提交 c702fb7b 作者: 薛凌堃

语言判断

上级 09af6b88
import os import os
...@@ -192,7 +192,9 @@ def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_ti ...@@ -192,7 +192,9 @@ def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_ti
else: else:
return False return False
content = retData['content'] content = retData['content']
lang = baseCore.detect_language(content)
if lang == 'cn':
lang = 'zh'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -203,7 +205,7 @@ def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_ti ...@@ -203,7 +205,7 @@ def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_ti
'deleteFlag': '0', 'deleteFlag': '0',
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': lang,
'origin': '东方财富网', 'origin': '东方财富网',
'publishDate': pub_time, 'publishDate': pub_time,
'sid': '1684032033495392257', 'sid': '1684032033495392257',
......
import os import os
...@@ -278,7 +278,9 @@ def spider(browser, code, social_code, com_name): ...@@ -278,7 +278,9 @@ def spider(browser, code, social_code, com_name):
att_id = '' att_id = ''
browser2.quit() browser2.quit()
lang = baseCore.detect_language(content)
if lang == 'cn':
lang = 'zh'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -289,7 +291,7 @@ def spider(browser, code, social_code, com_name): ...@@ -289,7 +291,7 @@ def spider(browser, code, social_code, com_name):
'deleteFlag': '0', 'deleteFlag': '0',
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': lang,
'origin': '东方财富网', 'origin': '东方财富网',
'publishDate': publishDate, 'publishDate': publishDate,
'sid': '1684032033495392257', 'sid': '1684032033495392257',
......
...@@ -308,7 +308,9 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -308,7 +308,9 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
else: else:
return False return False
content = retData['content'] content = retData['content']
lang = baseCore.detect_language(content)
if lang == 'cn':
lang = 'zh'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -319,7 +321,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -319,7 +321,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
'deleteFlag': '0', 'deleteFlag': '0',
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': lang,
'origin': origin, 'origin': origin,
'publishDate': pub_time, 'publishDate': pub_time,
'sid': '1684032033495392257', 'sid': '1684032033495392257',
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论