提交 0afdebe1 作者: 薛凌堃

证监会企业年报

上级 4e84d611
import json
import json
import json
from datetime import datetime
from kafka import KafkaProducer
......@@ -113,6 +114,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
else:
continue
# print(name)
# 将时间年月日字符串转换为datetime对象
date_object = datetime.strptime(pub_time, "%Y-%m-%d")
# 将datetime对象转换为年月日时分秒字符串
datetime_string = date_object.strftime("%Y-%m-%d %H:%M:%S")
report_type = td_list[4].text.strip()
# print(report_type)
if report_type == '年报':
......@@ -143,13 +149,16 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
return False
#插入数据库获取att_id
num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time)
origin = '证监会'
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time,origin)
if att_id:
pass
else:
return False
content = retData['content']
lang = baseCore.detect_language(content)
if lang == 'cn':
lang = 'zh'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
......@@ -160,9 +169,9 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '证监会',
'publishDate': pub_time,
'lang': lang,
'origin': origin,
'publishDate': datetime_string,
'sid': '1684032033495392257',
'sourceAddress': pdf_url, # 原文链接
'summary': '',
......@@ -174,7 +183,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论