证监会企业年报

0afdebe1 · 薛凌堃 · 4e84d611 · 0afdebe1
--- a/comData/annualReport/证监会-年报.py
+++ b/comData/annualReport/证监会-年报.py
-import json
+import json
 import json
+from datetime import datetime

 from kafka import KafkaProducer

@@ -113,6 +114,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                else:
                    continue
            # print(name)
+            # 将时间年月日字符串转换为datetime对象
+            date_object = datetime.strptime(pub_time, "%Y-%m-%d")
+
+            # 将datetime对象转换为年月日时分秒字符串
+            datetime_string = date_object.strftime("%Y-%m-%d %H:%M:%S")
            report_type = td_list[4].text.strip()
            # print(report_type)
            if report_type == '年报':
@@ -143,13 +149,16 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                        return False
                    #插入数据库获取att_id
                    num = num + 1
-                    att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time)
+                    origin = '证监会'
+                    att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time,origin)
                    if att_id:
                        pass
                    else:
                        return False
                    content = retData['content']
-
+                    lang = baseCore.detect_language(content)
+                    if lang == 'cn':
+                        lang = 'zh'
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    dic_news = {
                        'attachmentIds': att_id,
@@ -160,9 +169,9 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                        'deleteFlag': '0',
                        'id': '',
                        'keyWords': '',
-                        'lang': 'zh',
-                        'origin': '证监会',
-                        'publishDate': pub_time,
+                        'lang': lang,
+                        'origin': origin,
+                        'publishDate': datetime_string,
                        'sid': '1684032033495392257',
                        'sourceAddress': pdf_url,  # 原文链接
                        'summary': '',
@@ -174,7 +183,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                    # print(dic_news)
                    # 将相应字段通过kafka传输保存
                    try:
-                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
                        kafka_result = producer.send("researchReportTopic",
                                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))