提交 9dc7843b 作者: 薛凌堃

美国证券交易委员会kafka大小调整

上级 a74f108a
...@@ -520,7 +520,7 @@ def fbspdfurlinfo(): ...@@ -520,7 +520,7 @@ def fbspdfurlinfo():
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
fbspdfurlinfo() # fbspdfurlinfo()
# danxiangguanjun() # danxiangguanjun()
# kegaishifan() # kegaishifan()
# shuangbaiqiye() # shuangbaiqiye()
...@@ -536,7 +536,7 @@ if __name__ == "__main__": ...@@ -536,7 +536,7 @@ if __name__ == "__main__":
# FBS() # FBS()
# MengZhi() # MengZhi()
# NQEnterprise() # NQEnterprise()
# SEC_CIK() SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
......
...@@ -29,7 +29,7 @@ type_id = 1 ...@@ -29,7 +29,7 @@ type_id = 1
create_by = 'XueLingKun' create_by = 'XueLingKun'
taskType = '企业年报' taskType = '企业年报'
#付俊雪的需要改为巨潮资讯网1_福布斯2000_PDF_60_付 #付俊雪的需要改为巨潮资讯网1_福布斯2000_PDF_60_付
file_path = 'D:\\BaiduNetdiskDownload\\1_福布斯2000_PDF_55_邵' file_path = 'D:\\BaiduNetdiskDownload\\1_福布斯2000_PDF_60_付'
log.info(f'=============当前pid为{baseCore.getPID()}==============') log.info(f'=============当前pid为{baseCore.getPID()}==============')
def sendKafka(dic_news): def sendKafka(dic_news):
...@@ -180,7 +180,7 @@ if __name__=='__main__': ...@@ -180,7 +180,7 @@ if __name__=='__main__':
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': 'zh',
'origin': '企业官网', 'origin': '巨潮资讯网',
'publishDate': file_year + '-12-31', 'publishDate': file_year + '-12-31',
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': '', # 原文链接 'sourceAddress': '', # 原文链接
......
...@@ -190,7 +190,7 @@ def spider(com_name,cik,up_okCount): ...@@ -190,7 +190,7 @@ def spider(com_name,cik,up_okCount):
# print(dic_news) # print(dic_news)
# 将相应字段通过kafka传输保存 # 将相应字段通过kafka传输保存
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],compression_type='gzip',batch_size=1638400,linger_ms=1,buffer_memory=33445532*2,max_request_size=8388608) #,batch_size=20480000,buffer_memory=64000000) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],compression_type='gzip',batch_size=1638400,linger_ms=1,buffer_memory=33445532*2,max_request_size=1024*1024*50) #,batch_size=20480000,buffer_memory=64000000)
kafka_result = producer.send("researchReportTopic", kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8')) json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
...@@ -203,7 +203,8 @@ def spider(com_name,cik,up_okCount): ...@@ -203,7 +203,8 @@ def spider(com_name,cik,up_okCount):
} }
log.info(dic_result) log.info(dic_result)
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,content,create_time) values(%s,%s,%s,%s,%s,%s,%s,now())'''
# 动态信息列表 # 动态信息列表
up_okCount = up_okCount + 1 up_okCount = up_okCount + 1
list_info = [ list_info = [
...@@ -211,6 +212,9 @@ def spider(com_name,cik,up_okCount): ...@@ -211,6 +212,9 @@ def spider(com_name,cik,up_okCount):
news_url, news_url,
'SEC', 'SEC',
'1', '1',
filingDate,
title,
content[:500]
] ]
cursor_.execute(insert_sql, tuple(list_info)) cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit() cnx_.commit()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论