提交 13609e3d 作者: 薛凌堃

雪球网年报

上级 d30620e6
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -35,8 +35,10 @@ chromedriver = r'D:/cmd100/chromedriver.exe' ...@@ -35,8 +35,10 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
log = baseCore.getLogger() log = baseCore.getLogger()
requests.adapters.DEFAULT_RETRIES = 3 requests.adapters.DEFAULT_RETRIES = 3
#11数据库
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = baseCore.cnx_
cursor = baseCore.cursor_
#144数据库
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
...@@ -159,94 +161,92 @@ def spider_annual_report(dict_info,num): ...@@ -159,94 +161,92 @@ def spider_annual_report(dict_info,num):
# name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '') # name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
# name_pdf = pdf_name_a + '.pdf' # name_pdf = pdf_name_a + '.pdf'
with cnx.cursor() as cursor: if '年' in year:
if '年' in year: year = year.split('年')[0]
year = year.split('年')[0] else:
else: pass
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone()
if selects:
log.info(f'com_name:{com_name}、{year}已存在')
continue
else:
#上传文件至obs服务器
retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun')
if retData['state']:
pass pass
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone()
if selects:
log.info(f'com_name:{com_name}、{year}已存在')
continue
else: else:
#上传文件至obs服务器 log.info(f'====pdf解析失败====')
retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun') continue
if retData['state']: num = num + 1
pass try:
else: origin = '雪球网'
log.info(f'====pdf解析失败====') att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time,origin)
continue content = retData['content']
num = num + 1 state = 1
try: takeTime = baseCore.getTimeCost(start_time, time.time())
origin = '雪球网' baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time,origin) except Exception as e:
content = retData['content'] exception = '数据库传输失败'
state = 1 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功') baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}')
except Exception as e: return False
exception = '数据库传输失败' #发送数据到kafka
state = 0 lang = baseCore.detect_language(content)
takeTime = baseCore.getTimeCost(start_time, time.time()) if lang == 'cn':
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}') lang = 'zh'
return False time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#发送数据到kafka dic_news = {
lang = baseCore.detect_language(content) 'attachmentIds': att_id,
if lang == 'cn': 'author': '',
lang = 'zh' 'content': content,
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 'contentWithTag': '',
dic_news = { 'createDate': time_now,
'attachmentIds': att_id, 'deleteFlag': '0',
'author': '', 'id': '',
'content': content, 'keyWords': '',
'contentWithTag': '', 'lang': lang,
'createDate': time_now, 'origin': origin,
'deleteFlag': '0', 'publishDate': datetime_string,
'id': '', 'sid': '1684032033495392257',
'keyWords': '', 'sourceAddress': year_url, # 原文链接
'lang': lang, 'summary': '',
'origin': origin, 'title': name_pdf.replace(',pdf', ''),
'publishDate': datetime_string, 'type': 1,
'sid': '1684032033495392257', 'socialCreditCode': social_code,
'sourceAddress': year_url, # 原文链接 'year': year
'summary': '', }
'title': name_pdf.replace(',pdf', ''), # 将相应字段通过kafka传输保存
'type': 1, try:
'socialCreditCode': social_code, producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
'year': year kafka_result = producer.send("researchReportTopic",
} json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10)) print(kafka_result.get(timeout=10))
dic_result = { dic_result = {
'success': 'ture', 'success': 'ture',
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
log.info(dic_result) log.info(dic_result)
# return True # return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
'success': 'false', 'success': 'false',
'message': '操作失败', 'message': '操作失败',
'code': '204', 'code': '204',
'e': e 'e': e
} }
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
log.info(dic_result) log.info(dic_result)
return False return False
# num = num + 1 time.sleep(2)
time.sleep(2)
# browser.quit() # browser.quit()
return True return True
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论