提交 d30620e6 作者: 薛凌堃

雪球网年报

上级 3d1b07fd
......@@ -767,11 +767,15 @@ class BaseCore:
log = self.getLogger()
log.error(f'OBS发送失败')
return retData
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = self.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -177,7 +177,7 @@ def spider_annual_report(dict_info,num):
pass
else:
log.info(f'====pdf解析失败====')
return False
continue
num = num + 1
try:
origin = '雪球网'
......@@ -259,8 +259,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '91100000100003962T'
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '913412007050444417'
if not social_code:
time.sleep(20)
continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论