提交 d30620e6 作者: 薛凌堃

雪球网年报

上级 3d1b07fd
...@@ -767,11 +767,15 @@ class BaseCore: ...@@ -767,11 +767,15 @@ class BaseCore:
log = self.getLogger() log = self.getLogger()
log.error(f'OBS发送失败') log.error(f'OBS发送失败')
return retData return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count page_size = doc.page_count
for page in doc.pages(): for page in doc.pages():
retData['content'] += page.get_text() retData['content'] += page.get_text()
except:
log = self.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1: if page_size < 1:
# pdf解析失败 # pdf解析失败
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -177,7 +177,7 @@ def spider_annual_report(dict_info,num): ...@@ -177,7 +177,7 @@ def spider_annual_report(dict_info,num):
pass pass
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
return False continue
num = num + 1 num = num + 1
try: try:
origin = '雪球网' origin = '雪球网'
...@@ -259,8 +259,8 @@ if __name__ == '__main__': ...@@ -259,8 +259,8 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode') # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '91100000100003962T' social_code = '913412007050444417'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论