提交 632d5a17 作者: 薛凌堃

企业公告维护

上级 8cf6e366
import os
import os
......@@ -46,22 +46,41 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
if category == '.pdf':
try:
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
pass
except:
time.sleep(3)
continue
page_size = 1
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except:
time.sleep(3)
continue
name = str(getuuid()) + category
try:
result = getOBSres(pathType, name, response)
......@@ -85,7 +104,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
#baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
......
import os
import os
......@@ -48,6 +48,7 @@ def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
category = os.path.splitext(pdf_url)[1]
......@@ -56,16 +57,41 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
if category == '.pdf':
try:
ip = baseCore.get_proxy()
response = requests.get(pdf_url, headers=headers,verify=False,proxies=ip, timeout=20)
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
break
except Exception as e:
time.sleep(60)
continue
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
pass
except:
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except:
time.sleep(3)
continue
name = str(getuuid()) + category
try:
......@@ -73,12 +99,6 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except:
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
except:
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
......@@ -95,11 +115,12 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
#baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论