提交 1da1551d 作者: 薛凌堃

工具包注释fdfs方法

上级 54dd8a54
......@@ -25,8 +25,8 @@ from DBUtils.PooledDB import PooledDB
from fdfs_client.client import get_tracker_conf, Fdfs_client
import uuid
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf)
# tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
# client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
......@@ -613,48 +613,48 @@ class BaseCore:
self.r.expire(key, 3600)
time.sleep(2)
# 上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self, pdf_url, type_id, social_code):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
# # 上传至文件服务器,并解析pdf的内容和页数
# def upLoadToServe(self, pdf_url, type_id, social_code):
# headers = {}
# retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
# 'full_path': '',
# 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
# 'create_time': '', 'page_size': '', 'content': ''}
# headers['User-Agent'] = self.getRandomUserAgent()
# for i in range(0, 3):
# try:
# resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
# break
# except:
# time.sleep(3)
# continue
# page_size = 0
#
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# for page in doc.pages():
# retData['content'] += page.get_text()
# break
# except:
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'======pdf解析失败=====')
# return retData
# else:
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# retData['state'] = True
# retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
# retData['full_path'] = bytes.decode(result['Remote file_id'])
# retData['file_size'] = result['Uploaded size']
# retData['create_time'] = time_now
# retData['page_size'] = page_size
#
# return retData
def secrchATT(self, item_id, year, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论