提交 29942683 作者: 薛凌堃

证监会公告脚本调整

上级 285fb7bc
import json
import json
......@@ -37,7 +37,7 @@ def convert_size(size_bytes):
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
......@@ -55,7 +55,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try:
name = pdf_name + '.pdf'
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
......@@ -113,12 +113,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
# id = ''
# return id
# else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
......@@ -277,9 +277,9 @@ def InsterInto(social_code, pdf_url,pub_time):
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#判断文件是否已经存在obs服务器中
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
now_time = time.strftime("%Y-%m")
file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
file_path = 'QYNotice/'+pdf_name+'.pdf'
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
log.info('=====文件不存在obs=====')
......@@ -372,19 +372,23 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
except:
pass
# # 先获取页数
# page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
#
# total = re.findall(r'\d+', page)[0]
#
# r_page = int(total) % 15
# if r_page == 0:
# Maxpage = int(total) // 15
# else:
# Maxpage = int(total) // 15 + 1
# log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
total = re.findall(r'\d+', page)[0]
r_page = int(total) % 15
if r_page == 0:
Maxpage = int(total) // 15
else:
Maxpage = int(total) // 15 + 1
log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# # 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1,51):
if Maxpage < 50:
pass
else:
Maxpage = 50
for i in range(1,Maxpage):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论