提交 29942683 作者: 薛凌堃

证监会公告脚本调整

上级 285fb7bc
import json import json
...@@ -37,7 +37,7 @@ def convert_size(size_bytes): ...@@ -37,7 +37,7 @@ def convert_size(size_bytes):
def uptoOBS(pdf_url,pdf_name,type_id,social_code): def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {} headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '', retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '', 'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun', 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''} 'create_time': '', 'page_size': '', 'content': ''}
...@@ -55,7 +55,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code): ...@@ -55,7 +55,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try: try:
name = pdf_name + '.pdf' name = pdf_name + '.pdf'
now_time = time.strftime("%Y-%m") now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content) result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count page_size = doc.page_count
for page in doc.pages(): for page in doc.pages():
...@@ -113,12 +113,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -113,12 +113,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
# id = '' # id = ''
# return id # return id
# else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time, page_size) create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入 cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交 cnx_.commit() # 提交
...@@ -277,9 +277,9 @@ def InsterInto(social_code, pdf_url,pub_time): ...@@ -277,9 +277,9 @@ def InsterInto(social_code, pdf_url,pub_time):
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#判断文件是否已经存在obs服务器中 #判断文件是否已经存在obs服务器中
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc' # file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
now_time = time.strftime("%Y-%m") now_time = time.strftime("%Y-%m")
file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf' file_path = 'QYNotice/'+pdf_name+'.pdf'
response = obsClient.getObjectMetadata('zzsn', file_path) response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300: if response.status >= 300:
log.info('=====文件不存在obs=====') log.info('=====文件不存在obs=====')
...@@ -372,19 +372,23 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -372,19 +372,23 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
except: except:
pass pass
# # 先获取页数 # 先获取页数
# page = soup.find('div', class_='pages').find('ul', class_='g-ul').text page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
#
# total = re.findall(r'\d+', page)[0] total = re.findall(r'\d+', page)[0]
#
# r_page = int(total) % 15 r_page = int(total) % 15
# if r_page == 0: if r_page == 0:
# Maxpage = int(total) // 15 Maxpage = int(total) // 15
# else: else:
# Maxpage = int(total) // 15 + 1 Maxpage = int(total) // 15 + 1
# log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页') log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# # 首页和其他页不同,遍历 如果是首页 修改一下链接 # # 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1,51): if Maxpage < 50:
pass
else:
Maxpage = 50
for i in range(1,Maxpage):
log.info(f'==========正在采集第{i}页=========') log.info(f'==========正在采集第{i}页=========')
if i == 1: if i == 1:
href = url href = url
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论