证监会公告脚本调整

29942683 · 薛凌堃 · 285fb7bc · 29942683
--- a/comData/noticeReport/证监会-公告.py
+++ b/comData/noticeReport/证监会-公告.py
 import json
@@ -37,7 +37,7 @@ def convert_size(size_bytes):
 def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
-    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
               'full_path': '',
               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
@@ -55,7 +55,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
        try:
            name = pdf_name + '.pdf'
            now_time = time.strftime("%Y-%m")
-            result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
+            result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                page_size = doc.page_count
                for page in doc.pages():
@@ -113,12 +113,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
    #     id = ''
    #     return id
    # else:
-    Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+    Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
    values = (
        year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
        status, create_by,
-        create_time, page_size)
+        create_time, page_size,path,'zzsn')
    cursor_.execute(Upsql, values)  # 插入
    cnx_.commit()  # 提交
@@ -277,9 +277,9 @@ def InsterInto(social_code, pdf_url,pub_time):
 def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
    #判断文件是否已经存在obs服务器中
-    # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
+    # file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
    now_time = time.strftime("%Y-%m")
-    file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
+    file_path = 'QYNotice/'+pdf_name+'.pdf'
    response = obsClient.getObjectMetadata('zzsn', file_path)
    if response.status >= 300:
        log.info('=====文件不存在obs=====')
@@ -372,19 +372,23 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
    except:
        pass
-    # # 先获取页数
+    # 先获取页数
-    # page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
+    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
-    #
-    # total = re.findall(r'\d+', page)[0]
+    total = re.findall(r'\d+', page)[0]
-    #
-    # r_page = int(total) % 15
+    r_page = int(total) % 15
-    # if r_page == 0:
+    if r_page == 0:
-    #     Maxpage = int(total) // 15
+        Maxpage = int(total) // 15
-    # else:
+    else:
-    #     Maxpage = int(total) // 15 + 1
+        Maxpage = int(total) // 15 + 1
-    # log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
+    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
    # # 首页和其他页不同，遍历 如果是首页 修改一下链接
-    for i in range(1,51):
+    if Maxpage < 50:
+        pass
+    else:
+        Maxpage = 50
+    for i in range(1,Maxpage):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url