企业公告调整

727d72b9 · 薛凌堃 · 5641214f · 727d72b9
--- a/comData/noticeReport/证监会-公告.py
+++ b/comData/noticeReport/证监会-公告.py
-import json
+
+
 import json
 import re
 import time
+import uuid
+from datetime import datetime
+
 import requests
 from bs4 import BeautifulSoup
 from kafka import KafkaProducer
+from retry import retry
+
 from base import BaseCore
 from obs import ObsClient
 import fitz
@@ -24,6 +30,11 @@ obsClient = ObsClient(
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )
+pathType = 'QYNotice/'
+
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid

 #获取文件大小
 def convert_size(size_bytes):
@@ -44,26 +55,28 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
-            response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
            file_size = int(response.headers.get('Content-Length'))
            break
        except:
            time.sleep(3)
            continue
    page_size = 0
-    for i in range(0, 3):
-        try:
-            name = pdf_name
-            now_time = time.strftime("%Y-%m")
-            result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
-            with fitz.open(stream=response.content, filetype='pdf') as doc:
-                page_size = doc.page_count
-                for page in doc.pages():
-                    retData['content'] += page.get_text()
-            break
-        except:
-            time.sleep(3)
-            continue
+    name = str(getuuid()) + '.pdf'
+    now_time = time.strftime("%Y-%m")
+    try:
+        result = getOBSres(pathType, name, response)
+    except:
+        log.error(f'OBS发送失败')
+        return retData
+    try:
+        with fitz.open(stream=response.content, filetype='pdf') as doc:
+            page_size = doc.page_count
+            for page in doc.pages():
+                retData['content'] += page.get_text()
+    except:
+        log.error(f'文件损坏')
+        return retData

    if page_size < 1:
        # pdf解析失败
@@ -73,8 +86,8 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
-            retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
-            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = result['body']['objectUrl']
            retData['file_size'] = convert_size(file_size)
            retData['create_time'] = time_now
            retData['page_size'] = page_size
@@ -86,15 +99,21 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):

        return retData

-def secrchATT(item_id, name, type_id,order_by):
-    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
-    cursor_.execute(sel_sql, (item_id, name, type_id,order_by))
-    select = cursor_.fetchall()
-    selects = select[-1]
+@retry(tries=3, delay=1)
+def getOBSres(pathType,name, response):
+    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
+    return result
+
+
+def secrchATT(item_id, retData, type_id,order_by):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
+    cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
+    selects = cursor_.fetchone()
    return selects

 # 插入到att表 返回附件id
-def tableUpdate(retData, com_name, year, pdf_name, num):
+def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
@@ -115,18 +134,19 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
    #     return id
    # else:
    try:
-        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
-            create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
+            create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
+            pub_time, origin)
        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
    except Exception as e:
        print(e)
    log.info(f"更新完成:{item_id}===={pdf_name}")
-    selects = secrchATT(item_id, pdf_name, type_id,order_by)
+    selects = secrchATT(item_id, retData, type_id,order_by)
    id = selects[0]
    return id

@@ -300,7 +320,8 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
        log.info(f'====pdf解析失败====')
        return False
    num = num + 1
-    att_id = tableUpdate(retData,com_name,year,pdf_name,num)
+    origin = '证监会'
+    att_id = tableUpdate(retData,com_name,year,pdf_name,num,pub_time,origin)
    if att_id:
        pass
    else:
@@ -318,7 +339,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
        'id': '',
        'keyWords': '',
        'lang': 'zh',
-        'origin': '证监会',
+        'origin': origin,
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': pdf_url,  # 原文链接
@@ -394,7 +415,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
        pass
    else:
        Maxpage = 50
-    for i in range(1,Maxpage):
+    for i in range(1,Maxpage+1):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
@@ -415,17 +436,22 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'') + '.pdf'

-            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
+            pub_time_ = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
            #todo:判断发布日期是否是日期格式
            pattern = r"^\d{4}-\d{2}-\d{2}$"  # 正则表达式匹配YYYY-MM-DD格式的日期
            date_time_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
-            if re.match(pattern, pub_time):
+            if re.match(pattern, pub_time_):
                pass
            else:
-                if re.match(date_time_pattern, pub_time):
+                if re.match(date_time_pattern, pub_time_):
                    pass
                else:
                    continue
+            # 将时间年月日字符串转换为datetime对象
+            date_object = datetime.strptime(pub_time_, "%Y-%m-%d")
+
+            # 将datetime对象转换为年月日时分秒字符串
+            pub_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
            year = pub_time[:4]
            report_type = td_list[4].text.strip()