证监会公告

a0ee390b · XveLingKun · 79448081 · a0ee390b
--- a/comData/noticeReport/证监会-公告.py
+++ b/comData/noticeReport/证监会-公告.py
-
+
@@ -51,18 +51,29 @@ def convert_size(size_bytes):
    return f"{size_bytes:.2f} {units[i]}"

 def uptoOBS(pdf_url,pdf_name,type_id,social_code):
-    headers = {}
+    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Encoding': 'gzip, deflate',
+                'Accept-Language': 'zh-CN,zh;q=0.9',
+                'Cache-Control': 'max-age=0',
+                'Connection': 'keep-alive',
+                'Cookie': 'acw_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; cdn_sec_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; acw_sc__v3=666be84cf4454ec8c2436572df9f9e6dc78b409b; tfstk=fqG-Yit_Tnxu599nN49m-_AIYB8Dsb3ru0u1tkXQNmEI7DCnr3uQAkigmbqor2ZKpoisr_xrxDEIS2DuVDDuAyiif42H87mY9mn0qT0H46hKjmf3V3PSp6FriYf3q3PKRcVpjhAMs4uzaWtMjvDJxuN_WgaCxuNbhEjiRARMs4u5rzTilCYevQh4AkNCFk6Xky48OzwSAowb5Pj7OWiIlEU37k1QAzgfGzznwK58vaZN9vHWr405ar5CObNUelUOK6CLOzwRU4Zxy4hYy8ETn4oFqbimRbcz3KW0TqDtvvi6mTqSBPnYIYKOwcnzRmFo1eJz3JGK9rk2v9Etd4DZd-LWNqF82-U8eaBQwvir98kR8FubNmkab8920rhosJEaHitSoqE7Bvnk06ZoBqiYIjjcs5MZDXe_1gPEsfIB8GqT-TTvk9WUFrfOmApHgpiI6rEMumWFL-U4klYvk9WUFrzYjEyVL9yYu',
+                'Host': 'static.sse.com.cn',
+                'Upgrade-Insecure-Requests': '1',
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
               'full_path': '',
               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
-    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    # headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            try:
                file_size = int(response.headers.get('Content-Length'))
-            break
            except:
+                file_size = 0
+            break
+        except Exception as e:
            time.sleep(3)
            continue
    page_size = 0
@@ -78,7 +89,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
            page_size = doc.page_count
            for page in doc.pages():
                retData['content'] += page.get_text()
-    except:
+    except Exception as e:
        log.error(f'文件损坏')
        return retData

@@ -156,10 +167,11 @@ def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):

 @retry(tries=3, delay=5)
 def RequestUrl(url, payload, social_code,start_time):
-    ip = baseCore.get_proxy()
+    # ip = baseCore.get_proxy()
    # proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}

-    response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
+    # response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
+    response = requests.post(url=url, headers=headers, data=payload)
    # response = requests.post(url=url, data=payload)
    response.encoding = response.apparent_encoding
    if response.status_code == 200:
@@ -463,6 +475,13 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            pub_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
            year = pub_time[:4]
            report_type = td_list[4].text.strip()
+            # 获取当前年份
+            current_year = datetime.now().year
+            # print(current_year)
+            if int(current_year) < int(year):
+                continue
+            if str(current_year)[:1] < year[:1]: # 防止年份出现6005这种切出来股票代码的情况
+                continue

            # 判断数据库中是否有该条资讯
            ifexist = ifInstert(short_name, social_code, pdf_url)
@@ -489,7 +508,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            else:
                log.info(f'======={short_name}========{code}===已存在')
                # continue
-                break
+                return

 if __name__ == '__main__':
    num = 0
@@ -528,8 +547,8 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
-        social_code = '91370000163446410B'
+        social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode_add')
+        # social_code = '91370000163446410B'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)