Merge remote-tracking branch 'origin/master'

bb122afc · XveLingKun · 49cbda1c · 2eafc1d6 · bb122afc
--- a/comData/policylaw/gwyRelevantDocuments.py
+++ b/comData/policylaw/gwyRelevantDocuments.py
+import datetime
+import json
+import os
+import re
+import time
+from random import choice
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from retry import retry
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+@retry(tries=5, delay=3)
+def getSoup(url):
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br, zstd',
+        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+        'Connection': 'keep-alive',
+        # 'Cookie': 'wdcid=043e7d9c8a90fbd9; wdcid=45e06544d2ddbc06; __auc=a67cddac18991603628dac4003a; wdses=14387868be342942; SERVERID=adb2d3a906b8c5e3f02ddd9c20949df0|1714008937|1714008813; wdlast=1714008941',
+        'Host': 'www.gov.cn',
+        # 'Referer': 'https://www.gov.cn/zhengce/wenjian/zhongyang/home_1.htm',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
+        'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+    }
+    req = requests.get(url, headers=headers, proxies=baseCore.get_proxy())
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    req.close()
+    return soup
+def getTotal(soup):
+    total = soup.find('div', class_='news_box').text
+    totalPage = re.findall('nPageCount = (\d+);', total)[0]
+    return int(totalPage)
+def getDic(title, url):
+    soup = getSoup(url)
+    soup = baseTool.paserUrl(str(soup), url)
+    dateAndOrigin = soup.find('div', class_='pages-date').text.strip()
+    publishDate = dateAndOrigin.split('来源：')[0].strip()
+    publishDate = datetime.datetime.strptime(publishDate, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S')
+    pub_org = dateAndOrigin.split('来源：')[1].split('字号')[0].strip()
+    contentWithTag = soup.find('div', class_='pages_content')
+    content = contentWithTag.text
+    contentWithTag = str(contentWithTag)
+    id_list = []
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic = {
+        'attachmentIds': id_list,  # 附件id
+        'author': '',  # 作者
+        'content': content,  # 正文不带标签
+        'contentWithTag': contentWithTag,  # 正文带标签
+        'createDate': time_now,  # 创建时间
+        'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
+        'id': '',  #
+        'labels': [{'relationId': "1783325599872438274", 'relationName': "中央有关文件", 'labelMark': "policy"}],
+        # 关联标签id  关联标签名称  关联标签标识
+        'origin': '中华人民共和国中央人民政府',  # 政策发布机关
+        'organ': pub_org,  # 政策发文机关
+        'topicClassification': '',  # 政策文件分类
+        'issuedNumber': '',  # 发文字号
+        'publishDate': publishDate,  # 发布时间
+        'writtenDate': None,  # 成文时间
+        'sid': '1697458829758697473',  # 信息源id
+        'sourceAddress': url,  # 原文链接
+        'summary': '',  # 摘要
+        'title': title  # 标题
+    }
+    flag = baseTool.sendKafka(dic)
+    if flag:
+        baseTool.save_data(dic)
+    return flag
+def doJob():
+    start_time = time.time()
+    num = 0
+    flg = False
+    url = 'https://www.gov.cn/zhengce/wenjian/zhongyang/home.htm'
+    soup = getSoup(url)
+    totalPage = getTotal(soup)
+    for i in range(totalPage):
+        if flg:
+            break
+        if i == 1:
+            url = url.replace('home', 'home_1')
+            soup = getSoup(url)
+        elif i > 1:
+            url = url.replace(f'home_{i - 1}', f'home_{i}')
+            soup = getSoup(url)
+        liList = soup.find('div', class_='news_box').find_all('li')
+        for li in liList:
+            title = li.find('a').text.strip()
+            href = li.find('a').get('href')
+            if 'https://www.gov.cn/' not in href:
+                href = href.replace('../../', 'https://www.gov.cn/zhengce/')
+            is_href = baseTool.db_storage.find_one({'网址': href})
+            if is_href:
+                flg = True
+                break
+            dateFlg = li.find('span').text.strip()
+            if dateFlg < '2020-01-01':
+                flg = True
+                break
+            if getDic(title, href):
+                num += 1
+        time.sleep(2)
+    end_time = time.time()
+    log.info(f'共采集{num}条中央有关文件,耗时{end_time - start_time}')
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()