Merge remote-tracking branch 'origin/master'

0cdf68ae · 薛凌堃 · dec4f848 · 394de490 · 0cdf68ae · 0cdf68ae
--- a/comData/important_meeting/zyjjhy.py
+++ b/comData/important_meeting/zyjjhy.py
+# 中央经济会议
+import datetime
+import json
+import re
+import time
+
+import redis
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore(sqlFlg=False)
+log = baseCore.getLogger()
+r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.12371.cn/',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'cross-site',
+    'Sec-Fetch-User': '?1',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
+    'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+
+def is_member_containing_string(key, string):
+    cursor = '0'
+    while True:
+        # 使用 SCAN 命令遍历 Set 列表
+        cursor, members = r.sscan(key, cursor)
+        for member in members:
+            # 判断字符串是否包含指定字符串
+            if string in member.decode("utf-8"):
+                return True
+        if cursor == b'0' or cursor == 0:
+            break
+    return False
+
+
+def sendKafka(dic_info):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    try:
+        kafka_result = producer.send("research_center_fourth",
+                                     json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
+        # r.sadd(info_code + '-test', sourceAddress)
+        log.info(f'{dic_info["title"]}发送kafka成功')
+        return True
+    except Exception as e:
+        log.info(f'{dic_info["title"]}发送kafka异常==={e}')
+        return False
+
+def getData(year, summary, url):
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    timeTag = soup.find('i', class_='time').text.strip()
+    publishDate = timeTag.split('发布时间：')[1].split('来源：')[0].strip()
+    publishDate = datetime.datetime.strptime(publishDate, "%Y年%m月%d日 %H:%M")
+    publishDate = publishDate.strftime('%Y-%m-%d %H:%M:%S')
+    title = soup.find('h1', class_='big_title').text.strip()
+    title = f'（{year}）' + title
+    contentWithTag = soup.find('div', class_='word')
+    contentWithTag.find('div', class_='bfq_img1220').decompose()
+    pList = contentWithTag.find_all('p')
+    for p in pList:
+        if p.text.strip() == '延伸阅读':
+            p.decompose()
+        a = p.find('a')
+        if a:
+            p.decompose()
+    scripts = contentWithTag.find_all('script')
+    for script in scripts:
+        script.decompose()
+    content = contentWithTag.text
+    time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    dic_info = {
+        'id': '1681549573150879745' + str(int(time.time() * 1000)),
+        'title': title,
+        'origin': '共产党员网',
+        'contentWithTag': str(contentWithTag),
+        'content': content,
+        'summary': summary,
+        'publishDate': publishDate,
+        'sid': '1691634024094507010',
+        'subjectId': '1681549573150879745',
+        'sourceAddress': url,
+        'checkStatus': 1,
+        'deleteFlag': 0,
+        'createDate': time_now,
+
+    }
+    return dic_info
+
+
+def doJonb():
+    info_code = 'IN-20230816-0006'
+    url = 'https://www.12371.cn/special/lczyjjgzhy/'
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    hTag = soup.find('div', attrs={'id': 'page_body'}).find('div', class_='dyw638_title_jj').find('h2')
+    href = hTag.find('a').get('href')
+    summary = soup.find('div', attrs={'id': 'page_body'}).find('div', class_='dyw638_title_jj').find('p').text
+    year = re.findall('\d+年', hTag.text.strip())[0]
+    req_ = requests.get(href, headers=headers)
+    req_.encoding = req_.apparent_encoding
+    soup_ = BeautifulSoup(req_.text, 'html.parser')
+    aList = soup_.find('div', class_='word').find_all('a')
+    for a in aList:
+        if a.text.strip() == '【详细】':
+            href_ = a.get('href')
+    if is_member_containing_string(info_code, href_):
+        return
+    dic = getData(year, summary, href_)
+    if sendKafka(dic):
+        r.sadd(info_code, href_)
+
+
+if __name__ == '__main__':
+    doJonb()
--- a/comData/important_meeting/zyqmshggldxzhy19.py
+++ b/comData/important_meeting/zyqmshggldxzhy19.py
@@ -52,7 +52,7 @@ headers = {

 if __name__ == "__main__":
        # 中央全面深化改革委员会会议
-        r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
+        r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
        # 中央全面深化改革领导小组会议
        # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
        url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'