提交 0cdf68ae 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

# 中央经济会议
import datetime
import json
import re
import time
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from base import BaseCore
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'http://www.12371.cn/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def is_member_containing_string(key, string):
cursor = '0'
while True:
# 使用 SCAN 命令遍历 Set 列表
cursor, members = r.sscan(key, cursor)
for member in members:
# 判断字符串是否包含指定字符串
if string in member.decode("utf-8"):
return True
if cursor == b'0' or cursor == 0:
break
return False
def sendKafka(dic_info):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
log.info(f'{dic_info["title"]}发送kafka成功')
return True
except Exception as e:
log.info(f'{dic_info["title"]}发送kafka异常==={e}')
return False
def getData(year, summary, url):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
timeTag = soup.find('i', class_='time').text.strip()
publishDate = timeTag.split('发布时间:')[1].split('来源:')[0].strip()
publishDate = datetime.datetime.strptime(publishDate, "%Y年%m月%d日 %H:%M")
publishDate = publishDate.strftime('%Y-%m-%d %H:%M:%S')
title = soup.find('h1', class_='big_title').text.strip()
title = f'({year})' + title
contentWithTag = soup.find('div', class_='word')
contentWithTag.find('div', class_='bfq_img1220').decompose()
pList = contentWithTag.find_all('p')
for p in pList:
if p.text.strip() == '延伸阅读':
p.decompose()
a = p.find('a')
if a:
p.decompose()
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
content = contentWithTag.text
time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dic_info = {
'id': '1681549573150879745' + str(int(time.time() * 1000)),
'title': title,
'origin': '共产党员网',
'contentWithTag': str(contentWithTag),
'content': content,
'summary': summary,
'publishDate': publishDate,
'sid': '1691634024094507010',
'subjectId': '1681549573150879745',
'sourceAddress': url,
'checkStatus': 1,
'deleteFlag': 0,
'createDate': time_now,
}
return dic_info
def doJonb():
info_code = 'IN-20230816-0006'
url = 'https://www.12371.cn/special/lczyjjgzhy/'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
hTag = soup.find('div', attrs={'id': 'page_body'}).find('div', class_='dyw638_title_jj').find('h2')
href = hTag.find('a').get('href')
summary = soup.find('div', attrs={'id': 'page_body'}).find('div', class_='dyw638_title_jj').find('p').text
year = re.findall('\d+年', hTag.text.strip())[0]
req_ = requests.get(href, headers=headers)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
aList = soup_.find('div', class_='word').find_all('a')
for a in aList:
if a.text.strip() == '【详细】':
href_ = a.get('href')
if is_member_containing_string(info_code, href_):
return
dic = getData(year, summary, href_)
if sendKafka(dic):
r.sadd(info_code, href_)
if __name__ == '__main__':
doJonb()
......@@ -52,7 +52,7 @@ headers = {
if __name__ == "__main__":
# 中央全面深化改革委员会会议
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论