提交 bb122afc 作者: XveLingKun

Merge remote-tracking branch 'origin/master'

import datetime
import json
import os
import re
import time
from random import choice
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from retry import retry
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
@retry(tries=5, delay=3)
def getSoup(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Connection': 'keep-alive',
# 'Cookie': 'wdcid=043e7d9c8a90fbd9; wdcid=45e06544d2ddbc06; __auc=a67cddac18991603628dac4003a; wdses=14387868be342942; SERVERID=adb2d3a906b8c5e3f02ddd9c20949df0|1714008937|1714008813; wdlast=1714008941',
'Host': 'www.gov.cn',
# 'Referer': 'https://www.gov.cn/zhengce/wenjian/zhongyang/home_1.htm',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
req = requests.get(url, headers=headers, proxies=baseCore.get_proxy())
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
req.close()
return soup
def getTotal(soup):
total = soup.find('div', class_='news_box').text
totalPage = re.findall('nPageCount = (\d+);', total)[0]
return int(totalPage)
def getDic(title, url):
soup = getSoup(url)
soup = baseTool.paserUrl(str(soup), url)
dateAndOrigin = soup.find('div', class_='pages-date').text.strip()
publishDate = dateAndOrigin.split('来源:')[0].strip()
publishDate = datetime.datetime.strptime(publishDate, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S')
pub_org = dateAndOrigin.split('来源:')[1].split('字号')[0].strip()
contentWithTag = soup.find('div', class_='pages_content')
content = contentWithTag.text
contentWithTag = str(contentWithTag)
id_list = []
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic = {
'attachmentIds': id_list, # 附件id
'author': '', # 作者
'content': content, # 正文不带标签
'contentWithTag': contentWithTag, # 正文带标签
'createDate': time_now, # 创建时间
'deleteFlag': 0, # 是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1783325599872438274", 'relationName': "中央有关文件", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识
'origin': '中华人民共和国中央人民政府', # 政策发布机关
'organ': pub_org, # 政策发文机关
'topicClassification': '', # 政策文件分类
'issuedNumber': '', # 发文字号
'publishDate': publishDate, # 发布时间
'writtenDate': None, # 成文时间
'sid': '1697458829758697473', # 信息源id
'sourceAddress': url, # 原文链接
'summary': '', # 摘要
'title': title # 标题
}
flag = baseTool.sendKafka(dic)
if flag:
baseTool.save_data(dic)
return flag
def doJob():
start_time = time.time()
num = 0
flg = False
url = 'https://www.gov.cn/zhengce/wenjian/zhongyang/home.htm'
soup = getSoup(url)
totalPage = getTotal(soup)
for i in range(totalPage):
if flg:
break
if i == 1:
url = url.replace('home', 'home_1')
soup = getSoup(url)
elif i > 1:
url = url.replace(f'home_{i - 1}', f'home_{i}')
soup = getSoup(url)
liList = soup.find('div', class_='news_box').find_all('li')
for li in liList:
title = li.find('a').text.strip()
href = li.find('a').get('href')
if 'https://www.gov.cn/' not in href:
href = href.replace('../../', 'https://www.gov.cn/zhengce/')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
flg = True
break
dateFlg = li.find('span').text.strip()
if dateFlg < '2020-01-01':
flg = True
break
if getDic(title, href):
num += 1
time.sleep(2)
end_time = time.time()
log.info(f'共采集{num}条中央有关文件,耗时{end_time - start_time}')
if __name__ == '__main__':
doJob()
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论