提交 c9546130 作者: 薛凌堃

政策法规最终版

上级 eeb41ef7
This source diff could not be displayed because it is too large. You can view the blob instead.
import random import json
import json
import random import random
import time import time
from urllib.parse import urljoin
import pymongo
from kafka import KafkaProducer
from tqdm import tqdm from tqdm import tqdm
import pandas as pd import pandas as pd
import pymysql import pymysql
...@@ -12,47 +17,80 @@ log = baseCore.getLogger() ...@@ -12,47 +17,80 @@ log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def save_data(dic_news):
aaa_dic = {
'附件id':dic_news['attachmentIds'],
'网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate']
}
db_storage.insert_one(aaa_dic)
def sendKafka(dic_news):
start_time = time.time()
try:#114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("policy",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
headers = { dic_result = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'success': 'ture',
'Accept-Encoding':'gzip, deflate', 'message': '操作成功',
'Accept-Language':'zh-CN,zh;q=0.9', 'code': '200',
'Cache-Control':'no-cache', }
'Connection':'keep-alive', log.info(dic_result)
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725', # 传输成功,写入日志中
'Host':'www.sasac.gov.cn', state = 1
'Pragma':'no-cache', takeTime = baseCore.getTimeCost(start_time, time.time())
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4', # return True
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' except Exception as e:
}
# 创建一个ExcelWriter对象 dic_result = {
writer = pd.ExcelWriter('国务院厅局.xlsx') 'success': 'false',
url = 'http://www.sasac.gov.cn/n2588020/index.html' 'message': '操作失败',
ip = baseCore.get_proxy() 'code': '204',
res = requests.get(url,headers,proxies=ip) 'e': e
soup = BeautifulSoup(res.content,'html.parser') }
time.sleep(2) log.error(dic_result)
#厅局列表 e = 'Kafka操作失败'
list_type = soup.find('div',class_='l-jgkk-right column').find_all('dd') state = 0
list_error = [] takeTime = baseCore.getTimeCost(start_time, time.time())
for type in tqdm(list_type[:2]):
list_news = [] def work(href_type,ting_type,relationId):
href_type = type.find('a')['href'] ip = baseCore.get_proxy()
ting_type = type.find('a').text log.info(f'\n================厅局类别==={ting_type}========================')
print(f'\n================厅局类别==={ting_type}========================')
if 'http' in href_type: if 'http' in href_type:
url_type = href_type url_type = href_type
else: else:
url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../','') url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../', '')
# print(url_type) # print(url_type)
i_res = requests.get(url_type,headers) i_res = requests.get(url=url_type, headers=headers, proxies=ip)
i_soup = BeautifulSoup(i_res.content,'html.parser') i_soup = BeautifulSoup(i_res.content, 'html.parser')
time.sleep(2) time.sleep(2)
news_list = i_soup.find('div',class_='tjywBottom').find_all('li') news_list = i_soup.find('div', class_='tjywBottom').find_all('li')
#文章列表 # 文章列表
# print('================新闻列表==================') # print('================新闻列表==================')
for news in tqdm(news_list[:2]): for news in tqdm(news_list):
try: try:
news_href = news.find('a')['href'] news_href = news.find('a')['href']
except: except:
...@@ -60,55 +98,185 @@ for type in tqdm(list_type[:2]): ...@@ -60,55 +98,185 @@ for type in tqdm(list_type[:2]):
if 'http' in news_href: if 'http' in news_href:
news_url = news_href news_url = news_href
else: else:
news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../','') news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../', '')
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': news_url})
if is_href:
log.info('已采集----------跳过')
continue
news_title = news.find('a').text.split('[')[0] news_title = news.find('a').text.split('[')[0]
print(f'\n----正在采集: {news_title}-------') log.info(f'\n----正在采集: {news_title}-------')
pub_time = news.find('span').text.replace('[','').replace(']','') pub_time = news.find('span').text.replace('[', '').replace(']', '')
#文章信息 # 文章信息
ii_res = requests.get(news_url,headers) header = {
ii_soup = BeautifulSoup(ii_res.content,'html.parser') 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617',
'Host': 'www.sasac.gov.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res = requests.get(url=news_url, headers=header, proxies=ip)
ii_soup = BeautifulSoup(ii_res.content, 'html.parser')
# todo:相对路径转化为绝对路径 # todo:相对路径转化为绝对路径
ii_soup = paserUrl(ii_soup, news_url)
# 去掉扫一扫
try:
ii_soup.find('div', id='qr_container').decompose()
except:
pass
# 去掉style标签
for styleTag in ii_soup.find_all('style'):
styleTag.extract()
time.sleep(2) time.sleep(2)
try: try:
news_info = ii_soup.find('div',class_='zsy_cotitle') news_info = ii_soup.find('div', class_='zsy_cotitle')
except Exception as e: except Exception as e:
print(e) log.error(e)
news_info = '' news_info = ''
if news_info: if news_info:
try: try:
pub_source = news_info.find('p').text.split('文章来源:')[1].split('发布时间')[0] # origin
pub_source = news_info.find('p').text.split('文章来源:')[1].split('发布时间')[0].strip()
except: except:
pub_source = '' pub_source = ''
try: try:
content = ii_soup.find('div','zsy_comain').text.replace('扫一扫在手机打开当前页','').strip() contentWithTag = ii_soup.find('div', 'zsy_comain')
content = contentWithTag.text.strip()
except: except:
content = '' content = ''
# print(news_url) contentWithTag = ''
if len(content) > 100:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'标题':news_title, 'attachmentIds': [],
'发布时间':pub_time, 'author': '',
'来源':pub_source, # 'content': content,
'内容':content, # 'contentWithTag': str(contentWithTag),
'原文链接':news_url 'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': '',
'sid': '1697458829758697473',
'sourceAddress': news_url,
'summary': '',
'title': news_title
} }
list_news.append(dic_news) sendKafka(dic_news)
save_data(dic_news)
log.info(f'{ting_type}-----{news_title}----发送成功', )
else: else:
dic_error = { dic_error = {
'标题': news_title, '标题': news_title,
'原文链接':news_url, '原文链接': news_url,
'厅局类别':ting_type '厅局类别': ting_type
} }
list_error.append(dic_error) log.error(dic_error)
#中央纪委国家监委驻国资委纪检监察组
def job1(a_type):
href = a_type['href']
ting_type = a_type.text
return href,ting_type
def job():
url = 'http://www.sasac.gov.cn/n2588020/index.html'
ip = baseCore.get_proxy()
res = requests.get(url=url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
time.sleep(2)
# 厅局列表
list_type = soup.find('div', class_='l-jgkk-right column').find_all('dd')[:22]
a_soup = soup.find('div', class_='l-jgkk-right column').find_all('dt')[0]
a_type = a_soup.text.strip()
a_href = a_soup.find('a')['href']
a_id = '1874'
list_error = []
num = 0
start_time = time.time()
work(a_href,a_type, a_id)
for type in tqdm(list_type):
list_news = []
href_type = type.find('a')['href']
ting_type = type.find('a').text
relationId = mapId_dic[ting_type]
work(href_type,ting_type,relationId)
df = pd.DataFrame(list_news) num += 1
# 将数据写入不同的sheet页 end_time = time.time()
df.to_excel(writer, sheet_name=ting_type,index=False) log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'=============当前sheet页{ting_type}---数据总数:{len(df)}================')
time.sleep(1) time.sleep(1)
writer.save() # writer.save()
df_error = pd.DataFrame(list_error) # df_error = pd.DataFrame(list_error)
df_error.to_excel('未采到文章.xlsx',index=False) # df_error.to_excel('未采到文章.xlsx',index=False)
if __name__=='__main__':
mapId_dic = {
'办公厅(党委办公厅)':'1643',
'综合研究局':'1644',
'政策法规局':'1645',
'规划发展局':'1646',
'财务监管与运行评价局':'1647',
'产权管理局':'1648',
'企业改革局':'1649',
'考核分配局':'1650',
'资本运营与收益管理局':'1651',
'科技创新和社会责任局':'1652',
'综合监督局':'1653',
'监督追责局':'1654',
'企业领导人员管理一局(董事会工作局)':'1655',
'企业领导人员管理二局':'1656',
'党建工作局(党委组织部、党委统战部)':'1657',
'宣传工作局(党委宣传部)':'1658',
'国际合作局':'1659',
'人事局':'1660',
'机关服务管理局(离退休干部管理局)':'1662',
'机关党委':'1663',
'党委巡视工作办公室、国资委巡视组':'1664',
}
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host':'www.sasac.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
try:
job()
except Exception as e:
print(e)
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论