提交 362b085c 作者: 薛凌堃

内蒙古人民政府

上级 ca21124d
import os import os
...@@ -6,10 +6,17 @@ import numpy as np ...@@ -6,10 +6,17 @@ import numpy as np
import pandas as pd import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '内蒙古自治区人民政府'
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -64,9 +71,9 @@ def getSoup(url): ...@@ -64,9 +71,9 @@ def getSoup(url):
def getPageSize(): def getPageSize():
ip = baseCore.get_proxy()
url = 'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1' url = 'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
req = requests.get(url, headers=headers, proxies=ip) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
total = int(req.json()['data']['total']) total = int(req.json()['data']['total'])
if total % 10 == 0: if total % 10 == 0:
...@@ -77,23 +84,22 @@ def getPageSize(): ...@@ -77,23 +84,22 @@ def getPageSize():
def getJson(page): def getJson(page):
ip = baseCore.get_proxy() # ip = baseCore.get_proxy()
url = f'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1' url = f'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
req = requests.get(url, headers=headers, proxies=ip) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
return req.json()['data']['data'] return req.json()['data']['data']
def getFjContent(url): def getFjContent(url):
ip = baseCore.get_proxy() # ip = baseCore.get_proxy()
req = requests.get(url, headers=headers_, proxies=ip) req = requests.get(url, headers=headers_)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
return req.content return req.content
def getContent(num, data): def getContent(num, data):
fjhref_list = '' id_list = []
fjtitle_list = ''
title = data['title'] title = data['title']
pub_hao = data['docno'] pub_hao = data['docno']
origin = data['sitedesc'] origin = data['sitedesc']
...@@ -102,12 +108,16 @@ def getContent(num, data): ...@@ -102,12 +108,16 @@ def getContent(num, data):
try: try:
writtenDate = data['scrq'] writtenDate = data['scrq']
except: except:
writtenDate = '' writtenDate = None
summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip() summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
url = data['docpuburl'] url = data['docpuburl']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, url)
if is_member:
return
soup = getSoup(url) soup = getSoup(url)
if soup == '': if soup == '':
return '' return
url_ = url.split('/')[-1] url_ = url.split('/')[-1]
soup = paserUrl(soup, url.replace(url_, '')) soup = paserUrl(soup, url.replace(url_, ''))
contentWithTag = soup.find('div', attrs={'id': 'pare'}) contentWithTag = soup.find('div', attrs={'id': 'pare'})
...@@ -119,7 +129,9 @@ def getContent(num, data): ...@@ -119,7 +129,9 @@ def getContent(num, data):
contentWithTag = soup.find('div', attrs={'class': 'zoomCon'}) contentWithTag = soup.find('div', attrs={'class': 'zoomCon'})
if not contentWithTag: if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'pagecontent'}) contentWithTag = soup.find('div', attrs={'id': 'pagecontent'})
if writtenDate == '': if not contentWithTag:
contentWithTag = soup.find('div', id="docContent")
if not writtenDate:
try: try:
tr_list = soup.find('table', class_='m-detailtb').find_all('tr') tr_list = soup.find('table', class_='m-detailtb').find_all('tr')
for tr in tr_list: for tr in tr_list:
...@@ -154,46 +166,69 @@ def getContent(num, data): ...@@ -154,46 +166,69 @@ def getContent(num, data):
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
for a in a_list: for a in a_list:
href = a.get('href') href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1] category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}' fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title: if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '') fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fjtitle_list += fj_title + '\n' # 附件
fjcontent = getFjContent(href) att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}' if att_id:
with open(file, 'wb') as f: id_list.append(att_id)
f.write(fjcontent) a['href'] = full_path
log.info(f'{fj_title}===附件下载成功')
except Exception as e: except Exception as e:
log.error(title, '=====', e) log.error(title, '=====', e)
content = contentWithTag.text.lstrip().strip() try:
data_ = [num, title, publishDate, origin, url, writtenDate, organ, pub_hao, summary, content, fjtitle_list, content = contentWithTag.text.lstrip().strip()
fjhref_list] except:
return data_ log.info(url)
return
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729041959772860417',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/内蒙古自治区人民政府/政策文件'):
os.makedirs('./相关政策/内蒙古自治区人民政府/政策文件')
data_list = []
pageSize = getPageSize() pageSize = getPageSize()
num = 1 num = 1
for page in range(1, pageSize + 1): for page in range(1, pageSize + 1):
data_json = getJson(page) data_json = getJson(page)
for data_ in data_json: for data_ in data_json:
if data_['chnldesc'] == '政策文件': if data_['chnldesc'] == '政策文件':
data = getContent(num, data_) getContent(num, data_)
if data: num += 1
data_list.append(data)
num += 1
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./内蒙古自治区人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论