提交 e0e0f337 作者: 薛凌堃

重庆市人民政府

上级 175a599c
import json import json
...@@ -3,13 +3,18 @@ import time ...@@ -3,13 +3,18 @@ import time
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base import BaseCore
import os import BaseCore
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '重庆市人民政府'
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -112,8 +117,10 @@ def getContent(url): ...@@ -112,8 +117,10 @@ def getContent(url):
id = url.split('policyId=')[1] id = url.split('policyId=')[1]
contentWithTag = getContent_(id) contentWithTag = getContent_(id)
contentWithTag = BeautifulSoup(contentWithTag,'lxml') contentWithTag = BeautifulSoup(contentWithTag,'lxml')
policy.paserUrl(contentWithTag, url)
else: else:
soup = getSoup(url) soup = getSoup(url)
policy.paserUrl(soup, url)
contentWithTag = soup.find('div', class_='view') contentWithTag = soup.find('div', class_='view')
if not contentWithTag: if not contentWithTag:
contentWithTag = soup.find('div',class_='document') contentWithTag = soup.find('div',class_='document')
...@@ -131,7 +138,7 @@ def getContent(url): ...@@ -131,7 +138,7 @@ def getContent(url):
except: except:
pass pass
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
return content return content, contentWithTag
def getData(data_, num): def getData(data_, num):
...@@ -139,6 +146,11 @@ def getData(data_, num): ...@@ -139,6 +146,11 @@ def getData(data_, num):
publishDate = data_['save_time'] publishDate = data_['save_time']
origin = data_['f_2021325755960'] origin = data_['f_2021325755960']
href = data_['doc_pub_url'] href = data_['doc_pub_url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
return
try: try:
writtenDate = data_['f_202121607647'] writtenDate = data_['f_202121607647']
except: except:
...@@ -153,31 +165,47 @@ def getData(data_, num): ...@@ -153,31 +165,47 @@ def getData(data_, num):
pub_hao = '' pub_hao = ''
summary = data_['f_202142777829'] summary = data_['f_202142777829']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip() summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content = getContent(href) content, contentWithTag = getContent(href)
fjtitle_list = '' contentWithTag_str = str(contentWithTag)
fjhref_list = '' time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, dic_info = {
fjhref_list] 'attachmentIds': [],
return data 'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729045755020103681',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
return
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/重庆市人民政府/政策文件'):
os.makedirs('./相关政策/重庆市人民政府/政策文件')
total = getTotal() total = getTotal()
num = 1 num = 1
data_list = []
for page in range(1, total + 1): for page in range(1, total + 1):
data_json = getDataJson(page) data_json = getDataJson(page)
for data_ in data_json: for data_ in data_json:
data = getData(data_, num) getData(data_, num)
num += 1 num += 1
time.sleep(3) time.sleep(3)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/重庆市人民政府/重庆市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论