提交 3d6d75b6 作者: 薛凌堃

福建省人民政府

上级 db64a87a
import time import time
...@@ -5,10 +5,15 @@ import numpy as np ...@@ -5,10 +5,15 @@ import numpy as np
import pandas as pd import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '福建省人民政府'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
...@@ -42,10 +47,10 @@ def getDataJson(data_post): ...@@ -42,10 +47,10 @@ def getDataJson(data_post):
def getContent(num, url, publishDate): def getContent(num, url, publishDate):
id_list = []
url_ = url.split('/')[-1] url_ = url.split('/')[-1]
url_ = url.replace(url_, '') url_ = url.replace(url_, '')
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
contentWithTag = soup.find('div', class_='TRS_Editor') contentWithTag = soup.find('div', class_='TRS_Editor')
try: try:
...@@ -63,27 +68,24 @@ def getContent(num, url, publishDate): ...@@ -63,27 +68,24 @@ def getContent(num, url, publishDate):
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
for a in a_list: for a in a_list:
fj_href = a.get('href').replace('./', url_) fj_href = a.get('href').replace('./', url_)
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip() fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n' att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
fjcontent = getFjContent(fj_href) if att_id:
file = f'./相关政策/福建省人民政府/政策文件/{fj_title}' id_list.append(att_id)
with open(file, 'wb') as f: a['href'] = full_path
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list
return content, contentWithTag, id_list
def doJob(): def doJob():
if not os.path.exists('./相关政策/福建省人民政府/政策文件'):
os.makedirs('./相关政策/福建省人民政府/政策文件')
data_posts = [{ data_posts = [{
'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '1', 'jiGuanList': '', 'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '1', 'jiGuanList': '',
'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055', 'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
...@@ -114,10 +116,15 @@ def doJob(): ...@@ -114,10 +116,15 @@ def doJob():
publishDate = data_['crtime'].replace('.','-') publishDate = data_['crtime'].replace('.','-')
origin = data_['docsourcename'] origin = data_['docsourcename']
href = data_['docpuburl'] href = data_['docpuburl']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
try: try:
writtenDate = data_['pubdate'].replace('.','-') writtenDate = data_['pubdate'].replace('.','-')
except: except:
writtenDate = '' writtenDate = None
try: try:
organ = data_['puborg'] organ = data_['puborg']
except: except:
...@@ -128,16 +135,37 @@ def doJob(): ...@@ -128,16 +135,37 @@ def doJob():
pub_hao = '' pub_hao = ''
summary = data_['doccontent'] summary = data_['doccontent']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip() summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content, fjtitle_list, fjhref_list = getContent(num, href, publishDate[:10]) content, contentWithTag, id_list = getContent(num, href, publishDate[:10])
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
fjhref_list] contentWithTag_str = str(contentWithTag)
data_list.append(data) dic_info = {
log.info(f'{title}===采集成功') 'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729043067106865154',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
num += 1 num += 1
time.sleep(1) time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/福建省人民政府/福建省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论