提交 06d190ff 作者: 薛凌堃

山西省人民政府

上级 639117a7
import os
import os
import os
import time
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '山西省人民政府'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
......@@ -74,8 +81,7 @@ def getFjContent(url):
def getContent(num, data):
fjhref_list = ''
fjtitle_list = ''
id_list = []
title = data['title']
pub_hao = data['docno']
origin = data['sitedesc']
......@@ -84,6 +90,10 @@ def getContent(num, data):
writtenDate = data['scrq']
summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
url = data['docpuburl']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, url)
if is_member:
return
url_ = url.split('/')[-1]
soup = getSoup(url)
soup = paserUrl(soup, url.replace(url_, ''))
......@@ -102,44 +112,65 @@ def getContent(num, data):
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
fj_title = a.text.lstrip()
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/山西省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list, fjhref_list]
return data_
att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
if att_id:
id_list.append(att_id)
a['href'] = full_path
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729041791539326977',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob():
if not os.path.exists('./相关政策/山西省人民政府/政策文件'):
os.makedirs('./相关政策/山西省人民政府/政策文件')
num = 1
data_list = []
# data_list = []
pageSize = getPageSize()
for page in range(1, pageSize + 1):
data_json = getJson(page)
for i in range(len(data_json)):
if data_json[i]['chnldesc'] == '政策文件':
data = getContent(num, data_json[i])
data_list.append(data)
log.info(f'{data[1]}===采集成功')
getContent(num, data_json[i])
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./山西省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论