提交 f7c275d7 作者: 薛凌堃

四川省人民政府

上级 fdbdc522
import requests
import time
import time
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '四川省人民政府'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -51,84 +60,87 @@ def getDataJson():
def getContent(url, publishDate, num):
url_ = url.split('/')[-1]
url_ = url.replace(url_, '')
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
policy.paserUrl(soup, url)
try:
writtenDate = \
soup.select('#szfcontentwrap2022 > div.zfwjwzcontent > div.topbox > ul > li')[3].text.split('成文日期:')[
1].lstrip().strip()
except:
writtenDate = ''
writtenDate = None
try:
contentWithTag = soup.select('.contText')[0]
except:
contentWithTag = soup.select('#cmsArticleContent')[0]
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = url_ + img.get('src')
fjhref_list += fj_href + '\n'
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/四川省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
return writtenDate, content, fjtitle_list, fjhref_list
return writtenDate, content, contentWithTag
def getData(data_, num):
id_list = []
title = data_['data']['title']
publishDate = data_['data']['docDate']
origin = data_['data']['siteLabel']['value']
href = data_['data']['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
return
organ = data_['data']['myValues']['DOCPUBNAME']
pub_hao = data_['data']['myValues']['DOCNOVAL']
summary = ''
if '.pdf' in href or '.PDF' in href:
content = ''
writtenDate = ''
fjtitle_list = title + '.pdf'
fjhref_list = href
fjcontent = getFjContent(href)
file = f'./相关政策/四川省人民政府/政策文件/{title}.pdf'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{title}===附件下载成功')
contentWithTag_str = ''
writtenDate = None
fj_title = title + '.pdf'
fj_href = href
att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
if att_id:
id_list.append(att_id)
else:
writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
writtenDate, content, contentWithTag = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729046053927178241',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob():
if not os.path.exists('./相关政策/四川省人民政府/政策文件'):
os.makedirs('./相关政策/四川省人民政府/政策文件')
data_list = []
num = 1
data_json = getDataJson()
for data_ in data_json:
data = getData(data_, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
getData(data_, num)
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/四川省人民政府/四川省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论