提交 80c02904 作者: 薛凌堃

11.29

上级 783b7172
import os import os
import os import os
import time import time
import numpy as np
import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base import BaseCore
import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '中华人民共和国中央人民政府'
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -84,9 +89,8 @@ def getDataJson(page, types): ...@@ -84,9 +89,8 @@ def getDataJson(page, types):
return data_list return data_list
def getContent(url, publishDate, num, organ): def getContent(url, publishDate, num, organ, id_list):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
if organ == '': if organ == '':
try: try:
...@@ -100,18 +104,18 @@ def getContent(url, publishDate, num, organ): ...@@ -100,18 +104,18 @@ def getContent(url, publishDate, num, organ):
for a in a_list: for a in a_list:
fj_href = a.get('href') fj_href = a.get('href')
if '.htm' not in fj_href and '.html' not in fj_href and '.shtml' not in fj_href and '.shtm' not in fj_href: if '.htm' not in fj_href and '.html' not in fj_href and '.shtml' not in fj_href and '.shtm' not in fj_href:
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip() fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n' # 上传附件至obs
fjcontent = getFjContent(fj_href) att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
file = f'./相关政策/国务院/政策文件/{fj_title}' if att_id:
with open(file, 'wb') as f: id_list.append(att_id)
f.write(fjcontent) a['href'] = full_path
log.info(f'{fj_title}===附件下载成功')
try: try:
scripts = contentWithTag.find_all('script') scripts = contentWithTag.find_all('script')
for script in scripts: for script in scripts:
...@@ -125,15 +129,17 @@ def getContent(url, publishDate, num, organ): ...@@ -125,15 +129,17 @@ def getContent(url, publishDate, num, organ):
except: except:
pass pass
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list, organ return content, id_list, contentWithTag, organ
def getData(data_, num): def getData(data_, num):
id_list = []
title = data_['title'].replace('\n', '').replace('\r', '') title = data_['title'].replace('\n', '').replace('\r', '')
title = BeautifulSoup(title,'lxml').text title = BeautifulSoup(title,'lxml').text
publishDate = data_['pubtimeStr'].replace('.', '-') publishDate = data_['pubtimeStr'].replace('.', '-')
origin = '国务院' origin = '国务院'
href = data_['url'] href = data_['url']
writtenDate = data_['ptime'] writtenDate = data_['ptime']
try: try:
organ = data_['puborg'] organ = data_['puborg']
...@@ -142,16 +148,43 @@ def getData(data_, num): ...@@ -142,16 +148,43 @@ def getData(data_, num):
pub_hao = data_['pcode'] pub_hao = data_['pcode']
summary = data_['summary'] summary = data_['summary']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip() summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content, fjtitle_list, fjhref_list, organ = getContent(href, publishDate, num, organ) content, id_list, contentWithTag, organ = getContent(href, publishDate, num, organ, id_list)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
fjhref_list] dic_info = {
return data 'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729028548502597633',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
# data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
# fjhref_list]
def doJob(): def doJob():
if not os.path.exists('./相关政策/国务院/政策文件'): # if not os.path.exists('./相关政策/国务院/政策文件'):
os.makedirs('./相关政策/国务院/政策文件') # os.makedirs('./相关政策/国务院/政策文件')
data_list = [] # data_list = []
href_list = [] href_list = []
num = 1 num = 1
types = ['bumenfile', 'gongwen', 'otherfile', 'gongbao'] types = ['bumenfile', 'gongwen', 'otherfile', 'gongbao']
...@@ -160,16 +193,19 @@ def doJob(): ...@@ -160,16 +193,19 @@ def doJob():
data_json = getDataJson(page, types) data_json = getDataJson(page, types)
for data_ in data_json: for data_ in data_json:
href = data_['url'] href = data_['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
if href not in href_list: if href not in href_list:
data = getData(data_, num) getData(data_, num)
num += 1 num += 1
data_list.append(data) # data_list.append(data)
href_list.append(href) href_list.append(href)
log.info(f'{data[1]}===采集成功')
time.sleep(3) time.sleep(3)
df = pd.DataFrame(np.array(data_list)) # df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接'] # df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/国务院/国务院政策文件.xlsx', index=False) # df.to_excel('./相关政策/国务院/国务院政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论