提交 9443f3af 作者: 薛凌堃

黑龙江省人民政府

上级 95cfdf3b
import os #coding=utf-8
#coding=utf-8
import os import os
import time import time
import numpy as np
import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import BaseCore
from base import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '黑龙江省人民政府'
headers = { headers = {
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
'Token': 'db345f2c-20fd-4cc8-9799-b9cd08b96392', 'Token': '9a9ff46e-f534-43b8-bad1-063d80af7e51',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
...@@ -34,10 +39,11 @@ def getDataJson(): ...@@ -34,10 +39,11 @@ def getDataJson():
def getSoup(url): def getSoup(url):
ip = baseCore.get_proxy() # ip = baseCore.get_proxy()
req = requests.get(url,headers=headers,proxies=ip) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.json()['content']['html'],'lxml') print(req.json())
soup = BeautifulSoup(req.json()['content']['html'], 'lxml')
return soup return soup
...@@ -49,10 +55,9 @@ def getFjContent(url): ...@@ -49,10 +55,9 @@ def getFjContent(url):
def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
fjhref_list = '' id_list = []
fjtitle_list = ''
url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}' url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
writtenDate = '' writtenDate = None
if type == '政策解读': if type == '政策解读':
origin = organ origin = organ
organ = '' organ = ''
...@@ -60,7 +65,10 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -60,7 +65,10 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
else: else:
origin = '黑龙江省人民政府' origin = '黑龙江省人民政府'
href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyFile/findById/{id}' href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyFile/findById/{id}'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, url)
if is_member:
return
soup = getSoup(href_) soup = getSoup(href_)
try: try:
a_list = soup.find_all('a') a_list = soup.find_all('a')
...@@ -68,19 +76,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -68,19 +76,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
href = a.get('href') href = a.get('href')
if '.html' in href or '.shtml' in href or '.htm' in href: if '.html' in href or '.shtml' in href or '.htm' in href:
continue continue
fjhref_list += href + '\n'
category = os.path.splitext(href)[1] category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}' fj_title = a.text.lstrip().strip()
if '<' in fj_title or '>' in fj_title: if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '') fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fjtitle_list += fj_title + '\n' att_id, full_path = policy.attuributefile(fj_title,href,num,publishDate)
fjcontent = getFjContent(href) if att_id:
file = f'./相关政策/黑龙江省人民政府/政策文件/{fj_title}' id_list.append(att_id)
with open(file, 'wb') as f: a['href'] = full_path
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except Exception as e: except Exception as e:
log.error(title, '=====', e) log.error(title, '=====', e)
try: try:
...@@ -96,15 +102,39 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -96,15 +102,39 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
except: except:
pass pass
content = soup.text.lstrip().strip() content = soup.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list, contentWithTag_str = str(soup)
fjhref_list] time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return data_ dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729042585839841281',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/黑龙江省人民政府/政策文件'):
os.makedirs('./相关政策/黑龙江省人民政府/政策文件')
data_list = []
num = 1 num = 1
data_json = getDataJson() data_json = getDataJson()
for data_ in data_json: for data_ in data_json:
...@@ -122,12 +152,10 @@ def doJob(): ...@@ -122,12 +152,10 @@ def doJob():
except: except:
organ = '' organ = ''
data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type) data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
data_list.append(data) # data_list.append(data)
num += 1 num += 1
time.sleep(3) time.sleep(3)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/黑龙江省人民政府/黑龙江省人民政府政策文件.xlsx', index=False)
if __name__ == "__main__": if __name__ == "__main__":
doJob() doJob()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论