提交 3fdea62f 作者: 薛凌堃

北京市人民政府

上级 401719e1
import os
import os
import os
import random
import re
import fitz
import numpy as np
import openpyxl
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from openpyxl import load_workbook
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from urllib.parse import urljoin
import BaseCore
......@@ -23,8 +11,14 @@ import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '北京市人民政府'
class Policy():
class Policy1():
@retry(tries=3, delay=10)
def getrequest_soup(self, url):
ip = baseCore.get_proxy()
......@@ -137,7 +131,7 @@ def getFjContent(url):
def beijing():
if not os.path.exists('./相关政策/北京市人民政府/政策文件'):
os.makedirs('./相关政策/北京市人民政府/政策文件')
policy = Policy()
policy1 = Policy1()
url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = {
'siteCode': '1100000088',
......@@ -191,24 +185,31 @@ def beijing():
data = policy.requestPost(headers, url, payload_page)
info_list = data['resultDocs']
for info_ in info_list:
fjtitle_list = ''
fjhref_list = ''
id_list = []
info = info_['data']
origin = info['siteLabel']['value'].lstrip().strip()
title = info['titleO'].lstrip().strip()
titleLabel = info['titleLabel']['value'].lstrip().strip()
publishDate = info['docDate'].lstrip().strip()
newsUrl = info['url'].lstrip().strip()
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
summary = info['summary'].lstrip().strip()
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
writtenDate = ''
writtenDate = None
pub_hao = ''
organ = ''
content = ''
topicClassification = ''
if titleLabel == '政策解读':
try:
newssoup = policy.getrequest_soup(newsUrl)
newssoup = policy1.getrequest_soup(newsUrl)
except:
newssoup = policy.getrequest_soup_(newsUrl)
newssoup = policy1.getrequest_soup_(newsUrl)
contentWithTag = newssoup.find('div', id='mainText')
try:
scripts = contentWithTag.find_all('script')
......@@ -223,13 +224,14 @@ def beijing():
except:
pass
content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag)
organ = newssoup.find('div', class_='othermessage').find('p', class_='fl').text.split('来源:')[
1].lstrip().strip()
elif titleLabel == '政策文件':
try:
newssoup = policy.getrequest_soup(newsUrl)
newssoup = policy1.getrequest_soup(newsUrl)
except:
newssoup = policy.getrequest_soup_(newsUrl)
newssoup = policy1.getrequest_soup_(newsUrl)
contentWithTag = newssoup.find('div', id='mainText')
try:
scripts = contentWithTag.find_all('script')
......@@ -248,39 +250,66 @@ def beijing():
if '成文日期' in li.text:
writtenDate = li.find('span').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag)
formatRows = info['formatRows']
for row in formatRows:
for col in row['col']:
name = col['text']
if name == '相关附件':
tag_str = ''
value = col['value']
for i in range(len(value.keys())):
file_href = list(value.keys())[i]
file_name = list(value.values())[i]
fjcontent = getFjContent(file_href)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
file_name = f'{num}-{publishDate}-{file_name}'
file = f'./相关政策/北京市人民政府/政策文件/{file_name}'
fjtitle_list += file_name + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{file_name}===附件下载成功')
# 上传附件至obs
att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
if att_id:
id_list.append(att_id)
tag = newssoup.find('ul', class_='fujian').find_all('a')[i]
tag['href'] = full_path
tag_str += str(tag) + '<br>'
contentWithTag_str += tag_str
elif '号' in name:
pub_hao = col['value'].lstrip().strip()
elif '发文机构' in name:
organ = col['value'][0].lstrip().strip()
elif '主题分类' in name:
topicClassification = col['value'][0].lstrip().strip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if content == '':
continue
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': newsUrl,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729041207245328385',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
time.sleep(random.randint(10, 20))
data = [num, title, publishDate, origin, newsUrl, writtenDate, organ, pub_hao, summary, content,
fjtitle_list, fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/北京市人民政府/北京市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
beijing()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论