提交 98200599 作者: 薛凌堃

辽宁省人民政府

上级 362b085c
import time import time
...@@ -6,11 +6,17 @@ import pandas as pd ...@@ -6,11 +6,17 @@ import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
import BaseCore
from base import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '天津市人民政府'
headers = { headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' 'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
} }
...@@ -22,13 +28,14 @@ def getContent(url): ...@@ -22,13 +28,14 @@ def getContent(url):
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
contentWithTag = soup.find('div', class_='zfwj_detail') contentWithTag = soup.find('div', class_='zfwj_detail')
pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip() pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
return content, pub_hao return contentWithTag, pub_hao
def doJob(): def doJob():
url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c' url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c'
driver = baseCore.buildDriver() # driver = baseCore.buildDriver()
driver = policy.createDriver()
driver.get(url) driver.get(url)
time.sleep(1) time.sleep(1)
driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME, driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME,
...@@ -39,24 +46,50 @@ def doJob(): ...@@ -39,24 +46,50 @@ def doJob():
time.sleep(1) time.sleep(1)
div_list = driver.find_elements(By.CLASS_NAME, 'searchMod') div_list = driver.find_elements(By.CLASS_NAME, 'searchMod')
num = 1 num = 1
data_list = []
for div in div_list: for div in div_list:
title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip() title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip()
href = div.find_element(By.TAG_NAME, 'a').get_attribute('href') href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n', summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n',
'').lstrip().strip() '').lstrip().strip()
publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间:')[1].replace('年', '-').replace('月', publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间:')[1].replace('年', '-').replace('月',
'-').replace( '-').replace(
'日', '').lstrip().strip() '日', '').lstrip().strip()
content, pub_hao = getContent(href) contentWithTag, pub_hao = getContent(href)
data = [num, title, publishDate, '辽宁省人民政府', href, '', '', pub_hao, summary, content, '', ''] content = contentWithTag.text.lstrip().strip()
data_list.append(data) contentWithTag_str = str(contentWithTag)
log.info(f'{title}===采集成功') time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': '辽宁省人民政府',
'sourceAddress': url,
'writtenDate': '',
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729042213737967618',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
continue
num += 1 num += 1
driver.close() driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./辽宁省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论