提交 d8ac5582 作者: 薛凌堃

湖北省人民政府

上级 60ccfd9e
import os import os
...@@ -8,16 +8,21 @@ from bs4 import BeautifulSoup ...@@ -8,16 +8,21 @@ from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service from selenium.webdriver.firefox.service import Service
from base import BaseCore
import time import time
from selenium.webdriver import Firefox from selenium.webdriver import Firefox
from selenium import webdriver from selenium import webdriver
import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '湖北省人民政府'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
...@@ -62,73 +67,96 @@ def getDataJson(page): ...@@ -62,73 +67,96 @@ def getDataJson(page):
def getContent(driver, url, num): def getContent(driver, url, num):
driver.get(url) driver.get(url)
time.sleep(5) time.sleep(5)
fjhref_list = '' id_list = []
fjtitle_list = '' page_source = driver.page_source
publishDate = driver.find_element(By.CLASS_NAME,'hbgov-article-meta-time').text.split('发布时间:')[1].lstrip().strip() soup = BeautifulSoup(page_source, 'html.parser')
contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content') policy.paserUrl(soup, url)
img_list = contentWithTag.find_elements(By.TAG_NAME,'img') publishDate = soup.find(class_='hbgov-article-meta-time').text.split('发布时间:')[1].lstrip().strip()[:10]
time.sleep(2)
contentWithTag = soup.find(class_='hbgov-article-content')
img_list = contentWithTag.find_all(class_='img')
num_ = 1 num_ = 1
for img in img_list: for img in img_list:
fj_title = img.get_attribute('title') fj_title = img.get_attribute('title')
fj_href = img.get_attribute('src') fj_href = img.get_attribute('src')
fjhref_list += fj_href + '\n'
if fj_title == '': if fj_title == '':
fj_title = str(num_) fj_title = str(num_)
num_ += 1 num_ += 1
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fj_title = f'{num}-{publishDate[:10]}-{fj_title}' att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
fjtitle_list += fj_title + '\n' if att_id:
fjcontent = getFjContent(fj_href) id_list.append(att_id)
file = f'./相关政策/湖北省人民政府/政策文件/{fj_title}' img['href'] = full_path
if os.path.exists(file):
fj_title = fj_title.replace(category,f'-{num_}') + category
num_ += 1
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip() return publishDate, content, contentWithTag, id_list
return publishDate, content, fjtitle_list, fjhref_list
def getData(driver, data_, num): def getData(driver, data_, num):
title = data_['DOCTITLE'] title = data_['DOCTITLE']
origin = data_['SITENAME'] origin = data_['SITENAME']
pub_hao = data_['fileNum'] pub_hao = data_['fileNum']
writtenDate = data_['PUBDATE'] writtenDate = str(data_['PUBDATE'])[:10]
organ = data_['publisher'] organ = data_['publisher']
summary = data_['highlight']['DOCCONTENT'][0] summary = data_['highlight']['DOCCONTENT'][0]
href = data_['DOCPUBURL'] href = data_['DOCPUBURL']
publishDate, content, fjtitle_list, fjhref_list = getContent(driver, href, num) # 根据链接判重
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, is_member = baseCore.r.sismember('REITs::' + webname, href)
fjhref_list] if is_member:
return data return
publishDate, content, contentWithTag, id_list = getContent(driver, href, num)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary.replace('</em>', '').replace('<em>', ''),
'createDate': time_now,
'sid': '1729044085724860418',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob(): def doJob():
service = Service(r'F:\spider\firefox\geckodriver.exe') service = Service(r'D:/soft/geckodriver.exe')
options = Options() options = Options()
options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
driver = webdriver.Firefox(options=options, service=service) driver = webdriver.Firefox(options=options, service=service)
if not os.path.exists('./相关政策/湖北省人民政府/政策文件'):
os.makedirs('./相关政策/湖北省人民政府/政策文件')
data_list = []
num = 1 num = 1
for page in range(1, 3): for page in range(1, 3):
data_json = getDataJson(page) data_json = getDataJson(page)
for data_ in data_json: for data_ in data_json:
data = getData(driver, data_, num) data = getData(driver, data_, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1 num += 1
driver.close() driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/湖北省人民政府/湖北省人民政府政策文件.xlsx', index=False)
# #
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论