提交 687679ca 作者: 薛凌堃

广东省人民政府

上级 d8ac5582
import datetime import datetime
...@@ -6,13 +6,21 @@ import requests ...@@ -6,13 +6,21 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from retry import retry from retry import retry
from base import BaseCore
import os import os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '广东省人民政府'
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
...@@ -65,10 +73,10 @@ def getDataJson(url, data_post): ...@@ -65,10 +73,10 @@ def getDataJson(url, data_post):
def getContent(url, publishDate, num): def getContent(url, publishDate, num):
fjhref_list = '' id_list = []
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
time.sleep(2) time.sleep(2)
policy.paserUrl(soup, url)
try: try:
try: try:
contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0] contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0]
...@@ -78,54 +86,21 @@ def getContent(url, publishDate, num): ...@@ -78,54 +86,21 @@ def getContent(url, publishDate, num):
contentWithTag = soup.find('div', class_='article-content').find('center') contentWithTag = soup.find('div', class_='article-content').find('center')
if not contentWithTag: if not contentWithTag:
contentWithTag = soup.find('div', class_='article-content') contentWithTag = soup.find('div', class_='article-content')
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = img.get('src')
if "http" not in fj_href and '//www' in fj_href:
fj_href = 'http:' + fj_href
fjhref_list += fj_href + '\n'
fj_title = img.get('alt')
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
file = file.replace(category, f'-{num_}{category}')
num_ += 1
if os.path.exists(file):
fj_title = fj_title.replace(category, f'-{num_}{category}')
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
for a in a_list: for a in a_list:
fj_href = a.get('href') fj_href = a.get('href')
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip() fj_title = a.text.lstrip().strip()
if fj_title == '': if fj_title == '':
fj_title = str(num_) fj_title = str(num)
num_ += 1 num += 1
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}' att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
fjcontent = getFjContent(fj_href) if att_id:
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}' id_list.append(att_id)
if os.path.exists(file):
file = file.replace(category, f'-{num_}{category}')
num_ += 1
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try: try:
scripts = contentWithTag.find_all('script') scripts = contentWithTag.find_all('script')
for script in scripts: for script in scripts:
...@@ -139,9 +114,8 @@ def getContent(url, publishDate, num): ...@@ -139,9 +114,8 @@ def getContent(url, publishDate, num):
except: except:
pass pass
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip() return content, contentWithTag, id_list
return content, fjtitle_list, fjhref_list
def ST(txt): def ST(txt):
...@@ -149,12 +123,17 @@ def ST(txt): ...@@ -149,12 +123,17 @@ def ST(txt):
return txt return txt
def getData(data_, num): def getData(data_, num,sid):
title = ST(data_['title']) title = ST(data_['title'])
log.info(f'{title}===开始采集') log.info(f'{title}===开始采集')
publishDate = data_['pub_time'] publishDate = data_['pub_time']
origin = data_['publisher_src'] origin = data_['publisher_src']
href = data_['url'] href = data_['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
return
log.info(href) log.info(href)
writtenDate = data_['date'] writtenDate = data_['date']
if writtenDate: if writtenDate:
...@@ -162,17 +141,44 @@ def getData(data_, num): ...@@ -162,17 +141,44 @@ def getData(data_, num):
organ = data_['source'] organ = data_['source']
pub_hao = data_['document_number'] pub_hao = data_['document_number']
summary = ST(data_['content']) summary = ST(data_['content'])
content, fjtitle_list, fjhref_list = getContent(href, publishDate, num) content, contentWithTag, id_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, contentWithTag_str = str(contentWithTag)
fjhref_list] time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return data dic_info = {
'attachmentIds': id_list,
'author': '',
def doJob_1(): 'content': content,
if not os.path.exists('./相关政策/广东省人民政府/政策文件'): 'contentWithTag': contentWithTag_str,
os.makedirs('./相关政策/广东省人民政府/政策文件') 'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': sid,
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
# 政策文件
def doJob_1(sid1):
# if not os.path.exists('./相关政策/广东省人民政府/政策文件'):
# os.makedirs('./相关政策/广东省人民政府/政策文件')
pageSize = getPageSize() pageSize = getPageSize()
data_list = []
num = 1 num = 1
url = 'https://search.gd.gov.cn/api/search/file' url = 'https://search.gd.gov.cn/api/search/file'
for page in range(1, pageSize + 1): for page in range(1, pageSize + 1):
...@@ -182,17 +188,15 @@ def doJob_1(): ...@@ -182,17 +188,15 @@ def doJob_1():
data_post = json.dumps(data_post) data_post = json.dumps(data_post)
data_json = getDataJson(url, data_post) data_json = getDataJson(url, data_post)
for data_ in data_json: for data_ in data_json:
data = getData(data_, num) getData(data_, num, sid1)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1 num += 1
return data_list, num return
def doJob_2(num): def doJob_2(sid2):
url = 'https://search.gd.gov.cn/api/search/all' url = 'https://search.gd.gov.cn/api/search/all'
types = ['政策解读', '计划规划'] types = ['政策解读', '计划规划']
data_list = [] num = 1
for type in types: for type in types:
data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2", data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
"range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000", "range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000",
...@@ -200,23 +204,19 @@ def doJob_2(num): ...@@ -200,23 +204,19 @@ def doJob_2(num):
data_post = json.dumps(data_post) data_post = json.dumps(data_post)
data_json = getDataJson(url, data_post) data_json = getDataJson(url, data_post)
for data_ in data_json: for data_ in data_json:
data = getData(data_, num) getData(data_, num, sid2)
time.sleep(1) time.sleep(1)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1 num += 1
return data_list return
def doJob(): def doJob():
data_list = [] sid1 = '1729044231736971266'
data_list_, num = doJob_1() sid2 = '1729044396395048961'
data_list += data_list_ doJob_1(sid1)
data_list_ = doJob_2(num)
data_list += data_list_ doJob_2(sid2)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/广东省人民政府/广东省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论