提交 35ce0bae 作者: 薛凌堃

海南省国资委

上级 2aca66c0
import requests import time
import time
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os import os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from retry import retry
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '海南省人民政府'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
@retry(tries=3, delay=1)
def getSoup(url): def getSoup(url):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip) req = requests.get(url, headers=headers, proxies=ip)
...@@ -18,12 +32,19 @@ def getSoup(url): ...@@ -18,12 +32,19 @@ def getSoup(url):
return '' return ''
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
if soup:
return soup return soup
else:
raise Exception("重试")
def getPageSize(type): def getPageSize(type):
url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1' url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
try:
soup = getSoup(url) soup = getSoup(url)
except:
return 0
total = int(soup.find('div', class_='results-list').find('span').text.lstrip().strip()) total = int(soup.find('div', class_='results-list').find('span').text.lstrip().strip())
if total % 10 == 0: if total % 10 == 0:
pageSize = int(total / 10) pageSize = int(total / 10)
...@@ -33,16 +54,18 @@ def getPageSize(type): ...@@ -33,16 +54,18 @@ def getPageSize(type):
def getContent(url, publishDate, num): def getContent(url, publishDate, num):
fjhref_list = '' try:
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
except:
return '', '', ''
policy.paserUrl(soup, url)
if soup == '': if soup == '':
return '','','','' return '','',''
try: try:
writtenDate = soup.find('div', class_='zwgk_comr1').text.replace(' ', '').split('成文日期:')[1].split('标题')[ writtenDate = soup.find('div', class_='zwgk_comr1').text.replace(' ', '').split('成文日期:')[1].split('标题')[
0].lstrip().strip() 0].lstrip().strip()
except: except:
writtenDate = '' writtenDate = None
contentWithTag = soup.find('div', attrs={'id':'font'}) contentWithTag = soup.find('div', attrs={'id':'font'})
try: try:
...@@ -61,7 +84,7 @@ def getContent(url, publishDate, num): ...@@ -61,7 +84,7 @@ def getContent(url, publishDate, num):
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
except: except:
print(url) print(url)
return writtenDate, content, fjtitle_list, fjhref_list return writtenDate, content, contentWithTag
def getData(div, num): def getData(div, num):
...@@ -80,40 +103,62 @@ def getData(div, num): ...@@ -80,40 +103,62 @@ def getData(div, num):
summary = div.find('p', class_='p-text-color').text.lstrip().strip() summary = div.find('p', class_='p-text-color').text.lstrip().strip()
except: except:
summary = '' summary = ''
writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num) writtenDate, content, contentWithTag = getContent(href, publishDate, num)
if content == '': if content == '':
return [] return
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, contentWithTag_str = str(contentWithTag)
fjhref_list] time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return data dic_info = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729042375596158978',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except Exception as e:
return
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/海南省人民政府/政策文件'):
os.makedirs('./相关政策/海南省人民政府/政策文件')
data_list = []
href_list = []
num = 1 num = 1
types = [2682,2677] types = [2682,2677]
for type in types: for type in types:
pageSize = getPageSize(type) pageSize = getPageSize(type)
for page in range(pageSize): for page in range(pageSize):
url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={page}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1' url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={page}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
try:
soup = getSoup(url) soup = getSoup(url)
except:
continue
div_list = soup.select('#showPage > div') div_list = soup.select('#showPage > div')
del (div_list[-1]) del (div_list[-1])
for div in div_list: for div in div_list:
href = div.find('a', class_='titlec').get('href') href = div.find('a', class_='titlec').get('href')
if href not in href_list: # 根据链接判重
data = getData(div, num) is_member = baseCore.r.sismember('REITs::' + webname, href)
if data: if is_member:
href_list.append(href) continue
data_list.append(data)
getData(div, num)
num += 1 num += 1
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/海南省人民政府/江西省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论