提交 5d788bc9 作者: 薛凌堃

Reits专题

上级 c702fb7b
# REITs专题核心工具包 # REITs专题核心工具包
...@@ -5,6 +5,7 @@ import random ...@@ -5,6 +5,7 @@ import random
import socket import socket
import sys import sys
import time import time
import uuid
import fitz import fitz
import logbook import logbook
...@@ -252,7 +253,7 @@ class BaseCore: ...@@ -252,7 +253,7 @@ class BaseCore:
charset='utf8mb4') charset='utf8mb4')
self.cursor_ = self.cnx_.cursor() self.cursor_ = self.cnx_.cursor()
# 连接到Redis # 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6) self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
self.pool_caiji = PooledDB( self.pool_caiji = PooledDB(
creator=pymysql, creator=pymysql,
...@@ -451,6 +452,7 @@ class BaseCore: ...@@ -451,6 +452,7 @@ class BaseCore:
# def doc_page(self,file_path): # def doc_page(self,file_path):
# doc = Document(file_path) # doc = Document(file_path)
# return len(doc.sections) # return len(doc.sections)
def deliteATT(self,id): def deliteATT(self,id):
delitesql = f"delete from clb_sys_attachment where id = '{id}' " delitesql = f"delete from clb_sys_attachment where id = '{id}' "
self.cursor_.execute(delitesql) self.cursor_.execute(delitesql)
...@@ -492,6 +494,9 @@ class BaseCore: ...@@ -492,6 +494,9 @@ class BaseCore:
id = selects[0] id = selects[0]
return id,full_path return id,full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小 # 获取文件大小
def convert_size(self,size_bytes): def convert_size(self,size_bytes):
...@@ -520,37 +525,25 @@ class BaseCore: ...@@ -520,37 +525,25 @@ class BaseCore:
except: except:
time.sleep(3) time.sleep(3)
continue continue
page_size = 0
for i in range(0, 3): for i in range(0, 3):
try: try:
# name = file_name file_name = str(self.getuuid()) + category
if category in file_name: result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
pass
else:
file_name = file_name + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
break break
except: except:
time.sleep(3) time.sleep(3)
continue continue
try:
if page_size < 1: time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# pdf解析失败 retData['state'] = True
# print(f'======pdf解析失败=====') retData['path'] = result['body']['objectUrl'].split('.com')[1]
return retData retData['full_path'] = unquote(result['body']['objectUrl'])
else: retData['file_size'] = self.convert_size(file_size)
try: retData['create_time'] = time_now
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) except Exception as e:
retData['state'] = True print(f'error:{e}')
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData return retData
return retData
def sendkafka(self, post_data, topic): def sendkafka(self, post_data, topic):
try: try:
......
import os import os
...@@ -107,7 +107,7 @@ class Policy(): ...@@ -107,7 +107,7 @@ class Policy():
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name:
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '9999', file_name) retData = baseCore.uptoOBS(file_href, '', file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -136,7 +136,7 @@ class Policy(): ...@@ -136,7 +136,7 @@ class Policy():
policy = Policy() policy = Policy()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt= #国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def reform(wb,file_path): def reform():
headers = { headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -153,22 +153,30 @@ def reform(wb,file_path): ...@@ -153,22 +153,30 @@ def reform(wb,file_path):
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"' 'sec-ch-ua-platform': '"Windows"'
} }
DataList = [] # DataList = []
num = 0 num = 0
path = 'data/国家改革发展委员会' webname = '中华人民共和国国家发展和改革委员会'
if not os.path.exists(path): # path = 'data/国家改革发展委员会'
os.makedirs(path) # if not os.path.exists(path):
# os.makedirs(path)
for page in range(1,3): for page in range(1,3):
url = f'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc' url = f'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
result = policy.getrequest_json(headers, url) result = policy.getrequest_json(headers, url)
data_list = result['data']['resultList'] data_list = result['data']['resultList']
for info in data_list: for info in data_list:
num += 1 num += 1
id_list = []
# info = data_list[1] # info = data_list[1]
publishDate_ = info['docDate'] publishDate_ = info['docDate']
title = info['title'] title = info['title']
summary = info['summary'].replace('<em>','').replace('</em>','') summary = info['summary'].replace('<em>','').replace('</em>','')
newsUrl = info['url'] newsUrl = info['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
header = { header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -190,6 +198,7 @@ def reform(wb,file_path): ...@@ -190,6 +198,7 @@ def reform(wb,file_path):
} }
newssoup = policy.getrequest_soup(header, newsUrl) newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup) # print(newssoup)
policy.paserUrl(newssoup, newsUrl)
try: try:
pubHao = '' pubHao = ''
source = '' source = ''
...@@ -229,20 +238,19 @@ def reform(wb,file_path): ...@@ -229,20 +238,19 @@ def reform(wb,file_path):
pattern = r"\d{4}年\d{1,2}月\d{1,2}日" pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.match(pattern, publishDate) match = re.match(pattern, publishDate)
if match: if match:
date1 = datetime.strptime(publishDate, "%Y年%m月%d日")
publishDate = date1.strftime("%Y-%m-%d")
pass pass
else: else:
publishDate = '' publishDate = ''
policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;') policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
policy.deletek(contentWithTag) policy.deletek(contentWithTag)
content = contentWithTag.text content = contentWithTag.text
try: try:
policy.paserUrl(newssoup,newsUrl) policy.paserUrl(newssoup,newsUrl)
att = newssoup.find('div', class_='attachment_r') att = newssoup.find('div', class_='attachment_r')
fu_jian_name = ''
fu_jian_href = ''
except: except:
fu_jian_name = ''
fu_jian_href = ''
att = '' att = ''
if att: if att:
for a in att.find_all('a'): for a in att.find_all('a'):
...@@ -255,49 +263,61 @@ def reform(wb,file_path): ...@@ -255,49 +263,61 @@ def reform(wb,file_path):
pass pass
else: else:
file_name = file_name + category file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}' att_id,full_path = policy.attuributefile(file_name,file_href,num,publishDate_)
fu_jian_name += rename_file + '\n' if att_id:
fu_jian_href += file_href + '\n' id_list.append(att_id)
policy.downloadfile(file_href, f'{path}/{rename_file}') a['href'] = full_path
contentWithTag_str = str(contentWithTag) + str(newssoup.find('div', class_='attachment'))
else:
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = { dic_info = {
'序号': num, 'attachmentIds': id_list,
'标题': title, 'author': '',
'发布时间': publishDate_, 'content': content,
'来源': source, 'contentWithTag': contentWithTag_str,
'原文链接': newsUrl, 'deleteFlag': 0,
'发文时间': publishDate, 'id': '',
'发文机构': '', 'title': title,
'发文字号': pubHao, 'publishDate': publishDate_,
'摘要': summary, 'origin': source,
'正文': content, 'sourceAddress': newsUrl,
'附件名称': fu_jian_name, 'writtenDate': publishDate,
'附件链接': fu_jian_href, 'organ': '',
'topicClassification': '',
'issuedNumber': pubHao,
'summary': summary,
'createDate': time_now,
'sid': '1729029275400646658',
} }
DataList.append(dic_info) # DataList.append(dic_info)
try: try:
baseCore.sendkafka(dic_info, topic) baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except: except:
for att_id in id_list:
sheet_name = "国家发展和改革委员会" baseCore.deliteATT(att_id)
if sheet_name in wb.sheetnames: # sheet_name = "国家发展和改革委员会"
log.info(f"{sheet_name}工作表已存在!") # if sheet_name in wb.sheetnames:
else: # log.info(f"{sheet_name}工作表已存在!")
# 创建新工作表 # else:
wb.create_sheet(sheet_name) # # 创建新工作表
print(f"{sheet_name}新工作表创建完成!") # wb.create_sheet(sheet_name)
# 保存Excel文件 # print(f"{sheet_name}新工作表创建完成!")
wb.save(file_path) # # 保存Excel文件
# wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name) #
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except Exception as e: except Exception as e:
log.info(f"error!!!{newsUrl}") log.info(f"error!!!{newsUrl}")
log.info({e}) log.info({e})
log.info(f'====第{page}页====处理结束,已采集{num}条数据=================') log.info(f'====第{page}页====处理结束,已采集{num}条数据=================')
#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp #证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
def zhengquanqihuo(wb,file_path): def zhengquanqihuo():
headers = { headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path): ...@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path):
total = pageUtil['rowCount'] total = pageUtil['rowCount']
page_size = pageUtil['pageSize'] page_size = pageUtil['pageSize']
Max_page = int(total / page_size) Max_page = int(total / page_size)
DataList = [] # DataList = []
num = 0 num = 0
path = 'data/证监会' webname = '证券期货法规数据库系统'
if not os.path.exists(path): # path = 'data/证监会'
os.makedirs(path) # if not os.path.exists(path):
# os.makedirs(path)
for page in range(0, Max_page+1): for page in range(0, Max_page+1):
payload_page = { payload_page = {
'pageNo': page + 1, 'pageNo': page + 1,
...@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path): ...@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path):
data_page = policy.requestPost(headers, url, payload_page) data_page = policy.requestPost(headers, url, payload_page)
info_list = data_page['pageUtil']['pageList'] info_list = data_page['pageUtil']['pageList']
for info in info_list: for info in info_list:
id_list = []
num += 1 num += 1
try: try:
title = info['secFutrsLawName'] title = info['secFutrsLawName']
...@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path): ...@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path):
# print(publishDate) # print(publishDate)
secFutrsLawId = info['secFutrsLawId'] secFutrsLawId = info['secFutrsLawId']
newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs' newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
browser = policy.createDriver() browser = policy.createDriver()
browser.get(newsUrl) browser.get(newsUrl)
time.sleep(1) time.sleep(1)
page_source = browser.page_source page_source = browser.page_source
newssoup = BeautifulSoup(page_source, 'html.parser') newssoup = BeautifulSoup(page_source, 'html.parser')
policy.paserUrl(newssoup,newsUrl)
# print(newssoup) # print(newssoup)
contentWithTag = newssoup.find('div', class_='law_text mainBody catalog') contentWithTag = newssoup.find('div', class_='law_text mainBody catalog')
content = contentWithTag.text.replace('显示注释', '') content = contentWithTag.text.replace('显示注释', '')
# print(content) # print(content)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = { dic_info = {
'序号': num, 'attachmentIds': id_list,
'标题': title, 'author': '',
'发布时间': publishDate, 'content': content,
'来源': source, 'contentWithTag': contentWithTag_str,
'原文链接': newsUrl, 'deleteFlag': 0,
'发文时间': publishDate, 'id': '',
'发文机构': source, 'title': title,
'发文字号': pubHao, 'publishDate': publishDate,
'摘要': '', 'origin': source,
'正文': content, 'sourceAddress': newsUrl,
'附件名称': '', 'writtenDate': publishDate,
'附件链接': '', 'organ': source,
'issuedNumber': pubHao,
'summary': '',
'topicClassification': '',
'createDate': time_now,
'sid': '1729030277461815298',
} }
DataList.append(dic_info) try:
sheet_name = "证监会" baseCore.sendkafka(dic_info, topic)
if sheet_name in wb.sheetnames: baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f"{sheet_name}工作表已存在!") log.info(f'采集成功--{title}--{newsUrl}')
else: except:
# 创建新工作表 for att_id in id_list:
wb.create_sheet(sheet_name) baseCore.deliteATT(att_id)
print(f"{sheet_name}新工作表创建完成!") # DataList.append(dic_info)
# 保存Excel文件 # sheet_name = "证监会"
wb.save(file_path) # if sheet_name in wb.sheetnames:
# log.info(f"{sheet_name}工作表已存在!")
baseCore.writerToExcel(DataList, file_path, sheet_name) # else:
# # 创建新工作表
# wb.create_sheet(sheet_name)
# print(f"{sheet_name}新工作表创建完成!")
# # 保存Excel文件
# wb.save(file_path)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except Exception as e: except Exception as e:
log.info(f"error!!!{num}") log.info(f"error!!!{num}")
log.info({e}) log.info({e})
...@@ -428,9 +472,10 @@ def sse(wb,file_path): ...@@ -428,9 +472,10 @@ def sse(wb,file_path):
total_page = result['data']['totalPage'] total_page = result['data']['totalPage']
DataList = [] DataList = []
num = 0 num = 0
path = 'data/上海交易所' webname = '上海证券交易所'
if not os.path.exists(path): # path = 'data/上海交易所'
os.makedirs(path) # if not os.path.exists(path):
# os.makedirs(path)
for page in range(0, int(total_page)): for page in range(0, int(total_page)):
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761' url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
data = policy.getrequest_json(headers, url_page) data = policy.getrequest_json(headers, url_page)
...@@ -456,9 +501,14 @@ def sse(wb,file_path): ...@@ -456,9 +501,14 @@ def sse(wb,file_path):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
} }
newsUrl = 'http://www.sse.com.cn' + news['extend'][4]['value'] newsUrl = 'http://www.sse.com.cn' + news['extend'][4]['value']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
if '.pdf' in newsUrl: if '.pdf' in newsUrl:
fu_jian_name = ''
fu_jian_href = ''
content = '' content = ''
response = requests.get(newsUrl, timeout=20) response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
...@@ -466,10 +516,10 @@ def sse(wb,file_path): ...@@ -466,10 +516,10 @@ def sse(wb,file_path):
content += page.get_text() content += page.get_text()
file_href = newsUrl file_href = newsUrl
file_name = title file_name = title
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n' policy.attuributefile(title, newsUrl, num, publishDate)
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
dic_info = { dic_info = {
'序号': num, '序号': num,
'标题': title, '标题': title,
...@@ -553,100 +603,6 @@ def sse(wb,file_path): ...@@ -553,100 +603,6 @@ def sse(wb,file_path):
baseCore.writerToExcel(DataList, file_path, sheet_name) baseCore.writerToExcel(DataList, file_path, sheet_name)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing():
url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': '1',
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '148',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
'Host': 'www.beijing.gov.cn',
'Origin': 'https://www.beijing.gov.cn',
'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
result = policy.requestPost(headers, url, payload)
total = result['totalHits']
page_size = result['currentHits']
Max_page = int(total / page_size)
for page in range(0, Max_page):
payload_page = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': page + 1,
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data = policy.requestPost(headers, url, payload_page)
info_list = data['resultDocs']
# print(info_list)
for info_ in info_list:
info = info_['data']
title = info['titleO']
titleLabel = info['titleLabel']['value']
publishDate = info['docDate']
# source = info['siteLabel']['value']
newsUrl = info['url']
if titleLabel == '政策解读':
newssoup = policy.getrequest_soup(headers, newsUrl)
print(newssoup)
contentWithTag = newssoup.find('div', id='mainText')
content = contentWithTag.text
source = newssoup.select('p[class="fl"]>span')[1].replace('来源:', '')
formatRows = info['formatRows']
num = 1
for row in formatRows:
for col in row['col']:
name = col['text']
if name == '相关附件':
value = col['value']
file_href = value.keys()
file_name = value.values()
# 附件上传
policy.attuributefile(file_name,file_href,num,publishDate)
num += 1
value = col['value'][0]
dic_info[name] = value
dic_info = {
'title': title,
'publishDate': publishDate,
'source': source,
'newsUrl': newsUrl,
'file_href': file_href
}
# print(dic_info)
# break
# 河北省人民政府 # 河北省人民政府
def hebei(): def hebei():
path = 'data/河北省人民政府' path = 'data/河北省人民政府'
...@@ -851,10 +807,6 @@ def hebei(): ...@@ -851,10 +807,6 @@ def hebei():
baseCore.writerToExcel(DataList, file_path, sheet_name) baseCore.writerToExcel(DataList, file_path, sheet_name)
break break
# 广东省人民政府
def guangdong():
pass
# 贵州省人民政府 # 贵州省人民政府
def guizhou(): def guizhou():
...@@ -963,12 +915,12 @@ def guizhou(): ...@@ -963,12 +915,12 @@ def guizhou():
if __name__=="__main__": if __name__=="__main__":
file_path = f'data/REITs贵州省人民政府.xlsx' # file_path = f'data/REITs贵州省人民政府.xlsx'
wb = policy.createfile(file_path) # wb = policy.createfile(file_path)
# reform(wb,file_path) # reform()
# shenzhen() # shenzhen()
# zhengquanqihuo(wb,file_path) zhengquanqihuo()
# sse(wb,file_path) # sse()
# hebei() # hebei()
guizhou() # guizhou()
# zhengquanqihuo() # zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论