提交 5d788bc9 作者: 薛凌堃

Reits专题

上级 c702fb7b
# REITs专题核心工具包
# REITs专题核心工具包
......@@ -5,6 +5,7 @@ import random
import socket
import sys
import time
import uuid
import fitz
import logbook
......@@ -252,7 +253,7 @@ class BaseCore:
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
self.pool_caiji = PooledDB(
creator=pymysql,
......@@ -451,6 +452,7 @@ class BaseCore:
# def doc_page(self,file_path):
# doc = Document(file_path)
# return len(doc.sections)
def deliteATT(self,id):
delitesql = f"delete from clb_sys_attachment where id = '{id}' "
self.cursor_.execute(delitesql)
......@@ -492,6 +494,9 @@ class BaseCore:
id = selects[0]
return id,full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self,size_bytes):
......@@ -520,37 +525,25 @@ class BaseCore:
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
# name = file_name
if category in file_name:
pass
else:
file_name = file_name + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
file_name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
def sendkafka(self, post_data, topic):
try:
......
import os
import os
......@@ -107,7 +107,7 @@ class Policy():
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '9999', file_name)
retData = baseCore.uptoOBS(file_href, '', file_name)
if retData['state']:
pass
else:
......@@ -136,7 +136,7 @@ class Policy():
policy = Policy()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def reform(wb,file_path):
def reform():
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -153,22 +153,30 @@ def reform(wb,file_path):
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
DataList = []
# DataList = []
num = 0
path = 'data/国家改革发展委员会'
if not os.path.exists(path):
os.makedirs(path)
webname = '中华人民共和国国家发展和改革委员会'
# path = 'data/国家改革发展委员会'
# if not os.path.exists(path):
# os.makedirs(path)
for page in range(1,3):
url = f'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
result = policy.getrequest_json(headers, url)
data_list = result['data']['resultList']
for info in data_list:
num += 1
id_list = []
# info = data_list[1]
publishDate_ = info['docDate']
title = info['title']
summary = info['summary'].replace('<em>','').replace('</em>','')
newsUrl = info['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -190,6 +198,7 @@ def reform(wb,file_path):
}
newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup)
policy.paserUrl(newssoup, newsUrl)
try:
pubHao = ''
source = ''
......@@ -229,20 +238,19 @@ def reform(wb,file_path):
pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.match(pattern, publishDate)
if match:
date1 = datetime.strptime(publishDate, "%Y年%m月%d日")
publishDate = date1.strftime("%Y-%m-%d")
pass
else:
publishDate = ''
policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
policy.deletek(contentWithTag)
content = contentWithTag.text
try:
policy.paserUrl(newssoup,newsUrl)
att = newssoup.find('div', class_='attachment_r')
fu_jian_name = ''
fu_jian_href = ''
except:
fu_jian_name = ''
fu_jian_href = ''
att = ''
if att:
for a in att.find_all('a'):
......@@ -255,49 +263,61 @@ def reform(wb,file_path):
pass
else:
file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
att_id,full_path = policy.attuributefile(file_name,file_href,num,publishDate_)
if att_id:
id_list.append(att_id)
a['href'] = full_path
contentWithTag_str = str(contentWithTag) + str(newssoup.find('div', class_='attachment'))
else:
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'序号': num,
'标题': title,
'发布时间': publishDate_,
'来源': source,
'原文链接': newsUrl,
'发文时间': publishDate,
'发文机构': '',
'发文字号': pubHao,
'摘要': summary,
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate_,
'origin': source,
'sourceAddress': newsUrl,
'writtenDate': publishDate,
'organ': '',
'topicClassification': '',
'issuedNumber': pubHao,
'summary': summary,
'createDate': time_now,
'sid': '1729029275400646658',
}
DataList.append(dic_info)
# DataList.append(dic_info)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
sheet_name = "国家发展和改革委员会"
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
for att_id in id_list:
baseCore.deliteATT(att_id)
# sheet_name = "国家发展和改革委员会"
# if sheet_name in wb.sheetnames:
# log.info(f"{sheet_name}工作表已存在!")
# else:
# # 创建新工作表
# wb.create_sheet(sheet_name)
# print(f"{sheet_name}新工作表创建完成!")
# # 保存Excel文件
# wb.save(file_path)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except Exception as e:
log.info(f"error!!!{newsUrl}")
log.info({e})
log.info(f'====第{page}页====处理结束,已采集{num}条数据=================')
#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
def zhengquanqihuo(wb,file_path):
def zhengquanqihuo():
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path):
total = pageUtil['rowCount']
page_size = pageUtil['pageSize']
Max_page = int(total / page_size)
DataList = []
# DataList = []
num = 0
path = 'data/证监会'
if not os.path.exists(path):
os.makedirs(path)
webname = '证券期货法规数据库系统'
# path = 'data/证监会'
# if not os.path.exists(path):
# os.makedirs(path)
for page in range(0, Max_page+1):
payload_page = {
'pageNo': page + 1,
......@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path):
data_page = policy.requestPost(headers, url, payload_page)
info_list = data_page['pageUtil']['pageList']
for info in info_list:
id_list = []
num += 1
try:
title = info['secFutrsLawName']
......@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path):
# print(publishDate)
secFutrsLawId = info['secFutrsLawId']
newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
browser = policy.createDriver()
browser.get(newsUrl)
time.sleep(1)
page_source = browser.page_source
newssoup = BeautifulSoup(page_source, 'html.parser')
policy.paserUrl(newssoup,newsUrl)
# print(newssoup)
contentWithTag = newssoup.find('div', class_='law_text mainBody catalog')
content = contentWithTag.text.replace('显示注释', '')
# print(content)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'序号': num,
'标题': title,
'发布时间': publishDate,
'来源': source,
'原文链接': newsUrl,
'发文时间': publishDate,
'发文机构': source,
'发文字号': pubHao,
'摘要': '',
'正文': content,
'附件名称': '',
'附件链接': '',
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': source,
'sourceAddress': newsUrl,
'writtenDate': publishDate,
'organ': source,
'issuedNumber': pubHao,
'summary': '',
'topicClassification': '',
'createDate': time_now,
'sid': '1729030277461815298',
}
DataList.append(dic_info)
sheet_name = "证监会"
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
# DataList.append(dic_info)
# sheet_name = "证监会"
# if sheet_name in wb.sheetnames:
# log.info(f"{sheet_name}工作表已存在!")
# else:
# # 创建新工作表
# wb.create_sheet(sheet_name)
# print(f"{sheet_name}新工作表创建完成!")
# # 保存Excel文件
# wb.save(file_path)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except Exception as e:
log.info(f"error!!!{num}")
log.info({e})
......@@ -428,9 +472,10 @@ def sse(wb,file_path):
total_page = result['data']['totalPage']
DataList = []
num = 0
path = 'data/上海交易所'
if not os.path.exists(path):
os.makedirs(path)
webname = '上海证券交易所'
# path = 'data/上海交易所'
# if not os.path.exists(path):
# os.makedirs(path)
for page in range(0, int(total_page)):
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
data = policy.getrequest_json(headers, url_page)
......@@ -456,9 +501,14 @@ def sse(wb,file_path):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
newsUrl = 'http://www.sse.com.cn' + news['extend'][4]['value']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
if '.pdf' in newsUrl:
fu_jian_name = ''
fu_jian_href = ''
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
......@@ -466,10 +516,10 @@ def sse(wb,file_path):
content += page.get_text()
file_href = newsUrl
file_name = title
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
policy.attuributefile(title, newsUrl, num, publishDate)
dic_info = {
'序号': num,
'标题': title,
......@@ -553,100 +603,6 @@ def sse(wb,file_path):
baseCore.writerToExcel(DataList, file_path, sheet_name)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing():
url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': '1',
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '148',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
'Host': 'www.beijing.gov.cn',
'Origin': 'https://www.beijing.gov.cn',
'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
result = policy.requestPost(headers, url, payload)
total = result['totalHits']
page_size = result['currentHits']
Max_page = int(total / page_size)
for page in range(0, Max_page):
payload_page = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': page + 1,
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data = policy.requestPost(headers, url, payload_page)
info_list = data['resultDocs']
# print(info_list)
for info_ in info_list:
info = info_['data']
title = info['titleO']
titleLabel = info['titleLabel']['value']
publishDate = info['docDate']
# source = info['siteLabel']['value']
newsUrl = info['url']
if titleLabel == '政策解读':
newssoup = policy.getrequest_soup(headers, newsUrl)
print(newssoup)
contentWithTag = newssoup.find('div', id='mainText')
content = contentWithTag.text
source = newssoup.select('p[class="fl"]>span')[1].replace('来源:', '')
formatRows = info['formatRows']
num = 1
for row in formatRows:
for col in row['col']:
name = col['text']
if name == '相关附件':
value = col['value']
file_href = value.keys()
file_name = value.values()
# 附件上传
policy.attuributefile(file_name,file_href,num,publishDate)
num += 1
value = col['value'][0]
dic_info[name] = value
dic_info = {
'title': title,
'publishDate': publishDate,
'source': source,
'newsUrl': newsUrl,
'file_href': file_href
}
# print(dic_info)
# break
# 河北省人民政府
def hebei():
path = 'data/河北省人民政府'
......@@ -851,10 +807,6 @@ def hebei():
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
# 广东省人民政府
def guangdong():
pass
# 贵州省人民政府
def guizhou():
......@@ -963,12 +915,12 @@ def guizhou():
if __name__=="__main__":
file_path = f'data/REITs贵州省人民政府.xlsx'
wb = policy.createfile(file_path)
# reform(wb,file_path)
# file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path)
# reform()
# shenzhen()
# zhengquanqihuo(wb,file_path)
# sse(wb,file_path)
zhengquanqihuo()
# sse()
# hebei()
guizhou()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论