提交 ca21124d 作者: 薛凌堃

深圳交易所

上级 f7a4f608
import re
import re
import re
import time
import fitz
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import BaseCore
from retry import retry
from reits import Policy
policy = Policy()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
topic = 'policy'
webname = '深圳交易所'
def getContentA(url):
pass
content = ""
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
try:
with fitz.open(stream=req.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
except:
return ''
return content
def getContentB(url):
def getContentB(url,publishDate,num):
id_list = []
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text,'html.parser')
contentWithTag = soup.find('div',attrs={'id':'desContent'})
content = contentWithTag.text.strip()
# print(content)
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
file_name = a.text.strip()
content = contentWithTag.text.strip()
att_id, full_path = policy.attuributefile(file_name,href,num,publishDate)
num += 1
if att_id:
id_list.append(att_id)
a['href'] = full_path
contentWithTag_str = str(contentWithTag)
return content, contentWithTag_str, id_list,num
def doJob():
......@@ -35,6 +57,7 @@ def doJob():
'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
for url in urls:
num = 1
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
......@@ -42,13 +65,94 @@ def doJob():
for li in li_list:
info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
if 'csrcorder' in url:
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcorder/')
origin = '国家发展改革委'
elif 'csrcannoun' in url:
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcannoun/')
origin = '中国证监会'
title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip()
if '.html' in href:
getContentA(href)
else:
getContentB(href)
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
content, contentWithTag_str, id_list, num = getContentB(href,publishDate,num)
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': '',
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729032681013825538',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else:
id_list = []
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
content = getContentA(href)
if content:
pass
else:
log.info(f'{title}---{href}')
continue
# 上传附件
att_id, full_path = policy.attuributefile(title, href, num, publishDate)
if att_id:
id_list.append(att_id)
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': '',
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': '深圳证券交易所',
'sourceAddress': href,
'writtenDate': '',
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729032681013825538',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
if __name__ == '__main__':
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论