提交 ca21124d 作者: 薛凌堃

深圳交易所

上级 f7a4f608
import re import re
import re import re
import time
import fitz
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from base import BaseCore import BaseCore
from retry import retry from retry import retry
from reits import Policy
policy = Policy()
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
} }
topic = 'policy'
webname = '深圳交易所'
def getContentA(url): def getContentA(url):
pass content = ""
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
try:
with fitz.open(stream=req.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
except:
return ''
return content
def getContentB(url): def getContentB(url,publishDate,num):
id_list = []
req = requests.get(url,headers=headers) req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text,'html.parser') soup = BeautifulSoup(req.text,'html.parser')
contentWithTag = soup.find('div',attrs={'id':'desContent'}) contentWithTag = soup.find('div',attrs={'id':'desContent'})
content = contentWithTag.text.strip()
# print(content)
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
for a in a_list: for a in a_list:
href = a.get('href') href = a.get('href')
file_name = a.text.strip() file_name = a.text.strip()
att_id, full_path = policy.attuributefile(file_name,href,num,publishDate)
content = contentWithTag.text.strip() num += 1
if att_id:
id_list.append(att_id)
a['href'] = full_path
contentWithTag_str = str(contentWithTag)
return content, contentWithTag_str, id_list,num
def doJob(): def doJob():
...@@ -35,6 +57,7 @@ def doJob(): ...@@ -35,6 +57,7 @@ def doJob():
'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html', 'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html'] 'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
for url in urls: for url in urls:
num = 1
req = requests.get(url, headers=headers) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml') soup = BeautifulSoup(req.text, 'lxml')
...@@ -42,13 +65,94 @@ def doJob(): ...@@ -42,13 +65,94 @@ def doJob():
for li in li_list: for li in li_list:
info = str(li.find('script')) info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/') href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
if 'csrcorder' in url:
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcorder/')
origin = '国家发展改革委'
elif 'csrcannoun' in url:
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcannoun/')
origin = '中国证监会'
title = re.findall('curTitle =\'(.*?)\';', info)[0] title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip() publishDate = li.find('span', class_='time').text.strip()
if '.html' in href: if '.html' in href:
getContentA(href) # 根据链接判重
else: is_member = baseCore.r.sismember('REITs::' + webname, href)
getContentB(href) if is_member:
continue
content, contentWithTag_str, id_list, num = getContentB(href,publishDate,num)
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': '',
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729032681013825538',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else:
id_list = []
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
content = getContentA(href)
if content:
pass
else:
log.info(f'{title}---{href}')
continue
# 上传附件
att_id, full_path = policy.attuributefile(title, href, num, publishDate)
if att_id:
id_list.append(att_id)
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': '',
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': '深圳证券交易所',
'sourceAddress': href,
'writtenDate': '',
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729032681013825538',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论