提交 d7b3c3cf 作者: LiuLiYuan

REITs专题 12/02

上级 7ef6f432
import re
import re
import requests
from bs4 import BeautifulSoup
from base import BaseCore
from retry import retry
baseCore = BaseCore.BaseCore()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def getContentA(url):
pass
def getContentB(url):
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text,'html.parser')
contentWithTag = soup.find('div',attrs={'id':'desContent'})
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
file_name = a.text.strip()
content = contentWithTag.text.strip()
def doJob():
urls = ['http://reits.szse.cn/lawrule/laws/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
for url in urls:
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
li_list = soup.find('ul', class_='newslist').find_all('li')
for li in li_list:
info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip()
if '.html' in href:
getContentA(href)
else:
getContentB(href)
if __name__ == '__main__':
doJob()
import os
import os
import re
import time
import requests
from bs4 import BeautifulSoup
import BaseCore
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '深圳证券交易所REITs'
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def getContent(url, publishDate, num, id_list):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
contentWithTag = soup.find('div', attrs={'id': 'desContent'})
pub_hao = contentWithTag.find('p').text.strip()
if pub_hao == '':
pub_hao = contentWithTag.find_all('p')[1].text.strip()
if '号' not in pub_hao:
pub_hao = ''
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_href = a.get('href')
if not fj_href:
continue
fj_title = a.text.strip()
category = os.path.splitext(fj_href)[1]
if '.' not in category or '.cn' in category:
continue
if category not in fj_title:
fj_title = fj_title + category
# 上传附件至obs
att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
if att_id:
id_list.append(att_id)
a['href'] = full_path
content = contentWithTag.text.strip()
return pub_hao, content, id_list, str(contentWithTag)
def doJob():
urls = ['http://reits.szse.cn/lawrule/bussrules/latest/index.html',
'http://reits.szse.cn/lawrule/bussrules/supervise/index.html']
num = 1
for url in urls:
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
li_list = soup.find('ul', class_='newslist').find_all('li')
for li in li_list:
id_list = []
info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', url.replace(url.split('/')[-1], ''))
title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip()
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
log.info(f'{title}===已采集')
continue
origin = '深圳证券交易所'
writtenDate = publishDate
organ = '深圳证券交易所'
summary = ''
pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1730508406971613186',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
num += 1
time.sleep(3)
if __name__ == '__main__':
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论