提交 783b7172 作者: 薛凌堃

11.29

上级 fa46345c
import os import os
...@@ -2,18 +2,23 @@ import os ...@@ -2,18 +2,23 @@ import os
import time import time
from urllib.parse import urljoin from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from base import BaseCore
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '深圳证券交易所'
headers = { headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate', 'Accept-Encoding': 'gzip, deflate',
...@@ -57,9 +62,8 @@ def getFjContent(url): ...@@ -57,9 +62,8 @@ def getFjContent(url):
return content return content
def getContent(url, publishDate, num): def getContent(url, publishDate, num, id_list):
fjhref_list = '' num += 1
fjtitle_list = ''
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip) req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
...@@ -67,27 +71,23 @@ def getContent(url, publishDate, num): ...@@ -67,27 +71,23 @@ def getContent(url, publishDate, num):
soup = paserUrl(soup, 'http://www.szse.cn/') soup = paserUrl(soup, 'http://www.szse.cn/')
contentWithTag = soup.find('div', class_='des-content') contentWithTag = soup.find('div', class_='des-content')
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
num_ = 1
for a in a_list: for a in a_list:
fj_href = a.get('href') fj_href = a.get('href')
if not fj_href: if not fj_href:
continue continue
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip() fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjcontent = getFjContent(fj_href) # 上传附件至obs
file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}' att_id,full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
if os.path.exists(file): if att_id:
fj_title = fj_title.replace(category,f'-{num_}{category}') id_list.append(att_id)
num_ += 1 a['href'] = full_path
file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try: try:
scripts = contentWithTag.find_all('script') scripts = contentWithTag.find_all('script')
for script in scripts: for script in scripts:
...@@ -102,14 +102,14 @@ def getContent(url, publishDate, num): ...@@ -102,14 +102,14 @@ def getContent(url, publishDate, num):
pass pass
pub_hao = contentWithTag.find('p').text.lstrip().strip() pub_hao = contentWithTag.find('p').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
return pub_hao, content, fjtitle_list, fjhref_list return pub_hao, content, id_list,contentWithTag
def doJob(): def doJob():
if not os.path.exists('./相关政策/深圳证券交易所/政策文件'): # if not os.path.exists('./相关政策/深圳证券交易所/政策文件'):
os.makedirs('./相关政策/深圳证券交易所/政策文件') # os.makedirs('./相关政策/深圳证券交易所/政策文件')
url = 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=%5B%22rules%22,%22csrcrules%22,%22szseBussrules%22,%22memorandumServicedirect%22,%22publicadvice%22,%22lawruleSearch%22%5D&range=content&searchtype=0' url = 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=%5B%22rules%22,%22csrcrules%22,%22szseBussrules%22,%22memorandumServicedirect%22,%22publicadvice%22,%22lawruleSearch%22%5D&range=content&searchtype=0'
driver = baseCore.buildDriver() driver = policy.createDriver()
driver.get(url) driver.get(url)
WebDriverWait(driver, 10).until( WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'article-item')) EC.presence_of_element_located((By.CLASS_NAME, 'article-item'))
...@@ -118,35 +118,84 @@ def doJob(): ...@@ -118,35 +118,84 @@ def doJob():
num = 0 num = 0
data_list = [] data_list = []
for div in div_list: for div in div_list:
id_list = []
title = div.find_element(By.TAG_NAME, 'a').text.lstrip().strip() title = div.find_element(By.TAG_NAME, 'a').text.lstrip().strip()
href = div.find_element(By.TAG_NAME, 'a').get_attribute('href') href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
publishDate = div.find_element(By.CLASS_NAME, 'pull-right').text.lstrip().strip() publishDate = div.find_element(By.CLASS_NAME, 'pull-right').text.lstrip().strip()
writtenDate = publishDate writtenDate = publishDate
origin = '深圳证券交易所' origin = '深圳证券交易所'
organ = origin organ = origin
# 文章就为pdf
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
if '.pdf' in href: if '.pdf' in href:
content = ''
summary = '' summary = ''
fjtitle_list = title + '.pdf'
fjhref_list = href
pub_hao = '' pub_hao = ''
fjcontent = getFjContent(href) fjcontent = getFjContent(href)
file = f'./相关政策/深圳证券交易所/政策文件/{title}.pdf' time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
with open(file, 'wb') as f: dic_info = {
f.write(fjcontent) 'attachmentIds': id_list,
log.info(f'{title}===附件下载成功') 'author': '',
'content': fjcontent,
'contentWithTag': '',
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729029275400646658',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else: else:
summary = div.find_element(By.CLASS_NAME, 'item-content').text.lstrip().strip() summary = div.find_element(By.CLASS_NAME, 'item-content').text.lstrip().strip()
pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num) pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list, time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
fjhref_list] dic_info = {
data_list.append(data) 'attachmentIds': id_list,
log.info(f'{title}===采集成功') 'author': '',
num += 1 'content': content,
'contentWithTag': str(contentWithTag),
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729029275400646658',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
driver.close() driver.close()
df = pd.DataFrame(np.array(data_list)) # df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接'] # df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/深圳证券交易所/深圳证券交易所政策文件.xlsx', index=False) # df.to_excel('./相关政策/深圳证券交易所/深圳证券交易所政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论