提交 e4cf70b6 作者: LiJunMing

机械项目研报采集

上级 a33c4898
"""
"""
行业研报
研报采集字段要求 所属行业、报告名称、时间、机构名称、附件、正文内容
"""
#行业研报
from bs4 import BeautifulSoup
# import sys
# sys.path.append('D:\\kkwork\\zzsn_spider\\base')
# import BaseCore
# baseCore = BaseCore.BaseCore()
from base.BaseCore import BaseCore
import requests,re,time,pymysql,json,redis
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
def clean_text(text):
"""
清理多余空行
:param text:
:return:
"""
soup = BeautifulSoup(text, 'html.parser')
# print(soup.get_text())
text = soup.get_text()
# str1 = re.sub('[\n]+', '\n', 'dfadf d\n \n\n \nfa ds ')
text_ = re.sub('\n+', '\n', text.replace('\t', '').replace('\r', ''))
return text_
# redis去重
# def add_check_url(article_url):
# r = redis.Redis(host="114.115.236.206",port=6379,password='clbzzsn')
# res = r.sadd(f'report_beijing', article_url,3) # 注意是 保存set的方式
# if res == 0: # 若返回0,说明插入不成功,表示有重复
# return True
# else:
# return False
#行业研报
def dongfangcaifu4():
filename = '机械项目研报.xlsx'
t = str(int(time.time()) * 1000)
import datetime
now = datetime.datetime.now()
# 将日期格式化为“%Y-%m-%d”格式
#当前日期
formatted_date = now.strftime("%Y-%m-%d")
pre_year = int(time.strftime('%Y', time.localtime(time.time()))) - 2
month_day = time.strftime('%m-%d', time.localtime(time.time()))
pre_date = '{}-{}'.format(pre_year, month_day)
# print("格式化后的日期为:", formatted_date)
# for i in range(1,1163):
for i in range(74, 75):
log.info(f'-------------开始采集第{i}页------------------')
url = f'https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=1&orgCode=&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
# url = "https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime=2021-06-27&endTime=2023-06-27&pageNo=6&fields=&qType=1&orgCode=&rcode=&p=6&pageNum=6&pageNumber=6&_=1687831020493"
res = requests.get(url).text
# print(res)
res_json = json.loads(res)
list_all = res_json['data']
# print(list_all)
for one_news in list_all[32:]:
dataList = []
news_title = one_news['title']
# print(news_title)
news_date = one_news['publishDate'][:10]
#机构名称
news_come = one_news['orgSName']
#行业名称
news_industry = one_news['industryName']
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] +'.html'
news_res = requests.get(news_href)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
try:
news_soup.find('div', class_='c-foot').decompose()
except:
news_soup = news_soup
# print(news_soup)
try:
if '抱歉,您访问的页面不存在或已删除!' in news_soup.title.text:
continue
except:
continue
# news_content = news_soup.find(id='ContentBody')
try:
news_content = news_soup.find(id='ContentBody').text.replace(' ', '').strip()
news_content = clean_text(news_content)
except:
#ctx-content
# print(news_href,news_title)
news_content = news_soup.find(class_='ctx-content').text.replace(' ', '').strip()
news_content = clean_text(news_content)
try:
news_result = news_soup.find(class_='report-infos')
# print(news_result)
news_pdf = news_result.find_all('span')[4].find('a')['href']
# print(news_pdf)
except:
news_pdf = news_soup.find('span',class_='to-link').find('a')['href']
# print(news_soup)
dic_post = {
'industry': news_industry, # 所属行业
'title': news_title, # 报告名称
'publishDate': news_date, # 时间
'come': news_come, # 机构名称
'url_pdf': news_pdf, # 报告链接
'content': news_content, # 内容
}
dataList.append(dic_post)
log.info(f'成功:{dic_post["title"]},{dic_post["publishDate"]},{i}')
#采集一条数据写入一条数据到excel
baseCore.writerToExcel(dataList,filename)
try:
dongfangcaifu4()
except Exception as e:
log.error(e)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论