提交 c155afc4 作者: 薛凌堃

企业研报代理IP 自动重启进程

上级 04c8d5f3
import os
import subprocess
import traceback
import urllib
import uuid
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib import parse
......@@ -9,6 +13,8 @@ import requests, re, time, pymysql, json, redis
from kafka import KafkaProducer
import urllib3
from selenium.webdriver.support.wait import WebDriverWait
urllib3.disable_warnings()
from obs import ObsClient
import fitz
......@@ -18,7 +24,7 @@ sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
......@@ -26,7 +32,18 @@ obsClient = ObsClient(
)
# tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf)
chromedriver = 'D:/chrome/113/chromedriver.exe'
# chromedriver = 'D:/chrome/113/chromedriver.exe'
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
......@@ -72,7 +89,7 @@ def getuuid():
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment_copy2(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
......@@ -81,7 +98,7 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment_copy2 where type_id=4 and full_path = %s''' # and stock_code = "01786.HK"
querySql = '''select id from clb_sys_attachment where type_id=4 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
......@@ -92,14 +109,22 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c
# redis去重
def add_check_url(article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
# res = r.sadd(f'report_pdf_two_history', article_url,3)
res = r.sadd(f'report_pdf_three_history_2', article_url, 3) # 注意是 保存set的方式
res = r.sadd(f'report_pdf_three_history', article_url, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# redis上传失败删除数据
def delete_url(article_url):
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.srem('report_pdf_three_history', article_url)
if res > 0:
return True
else:
return False
def uptoOBS(pdf_url, name_pdf, type_id, pathType):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
......@@ -108,6 +133,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
......@@ -230,6 +256,12 @@ def download(data, order_by):
pass
else:
log.info(f'====pdf解析失败====')
delete_url(sourceAddress)
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
......@@ -296,17 +328,429 @@ def download(data, order_by):
log.info(dic_result)
return
def Mob():
url = 'https://www.mob.com/mobData/report'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_info = soup.find('span',class_='el-pagination__total').text
max_info = re.findall('\d{1,4}',max_info)[0]
# print(type(max_info))
max_page = int((int(max_info)/9) + 1)
print(max_page)
i_id = 0
for page in range(max_page):
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
res = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find('ul', class_='fix')
li_list = result.find_all('li')
# for id in range(1, 149):
id = i_id
for li in li_list:
id += 1
title = li.find('div',class_='title').text
time = li.find('div',class_='date tc').text.strip()
year = re.findall('\d{4}',time)[0]
# for id in range(29,178):
real_id = 178 - id
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# href = 'https://www.mob.com/mobdata/report/169'
res_href = requests.get(url=href,headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
fin_summary = []
for s in summary_list:
summary = s.text
fin_summary.append(summary)
summary = ''.join(fin_summary)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
'summary': summary, # 摘要
'sid': '1662008807781212161', # 信息源id
}
order_by = 1
download(dic_post,order_by)
order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai():
url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page=1'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_page = soup.find(class_='results-page')
max_page = max_page.find_all('li',class_='trans')[-2].text
# print((max_page))
for page in range(int(max_page)):
url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page={}'.format(page)
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find(class_='content-data-report clearfix')
div_list = result.find_all(class_='reports-list clearfix')
for div in div_list:
info_list = div.find_all(class_='download-book')
for info in info_list:
href = info.find('b').find('a')['href']
title = info.find('b').find('a')['title'].strip()
time = info.find(class_='operate-book').find('p').text
year = re.findall('\d{4}', time)[0]
# print(href,title,time)
res_href = requests.get(url=href,headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
i_result = i_soup.find(class_='report-content l')
p_list = []
plist = i_result.find_all('p')
for p in plist:
if 'img' in str(p) or 'TalkingData' in str(p) or 'http' in str(p) or '请填写相关信息' in str(p):
continue
else:
p = p.text.strip()
p_list.append(p)
# print(p_list)
summary = ''.join(p_list).replace('\n','')
# print(summary)
url_pdf = i_soup.find(class_='operate-verify').find('button')['data-url']
# print(url_pdf)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'origin': '移动观象台', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
'summary': summary, # 摘要
'sid': '1662008276140597250', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
# 巨量算数
def juliangsuanshu():
browser = webdriver.Chrome(chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
for one_info in list_all:
info_title = one_info.a.text.strip()
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com'+info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
list_script = soup_info.find_all('script')
for script in list_script:
if 'window._SSR_DATA' in script.text:
json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '巨量算数', # 来源
'sourceAddress': info_url, # 原文链接
'content': '', # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008524476948481', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 36氪
def ke36():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://36kr.com/academe'
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
for one_info in list_all:
info_title = one_info.find('div',{'class':'title'}).text
info_zhaiyao = one_info.find('div',{'class':'desc'}).text
info_url = one_info.a.get('href')
browser.get(info_url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10]
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
dic_post = {
'title': info_title, # 报告名称
'url_pdf': '', # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': '', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '36氪研究院', # 来源
'sourceAddress': info_url, # 原文链接
'content': info_content, # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008421217378306', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 前沿知识库
def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
for one_info in list_all:
info_title = one_info.a.get('title')
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
info_href = one_info.a.get('href')
info_url = 'https://wk.askci.com'+info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
info_pdf_url = soup_info.find('iframe',{'scrolling':'auto'}).get('src').split('pdfpath=')[1]
info_pdf = urllib.parse.unquote(info_pdf_url)
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '前沿知识库', # 来源
'sourceAddress': info_url, # 原文链接
'content': '', # 内容
'summary': '', # 摘要
'sid': '1662008620631367682', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
# # 世界经济论坛
# def shijiejingjiluntan():
# allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
#
# url = f'https://cn.weforum.org/reports'
#
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
#
# list_all = soup.find('div',{'class':'collection-group collection-group--custom js-scroll'}).find_all('div',{'class':'report-listing-tout__content'})
#
# for one_info in list_all:
# info_title = one_info.find('h4').text.strip()
# info_date = one_info.find('div',{'class':'report-listing-tout__date'}).text.strip()
# try:
# info_pdf = one_info.find('a').get('href')
# except:
# info_pdf = ''
# list_date = info_date.replace('月','').split(' ')
# info_date = list_date[2]+'-'+allnum[list_date[1]]+'-'+list_date[0]
#
# info_href = one_info.find('a',{'class':'report-listing-tout__cta'}).get('href')
# info_url = 'https://cn.weforum.org'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
#
# info_zhaiyao = soup_info.find('div',{'class':'report__intro'}).text.strip()
# info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '世界经济论坛', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': info_zhaiyao, # 内容
# 'summary': info_content, # 摘要
# 'sid': '1662008019231088642', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# 世界经济论坛
def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
time.sleep(2)
list_all = soup.find('div', {'class':'wef-qrllub'}).find_all('div',{'class':'wef-184hs11'})
time.sleep(2)
for one_info in list_all:
tag = one_info.find('div', class_='wef-wx6hgt').find_all('div',class_='wef-0')[1]
info_title = tag.find('a').text.strip()
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d')
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except:
info_pdf = ''
info_href = tag.find('a').get('href')
res_info = requests.get(info_href)
soup_info = BeautifulSoup(res_info.content,'html.parser')
info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '世界经济论坛', # 来源
'sourceAddress': info_href, # 原文链接
'content': info_content, # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008019231088642', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 东方财富网
def dongfangcaifu():
cnx2 = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
charset='utf8mb4')
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
cnx2 = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
list_short_name = []
list_social_code = []
with cnx2.cursor() as cursor:
sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo'''
sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo where category = 1'''
cursor.execute(sel_sql)
selects = cursor.fetchall()
......@@ -336,11 +780,14 @@ def dongfangcaifu():
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
res = requests.get(url).text[1:-1]
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url,headers=headers).text[1:-1]
res_json = json.loads(res)
list_all = res_json['result']['researchReport']
......@@ -418,7 +865,6 @@ def dongfangcaifu():
# if len(list_all) != 10:
# break
# 东方财富网2
def dongfangcaifu2():
list_short_name = ['新', '的', '电', '能']
......@@ -525,9 +971,10 @@ def dongfangcaifu3():
# log.info("格式化后的日期为:", formatted_date)
# for i in range(1,1349):
for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=0&orgCode=&code=*&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
# url = 'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime=2021-06-13&endTime=2023-06-13&pageNo=1&fields=&qType=0&orgCode=&code=*&rcode=&p=1&pageNum=1&pageNumber=1&_=1686645164397'
res = requests.get(url).text
res = requests.get(url=url, headers=headers, verify=False).text
# log.info(res)
res_json = json.loads(res)
......@@ -538,7 +985,7 @@ def dongfangcaifu3():
news_title = one_news['title']
# log.info(news_title)
news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08"
comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
......@@ -548,7 +995,7 @@ def dongfangcaifu3():
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href)
news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup)
try:
......@@ -604,10 +1051,15 @@ def dongfangcaifu4():
# log.info("格式化后的日期为:", formatted_date)
for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=1&orgCode=&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
# url = "https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime=2021-06-27&endTime=2023-06-27&pageNo=6&fields=&qType=1&orgCode=&rcode=&p=6&pageNum=6&pageNumber=6&_=1687831020493"
res = requests.get(url).text
for i in range(0,3):
try:
res = requests.get(url=url,headers=headers,verify=False).text
break
except:
continue
# log.info(res)
res_json = json.loads(res)
......@@ -620,7 +1072,7 @@ def dongfangcaifu4():
news_date = one_news['publishDate'][:10]
news_come = one_news['orgSName']
news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08"
comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
......@@ -628,7 +1080,7 @@ def dongfangcaifu4():
pass
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href)
news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup)
try:
......@@ -693,10 +1145,11 @@ def dongfangcaifu5():
# log.info("格式化后的日期为:", formatted_date)
for i in range(1, 5):
for i in range(1, 10):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/newStockList?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=4&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text
res = requests.get(url=url,headers=headers,verify=False).text
log.info(res)
res_json = json.loads(res)
......@@ -708,7 +1161,7 @@ def dongfangcaifu5():
# log.info(news_title)
news_date = one_news['publishDate'][:10]
news_come = one_news['orgSName']
comparison_date = "2023-12-08"
comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
......@@ -716,7 +1169,7 @@ def dongfangcaifu5():
pass
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href)
news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup)
try:
......@@ -773,9 +1226,15 @@ def dongfangcaifu6():
# log.info("格式化后的日期为:", formatted_date)
for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=3&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text
for i in range(0, 3):
try:
res = requests.get(url=url, headers=headers, verify=False).text
break
except:
continue
# log.info(res)
res_json = json.loads(res)
......@@ -786,7 +1245,7 @@ def dongfangcaifu6():
news_title = one_news['title']
# log.info(news_title)
news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08"
comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
......@@ -798,7 +1257,7 @@ def dongfangcaifu6():
news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
'encodeUrl'] + '='
news_res = requests.get(news_href)
news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup)
try:
......@@ -862,9 +1321,10 @@ def dongfangcaifu7():
# log.info("格式化后的日期为:", formatted_date)
for i in range(1, 3):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=2&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text
res = requests.get(url=url,headers=headers,verify=False).text
# log.info(res)
res_json = json.loads(res)
......@@ -875,7 +1335,7 @@ def dongfangcaifu7():
news_title = one_news['title']
# log.info(news_title)
news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08"
comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
......@@ -887,7 +1347,7 @@ def dongfangcaifu7():
news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
'encodeUrl'] + '='
news_res = requests.get(news_href)
news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup)
try:
......@@ -962,7 +1422,8 @@ if __name__ == '__main__':
# try:
# log.info('shijiejingjiluntan')
# shijiejingjiluntan()
# except:
# except Exception as e:
# log.info(e)
# pass
# try:
# log.info('dongfangcaifu')
......@@ -974,33 +1435,38 @@ if __name__ == '__main__':
# dongfangcaifu2()
# except:
# pass
#
# try:
# log.info('dongfangcaifu3')
# dongfangcaifu3()
# except Exception as e:
# pass
#
try:
log.info('dongfangcaifu3')
dongfangcaifu3()
except Exception as e:
log.info(e)
pass
# try:
# log.info('dongfangcaifu4')
# dongfangcaifu4()
# except:
# except Exception as e:
# log.info(e)
# pass
try:
log.info('dongfangcaifu5')
dongfangcaifu5()
except:
pass
# try:
# log.info('dongfangcaifu5')
# dongfangcaifu5()
# except Exception as e:
# log.info(e)
# pass
# try:
# log.info('dongfangcaifu6')
# dongfangcaifu6()
# except:
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu7')
# dongfangcaifu7()
# except:
# except Exception as e:
# log.info(e)
# pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论