提交 c155afc4 作者: 薛凌堃

企业研报代理IP 自动重启进程

上级 04c8d5f3
import os
import subprocess
import traceback import traceback
import urllib import urllib
import uuid import uuid
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver from selenium import webdriver
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib import parse from urllib import parse
...@@ -9,6 +13,8 @@ import requests, re, time, pymysql, json, redis ...@@ -9,6 +13,8 @@ import requests, re, time, pymysql, json, redis
from kafka import KafkaProducer from kafka import KafkaProducer
import urllib3 import urllib3
from selenium.webdriver.support.wait import WebDriverWait
urllib3.disable_warnings() urllib3.disable_warnings()
from obs import ObsClient from obs import ObsClient
import fitz import fitz
...@@ -18,7 +24,7 @@ sys.path.append('D:\\kkwork\\zzsn_spider\\base') ...@@ -18,7 +24,7 @@ sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
obsClient = ObsClient( obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码 access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
...@@ -26,7 +32,18 @@ obsClient = ObsClient( ...@@ -26,7 +32,18 @@ obsClient = ObsClient(
) )
# tracker_conf = get_tracker_conf('./client.conf') # tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf) # client = Fdfs_client(tracker_conf)
chromedriver = 'D:/chrome/113/chromedriver.exe' # chromedriver = 'D:/chrome/113/chromedriver.exe'
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
...@@ -72,7 +89,7 @@ def getuuid(): ...@@ -72,7 +89,7 @@ def getuuid():
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size): create_by, create_time, come, page_size):
with cnx.cursor() as cursor: with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment_copy2(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by, year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
...@@ -81,7 +98,7 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c ...@@ -81,7 +98,7 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c
cursor.execute(Upsql, values) # 插入 cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交 cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment_copy2 where type_id=4 and full_path = %s''' # and stock_code = "01786.HK" querySql = '''select id from clb_sys_attachment where type_id=4 and full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path) cursor.execute(querySql, full_path)
selects = cursor.fetchone() selects = cursor.fetchone()
pdf_id = selects[0] pdf_id = selects[0]
...@@ -92,14 +109,22 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c ...@@ -92,14 +109,22 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c
# redis去重 # redis去重
def add_check_url(article_url): def add_check_url(article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
# res = r.sadd(f'report_pdf_two_history', article_url,3) # res = r.sadd(f'report_pdf_two_history', article_url,3)
res = r.sadd(f'report_pdf_three_history_2', article_url, 3) # 注意是 保存set的方式 res = r.sadd(f'report_pdf_three_history', article_url, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复 if res == 0: # 若返回0,说明插入不成功,表示有重复
return True return True
else: else:
return False return False
# redis上传失败删除数据
def delete_url(article_url):
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.srem('report_pdf_three_history', article_url)
if res > 0:
return True
else:
return False
def uptoOBS(pdf_url, name_pdf, type_id, pathType): def uptoOBS(pdf_url, name_pdf, type_id, pathType):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '', retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
...@@ -108,6 +133,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType): ...@@ -108,6 +133,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
'create_time': '', 'page_size': '', 'content': ''} 'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3): for i in range(0, 3):
try: try:
ip = baseCore.get_proxy()
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20) response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length')) file_size = int(response.headers.get('Content-Length'))
break break
...@@ -230,6 +256,12 @@ def download(data, order_by): ...@@ -230,6 +256,12 @@ def download(data, order_by):
pass pass
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
delete_url(sourceAddress)
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
return return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size'] page_size = retData['page_size']
...@@ -296,17 +328,429 @@ def download(data, order_by): ...@@ -296,17 +328,429 @@ def download(data, order_by):
log.info(dic_result) log.info(dic_result)
return return
def Mob():
url = 'https://www.mob.com/mobData/report'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_info = soup.find('span',class_='el-pagination__total').text
max_info = re.findall('\d{1,4}',max_info)[0]
# print(type(max_info))
max_page = int((int(max_info)/9) + 1)
print(max_page)
i_id = 0
for page in range(max_page):
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
res = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find('ul', class_='fix')
li_list = result.find_all('li')
# for id in range(1, 149):
id = i_id
for li in li_list:
id += 1
title = li.find('div',class_='title').text
time = li.find('div',class_='date tc').text.strip()
year = re.findall('\d{4}',time)[0]
# for id in range(29,178):
real_id = 178 - id
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# href = 'https://www.mob.com/mobdata/report/169'
res_href = requests.get(url=href,headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
fin_summary = []
for s in summary_list:
summary = s.text
fin_summary.append(summary)
summary = ''.join(fin_summary)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
'summary': summary, # 摘要
'sid': '1662008807781212161', # 信息源id
}
order_by = 1
download(dic_post,order_by)
order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai():
url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page=1'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_page = soup.find(class_='results-page')
max_page = max_page.find_all('li',class_='trans')[-2].text
# print((max_page))
for page in range(int(max_page)):
url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page={}'.format(page)
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find(class_='content-data-report clearfix')
div_list = result.find_all(class_='reports-list clearfix')
for div in div_list:
info_list = div.find_all(class_='download-book')
for info in info_list:
href = info.find('b').find('a')['href']
title = info.find('b').find('a')['title'].strip()
time = info.find(class_='operate-book').find('p').text
year = re.findall('\d{4}', time)[0]
# print(href,title,time)
res_href = requests.get(url=href,headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
i_result = i_soup.find(class_='report-content l')
p_list = []
plist = i_result.find_all('p')
for p in plist:
if 'img' in str(p) or 'TalkingData' in str(p) or 'http' in str(p) or '请填写相关信息' in str(p):
continue
else:
p = p.text.strip()
p_list.append(p)
# print(p_list)
summary = ''.join(p_list).replace('\n','')
# print(summary)
url_pdf = i_soup.find(class_='operate-verify').find('button')['data-url']
# print(url_pdf)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'origin': '移动观象台', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
'summary': summary, # 摘要
'sid': '1662008276140597250', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
# 巨量算数
def juliangsuanshu():
browser = webdriver.Chrome(chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
for one_info in list_all:
info_title = one_info.a.text.strip()
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com'+info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
list_script = soup_info.find_all('script')
for script in list_script:
if 'window._SSR_DATA' in script.text:
json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '巨量算数', # 来源
'sourceAddress': info_url, # 原文链接
'content': '', # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008524476948481', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 36氪
def ke36():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://36kr.com/academe'
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
for one_info in list_all:
info_title = one_info.find('div',{'class':'title'}).text
info_zhaiyao = one_info.find('div',{'class':'desc'}).text
info_url = one_info.a.get('href')
browser.get(info_url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10]
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
dic_post = {
'title': info_title, # 报告名称
'url_pdf': '', # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': '', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '36氪研究院', # 来源
'sourceAddress': info_url, # 原文链接
'content': info_content, # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008421217378306', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 前沿知识库
def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
for one_info in list_all:
info_title = one_info.a.get('title')
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
info_href = one_info.a.get('href')
info_url = 'https://wk.askci.com'+info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
info_pdf_url = soup_info.find('iframe',{'scrolling':'auto'}).get('src').split('pdfpath=')[1]
info_pdf = urllib.parse.unquote(info_pdf_url)
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '前沿知识库', # 来源
'sourceAddress': info_url, # 原文链接
'content': '', # 内容
'summary': '', # 摘要
'sid': '1662008620631367682', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
# # 世界经济论坛
# def shijiejingjiluntan():
# allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
#
# url = f'https://cn.weforum.org/reports'
#
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
#
# list_all = soup.find('div',{'class':'collection-group collection-group--custom js-scroll'}).find_all('div',{'class':'report-listing-tout__content'})
#
# for one_info in list_all:
# info_title = one_info.find('h4').text.strip()
# info_date = one_info.find('div',{'class':'report-listing-tout__date'}).text.strip()
# try:
# info_pdf = one_info.find('a').get('href')
# except:
# info_pdf = ''
# list_date = info_date.replace('月','').split(' ')
# info_date = list_date[2]+'-'+allnum[list_date[1]]+'-'+list_date[0]
#
# info_href = one_info.find('a',{'class':'report-listing-tout__cta'}).get('href')
# info_url = 'https://cn.weforum.org'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
#
# info_zhaiyao = soup_info.find('div',{'class':'report__intro'}).text.strip()
# info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '世界经济论坛', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': info_zhaiyao, # 内容
# 'summary': info_content, # 摘要
# 'sid': '1662008019231088642', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# 世界经济论坛
def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
time.sleep(2)
list_all = soup.find('div', {'class':'wef-qrllub'}).find_all('div',{'class':'wef-184hs11'})
time.sleep(2)
for one_info in list_all:
tag = one_info.find('div', class_='wef-wx6hgt').find_all('div',class_='wef-0')[1]
info_title = tag.find('a').text.strip()
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d')
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except:
info_pdf = ''
info_href = tag.find('a').get('href')
res_info = requests.get(info_href)
soup_info = BeautifulSoup(res_info.content,'html.parser')
info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
'year': info_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': info_date, # 时间
'origin': '世界经济论坛', # 来源
'sourceAddress': info_href, # 原文链接
'content': info_content, # 内容
'summary': info_zhaiyao, # 摘要
'sid': '1662008019231088642', # 信息源id
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
# 东方财富网 # 东方财富网
def dongfangcaifu(): def dongfangcaifu():
cnx2 = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', headers = {
charset='utf8mb4') 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search-api-web.eastmoney.com',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
cnx2 = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
list_short_name = [] list_short_name = []
list_social_code = [] list_social_code = []
with cnx2.cursor() as cursor: with cnx2.cursor() as cursor:
sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo''' sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo where category = 1'''
cursor.execute(sel_sql) cursor.execute(sel_sql)
selects = cursor.fetchall() selects = cursor.fetchall()
...@@ -336,11 +780,14 @@ def dongfangcaifu(): ...@@ -336,11 +780,14 @@ def dongfangcaifu():
} }
param_url = parse.quote(str(param).replace(" ", "")) param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D' # param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000) t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}' url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
res = requests.get(url).text[1:-1] # url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url,headers=headers).text[1:-1]
res_json = json.loads(res) res_json = json.loads(res)
list_all = res_json['result']['researchReport'] list_all = res_json['result']['researchReport']
...@@ -418,7 +865,6 @@ def dongfangcaifu(): ...@@ -418,7 +865,6 @@ def dongfangcaifu():
# if len(list_all) != 10: # if len(list_all) != 10:
# break # break
# 东方财富网2 # 东方财富网2
def dongfangcaifu2(): def dongfangcaifu2():
list_short_name = ['新', '的', '电', '能'] list_short_name = ['新', '的', '电', '能']
...@@ -525,9 +971,10 @@ def dongfangcaifu3(): ...@@ -525,9 +971,10 @@ def dongfangcaifu3():
# log.info("格式化后的日期为:", formatted_date) # log.info("格式化后的日期为:", formatted_date)
# for i in range(1,1349): # for i in range(1,1349):
for i in range(1, 15): for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=0&orgCode=&code=*&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}' url = f'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=0&orgCode=&code=*&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
# url = 'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime=2021-06-13&endTime=2023-06-13&pageNo=1&fields=&qType=0&orgCode=&code=*&rcode=&p=1&pageNum=1&pageNumber=1&_=1686645164397' # url = 'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime=2021-06-13&endTime=2023-06-13&pageNo=1&fields=&qType=0&orgCode=&code=*&rcode=&p=1&pageNum=1&pageNumber=1&_=1686645164397'
res = requests.get(url).text res = requests.get(url=url, headers=headers, verify=False).text
# log.info(res) # log.info(res)
res_json = json.loads(res) res_json = json.loads(res)
...@@ -538,7 +985,7 @@ def dongfangcaifu3(): ...@@ -538,7 +985,7 @@ def dongfangcaifu3():
news_title = one_news['title'] news_title = one_news['title']
# log.info(news_title) # log.info(news_title)
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08" comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06 # 比较发布日期是否小于2023-10-06
if news_date < comparison_date: if news_date < comparison_date:
continue continue
...@@ -548,7 +995,7 @@ def dongfangcaifu3(): ...@@ -548,7 +995,7 @@ def dongfangcaifu3():
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html' news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href) news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup) # log.info(news_soup)
try: try:
...@@ -604,10 +1051,15 @@ def dongfangcaifu4(): ...@@ -604,10 +1051,15 @@ def dongfangcaifu4():
# log.info("格式化后的日期为:", formatted_date) # log.info("格式化后的日期为:", formatted_date)
for i in range(1, 15): for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=1&orgCode=&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}' url = f'https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=1&orgCode=&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
# url = "https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime=2021-06-27&endTime=2023-06-27&pageNo=6&fields=&qType=1&orgCode=&rcode=&p=6&pageNum=6&pageNumber=6&_=1687831020493" # url = "https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime=2021-06-27&endTime=2023-06-27&pageNo=6&fields=&qType=1&orgCode=&rcode=&p=6&pageNum=6&pageNumber=6&_=1687831020493"
res = requests.get(url).text for i in range(0,3):
try:
res = requests.get(url=url,headers=headers,verify=False).text
break
except:
continue
# log.info(res) # log.info(res)
res_json = json.loads(res) res_json = json.loads(res)
...@@ -620,7 +1072,7 @@ def dongfangcaifu4(): ...@@ -620,7 +1072,7 @@ def dongfangcaifu4():
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
news_come = one_news['orgSName'] news_come = one_news['orgSName']
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08" comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06 # 比较发布日期是否小于2023-10-06
if news_date < comparison_date: if news_date < comparison_date:
continue continue
...@@ -628,7 +1080,7 @@ def dongfangcaifu4(): ...@@ -628,7 +1080,7 @@ def dongfangcaifu4():
pass pass
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html' news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href) news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup) # log.info(news_soup)
try: try:
...@@ -693,10 +1145,11 @@ def dongfangcaifu5(): ...@@ -693,10 +1145,11 @@ def dongfangcaifu5():
# log.info("格式化后的日期为:", formatted_date) # log.info("格式化后的日期为:", formatted_date)
for i in range(1, 5): for i in range(1, 10):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/newStockList?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=4&p={i}&pageNum={i}&pageNumber={i}&_={t}' url = f'https://reportapi.eastmoney.com/report/newStockList?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=4&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text res = requests.get(url=url,headers=headers,verify=False).text
log.info(res) log.info(res)
res_json = json.loads(res) res_json = json.loads(res)
...@@ -708,7 +1161,7 @@ def dongfangcaifu5(): ...@@ -708,7 +1161,7 @@ def dongfangcaifu5():
# log.info(news_title) # log.info(news_title)
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
news_come = one_news['orgSName'] news_come = one_news['orgSName']
comparison_date = "2023-12-08" comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06 # 比较发布日期是否小于2023-10-06
if news_date < comparison_date: if news_date < comparison_date:
continue continue
...@@ -716,7 +1169,7 @@ def dongfangcaifu5(): ...@@ -716,7 +1169,7 @@ def dongfangcaifu5():
pass pass
news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html' news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'
news_res = requests.get(news_href) news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup) # log.info(news_soup)
try: try:
...@@ -773,9 +1226,15 @@ def dongfangcaifu6(): ...@@ -773,9 +1226,15 @@ def dongfangcaifu6():
# log.info("格式化后的日期为:", formatted_date) # log.info("格式化后的日期为:", formatted_date)
for i in range(1, 15): for i in range(1, 15):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=3&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}' url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=3&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text for i in range(0, 3):
try:
res = requests.get(url=url, headers=headers, verify=False).text
break
except:
continue
# log.info(res) # log.info(res)
res_json = json.loads(res) res_json = json.loads(res)
...@@ -786,7 +1245,7 @@ def dongfangcaifu6(): ...@@ -786,7 +1245,7 @@ def dongfangcaifu6():
news_title = one_news['title'] news_title = one_news['title']
# log.info(news_title) # log.info(news_title)
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08" comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06 # 比较发布日期是否小于2023-10-06
if news_date < comparison_date: if news_date < comparison_date:
continue continue
...@@ -798,7 +1257,7 @@ def dongfangcaifu6(): ...@@ -798,7 +1257,7 @@ def dongfangcaifu6():
news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[ news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
'encodeUrl'] + '=' 'encodeUrl'] + '='
news_res = requests.get(news_href) news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup) # log.info(news_soup)
try: try:
...@@ -862,9 +1321,10 @@ def dongfangcaifu7(): ...@@ -862,9 +1321,10 @@ def dongfangcaifu7():
# log.info("格式化后的日期为:", formatted_date) # log.info("格式化后的日期为:", formatted_date)
for i in range(1, 3): for i in range(1, 3):
# ip = baseCore.get_proxy()
url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=2&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}' url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=2&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
res = requests.get(url).text res = requests.get(url=url,headers=headers,verify=False).text
# log.info(res) # log.info(res)
res_json = json.loads(res) res_json = json.loads(res)
...@@ -875,7 +1335,7 @@ def dongfangcaifu7(): ...@@ -875,7 +1335,7 @@ def dongfangcaifu7():
news_title = one_news['title'] news_title = one_news['title']
# log.info(news_title) # log.info(news_title)
news_date = one_news['publishDate'][:10] news_date = one_news['publishDate'][:10]
comparison_date = "2023-12-08" comparison_date = "2023-12-14"
# 比较发布日期是否小于2023-10-06 # 比较发布日期是否小于2023-10-06
if news_date < comparison_date: if news_date < comparison_date:
continue continue
...@@ -887,7 +1347,7 @@ def dongfangcaifu7(): ...@@ -887,7 +1347,7 @@ def dongfangcaifu7():
news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[ news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
'encodeUrl'] + '=' 'encodeUrl'] + '='
news_res = requests.get(news_href) news_res = requests.get(url=news_href,headers=headers,verify=False)
news_soup = BeautifulSoup(news_res.content, 'html.parser') news_soup = BeautifulSoup(news_res.content, 'html.parser')
# log.info(news_soup) # log.info(news_soup)
try: try:
...@@ -962,7 +1422,8 @@ if __name__ == '__main__': ...@@ -962,7 +1422,8 @@ if __name__ == '__main__':
# try: # try:
# log.info('shijiejingjiluntan') # log.info('shijiejingjiluntan')
# shijiejingjiluntan() # shijiejingjiluntan()
# except: # except Exception as e:
# log.info(e)
# pass # pass
# try: # try:
# log.info('dongfangcaifu') # log.info('dongfangcaifu')
...@@ -974,33 +1435,38 @@ if __name__ == '__main__': ...@@ -974,33 +1435,38 @@ if __name__ == '__main__':
# dongfangcaifu2() # dongfangcaifu2()
# except: # except:
# pass # pass
#
# try: try:
# log.info('dongfangcaifu3') log.info('dongfangcaifu3')
# dongfangcaifu3() dongfangcaifu3()
# except Exception as e: except Exception as e:
# pass log.info(e)
# pass
# try: # try:
# log.info('dongfangcaifu4') # log.info('dongfangcaifu4')
# dongfangcaifu4() # dongfangcaifu4()
# except: # except Exception as e:
# log.info(e)
# pass # pass
try: # try:
log.info('dongfangcaifu5') # log.info('dongfangcaifu5')
dongfangcaifu5() # dongfangcaifu5()
except: # except Exception as e:
pass # log.info(e)
# pass
# try: # try:
# log.info('dongfangcaifu6') # log.info('dongfangcaifu6')
# dongfangcaifu6() # dongfangcaifu6()
# except: # except Exception as e:
# log.info(e)
# pass # pass
#
# try: # try:
# log.info('dongfangcaifu7') # log.info('dongfangcaifu7')
# dongfangcaifu7() # dongfangcaifu7()
# except: # except Exception as e:
# log.info(e)
# pass # pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论