提交 be4f79be 作者: 薛凌堃

研报

上级 5d5dff2b
......@@ -257,11 +257,11 @@ def download(data, order_by):
else:
log.info(f'====pdf解析失败====')
delete_url(sourceAddress)
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
# # 获取当前进程pid
# current_pid = baseCore.getPID()
# # todo: 重新启动新进程,杀死当前进程
# subprocess.Popen([sys.executable] + sys.argv)
# os.kill(current_pid, 9)
return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size']
......@@ -328,37 +328,156 @@ def download(data, order_by):
log.info(dic_result)
return
# def Mob():
# url = 'https://www.mob.com/mobData/report'
# res = requests.get(url=url,headers=headers).content
# soup = BeautifulSoup(res,'html.parser')
# max_info = soup.find('span',class_='el-pagination__total').text
# max_info = re.findall('\d{1,4}',max_info)[0]
# # print(type(max_info))
# max_page = int((int(max_info)/9) + 1)
# print(max_page)
# i_id = 0
# for page in range(max_page):
# url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
# res = requests.get(url=url, headers=headers).content
# soup = BeautifulSoup(res, 'html.parser')
# result = soup.find('ul', class_='fix')
# li_list = result.find_all('li')
# # for id in range(1, 149):
# id = i_id
# for li in li_list:
# id += 1
# title = li.find('div',class_='title').text
# time = li.find('div',class_='date tc').text.strip()
# year = re.findall('\d{4}',time)[0]
# # for id in range(29,178):
# real_id = 178 - id
# href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# # href = 'https://www.mob.com/mobdata/report/169'
# res_href = requests.get(url=href,headers=headers).content
# i_soup = BeautifulSoup(res_href,'html.parser')
# url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
# summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
# fin_summary = []
# for s in summary_list:
# summary = s.text
# fin_summary.append(summary)
# summary = ''.join(fin_summary)
# dic_post = {
# 'title': title, # 报告名称
# 'url_pdf': url_pdf, # 报告链接
# 'year': year, # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': time, # 时间
# 'origin': 'Mob研究院', # 来源
# 'sourceAddress': href, # 原文链接
# 'content': '', # 内容
# 'summary': summary, # 摘要
# 'sid': '1662008807781212161', # 信息源id
# }
# order_by = 1
# download(dic_post,order_by)
# order_by += 1
# # print(dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# i_id += 9
def Mob():
url = 'https://www.mob.com/mobData/report'
res = requests.get(url=url,headers=headers).content
soup = BeautifulSoup(res,'html.parser')
max_info = soup.find('span',class_='el-pagination__total').text
max_info = re.findall('\d{1,4}',max_info)[0]
# print(type(max_info))
max_page = int((int(max_info)/9) + 1)
print(max_page)
i_id = 0
for page in range(max_page):
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
res = requests.get(url=url, headers=headers).content
soup = BeautifulSoup(res, 'html.parser')
result = soup.find('ul', class_='fix')
li_list = result.find_all('li')
# for id in range(1, 149):
id = i_id
for li in li_list:
id += 1
title = li.find('div',class_='title').text
time = li.find('div',class_='date tc').text.strip()
year = re.findall('\d{4}',time)[0]
# for id in range(29,178):
real_id = 178 - id
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# href = 'https://www.mob.com/mobdata/report/169'
res_href = requests.get(url=href,headers=headers).content
# loginfo = baseCore.redicPullData('Mob:loginfo')
# account = loginfo.split('|')[0]
# password = loginfo.split('|')[1]
# usecount = loginfo.split('|')[2]
usecount = 0
# 测试用
# account = '13636711746'
# password = 'Zhenghao123'
# account = '18703752600'
# password = 'Axlk010208!'
# account = '13273737131'
# password = 'liu1230...'
# account = '15237560528'
# password = 'xlk123456!'
# account = '17103126138'
# password = '171BlackOne'
account = '17103128590'
password = '171BlackTwo'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
f_url = 'https://www.mob.com/developer/login'
browser.get(f_url)
browser.find_element(By.CLASS_NAME, 's1').click()
browser.find_element(By.CSS_SELECTOR, 'input[type="text"]').send_keys(f'{account}')
browser.find_element(By.CSS_SELECTOR, 'input[type="password"]').send_keys(f'{password}')
browser.find_element(By.XPATH, '//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span').click()
if usecount < 5:
pass
else:
return Mob()
# 获取登录的信息
# url = browser.current_url
# print(url)
url = 'https://www.mob.com/mobdata/report'
browser.get(url)
# tags = browser.find_elements(By.CLASS_NAME, 'main-title')
# for tag in tags:
# if 'Mob研究院' in tag.text:
# tag.click()
# else:
# continue
# # try:
# # web = tag.find_element(By.CLASS_NAME, "")
# # web.click()
# # break
# # except:
# # continue
cookies_list = browser.get_cookies()
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
# cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
# cookies_ = json.dumps(cookies)
session = requests.session()
session.cookies.update(cookies)
for i in range(5):
url = f'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
req = session.get(url=url, headers=headers)
data_json = req.json()
news_list = data_json['data']['list']
for info in news_list:
title = info['title']
publishDate = info['effective_date']
year = publishDate[:4]
report_id = info['report_id']
href = 'https://www.mob.com/mobdata/report/{}'.format(report_id)
# tf_url = add_check_url(href)
is_member = r.sismember('report_pdf_three_history', href)
if is_member:
continue
res_href = session.get(url=href, headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
# headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
# headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
# headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
# headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
# headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
headers['token'] = '2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
news_req = session.get(url=news_url,headers=headers)
pdf_url = news_req.json()['data']
fin_summary = []
for s in summary_list:
summary = s.text
......@@ -366,13 +485,13 @@ def Mob():
summary = ''.join(fin_summary)
dic_post = {
'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接
'url_pdf': pdf_url, # 报告链接
'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间
'publishDate': publishDate, # 时间
'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接
'content': '', # 内容
......@@ -382,12 +501,7 @@ def Mob():
order_by = 1
download(dic_post,order_by)
order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai():
......@@ -452,30 +566,83 @@ def yidong_guanxiangtai():
# print(res.json())
# 巨量算数
def juliangsuanshu():
browser = webdriver.Chrome(chromedriver)
# # 巨量算数
# def juliangsuanshu():
# # browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
#
# url = 'https://trendinsight.oceanengine.com/arithmetic-report'
# browser.get(url)#跳到指定页面
#
# page_source = browser.page_source#获取页面信息
# soup = BeautifulSoup(page_source, 'html.parser')
#
# list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
# for one_info in list_all:
# info_title = one_info.a.text.strip()
# info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
# info_href = one_info.a.get('href')
# info_url = 'https://trendinsight.oceanengine.com'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
# list_script = soup_info.find_all('script')
# for script in list_script:
# if 'window._SSR_DATA' in script.text:
# json_str = script.text
# info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
#
# info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
# info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '巨量算数', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': '', # 内容
# 'summary': info_zhaiyao, # 摘要
# 'sid': '1662008524476948481', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# browser.quit()
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
# 巨量算数
page_source = browser.page_source#获取页面信息
def getnews(browser):
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
list_all = soup.find('div', {'class': 'byted-loading byted-loading-block'}).find_all('div', {
'class': 'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'})
for one_info in list_all:
try:
info_title = one_info.a.text.strip()
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
info_date = one_info.find('div', {'class': 'releaseTime-MbbUaH'}).text.split(':')[1]
info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com'+info_href
info_url = 'https://trendinsight.oceanengine.com' + info_href
res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser')
soup_info = BeautifulSoup(res_info.content, 'html.parser')
list_script = soup_info.find_all('script')
for script in list_script:
if 'window._SSR_DATA' in script.text:
json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
info_json = json.loads(json_str.replace('window._SSR_DATA = ', ''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
......@@ -504,6 +671,26 @@ def juliangsuanshu():
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
except Exception as e:
continue
# todo:点击下一页
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
# try:
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# except:
# time.sleep(1)
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# return getnews(browser)
def juliangsuanshu():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
getnews(browser)
browser.quit()
......@@ -560,12 +747,15 @@ def ke36():
# 前沿知识库
def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
for i in range(40,60):
log.info(f'====第{i}页====')
url = f'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
# list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
list_all = soup.find('div',{'class':'show_report_list'}).find_all('li')
for one_info in list_all:
info_title = one_info.a.get('title')
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
......@@ -664,7 +854,7 @@ def qianyanzhishiku():
def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128):
for i in range(76, 128):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
......@@ -672,6 +862,7 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面
time.sleep(5)
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息
......@@ -685,7 +876,12 @@ def shijiejingjiluntan():
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d')
# if info_date >= '2022-07-21':
# continue
try:
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
except:
info_zhaiyao = ''
try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except:
......@@ -1394,11 +1590,11 @@ def dongfangcaifu7():
if __name__ == '__main__':
# try:
# log.info('mob')
# Mob()
# except:
# pass
try:
log.info('mob')
Mob()
except Exception as e:
pass
# try:
# log.info('yidong_guanxiangtai')
# yidong_guanxiangtai()
......@@ -1407,7 +1603,7 @@ if __name__ == '__main__':
# try:
# log.info('juliangsuanshu')
# juliangsuanshu()
# except:
# except Exception as e:
# pass
# try:
# log.info('ke36')
......@@ -1417,7 +1613,7 @@ if __name__ == '__main__':
# try:
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# except:
# except Exception as e:
# pass
# try:
# log.info('shijiejingjiluntan')
......@@ -1442,31 +1638,31 @@ if __name__ == '__main__':
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu4')
# dongfangcaifu4()
# except Exception as e:
# log.info(e)
# pass
try:
log.info('dongfangcaifu5')
dongfangcaifu5()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu6')
dongfangcaifu6()
except Exception as e:
log.info(e)
pass
try:
log.info('dongfangcaifu7')
dongfangcaifu7()
except Exception as e:
log.info(e)
pass
#
# try:
# log.info('dongfangcaifu5')
# dongfangcaifu5()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu6')
# dongfangcaifu6()
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu7')
# dongfangcaifu7()
# except Exception as e:
# log.info(e)
# pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论