提交 be4f79be 作者: 薛凌堃

研报

上级 5d5dff2b
...@@ -257,11 +257,11 @@ def download(data, order_by): ...@@ -257,11 +257,11 @@ def download(data, order_by):
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
delete_url(sourceAddress) delete_url(sourceAddress)
# 获取当前进程pid # # 获取当前进程pid
current_pid = baseCore.getPID() # current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程 # # todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv) # subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9) # os.kill(current_pid, 9)
return return
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
page_size = retData['page_size'] page_size = retData['page_size']
...@@ -328,37 +328,156 @@ def download(data, order_by): ...@@ -328,37 +328,156 @@ def download(data, order_by):
log.info(dic_result) log.info(dic_result)
return return
# def Mob():
# url = 'https://www.mob.com/mobData/report'
# res = requests.get(url=url,headers=headers).content
# soup = BeautifulSoup(res,'html.parser')
# max_info = soup.find('span',class_='el-pagination__total').text
# max_info = re.findall('\d{1,4}',max_info)[0]
# # print(type(max_info))
# max_page = int((int(max_info)/9) + 1)
# print(max_page)
# i_id = 0
# for page in range(max_page):
# url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
# res = requests.get(url=url, headers=headers).content
# soup = BeautifulSoup(res, 'html.parser')
# result = soup.find('ul', class_='fix')
# li_list = result.find_all('li')
# # for id in range(1, 149):
# id = i_id
# for li in li_list:
# id += 1
# title = li.find('div',class_='title').text
# time = li.find('div',class_='date tc').text.strip()
# year = re.findall('\d{4}',time)[0]
# # for id in range(29,178):
# real_id = 178 - id
# href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# # href = 'https://www.mob.com/mobdata/report/169'
# res_href = requests.get(url=href,headers=headers).content
# i_soup = BeautifulSoup(res_href,'html.parser')
# url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
# summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
# fin_summary = []
# for s in summary_list:
# summary = s.text
# fin_summary.append(summary)
# summary = ''.join(fin_summary)
# dic_post = {
# 'title': title, # 报告名称
# 'url_pdf': url_pdf, # 报告链接
# 'year': year, # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': time, # 时间
# 'origin': 'Mob研究院', # 来源
# 'sourceAddress': href, # 原文链接
# 'content': '', # 内容
# 'summary': summary, # 摘要
# 'sid': '1662008807781212161', # 信息源id
# }
# order_by = 1
# download(dic_post,order_by)
# order_by += 1
# # print(dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# i_id += 9
def Mob(): def Mob():
url = 'https://www.mob.com/mobData/report' # loginfo = baseCore.redicPullData('Mob:loginfo')
res = requests.get(url=url,headers=headers).content # account = loginfo.split('|')[0]
soup = BeautifulSoup(res,'html.parser') # password = loginfo.split('|')[1]
max_info = soup.find('span',class_='el-pagination__total').text # usecount = loginfo.split('|')[2]
max_info = re.findall('\d{1,4}',max_info)[0] usecount = 0
# print(type(max_info)) # 测试用
max_page = int((int(max_info)/9) + 1) # account = '13636711746'
print(max_page) # password = 'Zhenghao123'
i_id = 0
for page in range(max_page): # account = '18703752600'
url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1) # password = 'Axlk010208!'
res = requests.get(url=url, headers=headers).content # account = '13273737131'
soup = BeautifulSoup(res, 'html.parser') # password = 'liu1230...'
result = soup.find('ul', class_='fix') # account = '15237560528'
li_list = result.find_all('li') # password = 'xlk123456!'
# for id in range(1, 149): # account = '17103126138'
id = i_id # password = '171BlackOne'
for li in li_list: account = '17103128590'
id += 1 password = '171BlackTwo'
title = li.find('div',class_='title').text browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
time = li.find('div',class_='date tc').text.strip() f_url = 'https://www.mob.com/developer/login'
year = re.findall('\d{4}',time)[0] browser.get(f_url)
# for id in range(29,178): browser.find_element(By.CLASS_NAME, 's1').click()
real_id = 178 - id browser.find_element(By.CSS_SELECTOR, 'input[type="text"]').send_keys(f'{account}')
href = 'https://www.mob.com/mobdata/report/{}'.format(real_id) browser.find_element(By.CSS_SELECTOR, 'input[type="password"]').send_keys(f'{password}')
# href = 'https://www.mob.com/mobdata/report/169' browser.find_element(By.XPATH, '//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span').click()
res_href = requests.get(url=href,headers=headers).content if usecount < 5:
pass
else:
return Mob()
# 获取登录的信息
# url = browser.current_url
# print(url)
url = 'https://www.mob.com/mobdata/report'
browser.get(url)
# tags = browser.find_elements(By.CLASS_NAME, 'main-title')
# for tag in tags:
# if 'Mob研究院' in tag.text:
# tag.click()
# else:
# continue
# # try:
# # web = tag.find_element(By.CLASS_NAME, "")
# # web.click()
# # break
# # except:
# # continue
cookies_list = browser.get_cookies()
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
# cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
# cookies_ = json.dumps(cookies)
session = requests.session()
session.cookies.update(cookies)
for i in range(5):
url = f'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
req = session.get(url=url, headers=headers)
data_json = req.json()
news_list = data_json['data']['list']
for info in news_list:
title = info['title']
publishDate = info['effective_date']
year = publishDate[:4]
report_id = info['report_id']
href = 'https://www.mob.com/mobdata/report/{}'.format(report_id)
# tf_url = add_check_url(href)
is_member = r.sismember('report_pdf_three_history', href)
if is_member:
continue
res_href = session.get(url=href, headers=headers).content
i_soup = BeautifulSoup(res_href,'html.parser') i_soup = BeautifulSoup(res_href,'html.parser')
url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3') summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
# headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
# headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
# headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
# headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
# headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
headers['token'] = '2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
news_req = session.get(url=news_url,headers=headers)
pdf_url = news_req.json()['data']
fin_summary = [] fin_summary = []
for s in summary_list: for s in summary_list:
summary = s.text summary = s.text
...@@ -366,13 +485,13 @@ def Mob(): ...@@ -366,13 +485,13 @@ def Mob():
summary = ''.join(fin_summary) summary = ''.join(fin_summary)
dic_post = { dic_post = {
'title': title, # 报告名称 'title': title, # 报告名称
'url_pdf': url_pdf, # 报告链接 'url_pdf': pdf_url, # 报告链接
'year': year, # 报告年份 'year': year, # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4) 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': 'YanBao', # 关联记录id,如:企业信用代码 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf 'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': time, # 时间 'publishDate': publishDate, # 时间
'origin': 'Mob研究院', # 来源 'origin': 'Mob研究院', # 来源
'sourceAddress': href, # 原文链接 'sourceAddress': href, # 原文链接
'content': '', # 内容 'content': '', # 内容
...@@ -382,12 +501,7 @@ def Mob(): ...@@ -382,12 +501,7 @@ def Mob():
order_by = 1 order_by = 1
download(dic_post,order_by) download(dic_post,order_by)
order_by += 1 order_by += 1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id += 9
def yidong_guanxiangtai(): def yidong_guanxiangtai():
...@@ -452,30 +566,83 @@ def yidong_guanxiangtai(): ...@@ -452,30 +566,83 @@ def yidong_guanxiangtai():
# print(res.json()) # print(res.json())
# 巨量算数 # # 巨量算数
def juliangsuanshu(): # def juliangsuanshu():
browser = webdriver.Chrome(chromedriver) # # browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
#
# url = 'https://trendinsight.oceanengine.com/arithmetic-report'
# browser.get(url)#跳到指定页面
#
# page_source = browser.page_source#获取页面信息
# soup = BeautifulSoup(page_source, 'html.parser')
#
# list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
# for one_info in list_all:
# info_title = one_info.a.text.strip()
# info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
# info_href = one_info.a.get('href')
# info_url = 'https://trendinsight.oceanengine.com'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
# list_script = soup_info.find_all('script')
# for script in list_script:
# if 'window._SSR_DATA' in script.text:
# json_str = script.text
# info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
#
# info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
# info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '巨量算数', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': '', # 内容
# 'summary': info_zhaiyao, # 摘要
# 'sid': '1662008524476948481', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# browser.quit()
url = 'https://trendinsight.oceanengine.com/arithmetic-report' # 巨量算数
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息 def getnews(browser):
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'}) list_all = soup.find('div', {'class': 'byted-loading byted-loading-block'}).find_all('div', {
'class': 'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'})
for one_info in list_all: for one_info in list_all:
try:
info_title = one_info.a.text.strip() info_title = one_info.a.text.strip()
info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1] info_date = one_info.find('div', {'class': 'releaseTime-MbbUaH'}).text.split(':')[1]
info_href = one_info.a.get('href') info_href = one_info.a.get('href')
info_url = 'https://trendinsight.oceanengine.com'+info_href info_url = 'https://trendinsight.oceanengine.com' + info_href
res_info = requests.get(info_url) res_info = requests.get(info_url)
soup_info = BeautifulSoup(res_info.content,'html.parser') soup_info = BeautifulSoup(res_info.content, 'html.parser')
list_script = soup_info.find_all('script') list_script = soup_info.find_all('script')
for script in list_script: for script in list_script:
if 'window._SSR_DATA' in script.text: if 'window._SSR_DATA' in script.text:
json_str = script.text json_str = script.text
info_json = json.loads(json_str.replace('window._SSR_DATA = ','')) info_json = json.loads(json_str.replace('window._SSR_DATA = ', ''))
info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction'] info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url'] info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
...@@ -504,6 +671,26 @@ def juliangsuanshu(): ...@@ -504,6 +671,26 @@ def juliangsuanshu():
# res = requests.post(url, data=json.dumps(dic_post)) # res = requests.post(url, data=json.dumps(dic_post))
# print(res.json()) # print(res.json())
time.sleep(2) time.sleep(2)
except Exception as e:
continue
# todo:点击下一页
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
# try:
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# except:
# time.sleep(1)
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# return getnews(browser)
def juliangsuanshu():
# browser = webdriver.Chrome(chromedriver)
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://trendinsight.oceanengine.com/arithmetic-report'
browser.get(url)#跳到指定页面
getnews(browser)
browser.quit() browser.quit()
...@@ -560,12 +747,15 @@ def ke36(): ...@@ -560,12 +747,15 @@ def ke36():
# 前沿知识库 # 前沿知识库
def qianyanzhishiku(): def qianyanzhishiku():
url = 'https://wk.askci.com/Periodical/quality/index_1.shtml' for i in range(40,60):
log.info(f'====第{i}页====')
url = f'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
res = requests.get(url) res = requests.get(url)
soup = BeautifulSoup(res.content,'html.parser') soup = BeautifulSoup(res.content,'html.parser')
list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li') # list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
list_all = soup.find('div',{'class':'show_report_list'}).find_all('li')
for one_info in list_all: for one_info in list_all:
info_title = one_info.a.get('title') info_title = one_info.a.get('title')
info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01') info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
...@@ -664,7 +854,7 @@ def qianyanzhishiku(): ...@@ -664,7 +854,7 @@ def qianyanzhishiku():
def shijiejingjiluntan(): def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'} allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(10, 128): for i in range(76, 128):
# res = requests.get(url) # res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser') # soup = BeautifulSoup(res.content,'html.parser')
...@@ -672,6 +862,7 @@ def shijiejingjiluntan(): ...@@ -672,6 +862,7 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}' url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面 browser.get(url) # 跳到指定页面
time.sleep(5)
wait = WebDriverWait(browser, 30) wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11"))) wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息 page_source = browser.page_source # 获取页面信息
...@@ -685,7 +876,12 @@ def shijiejingjiluntan(): ...@@ -685,7 +876,12 @@ def shijiejingjiluntan():
info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime'] info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ') datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
info_date = datetime_obj.strftime('%Y-%m-%d') info_date = datetime_obj.strftime('%Y-%m-%d')
# if info_date >= '2022-07-21':
# continue
try:
info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip() info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
except:
info_zhaiyao = ''
try: try:
info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href') info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
except: except:
...@@ -1394,11 +1590,11 @@ def dongfangcaifu7(): ...@@ -1394,11 +1590,11 @@ def dongfangcaifu7():
if __name__ == '__main__': if __name__ == '__main__':
# try: try:
# log.info('mob') log.info('mob')
# Mob() Mob()
# except: except Exception as e:
# pass pass
# try: # try:
# log.info('yidong_guanxiangtai') # log.info('yidong_guanxiangtai')
# yidong_guanxiangtai() # yidong_guanxiangtai()
...@@ -1407,7 +1603,7 @@ if __name__ == '__main__': ...@@ -1407,7 +1603,7 @@ if __name__ == '__main__':
# try: # try:
# log.info('juliangsuanshu') # log.info('juliangsuanshu')
# juliangsuanshu() # juliangsuanshu()
# except: # except Exception as e:
# pass # pass
# try: # try:
# log.info('ke36') # log.info('ke36')
...@@ -1417,7 +1613,7 @@ if __name__ == '__main__': ...@@ -1417,7 +1613,7 @@ if __name__ == '__main__':
# try: # try:
# log.info('qianyanzhishiku') # log.info('qianyanzhishiku')
# qianyanzhishiku() # qianyanzhishiku()
# except: # except Exception as e:
# pass # pass
# try: # try:
# log.info('shijiejingjiluntan') # log.info('shijiejingjiluntan')
...@@ -1442,31 +1638,31 @@ if __name__ == '__main__': ...@@ -1442,31 +1638,31 @@ if __name__ == '__main__':
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
# try: # try:
# log.info('dongfangcaifu4') # log.info('dongfangcaifu4')
# dongfangcaifu4() # dongfangcaifu4()
# except Exception as e: # except Exception as e:
# log.info(e) # log.info(e)
# pass # pass
#
try: # try:
log.info('dongfangcaifu5') # log.info('dongfangcaifu5')
dongfangcaifu5() # dongfangcaifu5()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
#
try: # try:
log.info('dongfangcaifu6') # log.info('dongfangcaifu6')
dongfangcaifu6() # dongfangcaifu6()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
#
try: # try:
log.info('dongfangcaifu7') # log.info('dongfangcaifu7')
dongfangcaifu7() # dongfangcaifu7()
except Exception as e: # except Exception as e:
log.info(e) # log.info(e)
pass # pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论