Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
be4f79be
提交
be4f79be
authored
1月 02, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
研报
上级
5d5dff2b
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
351 行增加
和
155 行删除
+351
-155
resentYanbao.py
comData/YanBao/resentYanbao.py
+351
-155
没有找到文件。
comData/YanBao/resentYanbao.py
浏览文件 @
be4f79be
...
...
@@ -257,11 +257,11 @@ def download(data, order_by):
else
:
log
.
info
(
f
'====pdf解析失败===='
)
delete_url
(
sourceAddress
)
# 获取当前进程pid
current_pid
=
baseCore
.
getPID
()
# todo: 重新启动新进程,杀死当前进程
subprocess
.
Popen
([
sys
.
executable
]
+
sys
.
argv
)
os
.
kill
(
current_pid
,
9
)
#
#
获取当前进程pid
#
current_pid = baseCore.getPID()
#
#
todo: 重新启动新进程,杀死当前进程
#
subprocess.Popen([sys.executable] + sys.argv)
#
os.kill(current_pid, 9)
return
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
page_size
=
retData
[
'page_size'
]
...
...
@@ -328,37 +328,156 @@ def download(data, order_by):
log
.
info
(
dic_result
)
return
# def Mob():
# url = 'https://www.mob.com/mobData/report'
# res = requests.get(url=url,headers=headers).content
# soup = BeautifulSoup(res,'html.parser')
# max_info = soup.find('span',class_='el-pagination__total').text
# max_info = re.findall('\d{1,4}',max_info)[0]
# # print(type(max_info))
# max_page = int((int(max_info)/9) + 1)
# print(max_page)
# i_id = 0
# for page in range(max_page):
# url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
# res = requests.get(url=url, headers=headers).content
# soup = BeautifulSoup(res, 'html.parser')
# result = soup.find('ul', class_='fix')
# li_list = result.find_all('li')
# # for id in range(1, 149):
# id = i_id
# for li in li_list:
# id += 1
# title = li.find('div',class_='title').text
# time = li.find('div',class_='date tc').text.strip()
# year = re.findall('\d{4}',time)[0]
# # for id in range(29,178):
# real_id = 178 - id
# href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
# # href = 'https://www.mob.com/mobdata/report/169'
# res_href = requests.get(url=href,headers=headers).content
# i_soup = BeautifulSoup(res_href,'html.parser')
# url_pdf = 'https://api.os.mob.com/api/academy_report/download/' + i_soup.find('div', class_='report-top').find('a')['href']
# summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
# fin_summary = []
# for s in summary_list:
# summary = s.text
# fin_summary.append(summary)
# summary = ''.join(fin_summary)
# dic_post = {
# 'title': title, # 报告名称
# 'url_pdf': url_pdf, # 报告链接
# 'year': year, # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'XueLingKun', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': time, # 时间
# 'origin': 'Mob研究院', # 来源
# 'sourceAddress': href, # 原文链接
# 'content': '', # 内容
# 'summary': summary, # 摘要
# 'sid': '1662008807781212161', # 信息源id
# }
# order_by = 1
# download(dic_post,order_by)
# order_by += 1
# # print(dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# i_id += 9
def
Mob
():
url
=
'https://www.mob.com/mobData/report'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
.
content
soup
=
BeautifulSoup
(
res
,
'html.parser'
)
max_info
=
soup
.
find
(
'span'
,
class_
=
'el-pagination__total'
)
.
text
max_info
=
re
.
findall
(
'
\
d{1,4}'
,
max_info
)[
0
]
# print(type(max_info))
max_page
=
int
((
int
(
max_info
)
/
9
)
+
1
)
print
(
max_page
)
i_id
=
0
for
page
in
range
(
max_page
):
url
=
'https://www.mob.com/mobdata/report?page={}'
.
format
(
page
+
1
)
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
.
content
soup
=
BeautifulSoup
(
res
,
'html.parser'
)
result
=
soup
.
find
(
'ul'
,
class_
=
'fix'
)
li_list
=
result
.
find_all
(
'li'
)
# for id in range(1, 149):
id
=
i_id
for
li
in
li_list
:
id
+=
1
title
=
li
.
find
(
'div'
,
class_
=
'title'
)
.
text
time
=
li
.
find
(
'div'
,
class_
=
'date tc'
)
.
text
.
strip
()
year
=
re
.
findall
(
'
\
d{4}'
,
time
)[
0
]
# for id in range(29,178):
real_id
=
178
-
id
href
=
'https://www.mob.com/mobdata/report/{}'
.
format
(
real_id
)
# href = 'https://www.mob.com/mobdata/report/169'
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
)
.
content
# loginfo = baseCore.redicPullData('Mob:loginfo')
# account = loginfo.split('|')[0]
# password = loginfo.split('|')[1]
# usecount = loginfo.split('|')[2]
usecount
=
0
# 测试用
# account = '13636711746'
# password = 'Zhenghao123'
# account = '18703752600'
# password = 'Axlk010208!'
# account = '13273737131'
# password = 'liu1230...'
# account = '15237560528'
# password = 'xlk123456!'
# account = '17103126138'
# password = '171BlackOne'
account
=
'17103128590'
password
=
'171BlackTwo'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
f_url
=
'https://www.mob.com/developer/login'
browser
.
get
(
f_url
)
browser
.
find_element
(
By
.
CLASS_NAME
,
's1'
)
.
click
()
browser
.
find_element
(
By
.
CSS_SELECTOR
,
'input[type="text"]'
)
.
send_keys
(
f
'{account}'
)
browser
.
find_element
(
By
.
CSS_SELECTOR
,
'input[type="password"]'
)
.
send_keys
(
f
'{password}'
)
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span'
)
.
click
()
if
usecount
<
5
:
pass
else
:
return
Mob
()
# 获取登录的信息
# url = browser.current_url
# print(url)
url
=
'https://www.mob.com/mobdata/report'
browser
.
get
(
url
)
# tags = browser.find_elements(By.CLASS_NAME, 'main-title')
# for tag in tags:
# if 'Mob研究院' in tag.text:
# tag.click()
# else:
# continue
# # try:
# # web = tag.find_element(By.CLASS_NAME, "")
# # web.click()
# # break
# # except:
# # continue
cookies_list
=
browser
.
get_cookies
()
cookies
=
{}
# 获取cookie中的name和value,转化成requests可以使用的形式
for
cookie
in
cookies_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
# cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
# cookies_ = json.dumps(cookies)
session
=
requests
.
session
()
session
.
cookies
.
update
(
cookies
)
for
i
in
range
(
5
):
url
=
f
'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
req
=
session
.
get
(
url
=
url
,
headers
=
headers
)
data_json
=
req
.
json
()
news_list
=
data_json
[
'data'
][
'list'
]
for
info
in
news_list
:
title
=
info
[
'title'
]
publishDate
=
info
[
'effective_date'
]
year
=
publishDate
[:
4
]
report_id
=
info
[
'report_id'
]
href
=
'https://www.mob.com/mobdata/report/{}'
.
format
(
report_id
)
# tf_url = add_check_url(href)
is_member
=
r
.
sismember
(
'report_pdf_three_history'
,
href
)
if
is_member
:
continue
res_href
=
session
.
get
(
url
=
href
,
headers
=
headers
)
.
content
i_soup
=
BeautifulSoup
(
res_href
,
'html.parser'
)
url_pdf
=
'https://api.os.mob.com/api/academy_report/download/'
+
i_soup
.
find
(
'div'
,
class_
=
'report-top'
)
.
find
(
'a'
)[
'href'
]
summary_list
=
i_soup
.
find
(
class_
=
'picture-content htmlContent'
)
.
find_all
(
'h3'
)
news_url
=
f
'https://api.os.mob.com/api/academy_report/download/{report_id}'
# headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
# headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
# headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
# headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
# headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
headers
[
'token'
]
=
'2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
news_req
=
session
.
get
(
url
=
news_url
,
headers
=
headers
)
pdf_url
=
news_req
.
json
()[
'data'
]
fin_summary
=
[]
for
s
in
summary_list
:
summary
=
s
.
text
...
...
@@ -366,13 +485,13 @@ def Mob():
summary
=
''
.
join
(
fin_summary
)
dic_post
=
{
'title'
:
title
,
# 报告名称
'url_pdf'
:
url_pdf
,
# 报告链接
'url_pdf'
:
pdf_url
,
# 报告链接
'year'
:
year
,
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
'YanBao'
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'XueLingKun'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
tim
e
,
# 时间
'publishDate'
:
publishDat
e
,
# 时间
'origin'
:
'Mob研究院'
,
# 来源
'sourceAddress'
:
href
,
# 原文链接
'content'
:
''
,
# 内容
...
...
@@ -382,12 +501,7 @@ def Mob():
order_by
=
1
download
(
dic_post
,
order_by
)
order_by
+=
1
# print(dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
i_id
+=
9
def
yidong_guanxiangtai
():
...
...
@@ -452,58 +566,131 @@ def yidong_guanxiangtai():
# print(res.json())
# 巨量算数
def
juliangsuanshu
():
browser
=
webdriver
.
Chrome
(
chromedriver
)
# # 巨量算数
# def juliangsuanshu():
# # browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
#
# url = 'https://trendinsight.oceanengine.com/arithmetic-report'
# browser.get(url)#跳到指定页面
#
# page_source = browser.page_source#获取页面信息
# soup = BeautifulSoup(page_source, 'html.parser')
#
# list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
# for one_info in list_all:
# info_title = one_info.a.text.strip()
# info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split(':')[1]
# info_href = one_info.a.get('href')
# info_url = 'https://trendinsight.oceanengine.com'+info_href
#
# res_info = requests.get(info_url)
# soup_info = BeautifulSoup(res_info.content,'html.parser')
# list_script = soup_info.find_all('script')
# for script in list_script:
# if 'window._SSR_DATA' in script.text:
# json_str = script.text
# info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
#
# info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
# info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
#
# dic_post = {
# 'title': info_title, # 报告名称
# 'url_pdf': info_pdf, # 报告链接
# 'year': info_date[:4], # 报告年份
# 'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
# 'item_id': 'YanBao', # 关联记录id,如:企业信用代码
# 'category': 'pdf', # 文件后缀名,如:pdf
# 'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
# 'publishDate': info_date, # 时间
# 'origin': '巨量算数', # 来源
# 'sourceAddress': info_url, # 原文链接
# 'content': '', # 内容
# 'summary': info_zhaiyao, # 摘要
# 'sid': '1662008524476948481', # 信息源id
# }
# order_by = 1
# download(dic_post, order_by)
# order_by += 1
# # print(page,dic_post)
# # url = 'http://114.115.155.139:5002/report_download'
# # # report-list
# # res = requests.post(url, data=json.dumps(dic_post))
# # print(res.json())
# time.sleep(2)
# browser.quit()
url
=
'https://trendinsight.oceanengine.com/arithmetic-report'
browser
.
get
(
url
)
#跳到指定页面
# 巨量算数
page_source
=
browser
.
page_source
#获取页面信息
def
getnews
(
browser
):
page_source
=
browser
.
page_source
# 获取页面信息
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
list_all
=
soup
.
find
(
'div'
,{
'class'
:
'index-module__reportList--nit0R'
})
.
find_all
(
'div'
,{
'class'
:
'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'
})
list_all
=
soup
.
find
(
'div'
,
{
'class'
:
'byted-loading byted-loading-block'
})
.
find_all
(
'div'
,
{
'class'
:
'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'
})
for
one_info
in
list_all
:
info_title
=
one_info
.
a
.
text
.
strip
()
info_date
=
one_info
.
find
(
'div'
,{
'class'
:
'card-module__releaseTime--MbbUa'
})
.
text
.
split
(
':'
)[
1
]
info_href
=
one_info
.
a
.
get
(
'href'
)
info_url
=
'https://trendinsight.oceanengine.com'
+
info_href
res_info
=
requests
.
get
(
info_url
)
soup_info
=
BeautifulSoup
(
res_info
.
content
,
'html.parser'
)
list_script
=
soup_info
.
find_all
(
'script'
)
for
script
in
list_script
:
if
'window._SSR_DATA'
in
script
.
text
:
json_str
=
script
.
text
info_json
=
json
.
loads
(
json_str
.
replace
(
'window._SSR_DATA = '
,
''
))
info_zhaiyao
=
info_json
[
'data'
][
'storeState'
][
'report_detail'
][
'report_info'
][
'introduction'
]
info_pdf
=
info_json
[
'data'
][
'storeState'
][
'report_detail'
][
'report_info'
][
'post_files'
][
0
][
'file_url'
]
try
:
info_title
=
one_info
.
a
.
text
.
strip
()
info_date
=
one_info
.
find
(
'div'
,
{
'class'
:
'releaseTime-MbbUaH'
})
.
text
.
split
(
':'
)[
1
]
info_href
=
one_info
.
a
.
get
(
'href'
)
info_url
=
'https://trendinsight.oceanengine.com'
+
info_href
res_info
=
requests
.
get
(
info_url
)
soup_info
=
BeautifulSoup
(
res_info
.
content
,
'html.parser'
)
list_script
=
soup_info
.
find_all
(
'script'
)
for
script
in
list_script
:
if
'window._SSR_DATA'
in
script
.
text
:
json_str
=
script
.
text
info_json
=
json
.
loads
(
json_str
.
replace
(
'window._SSR_DATA = '
,
''
))
info_zhaiyao
=
info_json
[
'data'
][
'storeState'
][
'report_detail'
][
'report_info'
][
'introduction'
]
info_pdf
=
info_json
[
'data'
][
'storeState'
][
'report_detail'
][
'report_info'
][
'post_files'
][
0
][
'file_url'
]
dic_post
=
{
'title'
:
info_title
,
# 报告名称
'url_pdf'
:
info_pdf
,
# 报告链接
'year'
:
info_date
[:
4
],
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
'YanBao'
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
info_date
,
# 时间
'origin'
:
'巨量算数'
,
# 来源
'sourceAddress'
:
info_url
,
# 原文链接
'content'
:
''
,
# 内容
'summary'
:
info_zhaiyao
,
# 摘要
'sid'
:
'1662008524476948481'
,
# 信息源id
}
order_by
=
1
download
(
dic_post
,
order_by
)
order_by
+=
1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time
.
sleep
(
2
)
dic_post
=
{
'title'
:
info_title
,
# 报告名称
'url_pdf'
:
info_pdf
,
# 报告链接
'year'
:
info_date
[:
4
],
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
'YanBao'
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
info_date
,
# 时间
'origin'
:
'巨量算数'
,
# 来源
'sourceAddress'
:
info_url
,
# 原文链接
'content'
:
''
,
# 内容
'summary'
:
info_zhaiyao
,
# 摘要
'sid'
:
'1662008524476948481'
,
# 信息源id
}
order_by
=
1
download
(
dic_post
,
order_by
)
order_by
+=
1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time
.
sleep
(
2
)
except
Exception
as
e
:
continue
# todo:点击下一页
# wait = WebDriverWait(browser, 30)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
# try:
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# except:
# time.sleep(1)
# browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
# return getnews(browser)
def
juliangsuanshu
():
# browser = webdriver.Chrome(chromedriver)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
url
=
'https://trendinsight.oceanengine.com/arithmetic-report'
browser
.
get
(
url
)
#跳到指定页面
getnews
(
browser
)
browser
.
quit
()
...
...
@@ -560,47 +747,50 @@ def ke36():
# 前沿知识库
def
qianyanzhishiku
():
url
=
'https://wk.askci.com/Periodical/quality/index_1.shtml'
for
i
in
range
(
40
,
60
):
log
.
info
(
f
'====第{i}页===='
)
url
=
f
'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
res
=
requests
.
get
(
url
)
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
res
=
requests
.
get
(
url
)
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
list_all
=
soup
.
find
(
'div'
,{
'class'
:
'quality_report pt-20 pb-40'
})
.
find_all
(
'li'
)
for
one_info
in
list_all
:
info_title
=
one_info
.
a
.
get
(
'title'
)
info_date
=
one_info
.
find
(
'div'
,{
'class'
:
'time'
})
.
text
.
replace
(
'年'
,
'-'
)
.
replace
(
'月'
,
'-01'
)
info_href
=
one_info
.
a
.
get
(
'href'
)
info_url
=
'https://wk.askci.com'
+
info_href
# list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
list_all
=
soup
.
find
(
'div'
,{
'class'
:
'show_report_list'
})
.
find_all
(
'li'
)
for
one_info
in
list_all
:
info_title
=
one_info
.
a
.
get
(
'title'
)
info_date
=
one_info
.
find
(
'div'
,{
'class'
:
'time'
})
.
text
.
replace
(
'年'
,
'-'
)
.
replace
(
'月'
,
'-01'
)
info_href
=
one_info
.
a
.
get
(
'href'
)
info_url
=
'https://wk.askci.com'
+
info_href
res_info
=
requests
.
get
(
info_url
)
soup_info
=
BeautifulSoup
(
res_info
.
content
,
'html.parser'
)
info_pdf_url
=
soup_info
.
find
(
'iframe'
,{
'scrolling'
:
'auto'
})
.
get
(
'src'
)
.
split
(
'pdfpath='
)[
1
]
info_pdf
=
urllib
.
parse
.
unquote
(
info_pdf_url
)
res_info
=
requests
.
get
(
info_url
)
soup_info
=
BeautifulSoup
(
res_info
.
content
,
'html.parser'
)
info_pdf_url
=
soup_info
.
find
(
'iframe'
,{
'scrolling'
:
'auto'
})
.
get
(
'src'
)
.
split
(
'pdfpath='
)[
1
]
info_pdf
=
urllib
.
parse
.
unquote
(
info_pdf_url
)
dic_post
=
{
'title'
:
info_title
,
# 报告名称
'url_pdf'
:
info_pdf
,
# 报告链接
'year'
:
info_date
[:
4
],
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
'YanBao'
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
info_date
,
# 时间
'origin'
:
'前沿知识库'
,
# 来源
'sourceAddress'
:
info_url
,
# 原文链接
'content'
:
''
,
# 内容
'summary'
:
''
,
# 摘要
'sid'
:
'1662008620631367682'
,
# 信息源id
}
order_by
=
1
download
(
dic_post
,
order_by
)
order_by
+=
1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time
.
sleep
(
2
)
dic_post
=
{
'title'
:
info_title
,
# 报告名称
'url_pdf'
:
info_pdf
,
# 报告链接
'year'
:
info_date
[:
4
],
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
'YanBao'
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
info_date
,
# 时间
'origin'
:
'前沿知识库'
,
# 来源
'sourceAddress'
:
info_url
,
# 原文链接
'content'
:
''
,
# 内容
'summary'
:
''
,
# 摘要
'sid'
:
'1662008620631367682'
,
# 信息源id
}
order_by
=
1
download
(
dic_post
,
order_by
)
order_by
+=
1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time
.
sleep
(
2
)
# # 世界经济论坛
...
...
@@ -664,7 +854,7 @@ def qianyanzhishiku():
def
shijiejingjiluntan
():
allnum
=
{
'一'
:
'01'
,
'二'
:
'02'
,
'三'
:
'03'
,
'四'
:
'04'
,
'五'
:
'05'
,
'六'
:
'06'
,
'七'
:
'07'
,
'八'
:
'08'
,
'九'
:
'09'
,
'十'
:
'10'
,
'十一'
:
'11'
,
'十二'
:
'12'
}
for
i
in
range
(
10
,
128
):
for
i
in
range
(
76
,
128
):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
...
...
@@ -672,6 +862,7 @@ def shijiejingjiluntan():
url
=
f
'https://cn.weforum.org/publications/?page={i}'
browser
.
get
(
url
)
# 跳到指定页面
time
.
sleep
(
5
)
wait
=
WebDriverWait
(
browser
,
30
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"wef-184hs11"
)))
page_source
=
browser
.
page_source
# 获取页面信息
...
...
@@ -685,7 +876,12 @@ def shijiejingjiluntan():
info_date
=
one_info
.
find
(
'div'
,{
'class'
:
'wef-1nvfeoy'
})
.
find
(
'time'
)[
'datetime'
]
datetime_obj
=
datetime
.
strptime
(
info_date
,
'
%
Y-
%
m-
%
dT
%
H:
%
M:
%
SZ'
)
info_date
=
datetime_obj
.
strftime
(
'
%
Y-
%
m-
%
d'
)
info_zhaiyao
=
one_info
.
find
(
'div'
,
{
'class'
:
'wef-8xl60i'
})
.
text
.
strip
()
# if info_date >= '2022-07-21':
# continue
try
:
info_zhaiyao
=
one_info
.
find
(
'div'
,
{
'class'
:
'wef-8xl60i'
})
.
text
.
strip
()
except
:
info_zhaiyao
=
''
try
:
info_pdf
=
one_info
.
find
(
'div'
,{
'class'
:
'wef-1nvfeoy'
})
.
find
(
'a'
)
.
get
(
'href'
)
except
:
...
...
@@ -1394,11 +1590,11 @@ def dongfangcaifu7():
if
__name__
==
'__main__'
:
#
try:
#
log.info('mob')
#
Mob()
# except
:
#
pass
try
:
log
.
info
(
'mob'
)
Mob
()
except
Exception
as
e
:
pass
# try:
# log.info('yidong_guanxiangtai')
# yidong_guanxiangtai()
...
...
@@ -1407,7 +1603,7 @@ if __name__ == '__main__':
# try:
# log.info('juliangsuanshu')
# juliangsuanshu()
# except:
# except
Exception as e
:
# pass
# try:
# log.info('ke36')
...
...
@@ -1417,7 +1613,7 @@ if __name__ == '__main__':
# try:
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# except:
# except
Exception as e
:
# pass
# try:
# log.info('shijiejingjiluntan')
...
...
@@ -1442,31 +1638,31 @@ if __name__ == '__main__':
# except Exception as e:
# log.info(e)
# pass
#
# try:
# log.info('dongfangcaifu4')
# dongfangcaifu4()
# except Exception as e:
# log.info(e)
# pass
try
:
log
.
info
(
'dongfangcaifu5'
)
dongfangcaifu5
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
try
:
log
.
info
(
'dongfangcaifu6'
)
dongfangcaifu6
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
try
:
log
.
info
(
'dongfangcaifu7'
)
dongfangcaifu7
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
#
#
try:
#
log.info('dongfangcaifu5')
#
dongfangcaifu5()
#
except Exception as e:
#
log.info(e)
#
pass
#
#
try:
#
log.info('dongfangcaifu6')
#
dongfangcaifu6()
#
except Exception as e:
#
log.info(e)
#
pass
#
#
try:
#
log.info('dongfangcaifu7')
#
dongfangcaifu7()
#
except Exception as e:
#
log.info(e)
#
pass
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论