Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
ee09212f
提交
ee09212f
authored
3月 01, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
知网专家采集
上级
241a04dd
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
454 行增加
和
3 行删除
+454
-3
5.py
comData/ZW_paper/5.py
+3
-3
51.py
comData/ZW_paper/51.py
+451
-0
没有找到文件。
comData/ZW_paper/5.py
浏览文件 @
ee09212f
...
@@ -51,9 +51,9 @@ def parse_excel():
...
@@ -51,9 +51,9 @@ def parse_excel():
def
get_content1
():
def
get_content1
():
print_result_list
=
[]
print_result_list
=
[]
result_dict_list
=
[]
result_dict_list
=
[]
# query = {"专家库主键id":"141
"}
query
=
{
"专家库主键id"
:
"143
"
}
#
for db_dict in db_storage.find(query):
for
db_dict
in
db_storage
.
find
(
query
):
for
db_dict
in
db_storage
.
find
():
#
for db_dict in db_storage.find():
del
db_dict
[
'_id'
]
del
db_dict
[
'_id'
]
result_dict_list
.
append
(
db_dict
)
result_dict_list
.
append
(
db_dict
)
for
result_dict
in
result_dict_list
:
for
result_dict
in
result_dict_list
:
...
...
comData/ZW_paper/51.py
0 → 100644
浏览文件 @
ee09212f
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-4-9
import
xlrd
from
tqdm
import
tqdm
import
pymongo
import
pymysql
import
time
import
requests
from
pyquery
import
PyQuery
as
pq
from
selenium
import
webdriver
import
json
from
requests.packages
import
urllib3
urllib3
.
disable_warnings
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'知网-研究中心专家'
]
# 知网专家 读取Excel
def
parse_excel
():
result_dict_list
=
[]
data
=
xlrd
.
open_workbook
(
'知网-研究中心专家.xlsx'
)
.
sheets
()[
0
]
# 读取excel第一行数据作为存入mongodb的字段名
rows_tag
=
data
.
row_values
(
0
)
n_rows
=
data
.
nrows
for
i
in
range
(
1
,
n_rows
):
# 将字段名和excel数据存储为字典形式
result_dict
=
dict
(
zip
(
rows_tag
,
data
.
row_values
(
i
)))
c
=
result_dict
[
'专家库主键id'
]
# if c:
# c = str(int(c))
d
=
result_dict
[
'专家库知网code码'
]
if
d
:
d
=
str
(
int
(
d
))
total_page
=
result_dict
[
'页数'
]
a_dict
=
{
'云协作专家'
:
result_dict
[
'云协作专家'
],
'专家库主键id'
:
str
(
int
(
c
)),
'专家库知网code码'
:
d
,
'页数'
:
int
(
total_page
),
'代码'
:
result_dict
[
'代码'
],
}
print
(
a_dict
)
result_dict_list
.
append
(
a_dict
)
db_storage
.
insert_many
(
result_dict_list
)
# 知网专家
#todo:更新代码如果必要字段为空的话 就不传输
def
get_content1
():
print_result_list
=
[]
result_dict_list
=
[]
query
=
{
"专家库主键id"
:{
"$gt"
:
144
}}
for
db_dict
in
db_storage
.
find
(
query
):
# for db_dict in db_storage.find():
del
db_dict
[
'_id'
]
result_dict_list
.
append
(
db_dict
)
for
result_dict
in
result_dict_list
:
try
:
a
=
result_dict
[
'云协作专家'
]
# b = result_dict['分类id']
c
=
int
(
result_dict
[
'专家库主键id'
])
# d = result_dict['专家库知网code码']
d
=
''
# total_page = result_dict['页数'] + 1
# payload_ = 'IsSearch=false&QueryJson=%7B%22Platform%22%3A%22%22%2C%22DBCode%22%3A%22CFLS%22%2C%22KuaKuCode%22%3A%22CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCCJD%2CCCVD%2CCJFN%22%2C%22QNode%22%3A%7B%22QGroup%22%3A%5B%7B%22Key%22%3A%22Subject%22%2C%22Title%22%3A%22%22%2C%22Logic%22%3A1%2C%22Items%22%3A%5B%5D%2C%22ChildItems%22%3A%5B%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E4%BD%9C%E8%80%85%22%2C%22Logic%22%3A1%2C%22Items%22%3A%5B%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%EF%BC%88%E4%B8%AD%E5%9B%BD%E6%A0%B8%E7%A7%91%E6%8A%80%E4%BF%A1%E6%81%AF%E4%B8%8E%E7%BB%8F%E6%B5%8E%E7%A0%94%E7%A9%B6%E9%99%A2%EF%BC%89%22%2C%22Logic%22%3A1%2C%22Name%22%3A%22AUC%22%2C%22Operate%22%3A%22%3D%22%2C%22Value%22%3A%22000005858248%22%2C%22ExtendType%22%3A13%2C%22ExtendValue%22%3A%22%22%2C%22Value2%22%3A%22%22%2C%22BlurType%22%3A%22%22%7D%2C%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%EF%BC%88%E4%B8%AD%E5%9B%BD%E6%A0%B8%E7%A7%91%E6%8A%80%E4%BF%A1%E6%81%AF%E4%B8%8E%E7%BB%8F%E6%B5%8E%E7%A0%94%E7%A9%B6%E9%99%A2%EF%BC%89%22%2C%22Logic%22%3A1%2C%22Name%22%3A%22AU%22%2C%22Operate%22%3A%22%3D%22%2C%22Value%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%22%2C%22ExtendType%22%3A13%2C%22ExtendValue%22%3A%22%22%2C%22Value2%22%3A%22%22%2C%22BlurType%22%3A%22%22%7D%5D%2C%22ChildItems%22%3A%5B%5D%7D%5D%7D%5D%7D%2C%22CodeLang%22%3A%22ch%22%7D&SearchSql=0645419CC2F0B23BC604FFC82ADF67C6E920108EDAD48468E8156BA693E89F481391D6F5096D7FFF3585B29E8209A884EFDF8EF1B43B4C7232E120D4832CCC8979F171B4C268EE675FFB969E7C6AF23B4B63CE6436EE93F3973DCB2E4950C92CBCE188BEB6A4E9E17C3978AE8787ED6BB56445D70910E6E32D9A03F3928F9AD8AADE2A90A8F00E2B29BD6E5A0BE025E88D8E778EC97D42EF1CF47C35B8A9D5473493D11B406E77A4FF28F5B34B8028FE85F57606D7A3FED75B27901EEF587583EBD4B63AC0E07735BE77F216B50090DEE5ABB766456B996D37EB8BDACA3A67E8126F111CF9D15B351A094210DB6B4638A21065F03B6F0B73BB4625BBECE66F8197909739D8FB4EB756DEF71864177DFA3CB468CFA6E8ABF7924234DED6B0DFD49D9269CBA4A2BF4075D517A61D094225D70C1B4C137DB9614758A5E097376F5F3E55A7063A4B7E437436D13FF3CC8FB435E131FFCD16FC30DD997098B4FC997D995E767E2712175BC05B960D3FEB5CAF12A13BD1CE3530AD72FC4DB93206996E216BC5DC294960A0CA05E986848E1E64FFC5A52BFCB41A97840A708E397F11EFF261E08F3A34094061AE8E8F819AF6A17A9E2176C3893C6DD3E3C06864C91989BDEF9790A38FAF2524B17743B30EBA4ADD550BF985F9C3097A608C697283CE37F8CB78BDC9EAA4874C3485E6F931B016EC41BFBC0EF91B2AD7E1B424E1DFB8FC8771DEA2458C5A7A4C9BF0192C101FD8EDDEE1BACB44C3E478361EF0D1B70FAD56BCF6870A6044D3A226611B9C1A43C6F9F7C021C98E0D5F778D72C87183F026071A730B8BB4FABF9F68FEC783AB1E6E79218B5D87FD1BB541817FB4F3C21DC849A803CB8A620A2EE00475BAF2CE6556638B7A949B446F39A1076DA15764A777BA6239447CB91F4CF513325366E167D268DDB75F288B5C13415CE62F5C431181C044A28CA502FF14439E5C6F63D419CB6DE1360DB01593FE765459299E442EE24917C199AB5178F38461F8C4EBBC95344C5F2AB60F379813A87E2E3AFE3021198B8222CEAB870D9A353786079961184D63977917C7DF8FE6AFBBC795A832BDD454D6E3CD22C3FF7A58808923DD6F464C12A9A88FBFD0C71458AB0E4C1D566315181A9578ECE93670E5CAF13CF2553F68E64726C131F4A48B42A9E7F09EFEBA51D1FDA6BA0ECA0B02B951ECC04548F1D4D08DB69D0EFDCE6793537BB8E59DC442631A9CDBA13878D7493AADA0CD868C1C1C3A6A6FA17C4109205A83F9C0C43E0D2551D0A8592EA99D20D4B78B4EEEE2D53A543701F620C7D6FF46E800B0CEF9B3D23ACD62C7CBEA25FC8BD74D5A0E5C86B9CF3FCACBDBE585AFF85F9689CBF5BCBD267C580361D5B93AA9BD5A1BB6122BB87C04AE227211FE675A4650814F2285261E5641683D65E0454E2597F6025BB4AA1A044D7B97F57394EA5EC878B80FC0A82F12E2D3D9E1BCB062A7ABB290F02116BDED95761A67CE2FDDE42BDDDF34F22E49A0406D724FEBC86B93F80BB52A8D34B8D2B24288ECBC3F90CCB1EF36085E77F1E2AE0AB411FE60A033E704A21469EA5CE4BB8AF6B1C1F1C5F1F084472D57F458CC39F0B2FE583D0795159E9E38BD1102F5D96DB0F828B66F41A702BB0AE59E40CF53BE7F6342EF208434CFFABF845AFD771B288D484BB79952159E6EA27658A6B6230557AF16E86C4AFDF973DBD5A3A2B979AD9037441409D22A954DC50CBCEA8EA5AC500C4BC8282DCE2626BD2B2CB4B1E33B2E1F92533F7F04C48D061907DBCE3E21FF0A77F09C1AE33E769962CD1EDE6B688590D569409EE9EEC4DF1074DCC97C43B0EDAF1C38B5B2784ABC803D9B3B4FC35F46CB1E275E7F83036FC6AFF2E624D4D2E6AE1C2D4CE3FF219FA90A935957E0DE1A386E4AAB5C9F9D1CECA909F5698BFA86C57B6A73D3C0F9FDB94128B7BB9FDD19D57E4C2C2F4127A1F127A96ABF248B26D8B6EF12A1EA97D064564D33D46E5CA71F53FA121A7E5C91ED2B08BE64A0E3D22BE26FC251C0BF4CF21674DE19AF410E3EDFBD9A4BBEA6C709A1E42B5C17E1EE7AA33EFB0F375BF0858D49210A71662313FA5B8E04E508A5E9425D49C3C5D12CB8DCADBF8A148BFA042BBB0218AAC403AAB9CECC45FD33CEC6797FC984BF91FF638AF6E1F09546F595CFC779D2D867282C63B78DC6A6ED3C1C3887462C84AC07C756C5A8D8A8B2EFD39C28A68D47091A3312461BC20085636F4B41F22D5B46861F3E557777CDCFDFE6CAB8ABECFECA3634D779C0F21185772C426BE383BD26E1715DAB5EC4AE4CAC877ED6899CBCB31546F9C6144399C7C0257BBCCBE0EBD2E90EA901840211FCBB1655CB66FD9C51E90432B273CC4CFF3F8DBEB24CDB0ED6017FE68A7F3E9156E1BA276526DE9599A66921F0E2C3ED466FDE0076DCDA6745F29D28E406BEE5FFB0D4C5FB5D72029BAE56BC22496567FF64341F89469703987DD9D700C08346782F57BA62479812820C862D3D5C604C5C26A76F1A7EC503EB4892BAEB25BB44E783FFB3F1B2F5BECB16A5B48F4D769C3DD5713D2B00AEF4870248A1D561623C9418C285CEE86E1C8DF4A73ED729D7789577456281B4F4D1EE3447E6F8391341BB8F15CF9712E73DBE149164B95748F35D6A4CB4492B8D082AB372E96FC29D1578B41D85F0B7A04EFCBE928642D5D2825F978805C43062C3DF3F0915B33F58A8D82BCC523F3C36B9BAEB9226A39549408AFD3119A8B39B8887038107EB5A623D59186BFFF562E624E1BC25A0C8AA6DD298AEC09B06802A77DFD11799D29506307693DAB2962B98EF9F25E785619D05BDE7073474E187D59D41F6A2E06CC292AD406C2991D9C5E58812A1431B46AB634D548433B1E437D745A013EF4C4FED4369039C743E94AE511E197E219B125674EB364F0CFF5CE55D4C503377D1FFCA229511BBE26D1FB2962F42FC4850A76FB6A12ED2E3067FDE84EED64CF16BD02939CAAB6320DDF506FD2A0014218BDB1426740642DE8F050A92A2FAAB4A0A62AD58F057140E8BC88CEBDFF50EA340BCBB9D4047B8AC0BE152C7DF2563DA563FFB42E36BD2919FA201C0DB25FDF3C7491A8A68DA05598505951C7118DE0ABA5BDB42B6D4A910E519ADFC508F303C54B8E4322F643E4A7B0E589959EAAAFE8506F0CF661FF5C6369CE510C874E596&PageName=defaultresult&HandlerId=19&DBCode=CFLS&KuaKuCodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCCJD%2CCCVD%2CCJFN&CurPage=2&RecordsCntPerPage=10&CurDisplayMode=listmode&CurrSortField=PT&CurrSortFieldType=desc&IsSortSearch=false&IsSentenceSearch=false&Subject='
payload_
=
result_dict
[
'代码'
]
if
payload_
:
pass
else
:
print
(
f
'{a}代码为空'
)
continue
aaa_dict_list
=
[]
set_title
=
set
(())
# print(f'正在采集{a},共{total_page - 1}页')
time
.
sleep
(
2
)
# for page in tqdm(range(1, total_page)):
# if total_page == 2:
# payload = payload_
# else:
# if '&CurPage=1&' in payload_:
# payload = payload_.replace('&CurPage=1&', f'&CurPage={page}&')
# else:
# payload = payload_.replace('&CurPage=2&', f'&CurPage={page}&')
headers
=
{
'Connection'
:
'keep-alive'
,
'Accept'
:
'text/html, */*; q=0.01'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Origin'
:
'https://kns.cnki.net'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Referer'
:
'https://kns.cnki.net/kns8/defaultresult/index'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'Ecp_ClientId=1210719153902607419; cnkiUserKey=124a20fb-4ebb-86f9-fcc8-5ba2e8da45a2; Ecp_ClientIp=221.15.216.161; Ecp_IpLoginFail=211126125.41.173.138; ASP.NET_SessionId=k5molf2cg114sjxy0lhjceyp; SID_kns8=123106; CurrSortField=
%
e5
%8
f
%91%
e8
%
a1
%
a8
%
e6
%97%
b6
%
e9
%97%
b4
%2
f(
%
e5
%8
f
%91%
e8
%
a1
%
a8
%
e6
%97%
b6
%
e9
%97%
b4
%2
c
%27
TIME
%27
); CurrSortFieldType=desc; SID_kns_new=kns123123; dsorder=pubdate; dSearchFold=undefined; dstyle=listmode; language=undefined; SID_kcms=124103; _pk_ref=
%5
B
%22%22%2
C
%22%22%2
C1637905762
%2
C
%22
https
%3
A
%2
F
%2
Fwww.baidu.com
%2
Flink
%3
Furl
%3
Dk_wIlObu07-p-iE_2Ec5ow7fGwitei5-u-u-hlhx-Z3
%26
wd
%3
D
%26
eqid
%3
Db0aef0a90017e0810000000261a0492d
%22%5
D; _pk_ses=*; dperpage=50; searchTimeFlag=1; _pk_id=abbb4caf-5c9c-4e46-b660-e356d71710f1.1626680366.29.1637909392.1637899142.; CurrSortField=
%
e5
%8
f
%91%
e8
%
a1
%
a8
%
e6
%97%
b6
%
e9
%97%
b4
%2
f(
%
e5
%8
f
%91%
e8
%
a1
%
a8
%
e6
%97%
b6
%
e9
%97%
b4
%2
c
%27
TIME
%27
); CurrSortFieldType=desc'
}
# url = "https://kns.cnki.net/KNS8/Brief/GetGridTableHtml"
url
=
"https://kns.cnki.net/kns8s/brief/grid"
resp_text
=
requests
.
post
(
url
,
headers
=
headers
,
verify
=
False
,
data
=
payload_
)
.
text
time
.
sleep
(
15
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.result-table-list tr'
)
.
items
()
total_count
=
doc_resp
(
'.pagerTitleCell em'
)
.
text
()
if
'pageSize=10'
in
payload_
:
total_page
=
int
(
total_count
)
//
10
if
int
(
total_count
)
%
10
!=
0
:
total_page
+=
1
if
'pageSize=20'
in
payload_
:
total_page
=
int
(
total_count
)
//
20
if
int
(
total_count
)
%
20
!=
0
:
total_page
+=
1
print
(
f
'正在采集{a},共{total_page}页'
)
for
page
in
tqdm
(
range
(
1
,
total_page
+
1
)):
# for page in tqdm(range(1, 2 + 1)):
if
'&pageNum=1&'
in
payload_
:
payload
=
payload_
.
replace
(
'&pageNum=1&'
,
f
'&pageNum={page}&'
)
else
:
payload
=
payload_
.
replace
(
'&pageNum=2&'
,
f
'&pageNum={page}&'
)
while
True
:
try
:
resp_text
=
requests
.
post
(
url
,
headers
=
headers
,
verify
=
False
,
data
=
payload
)
.
text
time
.
sleep
(
15
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.result-table-list tr'
)
.
items
()
total_count
=
doc_resp
(
'.pagerTitleCell em'
)
.
text
()
total_page
=
int
(
total_count
)
//
20
if
int
(
total_count
)
//
20
!=
0
:
total_page
+=
1
if
'请输入验证码'
in
doc_resp
.
text
():
print
(
'验证码!'
)
time
.
sleep
(
600
)
continue
break
except
Exception
as
e
:
print
(
f
'连接超时!==={e}'
)
time
.
sleep
(
10
)
continue
for
doc_item
in
doc_items
:
title
=
doc_item
(
'.name'
)
.
text
()
.
replace
(
'
\n
'
,
''
)
.
replace
(
'免费'
,
''
)
.
strip
()
if
title
==
''
:
continue
source
=
doc_item
(
'.source'
)
.
text
()
date
=
doc_item
(
'.date'
)
.
text
()
if
date
:
date
=
date
.
split
(
' '
)[
0
]
data
=
doc_item
(
'.data'
)
.
text
()
quote
=
doc_item
(
'.quote'
)
.
text
()
try
:
quote
=
int
(
quote
)
except
:
quote
=
0
download
=
doc_item
(
'.download'
)
.
text
()
try
:
download
=
int
(
download
)
except
:
download
=
0
author_items1
=
doc_item
(
'.author .KnowledgeNetLink'
)
.
items
()
authors
=
''
for
author_item1
in
author_items1
:
author1
=
author_item1
.
text
()
authors
+=
author1
+
'; '
if
authors
:
authors
=
authors
[:
-
2
]
try
:
title_href
=
doc_item
(
'.name .fz14'
)
.
attr
(
'href'
)
# dbcode = title_href.split('DbCode=')[1].split('&yx')[0]
# dbname = title_href.split('DbName=')[1].split('&DbCode')[0]
# filename = title_href.split('FileName=')[1].split('&DbName')[0]
# new_title_href = f'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={dbcode}&dbname={dbname}&filename={filename}'
headers11
=
{
'Connection'
:
'keep-alive'
,
'Cache-Control'
:
'max-age=0'
,
'sec-ch-ua'
:
'"Chromium";v="21", " Not;A Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'Sec-Fetch-Site'
:
'none'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-User'
:
'?1'
,
'Sec-Fetch-Dest'
:
'document'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'cangjieConfig_NZKPT2=
%7
B
%22
status
%22%3
Atrue
%2
C
%22
startTime
%22%3
A
%222021
-12-23
%22%2
C
%22
endTime
%22%3
A
%222022
-05-26
%22%2
C
%22
orginHosts
%22%3
A
%22
kns.cnki.net
%22%2
C
%22
type
%22%3
A
%22
mix
%22%2
C
%22
poolSize
%22%3
A
%2210%22%2
C
%22
intervalTime
%22%3
A10000
%2
C
%22
persist
%22%3
Afalse
%7
D; Hm_lvt_38f33a73da35494cc56a660420d5b6be=1653730524; Hm_lpvt_38f33a73da35494cc56a660420d5b6be=1653731762; Ecp_ClientId=a220517174803460325; knsLeftGroupSelectItem=1
%3
B2
%3
B; Ecp_ClientIp=1.193.57.250; ASP.NET_SessionId=g3lo0aj14clgsxzbfps5ae2i; SID_kns8=123145; dblang=ch; _pk_ses=*; SID_kns_new=kns123165; Ecp_IpLoginFail=2205281.193.37.253; CurrSortField=
%
e7
%9
b
%
b8
%
e5
%85%
b3
%
e5
%
ba
%
a6
%2
frelevant
%2
c(
%
e5
%8
f
%91%
e8
%
a1
%
a8
%
e6
%97%
b6
%
e9
%97%
b4
%2
c
%27
time
%27
)+desc; CurrSortFieldType=desc; SID_kcms=015126022; SID_docpre=006007; yeswholedownload=
%3
Btzjd199805016; _pk_id=2af0698c-d628-4f4c-94ea-1175726c8139.1652780940.2.1653731390.1653730486.'
}
resp_text
=
requests
.
request
(
"GET"
,
title_href
,
headers
=
headers11
,
verify
=
False
)
.
text
doc_href
=
pq
(
resp_text
)
summary
=
doc_href
(
'.abstract-text'
)
.
text
()
if
not
summary
:
summary
=
doc_href
(
'#ChDivSummary'
)
.
text
()
except
:
summary
=
''
time
.
sleep
(
20
)
if
title
in
set_title
:
continue
set_title
.
add
(
title
)
aaa_dict
=
{
'title'
:
title
,
'author'
:
authors
,
'agency'
:
source
,
'pubdate'
:
date
,
'baseData'
:
data
,
'citedcount'
:
quote
,
'downloadcount'
:
download
,
'summary'
:
summary
,
}
if
aaa_dict
[
'title'
]
==
''
or
aaa_dict
[
'summary'
]
==
''
or
aaa_dict
[
'author'
]
==
''
or
aaa_dict
[
'citedcount'
]
==
''
:
continue
else
:
aaa_dict_list
.
append
(
aaa_dict
)
post_dict
=
{
"authorId"
:
int
(
c
),
"authorName"
:
a
,
"ikCode"
:
d
,
"papers"
:
aaa_dict_list
}
# print(post_dict)
print
(
len
(
aaa_dict_list
))
print_result_list
.
append
(
post_dict
)
# print(aaa_dict_list)
while
aaa_dict_list
!=
[]:
try
:
#192.168.1.88:8008
post_url
=
'http://114.116.19.92:8088/api/reptile/autoSaveExpertPaper'
# post_url = 'http://192.168.1.88:8008/api/reptile/autoSaveExpertPaper'
headers
=
{
'Content-Type'
:
'application/json'
}
resp_json
=
requests
.
post
(
url
=
post_url
,
headers
=
headers
,
verify
=
False
,
data
=
json
.
dumps
(
post_dict
))
.
json
()
print
(
'推送:'
,
resp_json
[
'msg'
])
break
except
Exception
as
e
:
print
(
e
)
print
(
'数据传接口失败!'
)
time
.
sleep
(
10
)
continue
except
Exception
as
e
:
print
(
e
)
time
.
sleep
(
30
)
print
(
'出错,重试中!'
)
continue
import
pandas
as
pd
df
=
pd
.
DataFrame
(
print_result_list
)
df
.
to_excel
(
'experct_data.xlsx'
,
index
=
False
)
# 雅虎财经处理表格
def
deal_table
(
doc_resp
):
all_dict
=
{}
resp1_table
=
doc_resp
(
'#Col1-1-Financials-Proxy section div:nth-child(3)>div>div'
)
.
children
()
catalogue_title
=
pq
(
resp1_table
[
0
])
.
text
()
.
split
(
'
\n
'
)
doc_items
=
pq
(
resp1_table
[
1
])
.
children
()
catalogue_dict
=
{}
content_dict
=
{}
for
doc_item
in
doc_items
:
if
pq
(
doc_item
)
.
text
()
==
''
:
continue
a
=
pq
(
pq
(
doc_item
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
a_list
=
pq
(
pq
(
doc_item
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
a
]
=
a_list
b_dict
=
{}
for
doc_item1
in
pq
(
doc_item
)
.
children
()[
1
]:
b
=
pq
(
pq
(
doc_item1
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
b
:
continue
b_list
=
pq
(
pq
(
doc_item1
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
b
]
=
b_list
c_dict
=
{}
for
doc_item2
in
pq
(
doc_item1
)
.
children
()[
1
]:
c
=
pq
(
pq
(
doc_item2
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
c
:
continue
c_list
=
pq
(
pq
(
doc_item2
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
c
]
=
c_list
d_dict
=
{}
for
doc_item3
in
pq
(
doc_item2
)
.
children
()[
1
]:
d
=
pq
(
pq
(
doc_item3
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
d
:
continue
d_list
=
pq
(
pq
(
doc_item3
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
d
]
=
d_list
e_dict
=
{}
for
doc_item4
in
pq
(
doc_item3
)
.
children
()[
1
]:
e
=
pq
(
pq
(
doc_item4
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
e
:
continue
e_list
=
pq
(
pq
(
doc_item4
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
e
]
=
e_list
f_dict
=
{}
for
doc_item5
in
pq
(
doc_item4
)
.
children
()[
1
]:
f
=
pq
(
pq
(
doc_item5
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
f
:
continue
f_list
=
pq
(
pq
(
doc_item5
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
f
]
=
f_list
g_dict
=
{}
for
doc_item6
in
pq
(
doc_item5
)
.
children
()[
1
]:
g
=
pq
(
pq
(
doc_item6
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
0
]
if
not
g
:
continue
g_list
=
pq
(
pq
(
doc_item6
)
.
children
()[
0
])
.
text
()
.
split
(
'
\n
'
)[
1
:]
content_dict
[
g
]
=
g_list
g_dict
[
g
]
=
{}
f_dict
[
f
]
=
g_dict
e_dict
[
e
]
=
f_dict
d_dict
[
d
]
=
e_dict
c_dict
[
c
]
=
d_dict
b_dict
[
b
]
=
c_dict
catalogue_dict
[
a
]
=
b_dict
all_dict
[
'表头'
]
=
catalogue_title
all_dict
[
'目录'
]
=
catalogue_dict
all_dict
[
'内容'
]
=
content_dict
return
all_dict
# 雅虎财经
def
get_content2
():
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
add_argument
(
"--start-maximized"
)
chrome_options
.
add_argument
(
'--headless'
)
executable_path
=
r"D:\chrome\chromedriver.exe"
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
executable_path
=
executable_path
)
conn
=
pymysql
.
Connect
(
host
=
'114.115.159.144'
,
port
=
3306
,
user
=
'root'
,
passwd
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8'
)
cursor
=
conn
.
cursor
()
sql1
=
"""SELECT id, stock_code from config_finance_data_sync WHERE origin_type = 1"""
cursor
.
execute
(
sql1
)
result_data
=
cursor
.
fetchall
()
for
data
in
result_data
:
try
:
data_list
=
list
(
data
)
print
(
data_list
)
stock
=
data_list
[
1
]
orc_id
=
data_list
[
0
]
url
=
f
'https://finance.yahoo.com/quote/{stock}/financials?p={stock}'
try
:
print
(
f
'正在采集:{url}'
)
driver
.
get
(
url
)
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp1
=
pq
(
driver
.
page_source
)
financials1
=
deal_table
(
doc_resp1
)
driver
.
find_element_by_xpath
(
'//div/span[text()="Quarterly"]'
)
.
click
()
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp2
=
pq
(
driver
.
page_source
)
financials2
=
deal_table
(
doc_resp2
)
driver
.
find_element_by_xpath
(
'//div/span[text()="Balance Sheet"]'
)
.
click
()
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp3
=
pq
(
driver
.
page_source
)
financials3
=
deal_table
(
doc_resp3
)
driver
.
find_element_by_xpath
(
'//div/span[text()="Quarterly"]'
)
.
click
()
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp4
=
pq
(
driver
.
page_source
)
financials4
=
deal_table
(
doc_resp4
)
driver
.
find_element_by_xpath
(
'//div/span[text()="Cash Flow"]'
)
.
click
()
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp5
=
pq
(
driver
.
page_source
)
financials5
=
deal_table
(
doc_resp5
)
driver
.
find_element_by_xpath
(
'//div/span[text()="Quarterly"]'
)
.
click
()
time
.
sleep
(
8
)
try
:
driver
.
find_element_by_xpath
(
'//div/span[text()="Expand All"]'
)
.
click
()
time
.
sleep
(
8
)
except
:
pass
doc_resp6
=
pq
(
driver
.
page_source
)
financials6
=
deal_table
(
doc_resp6
)
financials_dict
=
{
'表1'
:
financials1
,
'表2'
:
financials2
,
'表3'
:
financials3
,
'表4'
:
financials4
,
'表5'
:
financials5
,
'表6'
:
financials6
,
}
mu_lus
=
''
for
i
in
range
(
1
,
7
):
mu_lu
=
financials_dict
[
f
'表{i}'
][
'目录'
]
mu_lu
=
json
.
dumps
(
mu_lu
,
ensure_ascii
=
False
,
indent
=
4
)
mu_lus
+=
mu_lu
+
'&&&&'
level_relation
=
mu_lus
[:
-
4
]
financials
=
''
for
i
in
range
(
1
,
7
):
a_list
=
financials_dict
[
f
'表{i}'
][
'表头'
]
for
a
in
a_list
:
financials
+=
a
+
'
\n
'
b_dict
=
financials_dict
[
f
'表{i}'
][
'内容'
]
for
key
,
values
in
b_dict
.
items
():
financials
+=
key
+
'
\n
'
for
b
in
values
:
financials
+=
b
+
'
\n
'
financials
+=
'&&&&'
+
'
\n
'
financials
=
financials
.
strip
()
content
=
financials
[:
-
4
]
.
strip
()
.
replace
(
'
\n
&&&&
\n
'
,
'&&&&'
)
sql
=
"UPDATE config_finance_data_sync SET level_relation=
%
s, content=
%
s WHERE ID =
%
s"
val
=
(
level_relation
,
content
,
orc_id
)
cursor
.
execute
(
sql
,
val
)
conn
.
commit
()
get_url
=
f
'http://114.115.215.250:8089/synFinanceData/yh?id={orc_id}'
try
:
resp
=
requests
.
get
(
get_url
)
except
:
with
open
(
'雅虎财经-财务数据_发送错误ID.txt'
,
'a'
,
encoding
=
'utf8'
)
as
f
:
f
.
write
(
orc_id
+
'
\n
'
)
except
:
print
(
f
'采集:{url}失败'
)
pass
except
:
time
.
sleep
(
60
*
60
)
print
(
'出错,重试中!'
)
continue
driver
.
close
()
if
__name__
==
'__main__'
:
# parse_excel()
get_content1
()
#get_content2()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论