Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
862e97ab
提交
862e97ab
authored
1月 31, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/31
上级
1d1053c8
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
1012 行增加
和
393 行删除
+1012
-393
CorePerson.py
comData/Tyc/CorePerson.py
+275
-147
CorePerson2.py
comData/Tyc/CorePerson2.py
+284
-186
resentYanbao.py
comData/YanBao/resentYanbao.py
+37
-35
dfsm_sasac.py
comData/dingzhi/dfsm_sasac.py
+145
-0
gzyw_sasac.py
comData/dingzhi/gzyw_sasac.py
+157
-0
zzcx.py
comData/dingzhi/zzcx.py
+52
-0
ClassTool.py
comData/policylaw/ClassTool.py
+2
-1
test.py
test.py
+60
-24
没有找到文件。
comData/Tyc/CorePerson.py
浏览文件 @
862e97ab
...
...
@@ -2,32 +2,99 @@
#先采集天眼查id,再通过id采集核心人员信息
import
datetime
import
json
import
os
import
subprocess
import
sys
import
requests
,
time
,
random
import
pandas
as
pd
from
bs4
import
BeautifulSoup
import
urllib3
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
log
=
baseCore
.
getLogger
()
headers
=
{
'Cookie'
:
'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=
%7
B
%22
distinct_id
%22%3
A
%22307016917%22%2
C
%22
first_id
%22%3
A
%22188
be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e
%22%2
C
%22
props
%22%3
A
%7
B
%22%24
latest_traffic_source_type
%22%3
A
%22%
E8
%87%
AA
%
E7
%84%
B6
%
E6
%90%9
C
%
E7
%
B4
%
A2
%
E6
%
B5
%81%
E9
%87%8
F
%22%2
C
%22%24
latest_search_keyword
%22%3
A
%22%
E6
%9
C
%
AA
%
E5
%8
F
%96%
E5
%88%
B0
%
E5
%80%
BC
%22%2
C
%22%24
latest_referrer
%22%3
A
%22
https
%3
A
%2
F
%2
Fwww.baidu.com
%2
Flink
%22%7
D
%2
C
%22
identities
%22%3
A
%22
eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9
%22%2
C
%22
history_login_id
%22%3
A
%7
B
%22
name
%22%3
A
%22%24
identity_login_id
%22%2
C
%22
value
%22%3
A
%22307016917%22%7
D
%2
C
%22%24
device_id
%22%3
A
%22188
be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e
%22%7
D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=
%7
B
%22
state
%22%3
A
%220%22%2
C
%22
vipManager
%22%3
A
%220%22%2
C
%22
mobile
%22%3
A
%2213592481839%22%7
D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412'
,
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'Cookie'
:
'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={
%22
state
%22
:
%220%22%2
C
%22
vipManager
%22
:
%220%22%2
C
%22
mobile
%22
:
%2217103126138%22%2
C
%22
userId
%22
:
%22304029617%22
}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=
%255
B
%252217103126138%2522%255
D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=
%7
B
%22
distinct_id
%22%3
A
%22304029617%22%2
C
%22
first_id
%22%3
A
%2218
d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e
%22%2
C
%22
props
%22%3
A
%7
B
%22%24
latest_traffic_source_type
%22%3
A
%22%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%
B5
%81%
E9
%87%8
F
%22%2
C
%22%24
latest_search_keyword
%22%3
A
%22%
E6
%9
C
%
AA
%
E5
%8
F
%96%
E5
%88%
B0
%
E5
%80%
BC_
%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%89%93%
E5
%
BC
%80%22%2
C
%22%24
latest_referrer
%22%3
A
%22%22%7
D
%2
C
%22
identities
%22%3
A
%22
eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9
%22%2
C
%22
history_login_id
%22%3
A
%7
B
%22
name
%22%3
A
%22%24
identity_login_id
%22%2
C
%22
value
%22%3
A
%22304029617%22%7
D
%2
C
%22%24
device_id
%22%3
A
%2218
d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e
%22%7
D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
,
}
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_2
=
[]
taskType
=
'天眼查/核心人员'
ip_num
=
0
def
get_proxy
(
ip_num
):
sql
=
"select proxy from clb_proxy"
cursor_
.
execute
(
sql
)
proxy_lists
=
cursor_
.
fetchall
()
cnx_
.
commit
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"http"
:
proxyMeta
,
"https"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
[
ip_num
]
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
,
ip_num
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
ip
=
get_proxy
(
ip_num
)
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
if
response
.
status_code
==
200
:
pass
else
:
ip_num
+=
1
raise
# return -1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
try
:
tmp_field
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
text
if
'最新公示'
in
tmp_field
:
total
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
return
0
except
:
return
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_page
(
url
,
ip_num
):
ip
=
get_proxy
(
ip_num
)
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
if
res
.
status_code
==
200
:
pass
else
:
ip_num
+=
1
raise
time
.
sleep
(
1
)
total_page_
=
res
.
json
()[
'data'
][
'total'
]
return
total_page_
def
doJob
():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'CorPersonEnterprise:gnqy_socialCode'
)
#
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '9135020056842712XB
'
social_code
=
'91320691550279691N
'
if
social_code
==
None
:
time
.
sleep
(
20
)
continue
...
...
@@ -35,15 +102,29 @@ def doJob():
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
pass
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
count
=
data
[
17
]
else
:
#数据重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
continue
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
# log.info(f'数据库中无该企业{social_code}')
sql
=
f
"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
id
=
data
[
0
]
com_name
=
data
[
3
]
xydm
=
data
[
1
]
conut
=
0
# 写入数据库
insert
=
"INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insert
,
(
com_name
,
xydm
,
social_code
))
cnx_
.
commit
()
tycid
=
''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
com_name
)
...
...
@@ -58,28 +139,111 @@ def doJob():
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始采集核心人员"
)
list_one_info
=
[]
num
=
1
for
page
in
range
(
1
,
2
):
t
=
int
(
time
.
time
()
*
1000
)
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url,headers=headers,proxies=ip) # ,verify=False
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
#todo:先确定接口走哪个
try
:
charge
=
get_html
(
tycid
,
ip_num
)
except
Exception
as
e
:
charge
=
-
1
log
.
info
(
e
)
total_page
=
0
t
=
int
(
time
.
time
()
*
1000
)
if
charge
==
-
1
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====页面请求失败===重新放入redis===='
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----请求失败"
)
# 获取当前进程pid
current_pid
=
baseCore
.
getPID
()
# todo: 重新启动新进程,杀死当前进程
subprocess
.
Popen
([
sys
.
executable
]
+
sys
.
argv
)
os
.
kill
(
current_pid
,
9
)
continue
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page1
=
get_page
(
url1
,
ip_num
)
except
:
total_page1
=
0
url
=
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page1
flag
=
2
else
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----有最新公示"
)
url2
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page2
=
get_page
(
url2
,
ip_num
)
except
:
total_page2
=
0
time
.
sleep
(
2
)
try
:
total_page3
=
get_page
(
url3
,
ip_num
)
except
:
total_page3
=
0
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page3
flag
=
3
else
:
total_page
=
0
flag
=
0
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
continue
if
total_page
==
0
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====总数请求失败===重新放入redis===='
)
continue
#todo:获取页数
time
.
sleep
(
2
)
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
c
in
range
(
3
):
ip
=
baseCore
.
get_proxy
()
url_
=
url
.
format
(
t
,
tycid
,
page
)
res
=
requests
.
get
(
url_
,
headers
=
headers
,
proxies
=
ip
)
# ,verify=False
time
.
sleep
(
1
)
if
res
.
status_code
==
200
:
break
else
:
if
c
==
2
:
res
=
''
break
continue
if
res
:
pass
else
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----高管信息请求失败'
)
continue
try
:
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
except
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
if
list_all
:
pass
else
:
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
if
flag
==
1
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
sex
=
one_info
[
'sex'
]
...
...
@@ -135,131 +299,95 @@ def doJob():
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# list_all_2.append(dic_json_img)
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
if
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
try
:
sex
=
one_info
[
'gender2'
]
except
:
sex
=
''
education
=
''
position
=
one_info
[
'position_name'
]
Salary
=
''
elif
flag
==
3
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
try
:
sex
=
one_info
[
'gender2'
]
except
:
sex
=
''
education
=
''
position
=
one_info
[
'position_name'
]
Salary
=
''
try
:
birthYear
=
one_info
[
'year_of_birth'
]
except
:
birthYear
=
''
personInfo
=
one_info
[
'resume_cn'
]
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'result'
]
# todo:增加一种情况
if
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
try
:
sex
=
one_info
[
'sex'
]
except
:
sex
=
''
try
:
education
=
one_info
[
'education'
]
except
:
education
=
''
try
:
position
=
one_info
[
'typeSore'
]
except
:
position
=
''
try
:
Salary
=
one_info
[
'salary'
]
except
:
Salary
=
''
birthYear
=
''
try
:
shareRatio
=
one_info
[
'percent'
]
except
:
shareRatio
=
''
try
:
benefitShare
=
one_info
[
'finalBenefitShares'
]
except
:
benefitShare
=
''
try
:
currentTerm
=
one_info
[
'term'
]
except
:
currentTerm
=
''
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
except
:
personInfo
=
''
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"benefitShare"
:
benefitShare
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"benefitShare"
:
benefitShare
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
personInfo
=
one_info
[
'resume_cn'
]
timestamp
=
int
(
int
(
one_info
[
'employ_date'
])
/
10000
)
currentTerm
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
(
timestamp
))
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
+
'至-'
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
else
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
try
:
position
=
one_info
[
'typeSore'
]
except
:
position
=
''
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
except
:
personInfo
=
''
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# print(list_one_info)
json_updata
=
json
.
dumps
(
list_one_info
)
if
json_updata
==
'[]'
:
continue
...
...
@@ -272,7 +400,7 @@ def doJob():
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
e
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
...
...
comData/Tyc/CorePerson2.py
浏览文件 @
862e97ab
...
...
@@ -7,6 +7,8 @@ import requests,time,random
import
pandas
as
pd
from
bs4
import
BeautifulSoup
import
urllib3
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
()
...
...
@@ -19,77 +21,207 @@ headers = {
}
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_2
=
[]
taskType
=
'天眼查/核心人员'
requests
.
adapters
.
DEFAULT_RETRIES
=
5
ip_num
=
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
if
response
.
status_code
==
200
:
pass
else
:
raise
# return -1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
try
:
tmp_field
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
text
if
'最新公示'
in
tmp_field
:
total
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
return
0
except
:
return
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_page
(
url
):
# ip = baseCore.get_proxy()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
time
.
sleep
(
1
)
total_page_
=
res
.
json
()[
'data'
][
'total'
]
return
total_page_
def
doJob
():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
#
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
social_code
=
baseCore
.
redicPullData
(
'CorPersonEnterprise:gnqy_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
social_code
=
'91510000207312079C'
# social_code = '91320691550279691N'
if
social_code
==
None
:
time
.
sleep
(
20
)
continue
if
'ZZSN'
in
social_code
:
continue
start
=
time
.
time
()
try
:
# data = baseCore.getInfomation(social_code)
# if len(data) != 0:
# pass
# else:
# #数据重新塞入redis
# baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
# continue
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
count
=
data
[
17
]
else
:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql
=
f
"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
id
=
data
[
0
]
com_name
=
data
[
3
]
xydm
=
data
[
1
]
conut
=
0
# 写入数据库
insert
=
"INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insert
,
(
com_name
,
xydm
,
social_code
))
cnx_
.
commit
()
tycid
=
''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# id = data[0]
# com_name = data[1]
# xydm = data[2]
tycid
=
''
# tycid = data[11]
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
social_cod
e
)
retData
=
getTycIdByXYDM
(
com_nam
e
)
if
retData
[
'state'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
#
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
#
cursor_.execute(updateSql)
#
cnx_.commit()
#
#
todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
else
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
except
Exception
as
e
:
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
#
count = data[17]
log
.
info
(
f
"
---{social_code
}----{tycid}----开始采集核心人员"
)
count
=
data
[
17
]
log
.
info
(
f
"
{id}---{xydm
}----{tycid}----开始采集核心人员"
)
list_one_info
=
[]
num
=
1
for
page
in
range
(
1
,
2
):
t
=
int
(
time
.
time
()
*
1000
)
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
# todo:先确定接口走哪个
try
:
charge
=
get_html
(
tycid
)
except
Exception
as
e
:
charge
=
-
1
log
.
info
(
e
)
time
.
sleep
(
2
)
t
=
int
(
time
.
time
()
*
1000
)
if
charge
==
-
1
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====页面请求失败===重新放入redis===='
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----请求失败"
)
continue
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page1
=
get_page
(
url1
)
except
:
total_page1
=
0
url
=
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page1
flag
=
2
else
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----有最新公示"
)
url2
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page2
=
get_page
(
url2
)
except
:
total_page2
=
0
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
try
:
total_page3
=
get_page
(
url3
)
except
:
total_page3
=
0
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page3
flag
=
3
else
:
total_page
=
0
flag
=
0
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
continue
if
total_page
==
0
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====总数请求失败===重新放入redis===='
)
break
# todo:获取页数
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
c
in
range
(
3
):
# ip = baseCore.get_proxy()
url_
=
url
.
format
(
t
,
tycid
,
page
)
res
=
requests
.
get
(
url_
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
if
res
.
status_code
==
200
:
break
else
:
if
c
==
2
:
res
=
''
break
continue
if
res
:
pass
else
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----高管信息请求失败'
)
continue
try
:
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
except
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
if
list_all
:
pass
else
:
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
if
flag
==
1
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
sex
=
one_info
[
'sex'
]
education
=
one_info
[
'education'
]
position
=
one_info
[
'position'
]
Salary
=
one_info
[
'salary'
]
#todo:获取当前年份
#
todo:获取当前年份
now
=
datetime
.
datetime
.
now
()
year
=
now
.
year
try
:
...
...
@@ -105,183 +237,149 @@ def doJob():
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# list_all_2.append(dic_json_img)
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
if
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
try
:
sex
=
one_info
[
'gender2'
]
except
:
sex
=
''
education
=
''
position
=
one_info
[
'position_name'
]
Salary
=
''
elif
flag
==
3
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
try
:
sex
=
one_info
[
'gender2'
]
except
:
sex
=
''
education
=
''
position
=
one_info
[
'position_name'
]
Salary
=
''
try
:
birthYear
=
one_info
[
'year_of_birth'
]
except
:
birthYear
=
''
personInfo
=
one_info
[
'resume_cn'
]
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
time
.
sleep
(
1
)
personInfo
=
one_info
[
'resume_cn'
]
timestamp
=
int
(
int
(
one_info
[
'employ_date'
])
/
10000
)
currentTerm
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
(
timestamp
))
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
+
'至-'
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
else
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
try
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
except
Exception
as
e
:
log
.
info
(
res
.
json
())
continue
# todo:增加一种情况
if
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
try
:
sex
=
one_info
[
'sex'
]
except
:
sex
=
''
try
:
education
=
one_info
[
'education'
]
except
:
education
=
''
try
:
position
=
one_info
[
'typeSore'
]
except
:
position
=
''
try
:
Salary
=
one_info
[
'salary'
]
except
:
Salary
=
''
birthYear
=
''
try
:
shareRatio
=
one_info
[
'percent'
]
except
:
shareRatio
=
''
try
:
benefitShare
=
one_info
[
'finalBenefitShares'
]
except
:
benefitShare
=
''
try
:
currentTerm
=
one_info
[
'term'
]
except
:
currentTerm
=
''
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
,
proxies
=
ip
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
except
:
personInfo
=
''
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"benefitShare"
:
benefitShare
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"benefitShare"
:
benefitShare
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
position
=
one_info
[
'typeSore'
]
except
:
position
=
''
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
except
:
personInfo
=
''
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# print(list_one_info)
json_updata
=
json
.
dumps
(
list_one_info
)
if
json_updata
==
'[]'
:
log
.
indo
(
f
'---{social_code}---无高管信息---'
)
continue
else
:
pass
response
=
requests
.
post
(
'http://114.115.236.206:8088/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
response
=
requests
.
post
(
'http://114.115.236.206:8088/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
print
(
response
.
text
)
log
.
info
(
'=========成功======'
)
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
e
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if
__name__
==
"__main__"
:
...
...
comData/YanBao/resentYanbao.py
浏览文件 @
862e97ab
...
...
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break
except
Exception
as
e
:
time
.
sleep
(
3
)
log
.
info
(
e
)
continue
if
page_size
<
1
:
...
...
@@ -206,7 +207,8 @@ def download(data, order_by,header):
come
=
data
[
'come'
]
except
:
come
=
''
if
publishDate
<
'2024-01-29'
:
return
tf_url
=
add_check_url
(
sourceAddress
)
if
tf_url
:
dic_result
=
{
...
...
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku()
# except Exception as e:
# pass
try
:
log
.
info
(
'shijiejingjiluntan'
)
shijiejingjiluntan
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
#
try:
#
log.info('shijiejingjiluntan')
#
shijiejingjiluntan()
#
except Exception as e:
#
log.info(e)
#
pass
# try:
# log.info('dongfangcaifu')
# dongfangcaifu()
...
...
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e:
# log.info(e)
# pass
#
#
try:
#
log.info('dongfangcaifu4')
#
dongfangcaifu4()
#
except Exception as e:
#
log.info(e)
#
pass
#
#
try:
#
log.info('dongfangcaifu5')
#
dongfangcaifu5()
#
except Exception as e:
#
log.info(e)
#
pass
#
#
try:
#
log.info('dongfangcaifu6')
#
dongfangcaifu6()
#
except Exception as e:
#
log.info(e)
#
pass
#
#
try:
#
log.info('dongfangcaifu7')
#
dongfangcaifu7()
#
except Exception as e:
#
log.info(e)
#
pass
try
:
log
.
info
(
'dongfangcaifu4'
)
dongfangcaifu4
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
try
:
log
.
info
(
'dongfangcaifu5'
)
dongfangcaifu5
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
try
:
log
.
info
(
'dongfangcaifu6'
)
dongfangcaifu6
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
try
:
log
.
info
(
'dongfangcaifu7'
)
dongfangcaifu7
()
except
Exception
as
e
:
log
.
info
(
e
)
pass
comData/dingzhi/dfsm_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
requests
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
def
two_dfsm_mtgc
():
info_list
=
[]
"""
地方扫描
"""
url_list
=
[
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
,
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for
url
in
url_list
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
# for page in range(378,int(pages)+1):
for
page
in
range
(
1
,
378
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else
:
url
=
f
'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0019-test'
,
href
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except
Exception
as
e
:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result
=
result
.
find
(
'p'
)
.
text
pub_source
=
result
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
info_code
=
'IN-20240129-0019'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751849444877144065'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
two_dfsm_mtgc
()
\ No newline at end of file
comData/dingzhi/gzyw_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
#国资要闻
def
gzyw
():
info_list
=
[]
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
# pages = soup.find('td',id='pag_4278129')
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
for
page
in
range
(
1
,
int
(
pages
)
+
1
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else
:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url
=
f
'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0002-test'
,
href
)
if
flag
:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except
Exception
as
e
:
continue
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result_
=
result
.
find
(
'p'
)
.
text
pub_source
=
result_
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result_
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\t
'
,
''
)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
if
result
:
result_
=
result
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result_
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result_
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result_
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
else
:
result
=
i_soup
.
find
(
class_
=
'pages-date'
)
pub_source
=
result
.
find
(
'span'
)
.
text
.
replace
(
'来源:'
,
''
)
.
strip
()
pub_time
=
result
.
text
pub_time
=
pub_time
.
split
(
'来源'
)[
0
]
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
)
.
text
)
# content = str(i_soup.find(class_='articlecontent').text)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code
=
'IN-20240129-0002'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751810519211053058'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
gzyw
()
\ No newline at end of file
comData/dingzhi/zzcx.py
0 → 100644
浏览文件 @
862e97ab
"""
中证智能财讯
"""
import
json
import
requests
from
bs4
import
BeautifulSoup
def
zzcx
():
url
=
'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload
=
{
"pageNo"
:
1
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
headers
=
{
'Accept'
:
'application/json'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Content-Length'
:
'56'
,
'Content-Type'
:
'application/json;charset=UTF-8'
,
'Cookie'
:
'zycna=VEwasVGF9akBAXuVA58n9CJm'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Origin'
:
'https://zzcx.cs.com.cn'
,
'Referer'
:
'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload
=
json
.
dumps
(
payload
)
result_json
=
requests
.
post
(
url
=
url
,
data
=
payload
,
headers
=
headers
)
.
json
()
print
(
result_json
)
pages
=
result_json
[
'data'
][
'pages'
]
for
page
in
range
(
1
,
int
(
pages
+
1
)):
payload_page
=
{
"pageNo"
:
page
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
payload_page
=
json
.
dumps
(
payload_page
)
datas
=
requests
.
post
(
url
=
url
,
data
=
payload_page
,
headers
=
headers
)
records
=
datas
.
json
()[
'data'
][
'records'
]
for
news
in
records
:
title
=
news
[
'title'
]
news_url
=
'https://zzcx.cs.com.cn/app/zzb/detail?id='
+
news
[
'manuscriptId'
]
news_req
=
requests
.
get
(
url
=
news_url
,
headers
=
headers
)
news_soup
=
BeautifulSoup
(
news_req
.
content
,
'html.parser'
)
detail_info
=
news_soup
.
find
(
'div'
,
class_
=
'subTitle___svblj'
)
div_list
=
detail_info
.
find_all
(
'div'
)
origin
=
div_list
[
0
]
.
text
publishDate
=
div_list
[
1
]
.
text
if
__name__
==
"__main__"
:
zzcx
()
\ No newline at end of file
comData/policylaw/ClassTool.py
浏览文件 @
862e97ab
...
...
@@ -85,7 +85,8 @@ class ClassTool():
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'发布时间'
:
dic_news
[
'publishDate'
]
'发布时间'
:
dic_news
[
'publishDate'
],
'标题'
:
dic_news
[
'title'
]
}
self
.
db_storage
.
insert_one
(
aaa_dic
)
...
...
test.py
浏览文件 @
862e97ab
...
...
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
#
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import
datetime
import
json
import
requests
headers
=
{
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
,
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
}
url
=
"https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob
%
E7
%
A0
%94%
E7
%
A9
%
B6
%
E9
%99%
A2
%
E3
%80%8
A2023
%
E5
%
B9
%
B4
%
E4
%
B8
%
AD
%
E5
%9
B
%
BD
%
E6
%96%87%
E6
%97%85%
E4
%
BA
%
A7
%
E4
%
B8
%9
A
%
E5
%8
F
%91%
E5
%
B1
%95%
E8
%
B6
%8
B
%
E5
%8
A
%
BF
%
E6
%8
A
%
A5
%
E5
%91%8
A
%
E3
%80%8
B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH
%2
Fk
%3
D"
# res = requests.get(url).text[1:-1]
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
with
open
(
'./a.pdf'
,
'wb'
)
as
f
:
f
.
write
(
res
.
content
)
\ No newline at end of file
import
pymongo
from
base
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0504'
]
datas
=
db_storage
.
find
({
'postCode'
:
'2'
})
.
limit
(
5
)
for
data
in
datas
:
title
=
data
[
'titleForeign'
]
contentWithTag
=
data
[
'richTextForeign'
]
summary
=
data
[
'contentForeign'
]
dic_info
=
{
'title'
:
title
,
'summary'
:
summary
,
'contentWithTag'
:
contentWithTag
}
headers
=
{
'Content-Type'
:
'application/json'
,
}
dic_info_
=
json
.
dumps
(
dic_info
)
# print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req
=
requests
.
post
(
'http://117.78.23.14:5000/translate'
,
data
=
dic_info_
,
headers
=
headers
)
log
.
info
(
req
.
text
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论