Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
9ed327a7
提交
9ed327a7
authored
2月 06, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
02/06
上级
b1d1cafd
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
1 行增加
和
405 行删除
+1
-405
CorePerson2.py
comData/Tyc/CorePerson2.py
+0
-404
baseinfouptime_tyc.py
comData/Tyc/baseinfouptime_tyc.py
+1
-1
没有找到文件。
comData/Tyc/CorePerson2.py
deleted
100644 → 0
浏览文件 @
b1d1cafd
#补充剩余核心人员信息
#先采集天眼查id,再通过id采集核心人员信息
import
datetime
import
json
import
requests
,
time
,
random
import
pandas
as
pd
from
bs4
import
BeautifulSoup
import
urllib3
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
log
=
baseCore
.
getLogger
()
headers
=
{
'Cookie'
:
'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=581fac60bfe911eeb3fc09360952f0ba; ssuid=1162354300; _ga=GA1.2.1333101206.1706683384; _gid=GA1.2.604055726.1706683384; tyc-user-phone=
%255
B
%252218837538506%2522%252
C
%2522152%25203756%25200528%2522%255
D; HWWAFSESID=b306585832394f6d3b; HWWAFSESTIME=1706751848880; csrfToken=DUIyVpHXj6o8vOwT9idnR4hd; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706671944,1706751850; bannerFlag=true; tyc-user-info=
%7
B
%22
state
%22%3
A
%220%22%2
C
%22
vipManager
%22%3
A
%220%22%2
C
%22
mobile
%22%3
A
%2215822283785%22%2
C
%22
userId
%22%3
A
%22269298908%22%7
D; tyc-user-info-save-time=1706751947161; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgyMjI4Mzc4NSIsImlhdCI6MTcwNjc1MTk0NiwiZXhwIjoxNzA5MzQzOTQ2fQ.W-hQ1QBEoDkHYqcSFjTEukemZJpHi-iYzqqnpYR-uaKi6ecS3HNp_dUs8UuzSiYyZH4WQjc-98Z-3hysQGEr_Q; searchSessionId=1706751998.12338612; sensorsdata2015jssdkcross=
%7
B
%22
distinct_id
%22%3
A
%22269298908%22%2
C
%22
first_id
%22%3
A
%2218
d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a
%22%2
C
%22
props
%22%3
A
%7
B
%22%24
latest_traffic_source_type
%22%3
A
%22%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%
B5
%81%
E9
%87%8
F
%22%2
C
%22%24
latest_search_keyword
%22%3
A
%22%
E6
%9
C
%
AA
%
E5
%8
F
%96%
E5
%88%
B0
%
E5
%80%
BC_
%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%89%93%
E5
%
BC
%80%22%2
C
%22%24
latest_referrer
%22%3
A
%22%22%7
D
%2
C
%22
identities
%22%3
A
%22
eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQ5MzJlZjg1NWEtMGVkMTRiODAyY2YzMDE4LTNlNjA0ODA5LTIwNzM2MDAtMThkNWQ5MzJlZjkyMGEiLCIkaWRlbnRpdHlfbG9naW5faWQiOiIyNjkyOTg5MDgifQ
%3
D
%3
D
%22%2
C
%22
history_login_id
%22%3
A
%7
B
%22
name
%22%3
A
%22%24
identity_login_id
%22%2
C
%22
value
%22%3
A
%22269298908%22%7
D
%2
C
%22%24
device_id
%22%3
A
%2218
d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a
%22%7
D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706752204'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
,
}
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_2
=
[]
taskType
=
'天眼查/核心人员'
ip_num
=
0
from
lxml
import
etree
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
if
response
.
status_code
==
200
:
pass
else
:
raise
# return -1
# soup = BeautifulSoup(response.content, 'html.parser')
soup
=
etree
.
HTML
(
response
.
content
)
try
:
model
=
soup
.
xpath
(
'//*[@id="page-root"]/div[3]/div[1]/div[3]/div/div[3]/div[2]/div[2]/div[3]/div/div[1]/div[1]/span/h3'
)
corp
=
model
.
text
if
corp
==
'主要人员'
:
tmp_field
=
soup
.
find
(
'div'
,
class_
=
'index_dim-tab-container__kysLO'
)
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
text
if
'最新公示'
in
tmp_field
:
total
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
return
0
except
:
return
-
1
try
:
except
:
return
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_page
(
url
):
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
time
.
sleep
(
1
)
if
res
.
status_code
!=
200
:
raise
total_page_
=
res
.
json
()[
'data'
][
'total'
]
return
total_page_
def
doJob
():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code
=
'91150400701461969E'
if
social_code
==
None
:
time
.
sleep
(
20
)
continue
start
=
time
.
time
()
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
count
=
data
[
17
]
else
:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql
=
f
"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
id
=
data
[
0
]
com_name
=
data
[
3
]
xydm
=
data
[
1
]
conut
=
0
# 写入数据库
insert
=
"INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (
%
s,
%
s)"
cursor_
.
execute
(
insert
,
(
com_name
,
xydm
))
cnx_
.
commit
()
tycid
=
''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# id = data[0]
# com_name = data[1]
# xydm = data[2]
# tycid = data[11]
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
com_name
)
if
retData
[
'state'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
# # todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
else
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
continue
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始采集核心人员"
)
list_one_info
=
[]
num
=
1
# todo:先确定接口走哪个
try
:
charge
=
get_html
(
tycid
)
except
Exception
as
e
:
charge
=
-
1
log
.
info
(
e
)
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseNone:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}------没有高管信息'
)
time
.
sleep
(
2
)
t
=
int
(
time
.
time
()
*
1000
)
if
charge
==
-
1
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====页面请求失败===重新放入redis===='
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----请求失败"
)
break
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page1
=
get_page
(
url1
)
except
:
total_page1
=
0
url
=
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page1
flag
=
2
else
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----有最新公示"
)
url2
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page2
=
get_page
(
url2
)
except
:
total_page2
=
0
time
.
sleep
(
1
)
try
:
total_page3
=
get_page
(
url3
)
except
:
total_page3
=
0
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page3
flag
=
3
else
:
total_page
=
0
flag
=
0
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseMap:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----页面和接口数据不对应'
)
continue
if
total_page
==
0
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====总数请求失败===重新放入redis===='
)
continue
# todo:获取页数
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
c
in
range
(
3
):
ip
=
baseCore
.
get_proxy
()
url_
=
url
.
format
(
t
,
tycid
,
page
)
res
=
requests
.
get
(
url_
,
headers
=
headers
,
proxies
=
ip
)
# ,verify=False
time
.
sleep
(
1
)
if
res
.
status_code
==
200
:
break
else
:
if
c
==
2
:
res
=
''
break
continue
if
res
:
pass
else
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----高管信息请求失败'
)
continue
try
:
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
except
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
if
list_all
:
pass
else
:
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
if
flag
==
1
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
sex
=
one_info
[
'sex'
]
education
=
one_info
[
'education'
]
position
=
one_info
[
'position'
]
Salary
=
one_info
[
'salary'
]
# todo:获取当前年份
now
=
datetime
.
datetime
.
now
()
year
=
now
.
year
try
:
birthYear
=
year
-
int
(
one_info
[
'age'
])
except
:
birthYear
=
''
StockKeepings
=
one_info
[
'numberOfShares'
]
currentTerm
=
one_info
[
'term'
]
personInfo
=
one_info
[
'resume'
]
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# list_all_2.append(dic_json_img)
elif
flag
==
3
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
try
:
sex
=
one_info
[
'gender2'
]
except
:
sex
=
''
education
=
''
position
=
one_info
[
'position_name'
]
Salary
=
''
try
:
birthYear
=
one_info
[
'year_of_birth'
]
except
:
birthYear
=
''
personInfo
=
one_info
[
'resume_cn'
]
try
:
timestamp
=
int
(
int
(
one_info
[
'employ_date'
])
/
10000
)
currentTerm
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
(
timestamp
))
except
:
currentTerm
=
''
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
sex
,
"education"
:
education
,
"position"
:
position
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
+
'至-'
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
else
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
try
:
position
=
one_info
[
'typeSore'
]
except
:
position
=
''
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
except
:
personInfo
=
''
try
:
person_img
=
one_info
[
'logo'
]
except
:
person_img
=
'--'
dic_json
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
}
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"sex"
:
''
,
"education"
:
''
,
"position"
:
position
,
"salary"
:
''
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
}
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
# print(list_one_info)
json_updata
=
json
.
dumps
(
list_one_info
)
if
json_updata
==
'[]'
:
continue
else
:
pass
response
=
requests
.
post
(
'http://114.115.236.206:8088/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
print
(
response
.
text
)
log
.
info
(
'=========成功======'
)
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
e
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
#break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if
__name__
==
"__main__"
:
doJob
()
\ No newline at end of file
comData/Tyc/baseinfouptime_tyc.py
浏览文件 @
9ed327a7
...
@@ -221,7 +221,7 @@ def spiderinfo(company_url, receptname, file_name):
...
@@ -221,7 +221,7 @@ def spiderinfo(company_url, receptname, file_name):
if
matched
:
if
matched
:
sourceUpdateTime
=
sourceUpdateTime_
sourceUpdateTime
=
sourceUpdateTime_
else
:
else
:
sourceUpdateTime
=
paserTime
(
sourceUpdateTime_
)
.
strftime
(
"
%
Y-
%
m-
%
d"
)
sourceUpdateTime
=
paserTime
(
sourceUpdateTime_
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S
"
)
except
:
except
:
redaytowork
(
com_name
,
social_code
,
file_name
)
redaytowork
(
com_name
,
social_code
,
file_name
)
aa_dict
=
{
aa_dict
=
{
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论