Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
f65aad31
提交
f65aad31
authored
8月 17, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查-股东信息
上级
5753a353
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
351 行增加
和
0 行删除
+351
-0
gudongxinxi.py
enterprise_tyc/gudongxinxi.py
+351
-0
没有找到文件。
enterprise_tyc/gudongxinxi.py
0 → 100644
浏览文件 @
f65aad31
# -*- coding: utf-8 -*-
import
json
import
requests
,
time
from
bs4
import
BeautifulSoup
import
urllib3
from
retry
import
retry
from
base
import
BaseCore
from
classtool
import
Token
,
sendData
,
Driver
,
Login
from
enterprise_tyc.getTycId
import
getTycIdByDB
baseCore
=
BaseCore
.
BaseCore
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
log
=
baseCore
.
getLogger
()
token
=
Token
()
edge
=
Driver
()
login
=
Login
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_2
=
[]
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
,
driver
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
driver
.
get
(
url
=
url
)
time
.
sleep
(
3
)
page_source
=
driver
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
try
:
div_part
=
soup
.
find
(
'div'
,
attrs
=
{
'data-dim'
:
'holder'
})
except
:
return
-
1
if
div_part
is
None
:
return
-
2
else
:
try
:
tmp_field
=
div_part
.
find
(
'h3'
,
class_
=
'dimHeader_main-title-txt__GPoaZ'
)
.
text
if
'股东信息'
in
tmp_field
:
log
.
info
(
'股东信息'
)
if
'股东信息'
in
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
():
total
=
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'股东信息'
)[
1
]
.
replace
(
' '
,
''
)
if
'最新公示'
in
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
():
total
=
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
# 否则就是主要股东接口
if
'主要股东'
in
tmp_field
:
log
.
info
(
'主要股东'
)
if
'股东信息'
in
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
():
total
=
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'股东信息'
)[
1
]
.
replace
(
' '
,
''
)
if
'最新公示'
in
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
():
total
=
div_part
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
except
:
return
0
@retry
(
tries
=
5
,
delay
=
3
)
def
get_page
(
url
,
s
,
headers
):
res
=
s
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
(
5
,
10
))
if
res
.
status_code
!=
200
:
raise
data_page
=
res
.
json
()
try
:
total_page_
=
data_page
[
'data'
][
'total'
]
except
:
raise
return
total_page_
,
data_page
@retry
(
tries
=
5
,
delay
=
3
)
def
get_page1
(
url
,
s
,
headers
):
res
=
s
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
(
5
,
10
))
if
res
.
status_code
!=
200
:
raise
data_page
=
res
.
json
()
try
:
total_page_
=
data_page
[
'data'
][
'stockHolder'
][
'total'
]
except
:
raise
return
total_page_
,
data_page
@retry
(
tries
=
5
,
delay
=
3
)
def
post_page
(
url
,
s
,
headers
,
payload
):
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
json
.
dumps
(
payload
),
timeout
=
(
5
,
10
))
if
res
.
status_code
!=
200
:
raise
json_info
=
res
.
json
()
try
:
total_page_
=
json_info
[
'data'
][
'total'
]
except
:
raise
return
total_page_
,
json_info
def
doJob
():
# for social_code in social_code_list:
driver
=
edge
.
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
while
True
:
# todo:设置cookies的使用
headers
=
{
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Content-Type'
:
'application/json'
,
'Connection'
:
'keep-alive'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'version'
:
'TYC-Web'
}
driver
,
id_cookie
,
s
,
update_headers
=
login
.
login
(
driver
)
if
id_cookie
:
pass
else
:
continue
headers
.
update
(
update_headers
)
info
=
baseCore
.
r
.
blpop
([
'shareHolderEnterprise:gnqy_socialCode'
],
2
)
if
not
info
:
log
.
info
(
'数据已全部采集完'
)
time
.
sleep
(
60
*
60
)
continue
info
=
info
[
1
]
.
decode
()
# tycid = info.split('|')[1]
socialCreditCode
=
info
.
split
(
'|'
)[
0
]
start
=
time
.
time
()
# info = '9133000070471161XA'
# tycid = info.split('|')[1]
socialCreditCode
=
info
.
split
(
'|'
)[
0
]
dics
=
[]
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
try
:
tycid
=
getTycIdByDB
(
socialCreditCode
,
cursor
,
start
,
info
,
s
)
if
tycid
:
try
:
charge
=
get_html
(
tycid
,
driver
)
except
:
charge
=
-
1
if
charge
==
-
1
:
token
.
updateTokeen
(
id_cookie
,
2
)
time
.
sleep
(
3
)
log
.
info
(
f
'{socialCreditCode}==={tycid}===详情页获取失败'
)
baseCore
.
r
.
rpush
(
'shareHolderEnterprise:gnqy_socialCode'
,
info
)
continue
elif
charge
==
-
2
:
# 该企业没有股东信息
token
.
updateTokeen
(
id_cookie
,
2
)
log
.
info
(
f
'{socialCreditCode}==={tycid}===没有股东信息'
)
# baseCore.r.rpush('shareHolderEnterprise:gnqy_socialCode', info)
continue
url2
=
f
'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holder/latest/announcement'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={tycid}&pageSize=10&pageNum=1&percentLevel=-100&type=1'
url1
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/holder/hk?date=&gid={tycid}&sortField=&sortType=-100&pageSize=10&pageNum=1&percentLevel=-100&keyword='
payload
=
{
"gid"
:
f
"{tycid}"
,
"pageSize"
:
20
,
"pageNum"
:
1
,
"sortField"
:
""
,
"sortType"
:
"-100"
,
"historyType"
:
1
}
try
:
total_page2
,
data_page2
=
post_page
(
url2
,
s
,
headers
,
payload
)
except
:
total_page2
=
0
data_page2
=
{}
time
.
sleep
(
1
)
try
:
total_page3
,
data_page3
=
get_page
(
url3
,
s
,
headers
)
except
:
total_page3
=
0
data_page3
=
{}
try
:
total_page1
,
data_page1
=
get_page1
(
url1
,
s
,
headers
)
except
:
total_page1
=
0
data_page1
=
{}
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holderV2/latest/announcement?'
total_page
=
total_page2
data_page_one
=
data_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1'
total_page
=
total_page3
data_page_one
=
data_page3
flag
=
3
else
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/holder/hk?date=&gid={}&sortField=&sortType=-100&pageSize=20&pageNum={}&percentLevel=-100&keyword='
total_page
=
total_page1
data_page_one
=
data_page1
flag
=
0
if
total_page
==
0
:
token
.
updateTokeen
(
id_cookie
,
3
)
# 重新塞入redis
log
.
info
(
f
'{socialCreditCode}==={tycid}===接口数据获取失败'
)
baseCore
.
r
.
rpush
(
'shareHolderEnterprise:gnqy_socialCode'
,
info
)
continue
# todo:获取页数
log
.
info
(
f
'总数为{total_page}'
)
if
int
(
total_page
%
20
)
==
0
:
maxpage
=
int
((
total_page
/
20
)
+
1
)
else
:
maxpage
=
int
((
total_page
/
20
)
+
1
)
+
1
for
page
in
range
(
1
,
maxpage
):
if
page
==
1
:
data_page
=
data_page_one
errorCode
=
data_page
[
'errorCode'
]
else
:
res
=
None
for
d
in
range
(
3
):
if
flag
==
1
:
url_
=
url
payload
=
{
"gid"
:
f
"{tycid}"
,
"pageSize"
:
20
,
"pageNum"
:
f
"{page}"
,
"sortField"
:
""
,
"sortType"
:
"-100"
,
"historyType"
:
1
}
try
:
res
=
s
.
post
(
url
=
url_
,
headers
=
headers
,
data
=
json
.
dumps
(
payload
),
timeout
=
(
5
,
10
))
except
requests
.
exceptions
.
RequestException
as
e
:
log
.
info
(
e
)
time
.
sleep
(
1
)
continue
data_page
=
res
.
json
()
errorCode
=
res
.
json
()[
'errorCode'
]
if
errorCode
!=
0
:
continue
else
:
break
else
:
url_
=
url
.
format
(
tycid
,
page
)
try
:
res
=
s
.
get
(
url_
,
headers
=
headers
,
timeout
=
(
5
,
10
))
# ,verify=False
except
requests
.
exceptions
.
RequestException
as
e
:
log
.
info
(
e
)
time
.
sleep
(
1
)
continue
data_page
=
res
.
json
()
errorCode
=
res
.
json
()[
'errorCode'
]
if
errorCode
!=
0
:
continue
else
:
break
res
.
close
()
if
errorCode
==
0
:
pass
else
:
token
.
updateTokeen
(
id_cookie
,
3
)
# 重新塞入redis
baseCore
.
r
.
rpush
(
'shareHolderEnterprise:gnqy_socialCode'
,
info
)
log
.
info
(
f
'{socialCreditCode}==={tycid}===接口数据获取失败'
)
continue
# todo:test测试
try
:
list_all
=
data_page
[
'data'
][
'holderList'
]
except
:
try
:
list_all
=
data_page
[
'data'
][
'result'
]
except
:
list_all
=
data_page
[
'data'
][
'stockHolder'
][
'result'
]
if
list_all
:
pass
else
:
pass
# todo: 关闭连接
# res.close()
log
.
info
(
f
'----flag:{flag}----'
)
log
.
info
(
f
'-----list_all:{len(list_all)}----'
)
for
idx
,
holder_info
in
enumerate
(
list_all
):
if
page
==
1
:
sort
=
idx
+
1
else
:
sort
=
idx
+
1
+
(
20
*
(
page
-
1
))
if
flag
==
1
:
name
=
holder_info
[
'shareHolderName'
]
# 股东名称
shareHoldRation
=
holder_info
[
'percent'
]
# 持股比例
shareHoldNum
=
holder_info
[
'shareholdingNum'
]
# 持股数
shareHoldUnit
=
holder_info
[
'shareholdingNumUnit'
]
# 持股单位
shareType
=
holder_info
[
'shareType'
]
# 持股类型
year
=
holder_info
[
'yearReport'
]
# 发布年份
dic
=
{
'socialCreditCode'
:
socialCreditCode
,
'name'
:
name
,
'shareHoldRation'
:
shareHoldRation
,
'shareHoldNum'
:
shareHoldNum
,
'shareHoldUnit'
:
shareHoldUnit
,
'shareType'
:
shareType
,
'year'
:
year
,
'sort'
:
sort
}
elif
flag
==
3
:
name
=
holder_info
[
'name'
]
# 股东名称
shareHoldRation
=
holder_info
[
'proportion'
]
# 持股比例
shareHoldNum
=
holder_info
[
'holdingNum'
]
# 持股数
shareHoldUnit
=
holder_info
[
'shareUnit'
]
# 持股单位
shareType
=
holder_info
[
'shareType'
]
# 持股类型
year
=
holder_info
[
'publishDate'
]
# 发布年份
dic
=
{
'socialCreditCode'
:
socialCreditCode
,
'name'
:
name
,
'shareHoldRation'
:
shareHoldRation
,
'shareHoldNum'
:
shareHoldNum
,
'shareHoldUnit'
:
shareHoldUnit
,
'shareType'
:
shareType
,
'year'
:
year
,
'sort'
:
sort
}
else
:
name
=
holder_info
[
'holder_name'
]
# 股东名称
shareHoldRation
=
holder_info
[
'longHeldRatioWithUnit'
]
# 持股比例
shareHoldNum
=
holder_info
[
'held_total_num_long_position'
]
# 持股数
shareHoldUnit
=
holder_info
[
'shareUnit'
]
# 持股单位
shareType
=
holder_info
[
'shareTypeName'
]
# 持股类型
dic
=
{
'socialCreditCode'
:
socialCreditCode
,
'name'
:
name
,
'shareHoldRation'
:
shareHoldRation
,
'shareHoldNum'
:
shareHoldNum
,
'shareHoldUnit'
:
shareHoldUnit
,
'shareType'
:
shareType
,
'sort'
:
sort
}
dics
.
append
(
dic
)
# log.info('=========成功======')
token
.
updateTokeen
(
id_cookie
,
3
)
time
.
sleep
(
5
)
try
:
req
=
sendData
(
'http://114.115.236.206:8088/sync/shareHolder'
,
dics
)
log
.
info
(
'数据发送成功'
)
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
log
.
info
(
f
'{socialCreditCode}==={req.text}===耗时{takeTime}'
)
except
Exception
as
e
:
log
.
error
(
f
'数据发送结果口失败==={e}'
)
except
Exception
as
e
:
token
.
updateTokeen
(
id_cookie
,
3
)
log
.
info
(
f
'==={socialCreditCode}=====企业股东采集失败===重新放入redis===='
)
log
.
info
(
e
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'ChangeRecordEnterprise:gnqy_socialCode'
,
info
)
time
.
sleep
(
5
)
# break
if
__name__
==
"__main__"
:
doJob
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论