Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
862e97ab
提交
862e97ab
authored
1月 31, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/31
上级
1d1053c8
显示空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
878 行增加
和
259 行删除
+878
-259
CorePerson.py
comData/Tyc/CorePerson.py
+205
-77
CorePerson2.py
comData/Tyc/CorePerson2.py
+220
-122
resentYanbao.py
comData/YanBao/resentYanbao.py
+37
-35
dfsm_sasac.py
comData/dingzhi/dfsm_sasac.py
+145
-0
gzyw_sasac.py
comData/dingzhi/gzyw_sasac.py
+157
-0
zzcx.py
comData/dingzhi/zzcx.py
+52
-0
ClassTool.py
comData/policylaw/ClassTool.py
+2
-1
test.py
test.py
+60
-24
没有找到文件。
comData/Tyc/CorePerson.py
浏览文件 @
862e97ab
...
@@ -2,32 +2,99 @@
...
@@ -2,32 +2,99 @@
#先采集天眼查id,再通过id采集核心人员信息
#先采集天眼查id,再通过id采集核心人员信息
import
datetime
import
datetime
import
json
import
json
import
os
import
subprocess
import
sys
import
requests
,
time
,
random
import
requests
,
time
,
random
import
pandas
as
pd
import
pandas
as
pd
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
urllib3
import
urllib3
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
getTycId
import
getTycIdByXYDM
from
getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
headers
=
{
headers
=
{
'Cookie'
:
'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=
%7
B
%22
distinct_id
%22%3
A
%22307016917%22%2
C
%22
first_id
%22%3
A
%22188
be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e
%22%2
C
%22
props
%22%3
A
%7
B
%22%24
latest_traffic_source_type
%22%3
A
%22%
E8
%87%
AA
%
E7
%84%
B6
%
E6
%90%9
C
%
E7
%
B4
%
A2
%
E6
%
B5
%81%
E9
%87%8
F
%22%2
C
%22%24
latest_search_keyword
%22%3
A
%22%
E6
%9
C
%
AA
%
E5
%8
F
%96%
E5
%88%
B0
%
E5
%80%
BC
%22%2
C
%22%24
latest_referrer
%22%3
A
%22
https
%3
A
%2
F
%2
Fwww.baidu.com
%2
Flink
%22%7
D
%2
C
%22
identities
%22%3
A
%22
eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9
%22%2
C
%22
history_login_id
%22%3
A
%7
B
%22
name
%22%3
A
%22%24
identity_login_id
%22%2
C
%22
value
%22%3
A
%22307016917%22%7
D
%2
C
%22%24
device_id
%22%3
A
%22188
be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e
%22%7
D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=
%7
B
%22
state
%22%3
A
%220%22%2
C
%22
vipManager
%22%3
A
%220%22%2
C
%22
mobile
%22%3
A
%2213592481839%22%7
D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412'
,
'Cookie'
:
'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={
%22
state
%22
:
%220%22%2
C
%22
vipManager
%22
:
%220%22%2
C
%22
mobile
%22
:
%2217103126138%22%2
C
%22
userId
%22
:
%22304029617%22
}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=
%255
B
%252217103126138%2522%255
D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=
%7
B
%22
distinct_id
%22%3
A
%22304029617%22%2
C
%22
first_id
%22%3
A
%2218
d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e
%22%2
C
%22
props
%22%3
A
%7
B
%22%24
latest_traffic_source_type
%22%3
A
%22%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%
B5
%81%
E9
%87%8
F
%22%2
C
%22%24
latest_search_keyword
%22%3
A
%22%
E6
%9
C
%
AA
%
E5
%8
F
%96%
E5
%88%
B0
%
E5
%80%
BC_
%
E7
%9
B
%
B4
%
E6
%8
E
%
A5
%
E6
%89%93%
E5
%
BC
%80%22%2
C
%22%24
latest_referrer
%22%3
A
%22%22%7
D
%2
C
%22
identities
%22%3
A
%22
eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9
%22%2
C
%22
history_login_id
%22%3
A
%7
B
%22
name
%22%3
A
%22%24
identity_login_id
%22%2
C
%22
value
%22%3
A
%22304029617%22%7
D
%2
C
%22%24
device_id
%22%3
A
%2218
d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e
%22%7
D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529'
,
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
,
}
}
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_1
=
[]
list_all_2
=
[]
list_all_2
=
[]
taskType
=
'天眼查/核心人员'
taskType
=
'天眼查/核心人员'
ip_num
=
0
def
get_proxy
(
ip_num
):
sql
=
"select proxy from clb_proxy"
cursor_
.
execute
(
sql
)
proxy_lists
=
cursor_
.
fetchall
()
cnx_
.
commit
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"http"
:
proxyMeta
,
"https"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
[
ip_num
]
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
,
ip_num
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
ip
=
get_proxy
(
ip_num
)
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
if
response
.
status_code
==
200
:
pass
else
:
ip_num
+=
1
raise
# return -1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
try
:
tmp_field
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
text
if
'最新公示'
in
tmp_field
:
total
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
return
0
except
:
return
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_page
(
url
,
ip_num
):
ip
=
get_proxy
(
ip_num
)
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
if
res
.
status_code
==
200
:
pass
else
:
ip_num
+=
1
raise
time
.
sleep
(
1
)
total_page_
=
res
.
json
()[
'data'
][
'total'
]
return
total_page_
def
doJob
():
def
doJob
():
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'CorPersonEnterprise:gnqy_socialCode'
)
#
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
# social_code = '9135020056842712XB
'
social_code
=
'91320691550279691N
'
if
social_code
==
None
:
if
social_code
==
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
@@ -35,15 +102,29 @@ def doJob():
...
@@ -35,15 +102,29 @@ def doJob():
try
:
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
if
len
(
data
)
!=
0
:
pass
else
:
#数据重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
continue
id
=
data
[
0
]
id
=
data
[
0
]
com_name
=
data
[
1
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
tycid
=
data
[
11
]
count
=
data
[
17
]
else
:
#数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql
=
f
"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
id
=
data
[
0
]
com_name
=
data
[
3
]
xydm
=
data
[
1
]
conut
=
0
# 写入数据库
insert
=
"INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insert
,
(
com_name
,
xydm
,
social_code
))
cnx_
.
commit
()
tycid
=
''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
if
tycid
==
None
or
tycid
==
''
:
if
tycid
==
None
or
tycid
==
''
:
try
:
try
:
retData
=
getTycIdByXYDM
(
com_name
)
retData
=
getTycIdByXYDM
(
com_name
)
...
@@ -58,28 +139,111 @@ def doJob():
...
@@ -58,28 +139,111 @@ def doJob():
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
continue
except
:
except
:
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始采集核心人员"
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始采集核心人员"
)
list_one_info
=
[]
list_one_info
=
[]
num
=
1
num
=
1
for
page
in
range
(
1
,
2
):
t
=
int
(
time
.
time
()
*
1000
)
#todo:先确定接口走哪个
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
try
:
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
charge
=
get_html
(
tycid
,
ip_num
)
except
Exception
as
e
:
charge
=
-
1
log
.
info
(
e
)
total_page
=
0
t
=
int
(
time
.
time
()
*
1000
)
if
charge
==
-
1
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====页面请求失败===重新放入redis===='
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----请求失败"
)
# 获取当前进程pid
current_pid
=
baseCore
.
getPID
()
# todo: 重新启动新进程,杀死当前进程
subprocess
.
Popen
([
sys
.
executable
]
+
sys
.
argv
)
os
.
kill
(
current_pid
,
9
)
continue
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page1
=
get_page
(
url1
,
ip_num
)
except
:
total_page1
=
0
url
=
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page1
flag
=
2
else
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----有最新公示"
)
url2
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page2
=
get_page
(
url2
,
ip_num
)
except
:
total_page2
=
0
time
.
sleep
(
2
)
try
:
total_page3
=
get_page
(
url3
,
ip_num
)
except
:
total_page3
=
0
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page3
flag
=
3
else
:
total_page
=
0
flag
=
0
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
continue
if
total_page
==
0
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====总数请求失败===重新放入redis===='
)
continue
#todo:获取页数
time
.
sleep
(
2
)
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
c
in
range
(
3
):
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url,headers=headers,proxies=ip) # ,verify=False
url_
=
url
.
format
(
t
,
tycid
,
page
)
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
res
=
requests
.
get
(
url_
,
headers
=
headers
,
proxies
=
ip
)
# ,verify=False
time
.
sleep
(
1
)
time
.
sleep
(
1
)
if
res
.
status_code
==
200
:
break
else
:
if
c
==
2
:
res
=
''
break
continue
if
res
:
pass
else
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----高管信息请求失败'
)
continue
try
:
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
except
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
if
list_all
:
if
list_all
:
pass
else
:
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
if
flag
==
1
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
name
=
one_info
[
'name'
]
sex
=
one_info
[
'sex'
]
sex
=
one_info
[
'sex'
]
...
@@ -135,15 +299,7 @@ def doJob():
...
@@ -135,15 +299,7 @@ def doJob():
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
# list_all_2.append(dic_json_img)
# list_all_2.append(dic_json_img)
else
:
elif
flag
==
3
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
if
list_all
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
name
=
one_info
[
'personal_name'
]
try
:
try
:
...
@@ -153,8 +309,13 @@ def doJob():
...
@@ -153,8 +309,13 @@ def doJob():
education
=
''
education
=
''
position
=
one_info
[
'position_name'
]
position
=
one_info
[
'position_name'
]
Salary
=
''
Salary
=
''
try
:
birthYear
=
one_info
[
'year_of_birth'
]
except
:
birthYear
=
''
birthYear
=
''
personInfo
=
one_info
[
'resume_cn'
]
personInfo
=
one_info
[
'resume_cn'
]
timestamp
=
int
(
int
(
one_info
[
'employ_date'
])
/
10000
)
currentTerm
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
(
timestamp
))
dic_json
=
{
dic_json
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
...
@@ -166,53 +327,20 @@ def doJob():
...
@@ -166,53 +327,20 @@ def doJob():
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
'
'
,
"currentTerm"
:
currentTerm
+
'至-
'
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
else
:
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
# res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
res
=
requests
.
get
(
url
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'result'
]
# todo:增加一种情况
if
list_all
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
name
=
one_info
[
'name'
]
try
:
try
:
sex
=
one_info
[
'sex'
]
except
:
sex
=
''
try
:
education
=
one_info
[
'education'
]
except
:
education
=
''
try
:
position
=
one_info
[
'typeSore'
]
position
=
one_info
[
'typeSore'
]
except
:
except
:
position
=
''
position
=
''
try
:
Salary
=
one_info
[
'salary'
]
except
:
Salary
=
''
birthYear
=
''
try
:
shareRatio
=
one_info
[
'percent'
]
except
:
shareRatio
=
''
try
:
benefitShare
=
one_info
[
'finalBenefitShares'
]
except
:
benefitShare
=
''
try
:
currentTerm
=
one_info
[
'term'
]
except
:
currentTerm
=
''
person_id
=
one_info
[
'id'
]
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
# person_res = requests.get(person_url, headers=headers, proxies=ip)
...
@@ -229,29 +357,29 @@ def doJob():
...
@@ -229,29 +357,29 @@ def doJob():
dic_json
=
{
dic_json
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
''
,
"education"
:
education
,
"education"
:
''
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
''
,
"birthYear"
:
birthYear
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"shareRatio"
:
''
,
"benefitShare"
:
benefitShare
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
dic_json_img
=
{
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
''
,
"education"
:
education
,
"education"
:
''
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
''
,
"birthYear"
:
birthYear
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"shareRatio"
:
''
,
"benefitShare"
:
benefitShare
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"头像"
:
person_img
,
...
@@ -259,7 +387,7 @@ def doJob():
...
@@ -259,7 +387,7 @@ def doJob():
}
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
# print(list_one_info)
json_updata
=
json
.
dumps
(
list_one_info
)
json_updata
=
json
.
dumps
(
list_one_info
)
if
json_updata
==
'[]'
:
if
json_updata
==
'[]'
:
continue
continue
...
@@ -272,7 +400,7 @@ def doJob():
...
@@ -272,7 +400,7 @@ def doJob():
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
e
)
log
.
info
(
e
)
# 重新塞入redis
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
...
...
comData/Tyc/CorePerson2.py
浏览文件 @
862e97ab
...
@@ -7,6 +7,8 @@ import requests,time,random
...
@@ -7,6 +7,8 @@ import requests,time,random
import
pandas
as
pd
import
pandas
as
pd
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
urllib3
import
urllib3
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
getTycId
import
getTycIdByXYDM
from
getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
...
@@ -19,77 +21,207 @@ headers = {
...
@@ -19,77 +21,207 @@ headers = {
}
}
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
list_all_1
=
[]
list_all_1
=
[]
list_all_2
=
[]
list_all_2
=
[]
taskType
=
'天眼查/核心人员'
taskType
=
'天眼查/核心人员'
requests
.
adapters
.
DEFAULT_RETRIES
=
5
ip_num
=
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_html
(
tycid
):
url
=
f
"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
if
response
.
status_code
==
200
:
pass
else
:
raise
# return -1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
try
:
tmp_field
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
text
if
'最新公示'
in
tmp_field
:
total
=
soup
.
find
(
'div'
,
class_
=
'dim-tab-root'
)
.
find
(
'span'
)
.
get_text
()
.
split
(
'最新公示'
)[
1
]
.
replace
(
' '
,
''
)
return
int
(
total
)
else
:
return
0
except
:
return
0
@retry
(
tries
=
3
,
delay
=
1
)
def
get_page
(
url
):
# ip = baseCore.get_proxy()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
time
.
sleep
(
1
)
total_page_
=
res
.
json
()[
'data'
][
'total'
]
return
total_page_
def
doJob
():
def
doJob
():
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
#
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
social_code
=
baseCore
.
redicPullData
(
'CorPersonEnterprise:gnqy_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
social_code
=
'91510000207312079C'
# social_code = '91320691550279691N'
if
social_code
==
None
:
if
social_code
==
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
if
'ZZSN'
in
social_code
:
continue
start
=
time
.
time
()
start
=
time
.
time
()
try
:
try
:
# data = baseCore.getInfomation(social_code)
data
=
baseCore
.
getInfomation
(
social_code
)
# if len(data) != 0:
if
len
(
data
)
!=
0
:
# pass
id
=
data
[
0
]
# else:
com_name
=
data
[
1
]
# #数据重新塞入redis
xydm
=
data
[
2
]
# baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
tycid
=
data
[
11
]
count
=
data
[
17
]
else
:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql
=
f
"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
id
=
data
[
0
]
com_name
=
data
[
3
]
xydm
=
data
[
1
]
conut
=
0
# 写入数据库
insert
=
"INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insert
,
(
com_name
,
xydm
,
social_code
))
cnx_
.
commit
()
tycid
=
''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# continue
# id = data[0]
# id = data[0]
# com_name = data[1]
# xydm = data[2]
# xydm = data[2]
tycid
=
''
# tycid = data[11]
if
tycid
==
None
or
tycid
==
''
:
if
tycid
==
None
or
tycid
==
''
:
try
:
try
:
retData
=
getTycIdByXYDM
(
social_cod
e
)
retData
=
getTycIdByXYDM
(
com_nam
e
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
#
#
todo:写入数据库
#
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
#
cursor_.execute(updateSql)
cursor_
.
execute
(
updateSql
)
#
cnx_.commit()
cnx_
.
commit
()
else
:
else
:
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
continue
except
Exception
as
e
:
except
:
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
continue
continue
#
count = data[17]
count
=
data
[
17
]
log
.
info
(
f
"
---{social_code
}----{tycid}----开始采集核心人员"
)
log
.
info
(
f
"
{id}---{xydm
}----{tycid}----开始采集核心人员"
)
list_one_info
=
[]
list_one_info
=
[]
num
=
1
num
=
1
for
page
in
range
(
1
,
2
):
t
=
int
(
time
.
time
()
*
1000
)
# todo:先确定接口走哪个
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
try
:
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
charge
=
get_html
(
tycid
)
ip
=
baseCore
.
get_proxy
()
except
Exception
as
e
:
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
charge
=
-
1
log
.
info
(
e
)
time
.
sleep
(
2
)
t
=
int
(
time
.
time
()
*
1000
)
if
charge
==
-
1
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====页面请求失败===重新放入redis===='
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----请求失败"
)
continue
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page1
=
get_page
(
url1
)
except
:
total_page1
=
0
url
=
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page1
flag
=
2
else
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----有最新公示"
)
url2
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
total_page2
=
get_page
(
url2
)
except
:
total_page2
=
0
time
.
sleep
(
1
)
time
.
sleep
(
1
)
try
:
total_page3
=
get_page
(
url3
)
except
:
total_page3
=
0
if
total_page2
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page2
flag
=
1
else
:
if
total_page3
==
charge
:
url
=
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page
=
total_page3
flag
=
3
else
:
total_page
=
0
flag
=
0
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
continue
if
total_page
==
0
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'==={social_code}=====总数请求失败===重新放入redis===='
)
break
# todo:获取页数
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
c
in
range
(
3
):
# ip = baseCore.get_proxy()
url_
=
url
.
format
(
t
,
tycid
,
page
)
res
=
requests
.
get
(
url_
,
headers
=
headers
)
# ,verify=False
time
.
sleep
(
1
)
if
res
.
status_code
==
200
:
break
else
:
if
c
==
2
:
res
=
''
break
continue
if
res
:
pass
else
:
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterpriseError:gnqy_socialCode'
,
social_code
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----高管信息请求失败'
)
continue
try
:
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
except
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
if
list_all
:
if
list_all
:
pass
else
:
log
.
info
(
f
'{id}---{xydm}----{tycid}----没有高管信息'
)
if
flag
==
1
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
name
=
one_info
[
'name'
]
sex
=
one_info
[
'sex'
]
sex
=
one_info
[
'sex'
]
education
=
one_info
[
'education'
]
education
=
one_info
[
'education'
]
position
=
one_info
[
'position'
]
position
=
one_info
[
'position'
]
Salary
=
one_info
[
'salary'
]
Salary
=
one_info
[
'salary'
]
#todo:获取当前年份
#
todo:获取当前年份
now
=
datetime
.
datetime
.
now
()
now
=
datetime
.
datetime
.
now
()
year
=
now
.
year
year
=
now
.
year
try
:
try
:
...
@@ -105,47 +237,40 @@ def doJob():
...
@@ -105,47 +237,40 @@ def doJob():
except
:
except
:
person_img
=
'--'
person_img
=
'--'
dic_json
=
{
dic_json
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
sex
,
"education"
:
education
,
"education"
:
education
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
dic_json_img
=
{
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
sex
,
"education"
:
education
,
"education"
:
education
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
Salary
,
"birthYear"
:
birthYear
,
"birthYear"
:
birthYear
,
"shareNum"
:
StockKeepings
,
"shareNum"
:
StockKeepings
,
"shareRatio"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"currentTerm"
:
currentTerm
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"头像"
:
person_img
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
# list_all_2.append(dic_json_img)
# list_all_2.append(dic_json_img)
else
:
elif
flag
==
3
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
time
.
sleep
(
1
)
list_all
=
res
.
json
()[
'data'
][
'dataList'
]
if
list_all
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'personal_name'
]
name
=
one_info
[
'personal_name'
]
try
:
try
:
...
@@ -155,8 +280,13 @@ def doJob():
...
@@ -155,8 +280,13 @@ def doJob():
education
=
''
education
=
''
position
=
one_info
[
'position_name'
]
position
=
one_info
[
'position_name'
]
Salary
=
''
Salary
=
''
try
:
birthYear
=
one_info
[
'year_of_birth'
]
except
:
birthYear
=
''
birthYear
=
''
personInfo
=
one_info
[
'resume_cn'
]
personInfo
=
one_info
[
'resume_cn'
]
timestamp
=
int
(
int
(
one_info
[
'employ_date'
])
/
10000
)
currentTerm
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
(
timestamp
))
dic_json
=
{
dic_json
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
...
@@ -168,59 +298,24 @@ def doJob():
...
@@ -168,59 +298,24 @@ def doJob():
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
''
,
"shareRatio"
:
''
,
"benefitShare"
:
''
,
"benefitShare"
:
''
,
"currentTerm"
:
'
'
,
"currentTerm"
:
currentTerm
+
'至-
'
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
else
:
else
:
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
time
.
sleep
(
1
)
try
:
list_all
=
res
.
json
()[
'data'
][
'result'
]
except
Exception
as
e
:
log
.
info
(
res
.
json
())
continue
# todo:增加一种情况
if
list_all
:
for
one_info
in
list_all
:
for
one_info
in
list_all
:
name
=
one_info
[
'name'
]
name
=
one_info
[
'name'
]
try
:
try
:
sex
=
one_info
[
'sex'
]
except
:
sex
=
''
try
:
education
=
one_info
[
'education'
]
except
:
education
=
''
try
:
position
=
one_info
[
'typeSore'
]
position
=
one_info
[
'typeSore'
]
except
:
except
:
position
=
''
position
=
''
try
:
Salary
=
one_info
[
'salary'
]
except
:
Salary
=
''
birthYear
=
''
try
:
shareRatio
=
one_info
[
'percent'
]
except
:
shareRatio
=
''
try
:
benefitShare
=
one_info
[
'finalBenefitShares'
]
except
:
benefitShare
=
''
try
:
currentTerm
=
one_info
[
'term'
]
except
:
currentTerm
=
''
person_id
=
one_info
[
'id'
]
person_id
=
one_info
[
'id'
]
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_url
=
f
'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
,
proxies
=
ip
)
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res
=
requests
.
get
(
person_url
,
headers
=
headers
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
person_soup
=
BeautifulSoup
(
person_res
.
content
,
'html.parser'
)
try
:
try
:
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
personInfo
=
person_soup
.
find
(
'span'
,
{
'class'
:
'_56d0a'
})
.
text
.
strip
()
...
@@ -233,29 +328,29 @@ def doJob():
...
@@ -233,29 +328,29 @@ def doJob():
dic_json
=
{
dic_json
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
''
,
"education"
:
education
,
"education"
:
''
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
''
,
"birthYear"
:
birthYear
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"shareRatio"
:
''
,
"benefitShare"
:
benefitShare
,
"benefitShare"
:
''
,
"currentTerm"
:
currentTerm
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"sort"
:
str
(
num
)
"sort"
:
str
(
num
)
}
}
dic_json_img
=
{
dic_json_img
=
{
"socialCreditCode"
:
social_code
,
"socialCreditCode"
:
social_code
,
"name"
:
name
,
"name"
:
name
,
"sex"
:
sex
,
"sex"
:
''
,
"education"
:
education
,
"education"
:
''
,
"position"
:
position
,
"position"
:
position
,
"salary"
:
Salary
,
"salary"
:
''
,
"birthYear"
:
birthYear
,
"birthYear"
:
''
,
"shareNum"
:
''
,
"shareNum"
:
''
,
"shareRatio"
:
shareRatio
,
"shareRatio"
:
''
,
"benefitShare"
:
benefitShare
,
"benefitShare"
:
''
,
"currentTerm"
:
''
,
"currentTerm"
:
''
,
"personInfo"
:
personInfo
,
"personInfo"
:
personInfo
,
"头像"
:
person_img
,
"头像"
:
person_img
,
...
@@ -263,25 +358,28 @@ def doJob():
...
@@ -263,25 +358,28 @@ def doJob():
}
}
num
=
num
+
1
num
=
num
+
1
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
# print(list_one_info)
json_updata
=
json
.
dumps
(
list_one_info
)
json_updata
=
json
.
dumps
(
list_one_info
)
if
json_updata
==
'[]'
:
if
json_updata
==
'[]'
:
log
.
indo
(
f
'---{social_code}---无高管信息---'
)
continue
continue
else
:
else
:
pass
pass
response
=
requests
.
post
(
'http://114.115.236.206:8088/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
response
=
requests
.
post
(
'http://114.115.236.206:8088/sync/executive'
,
data
=
json_updata
,
timeout
=
300
,
verify
=
False
)
print
(
response
.
text
)
print
(
response
.
text
)
log
.
info
(
'=========成功======'
)
log
.
info
(
'=========成功======'
)
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
f
'==={social_code}=====企业核心人员采集失败===重新放入redis===='
)
log
.
info
(
e
)
# 重新塞入redis
# 重新塞入redis
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise
Error
:gnqy_socialCode'
,
social_code
)
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
time
.
sleep
(
5
)
# break
# break
# df_img = pd.DataFrame(list_all_2)
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
comData/YanBao/resentYanbao.py
浏览文件 @
862e97ab
...
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
...
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break
break
except
Exception
as
e
:
except
Exception
as
e
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
log
.
info
(
e
)
continue
continue
if
page_size
<
1
:
if
page_size
<
1
:
...
@@ -206,7 +207,8 @@ def download(data, order_by,header):
...
@@ -206,7 +207,8 @@ def download(data, order_by,header):
come
=
data
[
'come'
]
come
=
data
[
'come'
]
except
:
except
:
come
=
''
come
=
''
if
publishDate
<
'2024-01-29'
:
return
tf_url
=
add_check_url
(
sourceAddress
)
tf_url
=
add_check_url
(
sourceAddress
)
if
tf_url
:
if
tf_url
:
dic_result
=
{
dic_result
=
{
...
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
...
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku()
# qianyanzhishiku()
# except Exception as e:
# except Exception as e:
# pass
# pass
try
:
#
try:
log
.
info
(
'shijiejingjiluntan'
)
#
log.info('shijiejingjiluntan')
shijiejingjiluntan
()
#
shijiejingjiluntan()
except
Exception
as
e
:
#
except Exception as e:
log
.
info
(
e
)
#
log.info(e)
pass
#
pass
# try:
# try:
# log.info('dongfangcaifu')
# log.info('dongfangcaifu')
# dongfangcaifu()
# dongfangcaifu()
...
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
...
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e:
# except Exception as e:
# log.info(e)
# log.info(e)
# pass
# pass
#
#
try:
try
:
#
log.info('dongfangcaifu4')
log
.
info
(
'dongfangcaifu4'
)
#
dongfangcaifu4()
dongfangcaifu4
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu5')
log
.
info
(
'dongfangcaifu5'
)
#
dongfangcaifu5()
dongfangcaifu5
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu6')
log
.
info
(
'dongfangcaifu6'
)
#
dongfangcaifu6()
dongfangcaifu6
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu7')
log
.
info
(
'dongfangcaifu7'
)
#
dongfangcaifu7()
dongfangcaifu7
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
comData/dingzhi/dfsm_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
requests
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
def
two_dfsm_mtgc
():
info_list
=
[]
"""
地方扫描
"""
url_list
=
[
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
,
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for
url
in
url_list
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
# for page in range(378,int(pages)+1):
for
page
in
range
(
1
,
378
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else
:
url
=
f
'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0019-test'
,
href
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except
Exception
as
e
:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result
=
result
.
find
(
'p'
)
.
text
pub_source
=
result
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
info_code
=
'IN-20240129-0019'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751849444877144065'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
two_dfsm_mtgc
()
\ No newline at end of file
comData/dingzhi/gzyw_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
#国资要闻
def
gzyw
():
info_list
=
[]
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
# pages = soup.find('td',id='pag_4278129')
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
for
page
in
range
(
1
,
int
(
pages
)
+
1
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else
:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url
=
f
'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0002-test'
,
href
)
if
flag
:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except
Exception
as
e
:
continue
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result_
=
result
.
find
(
'p'
)
.
text
pub_source
=
result_
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result_
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\t
'
,
''
)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
if
result
:
result_
=
result
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result_
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result_
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result_
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
else
:
result
=
i_soup
.
find
(
class_
=
'pages-date'
)
pub_source
=
result
.
find
(
'span'
)
.
text
.
replace
(
'来源:'
,
''
)
.
strip
()
pub_time
=
result
.
text
pub_time
=
pub_time
.
split
(
'来源'
)[
0
]
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
)
.
text
)
# content = str(i_soup.find(class_='articlecontent').text)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code
=
'IN-20240129-0002'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751810519211053058'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
gzyw
()
\ No newline at end of file
comData/dingzhi/zzcx.py
0 → 100644
浏览文件 @
862e97ab
"""
中证智能财讯
"""
import
json
import
requests
from
bs4
import
BeautifulSoup
def
zzcx
():
url
=
'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload
=
{
"pageNo"
:
1
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
headers
=
{
'Accept'
:
'application/json'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Content-Length'
:
'56'
,
'Content-Type'
:
'application/json;charset=UTF-8'
,
'Cookie'
:
'zycna=VEwasVGF9akBAXuVA58n9CJm'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Origin'
:
'https://zzcx.cs.com.cn'
,
'Referer'
:
'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload
=
json
.
dumps
(
payload
)
result_json
=
requests
.
post
(
url
=
url
,
data
=
payload
,
headers
=
headers
)
.
json
()
print
(
result_json
)
pages
=
result_json
[
'data'
][
'pages'
]
for
page
in
range
(
1
,
int
(
pages
+
1
)):
payload_page
=
{
"pageNo"
:
page
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
payload_page
=
json
.
dumps
(
payload_page
)
datas
=
requests
.
post
(
url
=
url
,
data
=
payload_page
,
headers
=
headers
)
records
=
datas
.
json
()[
'data'
][
'records'
]
for
news
in
records
:
title
=
news
[
'title'
]
news_url
=
'https://zzcx.cs.com.cn/app/zzb/detail?id='
+
news
[
'manuscriptId'
]
news_req
=
requests
.
get
(
url
=
news_url
,
headers
=
headers
)
news_soup
=
BeautifulSoup
(
news_req
.
content
,
'html.parser'
)
detail_info
=
news_soup
.
find
(
'div'
,
class_
=
'subTitle___svblj'
)
div_list
=
detail_info
.
find_all
(
'div'
)
origin
=
div_list
[
0
]
.
text
publishDate
=
div_list
[
1
]
.
text
if
__name__
==
"__main__"
:
zzcx
()
\ No newline at end of file
comData/policylaw/ClassTool.py
浏览文件 @
862e97ab
...
@@ -85,7 +85,8 @@ class ClassTool():
...
@@ -85,7 +85,8 @@ class ClassTool():
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'发布时间'
:
dic_news
[
'publishDate'
]
'发布时间'
:
dic_news
[
'publishDate'
],
'标题'
:
dic_news
[
'title'
]
}
}
self
.
db_storage
.
insert_one
(
aaa_dic
)
self
.
db_storage
.
insert_one
(
aaa_dic
)
...
...
test.py
浏览文件 @
862e97ab
...
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
...
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
#
#
# code = use_ocr(out_img_path)
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
# 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import
datetime
import
json
import
requests
import
requests
headers
=
{
import
pymongo
# 'Accept': '*/*',
from
base
import
BaseCore
# 'Accept-Encoding': 'gzip, deflate, br',
baseCore
=
BaseCore
.
BaseCore
()
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
log
=
baseCore
.
getLogger
()
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
# 'Pragma': 'no-cache',
'数据源_0504'
]
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
datas
=
db_storage
.
find
({
'postCode'
:
'2'
})
.
limit
(
5
)
# 'Sec-Fetch-Site': 'same-site',
for
data
in
datas
:
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
,
title
=
data
[
'titleForeign'
]
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
contentWithTag
=
data
[
'richTextForeign'
]
# 'sec-ch-ua-mobile': '?0',
summary
=
data
[
'contentForeign'
]
# 'sec-ch-ua-platform': '"Windows"'
dic_info
=
{
}
'title'
:
title
,
url
=
"https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob
%
E7
%
A0
%94%
E7
%
A9
%
B6
%
E9
%99%
A2
%
E3
%80%8
A2023
%
E5
%
B9
%
B4
%
E4
%
B8
%
AD
%
E5
%9
B
%
BD
%
E6
%96%87%
E6
%97%85%
E4
%
BA
%
A7
%
E4
%
B8
%9
A
%
E5
%8
F
%91%
E5
%
B1
%95%
E8
%
B6
%8
B
%
E5
%8
A
%
BF
%
E6
%8
A
%
A5
%
E5
%91%8
A
%
E3
%80%8
B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH
%2
Fk
%3
D"
'summary'
:
summary
,
'contentWithTag'
:
contentWithTag
}
# res = requests.get(url).text[1:-1]
headers
=
{
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
'Content-Type'
:
'application/json'
,
with
open
(
'./a.pdf'
,
'wb'
)
as
f
:
}
f
.
write
(
res
.
content
)
dic_info_
=
json
.
dumps
(
dic_info
)
\ No newline at end of file
# print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req
=
requests
.
post
(
'http://117.78.23.14:5000/translate'
,
data
=
dic_info_
,
headers
=
headers
)
log
.
info
(
req
.
text
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论