Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
1f595f59
提交
1f595f59
authored
12月 13, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查脚本维护
上级
472a45d5
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
25 行增加
和
33 行删除
+25
-33
getTycId.py
comData/Tyc/getTycId.py
+6
-6
newsbucai.py
comData/Tyc/newsbucai.py
+19
-27
没有找到文件。
comData/Tyc/getTycId.py
浏览文件 @
1f595f59
...
@@ -47,11 +47,11 @@ cnx_ = baseCore.cnx
...
@@ -47,11 +47,11 @@ cnx_ = baseCore.cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
taskType
=
'天眼查企业id/天眼查'
taskType
=
'天眼查企业id/天眼查'
#根据信用代码获取天眼查id 企业名字等信息
#根据信用代码获取天眼查id 企业名字等信息
def
getTycIdByXYDM
(
xydm
):
def
getTycIdByXYDM
(
com_name
):
retData
=
{
'state'
:
False
,
'tycData'
:
None
,
'reput'
:
True
}
retData
=
{
'state'
:
False
,
'tycData'
:
None
,
'reput'
:
True
}
url
=
f
"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
url
=
f
"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
paramJsonData
=
{
'keyword'
:
xydm
}
paramJsonData
=
{
'keyword'
:
com_name
}
try
:
try
:
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
...
@@ -62,21 +62,21 @@ def getTycIdByXYDM(xydm):
...
@@ -62,21 +62,21 @@ def getTycIdByXYDM(xydm):
if
retJsonData
[
'data'
]
and
retJsonData
[
'state'
]
==
'ok'
:
if
retJsonData
[
'data'
]
and
retJsonData
[
'state'
]
==
'ok'
:
pass
pass
else
:
else
:
log
.
error
(
f
"---{
xydm
}-未查询到该企业---"
)
log
.
error
(
f
"---{
com_name
}-未查询到该企业---"
)
retData
[
'reput'
]
=
False
retData
[
'reput'
]
=
False
return
retData
return
retData
matchType
=
retJsonData
[
'data'
][
0
][
'matchType'
]
matchType
=
retJsonData
[
'data'
][
0
][
'matchType'
]
if
matchType
==
'信用代码
匹配'
:
if
matchType
==
'公司名称
匹配'
:
retData
[
'state'
]
=
True
retData
[
'state'
]
=
True
retData
[
'tycData'
]
=
retJsonData
[
'data'
][
0
]
retData
[
'tycData'
]
=
retJsonData
[
'data'
][
0
]
response
.
close
()
response
.
close
()
return
retData
return
retData
else
:
else
:
log
.
error
(
f
"{
xydm
}------{retJsonData}"
)
log
.
error
(
f
"{
com_name
}------{retJsonData}"
)
response
.
close
()
response
.
close
()
return
retData
return
retData
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
f
"---{
xydm
}--{e}---"
)
log
.
error
(
f
"---{
com_name
}--{e}---"
)
return
retData
return
retData
...
...
comData/Tyc/newsbucai.py
浏览文件 @
1f595f59
...
@@ -52,7 +52,7 @@ headers = {
...
@@ -52,7 +52,7 @@ headers = {
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
taskType
=
'企业动态/天眼查/
补采专精特新
'
taskType
=
'企业动态/天眼查/'
def
reqDetailmsg
(
url
,
headers
):
def
reqDetailmsg
(
url
,
headers
):
...
@@ -81,14 +81,14 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -81,14 +81,14 @@ def beinWork(tyc_code, social_code,start_time):
for
m
in
range
(
0
,
3
):
for
m
in
range
(
0
,
3
):
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
time
.
sleep
(
random
.
randint
(
3
,
5
))
time
.
sleep
(
random
.
randint
(
3
,
5
))
break
break
if
(
response
.
status_code
==
200
):
if
(
response
.
status_code
==
200
):
pass
pass
except
Exception
as
e
:
except
Exception
as
e
:
#todo:重新放入redis中
#todo:重新放入redis中
baseCore
.
rePutIntoR
(
'N
oticeEnterprise:gnqy_socialCode
'
,
social_code
)
baseCore
.
rePutIntoR
(
'N
ewsResend:newsInfo
'
,
social_code
)
log
.
error
(
f
"{tyc_code}-----获取总数接口失败"
)
log
.
error
(
f
"{tyc_code}-----获取总数接口失败"
)
error
=
'获取总数接口失败'
error
=
'获取总数接口失败'
state
=
0
state
=
0
...
@@ -125,7 +125,7 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -125,7 +125,7 @@ def beinWork(tyc_code, social_code,start_time):
up_okCount
=
0
up_okCount
=
0
up_errorCount
=
0
up_errorCount
=
0
up_repetCount
=
0
up_repetCount
=
0
for
num
in
range
(
1
,
totalPage
+
1
):
for
num
in
range
(
1
,
10
):
time
.
sleep
(
3
)
time
.
sleep
(
3
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num}----开始"
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num}----开始"
)
start_page
=
time
.
time
()
start_page
=
time
.
time
()
...
@@ -134,7 +134,7 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -134,7 +134,7 @@ def beinWork(tyc_code, social_code,start_time):
try
:
try
:
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
response_page
=
requests
.
get
(
url
=
url_page
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
response_page
=
requests
.
get
(
url
=
url_page
,
headers
=
headers
,
verify
=
False
)
# time.sleep(3)
# time.sleep(3)
break
break
except
:
except
:
...
@@ -168,24 +168,15 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -168,24 +168,15 @@ def beinWork(tyc_code, social_code,start_time):
source
=
info_page
[
'website'
]
source
=
info_page
[
'website'
]
link
=
info_page
[
'uri'
]
link
=
info_page
[
'uri'
]
try
:
try
:
sel_sql
=
'''select social_credit_code from brpa_source_article_news where source_address =
%
s and social_credit_code=
%
s and type='2' '''
time_struct
=
time
.
localtime
(
int
(
info_page
[
'rtm'
]
/
1000
))
# 首先把时间戳转换为结构化时间
cursor_
.
execute
(
sel_sql
,
(
link
,
social_code
))
time_format
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time_struct
)
# 把结构化时间转换为格式化时间
except
Exception
as
e
:
if
time_format
>
'2023-12-10 00:00'
:
print
(
e
)
pass
selects
=
cursor_
.
fetchone
()
else
:
if
selects
:
log
.
info
(
f
'{tyc_code}-----{social_code}----{link}:已经存在'
)
# todo:如果该条数据存在则说明该条数据之后的都已经采集完成,就可以跳出函数,执行下一个企业
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_repetCount'
]
=
up_repetCount
retData
[
'up_repetCount'
]
=
up_repetCount
# return retData
return
retData
continue
try
:
time_struct
=
time
.
localtime
(
int
(
info_page
[
'rtm'
]
/
1000
))
# 首先把时间戳转换为结构化时间
time_format
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time_struct
)
# 把结构化时间转换为格式化时间
except
:
except
:
time_format
=
baseCore
.
getNowTime
(
1
)
time_format
=
baseCore
.
getNowTime
(
1
)
try
:
try
:
...
@@ -303,8 +294,8 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -303,8 +294,8 @@ def beinWork(tyc_code, social_code,start_time):
def
doJob
():
def
doJob
():
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'News
Enterprise:gnqybc_socialCode
'
)
social_code
=
baseCore
.
redicPullData
(
'News
Resend:newsInfo
'
)
# social_code = '91
2301001275921118
'
# social_code = '91
320000733334390E
'
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
if
social_code
==
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
...
@@ -316,14 +307,15 @@ def doJob():
...
@@ -316,14 +307,15 @@ def doJob():
pass
pass
else
:
else
:
#数据重新塞入redis
#数据重新塞入redis
baseCore
.
rePutIntoR
(
'News
Enterprise:gnqybc_socialCode
'
,
social_code
)
baseCore
.
rePutIntoR
(
'News
Resend:newsInfo
'
,
social_code
)
continue
continue
id
=
data
[
0
]
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
tycid
=
data
[
11
]
if
tycid
==
None
or
tycid
==
''
:
if
tycid
==
None
or
tycid
==
''
:
try
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
retData
=
getTycIdByXYDM
(
com_name
)
if
retData
[
'tycData'
]
and
retData
[
'reput'
]:
if
retData
[
'tycData'
]
and
retData
[
'reput'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
# todo:写入数据库
...
@@ -335,7 +327,7 @@ def doJob():
...
@@ -335,7 +327,7 @@ def doJob():
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'News
Enterprise:gnqybc_socialCode
'
,
social_code
)
baseCore
.
rePutIntoR
(
'News
Resend:newsInfo
'
,
social_code
)
continue
continue
elif
not
retData
[
'reput'
]
and
not
retData
[
'tycData'
]:
elif
not
retData
[
'reput'
]
and
not
retData
[
'tycData'
]:
continue
continue
...
@@ -343,7 +335,7 @@ def doJob():
...
@@ -343,7 +335,7 @@ def doJob():
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'News
Enterprise:gnqybc_socialCode
'
,
social_code
)
baseCore
.
rePutIntoR
(
'News
Resend:newsInfo
'
,
social_code
)
continue
continue
count
=
data
[
17
]
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
...
@@ -363,7 +355,7 @@ def doJob():
...
@@ -363,7 +355,7 @@ def doJob():
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败===='
)
log
.
info
(
f
'==={social_code}=====获取企业信息失败===='
)
#重新塞入redis
#重新塞入redis
baseCore
.
rePutIntoR
(
'News
Enterprise:gnqybc_socialCode
'
,
social_code
)
baseCore
.
rePutIntoR
(
'News
Resend:newsInfo
'
,
social_code
)
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论