Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
cdc4a715
提交
cdc4a715
authored
10月 21, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
独角兽榜单基本信息
上级
41c6aaa2
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
65 行增加
和
46 行删除
+65
-46
dujs_1020_baseinfo.py
comData/BaseInfo_qcc/dujs_1020_baseinfo.py
+65
-46
没有找到文件。
comData/BaseInfo_qcc/
fbs
baseinfo.py
→
comData/BaseInfo_qcc/
dujs_1020_
baseinfo.py
浏览文件 @
cdc4a715
...
@@ -9,19 +9,14 @@ import json
...
@@ -9,19 +9,14 @@ import json
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
getQccId
import
find_id_by_name
from
getQccId
import
find_id_by_name
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
baseCore
=
BaseCore
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
# 通过企查查id获取企业基本信息
# 通过企查查id获取企业基本信息
def
info_by_id
(
com_id
,
com_name
):
def
info_by_id
(
com_id
,
com_name
,
social_code
):
aa_dict_list
=
[]
aa_dict_list
=
[]
t
=
str
(
int
(
time
.
time
())
*
1000
)
t
=
str
(
int
(
time
.
time
())
*
1000
)
...
@@ -29,14 +24,17 @@ def info_by_id(com_id,com_name):
...
@@ -29,14 +24,17 @@ def info_by_id(com_id,com_name):
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
log
.
info
(
resp_dict
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
com_jc_name
=
''
com_jc_name
=
''
try
:
try
:
result_dict
=
resp_dict
[
'result'
][
'Company'
]
result_dict
=
resp_dict
[
'result'
][
'Company'
]
except
:
except
:
print
(
com_name
+
":获取失败"
)
log
.
info
(
com_name
+
":获取失败===========重新放入redis"
)
#
baseCore
.
rePutIntoR
(
'dujs_1020:baseinfo_socialcode'
,
social_code
)
return
aa_dict_list
company_name
=
result_dict
[
'Name'
]
company_name
=
result_dict
[
'Name'
]
CreditCode
=
result_dict
[
'CreditCode'
]
CreditCode
=
result_dict
[
'CreditCode'
]
if
CreditCode
is
None
:
if
CreditCode
is
None
:
...
@@ -309,11 +307,12 @@ def info_by_id(com_id,com_name):
...
@@ -309,11 +307,12 @@ def info_by_id(com_id,com_name):
}
}
aa_dict_list
.
append
(
aa_dict
)
aa_dict_list
.
append
(
aa_dict
)
print
(
company_name
+
":爬取完成"
)
log
.
info
(
company_name
+
":爬取完成"
)
return
aa_dict_list
return
aa_dict_list
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查/
福布斯
'
taskType
=
'基本信息/企查查/
单项双百企业冠军
'
headers
=
{
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Connection'
:
'keep-alive'
,
...
@@ -325,54 +324,73 @@ if __name__ == '__main__':
...
@@ -325,54 +324,73 @@ if __name__ == '__main__':
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
'Accept-Encoding'
:
'gzip, deflate, br,'
}
}
list_weicha
=
[]
name_list
=
[]
#从redis里拿数据
#从redis里拿数据
while
True
:
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
baseCore
.
GetToken
()
token
=
baseCore
.
GetToken
()
list_weicha
=
[]
dataList
=
[]
list_all_info
=
[]
if
token
:
name_list
=
[]
pass
else
:
log
.
info
(
'==========已无token=========='
)
time
.
sleep
(
30
)
continue
# list_all_info = []
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
social_code
=
baseCore
.
redicPullData
(
'BaseInfoEnterpriseFbs:gnqy_social_
code'
)
# social_code = baseCore.redicPullData('dujs_1020:baseinfo_social
code')
# social_code = '91110000710924945A
'
social_code
=
'91310115067758342E
'
if
social_code
is
None
:
if
social_code
==
''
or
social_code
is
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
log
.
info
(
f
'----当前企业{social_code}-----'
)
dic_info
=
baseCore
.
getInfomation
(
social_code
)
dic_info
=
baseCore
.
getInfomation
(
social_code
)
#
log
.
info
(
f
'----当前企业{social_code}--开始处理---'
)
count
=
dic_info
[
13
]
count
=
dic_info
[
14
]
com_name
=
dic_info
[
1
]
com_name
=
dic_info
[
1
]
social_code
=
dic_info
[
2
]
social_code
=
dic_info
[
2
]
# 企查查id
# 企查查id
company_id
=
dic_info
[
12
]
company_id
=
dic_info
[
12
]
#
如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if
company_id
==
None
:
if
company_id
==
None
:
if
social_code
:
if
social_code
:
company_id
=
find_id_by_name
(
start_time
,
token
,
social_code
)
company_id
=
find_id_by_name
(
start_time
,
token
,
social_code
)
else
:
else
:
company_id
=
find_id_by_name
(
start_time
,
token
,
com_name
)
company_id
=
find_id_by_name
(
start_time
,
token
,
com_name
)
# todo:写入数据库
if
company_id
==
'null'
:
updateSql
=
f
"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
log
.
info
(
'=====搜索不到该企业===='
)
cursor_
.
execute
(
updateSql
)
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
cnx_
.
commit
()
baseCore
.
rePutIntoR
(
'dujs_1020:baseinfo_socialcode'
,
social_code
+
':搜索不到'
)
post_data_list
=
info_by_id
(
company_id
,
com_name
)
if
company_id
==
""
:
print
(
com_name
+
":企业ID获取失败"
)
list_weicha
.
append
(
com_name
+
":企业ID获取失败"
)
continue
continue
else
:
if
not
company_id
:
log
.
info
(
f
'====={social_code}===={company_id}=====获取企业id成功====='
)
log
.
info
(
social_code
+
":企业ID获取失败===重新放入redis"
)
try
:
list_weicha
.
append
(
social_code
+
":企业ID获取失败"
)
post_data_list
=
info_by_id
(
company_id
,
com_nam
e
)
baseCore
.
rePutIntoR
(
'dujs_1020:baseinfo_socialcode'
,
social_cod
e
)
except
:
baseCore
.
delete_token
(
token
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis
====='
)
log
.
info
(
'=====已重新放入redis,失效token已删除=
====='
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterpriseFbs:gnqy_social_code'
,
social_code
)
time
.
sleep
(
20
)
continue
continue
else
:
log
.
info
(
f
'====={com_name}===={company_id}=====获取企业id成功====='
)
# todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
try
:
post_data_list
=
info_by_id
(
company_id
,
com_name
,
social_code
)
except
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'dujs_1020:baseinfo_socialcode'
,
social_code
)
continue
if
post_data_list
:
pass
else
:
log
.
info
(
f
'======{social_code}====企查查token失效===='
)
time
.
sleep
(
20
)
continue
for
post_data
in
post_data_list
:
for
post_data
in
post_data_list
:
list_all_info
.
append
(
post_data
)
if
post_data
is
None
:
if
post_data
is
None
:
print
(
com_name
+
":企业信息获取失败"
)
print
(
com_name
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_name
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_name
+
":企业信息获取失败"
)
...
@@ -396,17 +414,18 @@ if __name__ == '__main__':
...
@@ -396,17 +414,18 @@ if __name__ == '__main__':
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
get_socialcode
,
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
get_socialcode
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{get_name}--{get_socialcode}--kafka传输失败"
)
log
.
info
(
f
"{get_name}--{get_socialcode}--kafka传输失败"
)
# break
# 信息采集完成后将该企业的采集次数更新
# 信息采集完成后将该企业的采集次数更新
runType
=
'BaseInfoRunCount'
runType
=
'BaseInfoRunCount'
count
+=
1
count
+=
1
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
'_'
)[:
10
]
break
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
'_'
)[:
10
]
companyName
=
pd
.
DataFrame
(
name_list
)
companyName
=
pd
.
DataFrame
(
name_list
)
companyName
.
to_excel
(
f
'./data/企业名称对比_{nowtime}.xlsx'
,
index
=
False
)
companyName
.
to_excel
(
f
'./data/企业名称对比_{nowtime}.xlsx'
,
index
=
False
)
false_com
=
pd
.
DataFrame
(
list_weicha
)
false_com
=
pd
.
DataFrame
(
list_weicha
)
false_com
.
to_excel
(
f
'./data/采集失败企业名单_{nowtime}.xlsx'
,
index
=
False
)
false_com
.
to_excel
(
f
'./data/采集失败企业名单_{nowtime}.xlsx'
,
index
=
False
)
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论