Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
b923d30f
提交
b923d30f
authored
10月 27, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中国100大企业基本信息
上级
1bb5b282
全部展开
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
117 行增加
和
55 行删除
+117
-55
BaseCore.py
comData/newlist/china100/BaseCore.py
+0
-0
baseinfo_start.bat
comData/newlist/china100/baseinfo_start.bat
+6
-0
china100.py
comData/newlist/china100/china100.py
+71
-42
getQccId.py
comData/newlist/china100/getQccId.py
+40
-13
没有找到文件。
comData/newlist/china100/BaseCore.py
0 → 100644
浏览文件 @
b923d30f
差异被折叠。
点击展开。
comData/newlist/china100/baseinfo_start.bat
0 → 100644
浏览文件 @
b923d30f
title dujiaoshoubaseinfo
call activate
call conda activate zzsn@3.8.0
python baseinfo_dujiaoshou.py
pause
\ No newline at end of file
comData/
dfcfwGpdm/NQenterprise/NQbase_info
.py
→
comData/
newlist/china100/china100
.py
浏览文件 @
b923d30f
...
...
@@ -7,16 +7,17 @@ import requests
import
json
from
kafka
import
KafkaProducer
from
base.
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
from
getQccId
import
find_id_by_name
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
# 通过企查查id获取企业基本信息
def
info_by_id
(
com_id
,
com_name
,
gpdm
):
def
info_by_id
(
com_id
,
com_name
):
aa_dict_list
=
[]
t
=
str
(
int
(
time
.
time
())
*
1000
)
...
...
@@ -31,7 +32,7 @@ def info_by_id(com_id,com_name,gpdm):
result_dict
=
resp_dict
[
'result'
][
'Company'
]
except
:
log
.
info
(
com_name
+
":获取失败===========重新放入redis"
)
baseCore
.
rePutIntoR
(
'
EnterpriseIpo:nq_gpdm'
,
gpdm
)
baseCore
.
rePutIntoR
(
'
china100:baseinfo'
,
com_name
)
return
aa_dict_list
company_name
=
result_dict
[
'Name'
]
...
...
@@ -306,12 +307,12 @@ def info_by_id(com_id,com_name,gpdm):
}
aa_dict_list
.
append
(
aa_dict
)
print
(
company_name
+
":爬取完成"
)
log
.
info
(
company_name
+
":爬取完成"
)
return
aa_dict_list
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查'
taskType
=
'基本信息/企查查
/中国100强
'
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
...
...
@@ -323,65 +324,97 @@ if __name__ == '__main__':
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
list_weicha
=
[]
name_list
=
[]
#从redis里拿数据
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
'b4eb43143abdcf395f1335f322ca29e5'
list_weicha
=
[]
list_all_info
=
[]
name_list
=
[]
token
=
baseCore
.
GetToken
()
dataList
=
[]
if
token
:
pass
else
:
log
.
info
(
'==========已无token=========='
)
time
.
sleep
(
30
)
continue
# list_all_info = []
start_time
=
time
.
time
()
# 获取企业信息
# com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code
=
'873349'
if
'.NQ'
in
com_code
:
com_code1
=
com_code
else
:
com_code1
=
com_code
+
'.NQ'
social_code
=
baseCore
.
redicPullData
(
'china100:baseinfo'
)
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
)
# com_name = '卓新市万达铸业有限公司'
if
social_code
==
''
or
social_code
is
None
:
time
.
sleep
(
20
)
continue
if
'搜索不到'
in
social_code
:
continue
else
:
pass
dic_info
=
baseCore
.
getInfomation
(
social_code
)
log
.
info
(
f
'----当前企业{social_code}--开始处理---'
)
com_name
=
dic_info
[
1
]
#企查查id
company_id
=
dic_info
[
3
]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if
company_id
==
None
or
company_id
==
False
:
if
social_code
:
company_id
=
find_id_by_name
(
start_time
,
token
,
social_code
)
else
:
company_id
=
find_id_by_name
(
start_time
,
token
,
com_name
)
if
company_id
==
'null'
:
log
.
info
(
'=====搜索不到该企业===='
)
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
social_code
+
':搜索不到'
)
continue
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
com_code
)
log
.
info
(
'-----已重新放入redis-----'
)
log
.
info
(
com_name
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_name
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
com_name
)
baseCore
.
delete_token
(
token
)
log
.
info
(
'=====已重新放入redis,失效token已删除======'
)
time
.
sleep
(
20
)
continue
else
:
log
.
info
(
f
'====={com_cod
e}===={company_id}=====获取企业id成功====='
)
# todo:企查查id写入gpdm表中
updateSql
=
f
"update gpdm set QCCID = '{company_id}' where gpdm = '{com_cod
e}'"
cursor_
.
execute
(
updateSql
)
log
.
info
(
f
'====={com_nam
e}===={company_id}=====获取企业id成功====='
)
# todo:写入数据库
updateqccid
=
f
"update China100 set qccid = '{company_id}' where CompanyName = '{com_nam
e}'"
cursor_
.
execute
(
updateqccid
)
cnx_
.
commit
()
try
:
post_data_list
=
info_by_id
(
company_id
,
''
,
com_code1
)
post_data_list
=
info_by_id
(
company_id
,
com_name
)
except
:
log
.
info
(
f
'====={com_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
com_code
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
com_name
)
baseCore
.
delete_token
(
token
)
log
.
info
(
'=====已重新放入redis,失效token已删除======'
)
continue
if
post_data_list
:
pass
else
:
log
.
info
(
f
'======{com
_code}====企查查token失效===='
)
# log.info(f'======{social
_code}====企查查token失效====')
time
.
sleep
(
20
)
continue
for
post_data
in
post_data_list
:
list_all_info
.
append
(
post_data
)
#
list_all_info.append(post_data)
if
post_data
is
None
:
print
(
com_
cod
e
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_
cod
e
+
":企业信息获取失败"
)
print
(
com_
nam
e
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_
nam
e
+
":企业信息获取失败"
)
continue
get_name
=
post_data
[
'name'
]
get_socialcode
=
post_data
[
'socialCreditCode'
]
#todo:将信用代码更新到表中
updatesocialcode
=
f
"update China100 set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_
.
execute
(
updatesocialcode
)
cnx_
.
commit
()
name_compile
=
{
'yuan_name'
:
com_
cod
e
,
'yuan_name'
:
com_
nam
e
,
'get_name'
:
get_name
}
name_list
.
append
(
name_compile
)
log
.
info
(
f
'采集{com_code}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}'
)
# dataList.append(post_data)
baseCore
.
writerToExcel
(
name_list
,
'中国100强企业.xlsx'
)
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}'
)
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
post_data
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
...
@@ -392,13 +425,9 @@ if __name__ == '__main__':
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
get_socialcode
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{get_name}--{get_socialcode}--kafka传输失败"
)
# 信息采集完成后将该企业的采集次数更新
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
'_'
)[:
10
]
companyName
=
pd
.
DataFrame
(
name_list
)
companyName
.
to_excel
(
f
'./data/企业名称对比_{nowtime}.xlsx'
,
index
=
False
)
false_com
=
pd
.
DataFrame
(
list_weicha
)
false_com
.
to_excel
(
f
'./data/采集失败企业名单_{nowtime}.xlsx'
,
index
=
False
)
# break
...
...
comData/
dfcfwGpdm/NQenterprise
/getQccId.py
→
comData/
newlist/china100
/getQccId.py
浏览文件 @
b923d30f
...
...
@@ -5,21 +5,43 @@ import time
from
urllib.parse
import
quote
import
requests
import
urllib3
from
base.
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'x-request-device-type'
:
'Android'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391'
,
'Content-Type'
:
'application/json'
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
'authMini'
:
'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5'
,
'xweb_xhr'
:
'1'
,
'xcx-version'
:
'2023.09.27'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-CurrentPage'
:
'/company-subpackages/business/index'
,
'Qcc-Timestamp'
:
'1696661787803'
,
'Qcc-RefPage'
:
'/company-subpackages/detail/index'
,
'Accept'
:
'*/*'
,
'Sec-Fetch-Site'
:
'cross-site'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def
find_id_by_name
(
start
,
token
,
name
):
urllib3
.
disable_warnings
()
...
...
@@ -32,8 +54,8 @@ def find_id_by_name(start,token,name):
try
:
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
break
except
:
print
(
'
重试'
)
except
Exception
as
e
:
print
(
f
'{e}-------------
重试'
)
time
.
sleep
(
5
)
continue
time
.
sleep
(
2
)
...
...
@@ -46,19 +68,23 @@ def find_id_by_name(start,token,name):
KeyNo
=
False
log
.
info
(
f
'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
if
resp_dict
[
'status'
]
==
40102
:
KeyNo
=
False
log
.
info
(
f
'=======无效的session=====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
try
:
if
resp_dict
[
'result'
][
'Result'
]:
result_dict
=
resp_dict
[
'result'
][
'Result'
][
0
]
KeyNo
=
result_dict
[
'KeyNo'
]
Name
=
result_dict
[
'Name'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
.
strip
()
if
Name
==
''
:
KeyNo
=
''
KeyNo
=
'
null
'
else
:
KeyNo
=
''
KeyNo
=
'
null
'
except
:
KeyNo
=
False
log
.
info
(
f
'====token失效====时间{baseCore.getTimeCost(start,time.time())}'
)
return
KeyNo
print
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
log
.
info
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
return
KeyNo
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论