Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
77b8e47a
提交
77b8e47a
authored
9月 12, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
新三板企业企查查id获取
上级
1a93992c
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
411 行增加
和
2 行删除
+411
-2
NQbase_info.py
comData/dfcfwGpdm/NQenterprise/NQbase_info.py
+406
-0
NQgetid.py
comData/dfcfwGpdm/NQenterprise/NQgetid.py
+5
-2
没有找到文件。
comData/dfcfwGpdm/NQenterprise/NQbase_info.py
0 → 100644
浏览文件 @
77b8e47a
# -*- coding: utf-8 -*-
import
pandas
as
pd
import
time
import
requests
import
json
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
from
getQccId
import
find_id_by_name
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
# 通过企查查id获取企业基本信息
def
info_by_id
(
com_id
,
com_name
,
gpdm
):
aa_dict_list
=
[]
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
time
.
sleep
(
2
)
com_jc_name
=
''
try
:
result_dict
=
resp_dict
[
'result'
][
'Company'
]
except
:
log
.
info
(
com_name
+
":获取失败===========重新放入redis"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
gpdm
)
return
aa_dict_list
company_name
=
result_dict
[
'Name'
]
CreditCode
=
result_dict
[
'CreditCode'
]
if
CreditCode
is
None
:
CreditCode
=
''
try
:
OperName
=
result_dict
[
'Oper'
][
'Name'
]
except
:
OperName
=
''
if
OperName
is
None
:
OperName
=
''
if
baseCore
.
str_have_num
(
OperName
):
OperName
=
''
try
:
Status
=
result_dict
[
'ShortStatus'
]
except
:
Status
=
''
if
Status
is
None
:
Status
=
''
try
:
StartDate
=
result_dict
[
'StartDate'
]
except
:
StartDate
=
''
if
StartDate
is
None
:
StartDate
=
''
try
:
RegistCapi
=
result_dict
[
'RegistCapi'
]
except
:
RegistCapi
=
''
if
RegistCapi
is
None
:
RegistCapi
=
''
RecCap
=
''
# result_dict['RecCap'] #实际缴纳金额,现已没有显示
if
RecCap
is
None
:
RecCap
=
''
try
:
OrgNo
=
result_dict
[
'CreditCode'
][
8
:
-
2
]
+
'-'
+
result_dict
[
'CreditCode'
][
-
2
]
# 组织机构代码,现已没有显示
except
:
OrgNo
=
''
if
OrgNo
is
None
:
OrgNo
=
''
try
:
TaxNo
=
result_dict
[
'TaxNo'
]
except
:
TaxNo
=
''
if
TaxNo
is
None
:
TaxNo
=
''
try
:
EconKind
=
result_dict
[
'EconKind'
]
except
:
EconKind
=
''
if
EconKind
is
None
:
EconKind
=
''
TermStart
=
''
# result_dict['TermStart'] 营业期限自,现已没有显示
if
TermStart
is
None
:
TermStart
=
''
TeamEnd
=
''
# result_dict['TeamEnd']营业期限至,现已没有显示
if
TeamEnd
is
None
:
TeamEnd
=
''
try
:
SubIndustry
=
result_dict
[
'Industry'
][
'SubIndustry'
]
except
:
SubIndustry
=
''
if
SubIndustry
is
None
:
SubIndustry
=
''
try
:
Province
=
result_dict
[
'Area'
][
'Province'
]
except
:
Province
=
''
try
:
City
=
result_dict
[
'Area'
][
'City'
]
except
:
City
=
''
try
:
County
=
result_dict
[
'Area'
][
'County'
]
except
:
County
=
''
try
:
region
=
Province
+
City
+
County
except
:
region
=
''
BelongOrg
=
''
# result_dict['BelongOrg']登记机关,现已没有显示
can_bao
=
''
CommonList
=
[]
# result_dict['CommonList']参保人数,现已没有显示
for
Common_dict
in
CommonList
:
try
:
KeyDesc
=
Common_dict
[
'KeyDesc'
]
except
:
continue
if
KeyDesc
==
'参保人数'
:
can_bao
=
Common_dict
[
'Value'
]
if
can_bao
==
'0'
:
can_bao
=
''
OriginalName
=
''
try
:
OriginalName_lists
=
result_dict
[
'OriginalName'
]
for
OriginalName_dict
in
OriginalName_lists
:
OriginalName
+=
OriginalName_dict
[
'Name'
]
+
' '
except
:
OriginalName
=
''
try
:
OriginalName
.
strip
()
except
:
OriginalName
=
''
EnglishName
=
''
# result_dict['EnglishName']企业英文名,现已没有显示
if
EnglishName
is
None
:
EnglishName
=
''
IxCode
=
''
# result_dict['IxCode']进出口企业代码,现已没有显示
if
IxCode
is
None
:
IxCode
=
''
Address
=
result_dict
[
'Address'
]
if
Address
is
None
:
Address
=
''
Scope
=
''
# result_dict['Scope']经营范围,现已没有显示
if
Scope
is
None
:
Scope
=
''
try
:
PhoneNumber
=
result_dict
[
'companyExtendInfo'
][
'Tel'
]
except
:
PhoneNumber
=
''
if
PhoneNumber
is
None
:
PhoneNumber
=
''
try
:
WebSite
=
result_dict
[
'companyExtendInfo'
][
'WebSite'
]
except
:
WebSite
=
None
if
WebSite
is
None
:
try
:
WebSite
=
result_dict
[
'ContactInfo'
][
'WebSite'
][
0
][
'Url'
]
except
:
WebSite
=
''
try
:
Email
=
result_dict
[
'companyExtendInfo'
][
'Email'
]
except
:
Email
=
''
if
Email
is
None
:
Email
=
''
try
:
Desc
=
result_dict
[
'companyExtendInfo'
][
'Desc'
]
except
:
Desc
=
''
if
Desc
is
None
:
Desc
=
''
try
:
Info
=
result_dict
[
'companyExtendInfo'
][
'Info'
]
except
:
Info
=
''
if
Info
is
None
:
Info
=
''
company_name
=
baseCore
.
hant_2_hans
(
company_name
)
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
resp_dict2
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
time
.
sleep
(
1
)
try
:
com2
=
resp_dict2
[
'result'
][
'Company'
]
except
:
com2
=
''
try
:
Scope
=
com2
[
'Scope'
]
except
:
Scope
=
''
try
:
CheckDate
=
com2
[
'CheckDate'
]
except
:
CheckDate
=
''
if
CheckDate
is
None
:
CheckDate
=
''
try
:
TaxpayerType
=
com2
[
'TaxpayerType'
]
#纳税人资质
except
:
TaxpayerType
=
''
if
TaxpayerType
is
None
:
TaxpayerType
=
''
try
:
No
=
com2
[
'No'
]
except
:
No
=
''
if
No
is
None
:
No
=
''
try
:
IxCode
=
com2
[
'IxCode'
]
except
:
IxCode
=
''
try
:
OrgNo
=
com2
[
'OrgNo'
]
except
:
OrgNo
=
''
try
:
for
Common_t
in
com2
[
'CommonList'
]:
try
:
if
Common_t
[
'KeyDesc'
]
==
'参保人数'
:
can_bao
=
Common_t
[
'Value'
]
except
:
pass
except
:
can_bao
=
''
try
:
TermStart
=
com2
[
'TermStart'
]
except
:
TermStart
=
''
try
:
TeamEnd
=
com2
[
'TeamEnd'
]
except
:
TeamEnd
=
''
try
:
RecCap
=
com2
[
'RecCap'
]
except
:
RecCap
=
''
try
:
No
=
com2
[
'No'
]
except
:
No
=
''
try
:
SubIndustry
=
com2
[
'IndustryArray'
][
-
1
]
except
:
SubIndustry
=
''
try
:
BelongOrg
=
com2
[
'BelongOrg'
]
except
:
BelongOrg
=
''
try
:
EnglishName
=
com2
[
'EnglishName'
]
except
:
EnglishName
=
''
aa_dict
=
{
'qccId'
:
com_id
,
# 企查查企业id
'name'
:
company_name
,
# 企业名称
'shortName'
:
com_jc_name
,
# 企业简称
'socialCreditCode'
:
CreditCode
,
# 统一社会信用代码
'legalPerson'
:
OperName
,
# 法定代表人
'officialPhone'
:
PhoneNumber
,
# 电话
'officialUrl'
:
WebSite
,
# 官网
'officialEmail'
:
Email
,
# 邮箱
'briefInfo'
:
Desc
,
# 简介
'registerStatus'
:
Status
,
# 登记状态
'incorporationDate'
:
StartDate
,
# 成立日期
'capital'
:
RegistCapi
,
# 注册资本
'paidCapital'
:
RecCap
,
# 实缴资本
'approvalDate'
:
CheckDate
,
# 核准日期
'organizationCode'
:
OrgNo
,
# 组织机构代码
'registerNo'
:
No
,
# 工商注册号
'taxpayerNo'
:
CreditCode
,
# 纳税人识别号
'type'
:
EconKind
,
# 企业类型
'businessStartDate'
:
TermStart
,
# 营业期限自
'businessEndDate'
:
TeamEnd
,
# 营业期限至
'taxpayerQualification'
:
TaxpayerType
,
# 纳税人资质
'industry'
:
SubIndustry
,
# 所属行业
'region'
:
region
,
'province'
:
Province
,
# 所属省
'city'
:
City
,
# 所属市
'county'
:
County
,
# 所属县
'registerDepartment'
:
BelongOrg
,
# 登记机关
'scale'
:
Info
,
# 人员规模
'insured'
:
can_bao
,
# 参保人数
'beforeName'
:
OriginalName
,
# 曾用名
'englishName'
:
EnglishName
,
# 英文名
'importExportEnterpriseCode'
:
IxCode
,
# 进出口企业代码
'address'
:
Address
,
# 地址
'businessRange'
:
Scope
,
# 经营范围
'status'
:
0
,
# 状态
}
aa_dict_list
.
append
(
aa_dict
)
print
(
company_name
+
":爬取完成"
)
return
aa_dict_list
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查'
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
#从redis里拿数据
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
baseCore
.
GetToken
()
list_weicha
=
[]
list_all_info
=
[]
name_list
=
[]
start_time
=
time
.
time
()
# 获取企业信息
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpoqccid:nq_gpdm'
)
if
'.NQ'
in
com_code
:
com_code1
=
com_code
else
:
com_code1
=
com_code
+
'.NQ'
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
)
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpoqccid:nq_gpdm'
,
com_code
)
log
.
info
(
'-----已重新放入redis-----'
)
time
.
sleep
(
20
)
continue
else
:
log
.
info
(
f
'====={com_code}===={company_id}=====获取企业id成功====='
)
# todo:企查查id写入gpdm表中
updateSql
=
f
"update gpdm set QCCID = '{company_id}' where gpdm = '{com_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
try
:
post_data_list
=
info_by_id
(
company_id
,
''
,
com_code1
)
except
:
log
.
info
(
f
'====={com_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise:gnqy_social_code'
,
com_code
)
continue
if
post_data_list
:
pass
else
:
log
.
info
(
f
'======{com_code}====企查查token失效===='
)
time
.
sleep
(
20
)
continue
for
post_data
in
post_data_list
:
list_all_info
.
append
(
post_data
)
if
post_data
is
None
:
print
(
com_code
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_code
+
":企业信息获取失败"
)
continue
get_name
=
post_data
[
'name'
]
get_socialcode
=
post_data
[
'socialCreditCode'
]
name_compile
=
{
'yuan_name'
:
com_code
,
'get_name'
:
get_name
}
name_list
.
append
(
name_compile
)
log
.
info
(
f
'采集{com_code}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}'
)
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
post_data
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
except
:
exception
=
'kafka传输失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
get_socialcode
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{get_name}--{get_socialcode}--kafka传输失败"
)
# 信息采集完成后将该企业的采集次数更新
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
'_'
)[:
10
]
companyName
=
pd
.
DataFrame
(
name_list
)
companyName
.
to_excel
(
f
'./data/企业名称对比_{nowtime}.xlsx'
,
index
=
False
)
false_com
=
pd
.
DataFrame
(
list_weicha
)
false_com
.
to_excel
(
f
'./data/采集失败企业名单_{nowtime}.xlsx'
,
index
=
False
)
comData/dfcfwGpdm/NQenterprise/NQgetid.py
浏览文件 @
77b8e47a
...
@@ -334,9 +334,12 @@ if __name__ == '__main__':
...
@@ -334,9 +334,12 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpoqccid:nq_gpdm'
)
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpoqccid:nq_gpdm'
)
com_code
=
com_code
+
'.NQ'
if
'.NQ'
in
com_code
:
com_code1
=
com_code
else
:
com_code1
=
com_code
+
'.NQ'
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
)
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
1
)
if
not
company_id
:
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论