Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
1a93992c
提交
1a93992c
authored
9月 12, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
新三板企业企查查id获取
上级
ea2b7efb
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
470 行增加
和
1 行删除
+470
-1
NQgetid.py
comData/dfcfwGpdm/NQenterprise/NQgetid.py
+403
-0
getQccId.py
comData/dfcfwGpdm/NQenterprise/getQccId.py
+65
-0
wxList.py
comData/weixin_solo/wxList.py
+2
-1
没有找到文件。
comData/dfcfwGpdm/NQenterprise/NQgetid.py
0 → 100644
浏览文件 @
1a93992c
# -*- coding: utf-8 -*-
import
pandas
as
pd
import
time
import
requests
import
json
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
from
getQccId
import
find_id_by_name
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
# 通过企查查id获取企业基本信息
def
info_by_id
(
com_id
,
com_name
,
gpdm
):
aa_dict_list
=
[]
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
time
.
sleep
(
2
)
com_jc_name
=
''
try
:
result_dict
=
resp_dict
[
'result'
][
'Company'
]
except
:
log
.
info
(
com_name
+
":获取失败===========重新放入redis"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
gpdm
)
return
aa_dict_list
company_name
=
result_dict
[
'Name'
]
CreditCode
=
result_dict
[
'CreditCode'
]
if
CreditCode
is
None
:
CreditCode
=
''
try
:
OperName
=
result_dict
[
'Oper'
][
'Name'
]
except
:
OperName
=
''
if
OperName
is
None
:
OperName
=
''
if
baseCore
.
str_have_num
(
OperName
):
OperName
=
''
try
:
Status
=
result_dict
[
'ShortStatus'
]
except
:
Status
=
''
if
Status
is
None
:
Status
=
''
try
:
StartDate
=
result_dict
[
'StartDate'
]
except
:
StartDate
=
''
if
StartDate
is
None
:
StartDate
=
''
try
:
RegistCapi
=
result_dict
[
'RegistCapi'
]
except
:
RegistCapi
=
''
if
RegistCapi
is
None
:
RegistCapi
=
''
RecCap
=
''
# result_dict['RecCap'] #实际缴纳金额,现已没有显示
if
RecCap
is
None
:
RecCap
=
''
try
:
OrgNo
=
result_dict
[
'CreditCode'
][
8
:
-
2
]
+
'-'
+
result_dict
[
'CreditCode'
][
-
2
]
# 组织机构代码,现已没有显示
except
:
OrgNo
=
''
if
OrgNo
is
None
:
OrgNo
=
''
try
:
TaxNo
=
result_dict
[
'TaxNo'
]
except
:
TaxNo
=
''
if
TaxNo
is
None
:
TaxNo
=
''
try
:
EconKind
=
result_dict
[
'EconKind'
]
except
:
EconKind
=
''
if
EconKind
is
None
:
EconKind
=
''
TermStart
=
''
# result_dict['TermStart'] 营业期限自,现已没有显示
if
TermStart
is
None
:
TermStart
=
''
TeamEnd
=
''
# result_dict['TeamEnd']营业期限至,现已没有显示
if
TeamEnd
is
None
:
TeamEnd
=
''
try
:
SubIndustry
=
result_dict
[
'Industry'
][
'SubIndustry'
]
except
:
SubIndustry
=
''
if
SubIndustry
is
None
:
SubIndustry
=
''
try
:
Province
=
result_dict
[
'Area'
][
'Province'
]
except
:
Province
=
''
try
:
City
=
result_dict
[
'Area'
][
'City'
]
except
:
City
=
''
try
:
County
=
result_dict
[
'Area'
][
'County'
]
except
:
County
=
''
try
:
region
=
Province
+
City
+
County
except
:
region
=
''
BelongOrg
=
''
# result_dict['BelongOrg']登记机关,现已没有显示
can_bao
=
''
CommonList
=
[]
# result_dict['CommonList']参保人数,现已没有显示
for
Common_dict
in
CommonList
:
try
:
KeyDesc
=
Common_dict
[
'KeyDesc'
]
except
:
continue
if
KeyDesc
==
'参保人数'
:
can_bao
=
Common_dict
[
'Value'
]
if
can_bao
==
'0'
:
can_bao
=
''
OriginalName
=
''
try
:
OriginalName_lists
=
result_dict
[
'OriginalName'
]
for
OriginalName_dict
in
OriginalName_lists
:
OriginalName
+=
OriginalName_dict
[
'Name'
]
+
' '
except
:
OriginalName
=
''
try
:
OriginalName
.
strip
()
except
:
OriginalName
=
''
EnglishName
=
''
# result_dict['EnglishName']企业英文名,现已没有显示
if
EnglishName
is
None
:
EnglishName
=
''
IxCode
=
''
# result_dict['IxCode']进出口企业代码,现已没有显示
if
IxCode
is
None
:
IxCode
=
''
Address
=
result_dict
[
'Address'
]
if
Address
is
None
:
Address
=
''
Scope
=
''
# result_dict['Scope']经营范围,现已没有显示
if
Scope
is
None
:
Scope
=
''
try
:
PhoneNumber
=
result_dict
[
'companyExtendInfo'
][
'Tel'
]
except
:
PhoneNumber
=
''
if
PhoneNumber
is
None
:
PhoneNumber
=
''
try
:
WebSite
=
result_dict
[
'companyExtendInfo'
][
'WebSite'
]
except
:
WebSite
=
None
if
WebSite
is
None
:
try
:
WebSite
=
result_dict
[
'ContactInfo'
][
'WebSite'
][
0
][
'Url'
]
except
:
WebSite
=
''
try
:
Email
=
result_dict
[
'companyExtendInfo'
][
'Email'
]
except
:
Email
=
''
if
Email
is
None
:
Email
=
''
try
:
Desc
=
result_dict
[
'companyExtendInfo'
][
'Desc'
]
except
:
Desc
=
''
if
Desc
is
None
:
Desc
=
''
try
:
Info
=
result_dict
[
'companyExtendInfo'
][
'Info'
]
except
:
Info
=
''
if
Info
is
None
:
Info
=
''
company_name
=
baseCore
.
hant_2_hans
(
company_name
)
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
"https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}"
.
format
(
token
,
t
,
com_id
)
resp_dict2
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
time
.
sleep
(
1
)
try
:
com2
=
resp_dict2
[
'result'
][
'Company'
]
except
:
com2
=
''
try
:
Scope
=
com2
[
'Scope'
]
except
:
Scope
=
''
try
:
CheckDate
=
com2
[
'CheckDate'
]
except
:
CheckDate
=
''
if
CheckDate
is
None
:
CheckDate
=
''
try
:
TaxpayerType
=
com2
[
'TaxpayerType'
]
#纳税人资质
except
:
TaxpayerType
=
''
if
TaxpayerType
is
None
:
TaxpayerType
=
''
try
:
No
=
com2
[
'No'
]
except
:
No
=
''
if
No
is
None
:
No
=
''
try
:
IxCode
=
com2
[
'IxCode'
]
except
:
IxCode
=
''
try
:
OrgNo
=
com2
[
'OrgNo'
]
except
:
OrgNo
=
''
try
:
for
Common_t
in
com2
[
'CommonList'
]:
try
:
if
Common_t
[
'KeyDesc'
]
==
'参保人数'
:
can_bao
=
Common_t
[
'Value'
]
except
:
pass
except
:
can_bao
=
''
try
:
TermStart
=
com2
[
'TermStart'
]
except
:
TermStart
=
''
try
:
TeamEnd
=
com2
[
'TeamEnd'
]
except
:
TeamEnd
=
''
try
:
RecCap
=
com2
[
'RecCap'
]
except
:
RecCap
=
''
try
:
No
=
com2
[
'No'
]
except
:
No
=
''
try
:
SubIndustry
=
com2
[
'IndustryArray'
][
-
1
]
except
:
SubIndustry
=
''
try
:
BelongOrg
=
com2
[
'BelongOrg'
]
except
:
BelongOrg
=
''
try
:
EnglishName
=
com2
[
'EnglishName'
]
except
:
EnglishName
=
''
aa_dict
=
{
'qccId'
:
com_id
,
# 企查查企业id
'name'
:
company_name
,
# 企业名称
'shortName'
:
com_jc_name
,
# 企业简称
'socialCreditCode'
:
CreditCode
,
# 统一社会信用代码
'legalPerson'
:
OperName
,
# 法定代表人
'officialPhone'
:
PhoneNumber
,
# 电话
'officialUrl'
:
WebSite
,
# 官网
'officialEmail'
:
Email
,
# 邮箱
'briefInfo'
:
Desc
,
# 简介
'registerStatus'
:
Status
,
# 登记状态
'incorporationDate'
:
StartDate
,
# 成立日期
'capital'
:
RegistCapi
,
# 注册资本
'paidCapital'
:
RecCap
,
# 实缴资本
'approvalDate'
:
CheckDate
,
# 核准日期
'organizationCode'
:
OrgNo
,
# 组织机构代码
'registerNo'
:
No
,
# 工商注册号
'taxpayerNo'
:
CreditCode
,
# 纳税人识别号
'type'
:
EconKind
,
# 企业类型
'businessStartDate'
:
TermStart
,
# 营业期限自
'businessEndDate'
:
TeamEnd
,
# 营业期限至
'taxpayerQualification'
:
TaxpayerType
,
# 纳税人资质
'industry'
:
SubIndustry
,
# 所属行业
'region'
:
region
,
'province'
:
Province
,
# 所属省
'city'
:
City
,
# 所属市
'county'
:
County
,
# 所属县
'registerDepartment'
:
BelongOrg
,
# 登记机关
'scale'
:
Info
,
# 人员规模
'insured'
:
can_bao
,
# 参保人数
'beforeName'
:
OriginalName
,
# 曾用名
'englishName'
:
EnglishName
,
# 英文名
'importExportEnterpriseCode'
:
IxCode
,
# 进出口企业代码
'address'
:
Address
,
# 地址
'businessRange'
:
Scope
,
# 经营范围
'status'
:
0
,
# 状态
}
aa_dict_list
.
append
(
aa_dict
)
print
(
company_name
+
":爬取完成"
)
return
aa_dict_list
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查'
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
#从redis里拿数据
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
baseCore
.
GetToken
()
list_weicha
=
[]
list_all_info
=
[]
name_list
=
[]
start_time
=
time
.
time
()
# 获取企业信息
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpoqccid:nq_gpdm'
)
com_code
=
com_code
+
'.NQ'
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
)
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpoqccid:nq_gpdm'
,
com_code
)
log
.
info
(
'-----已重新放入redis-----'
)
time
.
sleep
(
20
)
continue
else
:
log
.
info
(
f
'====={com_code}===={company_id}=====获取企业id成功====='
)
# todo:企查查id写入gpdm表中
updateSql
=
f
"update gpdm set QCCID = '{company_id}' where gpdm = '{com_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
# try:
# post_data_list = info_by_id(company_id, '',com_code)
# except:
# log.info(f'====={com_code}=====获取基本信息失败,重新放入redis=====')
# baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', com_code)
# continue
# if post_data_list:
# pass
# else:
# log.info(f'======{com_code}====企查查token失效====')
# time.sleep(20)
# continue
# for post_data in post_data_list:
# list_all_info.append(post_data)
# if post_data is None:
# print(com_code + ":企业信息获取失败")
# list_weicha.append(com_code + ":企业信息获取失败")
# continue
# get_name = post_data['name']
# get_socialcode = post_data['socialCreditCode']
# name_compile = {
# 'yuan_name':com_code,
# 'get_name':get_name
# }
# name_list.append(name_compile)
#
# log.info(f'采集{com_code}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
# try:
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
# print(kafka_result.get(timeout=10))
# except:
# exception = 'kafka传输失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
# log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# # 信息采集完成后将该企业的采集次数更新
# nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
# companyName = pd.DataFrame(name_list)
# companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
# false_com = pd.DataFrame(list_weicha)
# false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
comData/dfcfwGpdm/NQenterprise/getQccId.py
0 → 100644
浏览文件 @
1a93992c
# -*- coding: utf-8 -*-
import
time
from
urllib.parse
import
quote
import
requests
import
urllib3
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
# 通过企业名称或信用代码获取企查查id
def
find_id_by_name
(
start
,
token
,
name
):
urllib3
.
disable_warnings
()
qcc_key
=
name
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
f
"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=®istCapiBegin=®istCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for
lll
in
range
(
1
,
6
):
try
:
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
break
except
:
print
(
'重试'
)
time
.
sleep
(
5
)
continue
time
.
sleep
(
2
)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if
resp_dict
[
'status'
]
==
40101
:
KeyNo
=
False
log
.
info
(
f
'====token失效====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
if
resp_dict
[
'status'
]
==
401
:
KeyNo
=
False
log
.
info
(
f
'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
try
:
if
resp_dict
[
'result'
][
'Result'
]:
result_dict
=
resp_dict
[
'result'
][
'Result'
][
0
]
KeyNo
=
result_dict
[
'KeyNo'
]
Name
=
result_dict
[
'Name'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
.
strip
()
if
Name
==
''
:
KeyNo
=
''
else
:
KeyNo
=
''
except
:
KeyNo
=
False
log
.
info
(
f
'====token失效====时间{baseCore.getTimeCost(start,time.time())}'
)
return
KeyNo
print
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
return
KeyNo
\ No newline at end of file
comData/weixin_solo/wxList.py
浏览文件 @
1a93992c
...
...
@@ -262,7 +262,8 @@ if __name__=="__main__":
log
.
info
(
"redis已经没有数据了,重新放置数据"
)
getFromSql
()
time
.
sleep
(
60
)
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
continue
# infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
getWxList
(
infoSourceCode
)
# infoSourceCode = 'IN-20220917-0159'
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论