Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
52e227da
提交
52e227da
authored
2月 04, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查基本信息维护
上级
a86fe277
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
151 行增加
和
79 行删除
+151
-79
classtool.py
comData/BaseInfo_qcc/classtool.py
+2
-2
baseinfo0130_tyc.py
comData/Tyc/baseinfo0130_tyc.py
+149
-77
没有找到文件。
comData/BaseInfo_qcc/classtool.py
浏览文件 @
52e227da
...
...
@@ -49,8 +49,8 @@ class File():
class
Token
():
# 获取token
def
getToken
(
self
):
cursor
.
execute
(
f
"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1"
)
# cursor.execute(f" select id, cookies from QCC_token
")
#
cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor
.
execute
(
f
" select id, cookies from QCC_token where id = 63
"
)
# rows = cursor.fetchall()
# cnx.commit()
# if rows:
...
...
comData/Tyc/baseinfo0130_tyc.py
浏览文件 @
52e227da
...
...
@@ -81,15 +81,30 @@ def baseinfo(com_soup):
# print(info)
value
=
cominfo
.
text
.
replace
(
''
,
''
)
.
replace
(
'
\ue657
'
,
''
)
.
replace
(
'
\ue655
'
,
''
)
if
name
==
'法定代表人'
:
value
=
cominfo
.
find
(
'a'
)
.
text
try
:
value
=
cominfo
.
find
(
'a'
)
.
text
except
:
value
=
None
if
name
==
'电话'
:
value
=
cominfo
.
find
(
'span'
)
.
text
try
:
value
=
cominfo
.
find
(
'span'
)
.
text
except
:
value
=
None
if
name
==
'邮箱'
:
value
=
cominfo
.
find
(
'a'
)
.
text
try
:
value
=
cominfo
.
find
(
'a'
)
.
text
except
:
value
=
None
if
name
==
'网址'
:
value
=
cominfo
.
find
(
'a'
)
.
text
try
:
value
=
cominfo
.
find
(
'a'
)
.
text
except
:
value
=
None
if
name
==
'地址'
:
value
=
cominfo
.
find
(
'span'
)
.
text
try
:
value
=
cominfo
.
find
(
'span'
)
.
text
except
:
value
=
None
data
[
name
]
=
value
# print("==================")
...
...
@@ -141,7 +156,10 @@ def dic_handle(result_dic):
try
:
Status
=
result_dic
[
'经营状态'
]
except
:
Status
=
None
try
:
Status
=
result_dic
[
'公司现状'
]
except
:
Status
=
None
try
:
StartDate
=
result_dic
[
'成立日期'
]
...
...
@@ -198,31 +216,31 @@ def dic_handle(result_dic):
except
:
TaxpayerType
=
None
# try:
# SubIndustry = result_dic['国标行业']
# except:
# SubIndustry = ''
try
:
region
=
result_dic
[
'所属地区
'
]
SubIndustry
=
result_dic
[
'国标行业
'
]
except
:
region
=
None
try
:
pattern
=
r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
matches
=
re
.
match
(
pattern
,
region
)
Province
=
matches
.
group
(
1
)
City
=
matches
.
group
(
2
)
County
=
matches
.
group
(
3
)
if
Province
is
None
:
for
zxs
in
zxss
:
if
zxs
in
region
:
Province
=
zxs
break
SubIndustry
=
None
except
:
Province
=
None
City
=
None
County
=
None
# try:
# region = result_dic['所属地区']
# except:
# region = None
# try:
# pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
# matches = re.match(pattern, region)
# Province = matches.group(1)
# City = matches.group(2)
# County = matches.group(3)
# if Province is None:
# for zxs in zxss:
# if zxs in region:
# Province = zxs
# break
# except:
# Province = None
# City = None
# County = None
try
:
BelongOrg
=
result_dic
[
'登记机关'
]
...
...
@@ -285,11 +303,11 @@ def dic_handle(result_dic):
'businessStartDate'
:
TermStart
,
# 营业期限自
'businessEndDate'
:
TeamEnd
,
# 营业期限至
'taxpayerQualification'
:
TaxpayerType
,
# 纳税人资质
'industry'
:
None
,
# 所属行业
'region'
:
region
,
'province'
:
Provinc
e
,
# 所属省
'city'
:
City
,
# 所属市
'county'
:
County
,
# 所属县
'industry'
:
SubIndustry
,
# 所属行业
'region'
:
None
,
'province'
:
Non
e
,
# 所属省
'city'
:
None
,
# 所属市
'county'
:
None
,
# 所属县
'registerDepartment'
:
BelongOrg
,
# 登记机关
'scale'
:
Info
,
# 人员规模
'insured'
:
can_bao
,
# 参保人数
...
...
@@ -326,7 +344,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log
.
info
(
f
'----当前企业{social_code}-{com_name}--开始处理---'
)
count
=
0
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if
social_code
:
if
social_code
and
'ZZSN'
not
in
social_code
and
'ZD'
not
in
social_code
:
soup
=
checklogin
(
social_code
)
else
:
soup
=
checklogin
(
com_name
)
...
...
@@ -410,7 +428,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(td_count)
td_list
=
tr
.
find_all
(
'td'
)
td_count
=
len
(
td_list
)
name_list
=
[
td_list
[
i
]
.
text
for
i
in
range
(
td_count
)
if
i
%
2
==
0
]
# print(name_list)
# value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
...
...
@@ -428,7 +445,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
tag
.
deletep
(
value_tag
,
'span'
,
'class'
,
'index_branch-report__Nyf_Y'
)
# for value_tag in value_tag_list:
value_list
.
append
(
value_tag
.
text
.
replace
(
'
\xa0
'
,
''
))
# print(value_list)
if
len
(
name_list
)
==
len
(
value_list
):
for
i
in
range
(
len
(
name_list
)):
...
...
@@ -439,10 +455,30 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
if
name_list
[
i
]
==
'法定代表人'
:
value_list
[
i
]
=
value_list
[
i
]
.
split
(
'任职'
)[
0
]
dic_buseniss
[
name_list
[
i
]]
=
value_list
[
i
]
del
dic_buseniss
[
'天眼评分'
]
try
:
del
dic_buseniss
[
'天眼评分'
]
except
:
pass
# print(dic_buseniss)
result_dict
=
getinfo
(
dic_buseniss
,
data_baseinfo
)
# 主要针对香港台湾企业,社会信用代码传为给定的
try
:
result_dict
[
'统一社会信用代码'
]
except
:
# log.info('未获取到统一社会信用代码')
if
social_code
:
result_dict
[
'统一社会信用代码'
]
=
social_code
else
:
# 如果未给定社会信用代码,则返回
return
False
if
result_dict
[
'企业名称'
]
.
startswith
(
'('
)
and
result_dict
[
'企业名称'
]
.
endswith
(
')'
):
result_dict
[
'企业名称'
]
=
result_dict
[
'企业名称'
][
1
:
-
1
]
if
result_dict
[
'企业名称'
]
==
'-'
and
com_name
:
result_dict
[
'企业名称'
]
=
com_name
elif
not
com_name
:
return
False
else
:
pass
# print(result_dict)
# 采集成功的企业
data
=
[
com_name
,
result_dict
[
'企业名称'
],
social_code
,
result_dict
[
'统一社会信用代码'
]]
...
...
@@ -460,9 +496,28 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingType'
]
=
listType
# print(aa_dic)
sendkafka
(
aa_dic
)
# print(aa_dic)
else
:
data_baseinfo
=
baseinfo
(
com_soup
)
# 主要针对香港台湾企业,社会信用代码传为给定的
try
:
data_baseinfo
[
'统一社会信用代码'
]
except
:
log
.
info
(
'未获取到统一社会信用代码'
)
if
social_code
:
data_baseinfo
[
'统一社会信用代码'
]
=
social_code
else
:
# 如果未给定社会信用代码,则返回
return
False
if
data_baseinfo
[
'企业名称'
]
.
startswith
(
'('
)
and
data_baseinfo
[
'企业名称'
]
.
endswith
(
')'
):
data_baseinfo
[
'企业名称'
]
=
data_baseinfo
[
'企业名称'
][
1
:
-
1
]
if
data_baseinfo
[
'企业名称'
]
==
'-'
and
com_name
:
data_baseinfo
[
'企业名称'
]
=
com_name
elif
not
com_name
:
return
False
else
:
pass
# 采集成功的企业
data
=
[
com_name
,
data_baseinfo
[
'企业名称'
],
social_code
,
data_baseinfo
[
'统一社会信用代码'
]]
file
.
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
...
...
@@ -479,11 +534,18 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingType'
]
=
listType
sendkafka
(
aa_dic
)
def
remove_parentheses
(
text
):
# 清除中文小括号
text
=
re
.
sub
(
r'(|)'
,
''
,
text
)
# 清除英文小括号
text
=
re
.
sub
(
r'\(|\)'
,
''
,
text
)
return
text
.
replace
(
' '
,
''
)
# 判断名称是否统一
def
spiderwork
(
soup
,
receptname
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
company_url
=
''
try
:
company_list
=
soup
.
find
(
'div'
,
class_
=
'index_search-box__7YVh6'
)
company_list
=
soup
.
find
_all
(
'div'
,
class_
=
'index_search-box__7YVh6'
)
except
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
...
...
@@ -496,7 +558,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
info_t
=
compamy
.
find
(
'div'
,
class_
=
'index_name__qEdWi'
)
getname
=
info_t
.
find
(
'span'
)
.
text
log
.
info
(
f
'接收到的企业名称--{receptname}---采到的企业名称--{getname}'
)
if
receptname
and
getname
==
receptname
:
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
break
...
...
@@ -504,7 +565,13 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
break
else
:
continue
jian_name
=
remove_parentheses
(
baseCore
.
hant_2_hans
(
getname
))
if
remove_parentheses
(
receptname
)
==
jian_name
:
log
.
info
(
f
'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}'
)
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
break
else
:
continue
if
company_url
:
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
...
...
@@ -512,30 +579,33 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
# 判断是否是曾用名
getname
=
''
for
child
in
company_list
[
0
]
.
find_all
():
if
child
.
has_attr
(
'class'
):
print
(
child
[
'class'
])
if
'index_name'
in
child
[
'class'
]:
if
'index_name'
in
child
[
'class'
]
[
0
]
:
getname
=
child
.
text
company_url
=
child
.
find
(
'a'
)[
'href'
]
break
else
:
# 没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
time
.
sleep
(
2
)
return
False
# tr = company_list[:1][0]
# info_t = tr.find('div', class_='index_name__qEdWi')
# getname = info_t.find('span').text
log
.
info
(
f
'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}'
)
beforename
=
ifbeforename
(
company_url
)
if
beforename
==
receptname
:
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
if
getname
:
log
.
info
(
f
'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}'
)
beforename
=
ifbeforename
(
company_url
)
if
beforename
==
receptname
:
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
# 没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
time
.
sleep
(
2
)
return
False
else
:
#没有搜到相同的企业名称
#
没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
time
.
sleep
(
2
)
return
False
return
True
...
...
@@ -546,7 +616,7 @@ if __name__ == '__main__':
# driver, id_cookie = login()
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./国内企业基本信息采集情况.xlsx'
file_name
=
f
'./
data/
国内企业基本信息采集情况.xlsx'
file
.
createFile
(
file_name
)
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
...
...
@@ -564,8 +634,9 @@ if __name__ == '__main__':
s
.
cookies
.
update
(
cookies
)
start_time
=
time
.
time
()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field
=
'91110000710925016E||'
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
# company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore
.
sendEmail
(
file_name
)
...
...
@@ -592,26 +663,26 @@ if __name__ == '__main__':
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
continue
continue
# company_field_ = f'|{company_field}'
social_code
=
company_field
.
split
(
'|'
)[
0
]
com_name
=
company_field
.
split
(
'|'
)[
1
]
.
replace
(
' '
,
''
)
#
ynDomestic = company_field.split('|')[15]
#
countryName = company_field.split('|')[16]
#
securitiesCode = company_field.split('|')[17]
#
securitiesShortName = company_field.split('|')[18]
#
listingDate = company_field.split('|')[21]
#
category = company_field.split('|')[19]
#
exchange = company_field.split('|')[20]
#
listType = company_field.split('|')[21]
ynDomestic
=
None
countryName
=
None
securitiesCode
=
None
securitiesShortName
=
None
listingDate
=
None
category
=
None
exchange
=
None
listType
=
None
com_name
=
company_field
.
split
(
'|'
)[
2
]
.
replace
(
' '
,
''
)
ynDomestic
=
company_field
.
split
(
'|'
)[
15
]
countryName
=
company_field
.
split
(
'|'
)[
16
]
securitiesCode
=
company_field
.
split
(
'|'
)[
17
]
securitiesShortName
=
company_field
.
split
(
'|'
)[
18
]
listingDate
=
company_field
.
split
(
'|'
)[
21
]
category
=
company_field
.
split
(
'|'
)[
19
]
exchange
=
company_field
.
split
(
'|'
)[
20
]
listType
=
company_field
.
split
(
'|'
)[
21
]
#
ynDomestic = None
#
countryName = None
#
securitiesCode = None
#
securitiesShortName = None
#
listingDate = None
#
category = None
#
exchange = None
#
listType = None
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
...
...
@@ -622,5 +693,5 @@ if __name__ == '__main__':
# 信息采集完成后将该企业的采集次数更新
# runType = 'BaseInfoRunCount'
# baseCore.updateRun(social_code, runType, count)
break
#
break
baseCore
.
close
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论