Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
f7886002
提交
f7886002
authored
12月 27, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
企查查脚本维护
上级
08e4725c
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
39 行增加
和
31 行删除
+39
-31
baseinfo1113.py
comData/BaseInfo_qcc/baseinfo1113.py
+37
-29
baseinfo1122.py
comData/BaseInfo_qcc/baseinfo1122.py
+2
-2
没有找到文件。
comData/BaseInfo_qcc/baseinfo1113.py
浏览文件 @
f7886002
...
...
@@ -292,7 +292,7 @@ def dic_handle(result_dic):
return
aa_dict
# 采集准备
def
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
# if social_code:
# dic_info = baseCore.getInfomation(social_code)
...
...
@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else
:
# 开始采集
try
:
if
spiderwork
(
soup
,
com_name
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
if
spiderwork
(
soup
,
com_name
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
count
+=
1
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}'
)
token
.
updateTokeen
(
id_cookie
,
3
)
...
...
@@ -373,7 +373,7 @@ def ifbeforename(company_url):
return
''
# 采集基本信息和工商信息
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
qccid
=
company_url
.
split
(
'firm/'
)[
1
]
.
split
(
'.html'
)[
0
]
# 将采集到的企查查id更新
updateSql
=
f
"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
...
...
@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingDate'
]
=
listingDate
aa_dic
[
'category'
]
=
category
aa_dic
[
'exchange'
]
=
exchange
aa_dic
[
'listingType'
]
=
listType
# print(aa_dic)
sendkafka
(
aa_dic
)
...
...
@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingDate'
]
=
listingDate
aa_dic
[
'category'
]
=
category
aa_dic
[
'exchange'
]
=
exchange
aa_dic
[
'listingType'
]
=
listType
sendkafka
(
aa_dic
)
# 判断名称是否统一
def
spiderwork
(
soup
,
receptname
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
spiderwork
(
soup
,
receptname
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
company_url
=
''
try
:
company_list
=
soup
.
find
(
'table'
,
class_
=
'app-ltable ntable ntable-list ntable ntable-list'
)
...
...
@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
# 判断是否是曾用名
tr
=
tr_list
[:
1
][
0
]
...
...
@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
beforename
=
ifbeforename
(
company_url
)
if
beforename
==
receptname
:
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
#没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
...
...
@@ -549,6 +549,7 @@ if __name__ == '__main__':
else
:
log
.
info
(
'==========已无cookies=========='
)
time
.
sleep
(
30
)
continue
id_cookie
=
cookieinfo
[
0
]
cookie_
=
json
.
loads
(
cookieinfo
[
1
])
...
...
@@ -579,8 +580,8 @@ if __name__ == '__main__':
}
start_time
=
time
.
time
()
# 获取企业信息
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
# company_field = '91220101606092819L
||'
#
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field
=
'913300007125582210
||'
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore
.
sendEmail
(
file_name
)
...
...
@@ -595,6 +596,11 @@ if __name__ == '__main__':
while
flag
:
log
.
info
(
'--------已没有数据---------'
)
time
.
sleep
(
30
)
if
not
baseCore
.
check_mysql_conn
(
cnx_
):
# 144数据库
cnx_
=
baseCore
.
cnx
cursor_
=
cnx_
.
cursor
()
log
.
info
(
'===11数据库重新连接成功==='
)
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
if
company_field
:
flag
=
False
...
...
@@ -604,26 +610,28 @@ if __name__ == '__main__':
continue
social_code
=
company_field
.
split
(
'|'
)[
0
]
com_name
=
company_field
.
split
(
'|'
)[
2
]
.
replace
(
' '
,
''
)
ynDomestic
=
company_field
.
split
(
'|'
)[
15
]
countryName
=
company_field
.
split
(
'|'
)[
16
]
securitiesCode
=
company_field
.
split
(
'|'
)[
17
]
securitiesShortName
=
company_field
.
split
(
'|'
)[
18
]
listingDate
=
company_field
.
split
(
'|'
)[
21
]
category
=
company_field
.
split
(
'|'
)[
19
]
exchange
=
company_field
.
split
(
'|'
)[
20
]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
# securitiesShortName = ''
# listingDate = ''
# category = ''
# exchange = ''
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
com_name
=
company_field
.
split
(
'|'
)[
1
]
.
replace
(
' '
,
''
)
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21]
ynDomestic
=
''
countryName
=
''
securitiesCode
=
''
securitiesShortName
=
''
listingDate
=
''
category
=
''
exchange
=
''
listType
=
''
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
time
.
sleep
(
2
)
#
break
break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
...
...
comData/BaseInfo_qcc/baseinfo1122.py
浏览文件 @
f7886002
...
...
@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
# 判断是否是曾用名
tr
=
tr_list
[:
1
][
0
]
...
...
@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
beforename
=
ifbeforename
(
company_url
)
if
beforename
==
receptname
:
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
#没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论