Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
8cf6e366
提交
8cf6e366
authored
2月 23, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
企业新增及更新
上级
5e520511
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
162 行增加
和
111 行删除
+162
-111
CorePerson.py
comData/Tyc/CorePerson.py
+2
-1
CorePerson_Update.py
comData/Tyc/CorePerson_Update.py
+5
-2
baseinfo0130_tyc.py
comData/Tyc/baseinfo0130_tyc.py
+84
-61
baseinfotyc_update.py
comData/Tyc/baseinfotyc_update.py
+68
-45
baseinfouptime_tyc.py
comData/Tyc/baseinfouptime_tyc.py
+1
-1
classtool.py
comData/Tyc/classtool.py
+2
-1
没有找到文件。
comData/Tyc/CorePerson.py
浏览文件 @
8cf6e366
...
@@ -87,7 +87,8 @@ def doJob():
...
@@ -87,7 +87,8 @@ def doJob():
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'version'
:
'TYC-Web'
'version'
:
'TYC-Web'
}
}
cookies_list
,
id_cookie
=
token
.
get_cookies
()
cookies_list
,
id_cookie
,
user_name
=
token
.
get_cookies
()
log
.
info
(
f
'=====当前使用的是{user_name}的cookie======'
)
cookies
=
{}
cookies
=
{}
for
cookie
in
cookies_list
:
for
cookie
in
cookies_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
...
...
comData/Tyc/CorePerson_Update.py
浏览文件 @
8cf6e366
...
@@ -87,7 +87,8 @@ def doJob():
...
@@ -87,7 +87,8 @@ def doJob():
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'version'
:
'TYC-Web'
'version'
:
'TYC-Web'
}
}
cookies_list
,
id_cookie
=
token
.
get_cookies
()
cookies_list
,
id_cookie
,
user_name
=
token
.
get_cookies
()
log
.
info
(
f
'=====当前使用的是{user_name}的cookie======'
)
cookies
=
{}
cookies
=
{}
for
cookie
in
cookies_list
:
for
cookie
in
cookies_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
...
@@ -212,7 +213,7 @@ def doJob():
...
@@ -212,7 +213,7 @@ def doJob():
total_page
=
0
total_page
=
0
flag
=
0
flag
=
0
baseCore
.
rePutIntoR
(
'UpdateCoreperson:Map'
,
item
)
baseCore
.
rePutIntoR
(
'UpdateCoreperson:Map'
,
item
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----页面和接口数据不对应'
)
log
.
info
(
f
'{id}---{xydm}----{tycid}----页面和接口数据不对应
---{charge}---{total_page2}---{total_page3}
'
)
continue
continue
if
total_page
==
0
:
if
total_page
==
0
:
token
.
updateTokeen
(
id_cookie
,
2
)
token
.
updateTokeen
(
id_cookie
,
2
)
...
@@ -223,6 +224,8 @@ def doJob():
...
@@ -223,6 +224,8 @@ def doJob():
# # todo:获取页数
# # todo:获取页数
# total_page = 34
# total_page = 34
# flag = 2
# flag = 2
# todo: 测试程序是否执行到这一步
log
.
info
(
f
'总数为{total_page}'
)
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
for
page
in
range
(
1
,
int
((
total_page
/
20
)
+
1
)
+
1
):
res
=
None
res
=
None
for
c
in
range
(
3
):
for
c
in
range
(
3
):
...
...
comData/Tyc/baseinfo0130_tyc.py
浏览文件 @
8cf6e366
...
@@ -57,6 +57,12 @@ def sendkafka(post_data):
...
@@ -57,6 +57,12 @@ def sendkafka(post_data):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{com_name}--{social_code}--kafka传输失败"
)
log
.
info
(
f
"{com_name}--{social_code}--kafka传输失败"
)
def
Lreputredis
(
company_field
):
# todo: 重新放入redis
baseCore
.
r
.
lrem
(
'BaseInfoEnterprise:gnqy_socialCode'
,
0
,
'end'
)
baseCore
.
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
baseCore
.
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
'end'
)
# 合并基本信息和工商信息字段
# 合并基本信息和工商信息字段
def
getinfo
(
dict1
,
dict2
):
def
getinfo
(
dict1
,
dict2
):
# 取出两个字典的key值集合
# 取出两个字典的key值集合
...
@@ -352,7 +358,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -352,7 +358,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
soup
=
checklogin
(
com_name
)
soup
=
checklogin
(
com_name
)
if
not
soup
:
if
not
soup
:
log
.
info
(
"登录失效===重新放入redis"
)
log
.
info
(
"登录失效===重新放入redis"
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis
(
company_field
)
token
.
updateTokeen
(
id_cookie
,
2
)
token
.
updateTokeen
(
id_cookie
,
2
)
# log.info('=====已重新放入redis,失效cookies已删除======')
# log.info('=====已重新放入redis,失效cookies已删除======')
time
.
sleep
(
20
)
time
.
sleep
(
20
)
...
@@ -361,18 +368,23 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -361,18 +368,23 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
try
:
try
:
searchinfo
=
soup
.
find
(
'div'
,
class_
=
'index_content-tool-title__K1Z6C'
)
.
find
(
'span'
,
class_
=
'index_title-count__lDSjB'
)
.
text
searchinfo
=
soup
.
find
(
'div'
,
class_
=
'index_content-tool-title__K1Z6C'
)
.
find
(
'span'
,
class_
=
'index_title-count__lDSjB'
)
.
text
except
:
except
:
log
.
info
(
"登录失效===重新放入redis"
)
try
:
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# todo:可能是搜不到该企业
token
.
updateTokeen
(
id_cookie
,
2
)
errormessage
=
soup
.
find
(
'div'
,
class_
=
'index_no-data-reason-title__V3gFY'
)
.
text
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
if
'抱歉'
in
errormessage
:
time
.
sleep
(
20
)
log
.
info
(
'=====搜索不到该企业===='
)
return
count
data
=
[
com_name
,
social_code
]
if
searchinfo
==
'0'
:
# todo:搜不到的企业需要返回到一个表格中
log
.
info
(
'=====搜索不到该企业===='
)
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
data
=
[
com_name
,
social_code
]
return
count
# todo:搜不到的企业需要返回到一个表格中
except
:
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
log
.
info
(
"登录失效===重新放入redis"
)
return
count
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
Lreputredis
(
company_field
)
token
.
updateTokeen
(
id_cookie
,
2
)
# log.info('=====已重新放入redis,cookies已封号======')
time
.
sleep
(
20
)
return
count
else
:
else
:
# 开始采集
# 开始采集
try
:
try
:
...
@@ -385,7 +397,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -385,7 +397,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
return
count
return
count
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis
(
company_field
)
token
.
updateTokeen
(
id_cookie
,
2
)
token
.
updateTokeen
(
id_cookie
,
2
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
return
count
return
count
...
@@ -578,45 +591,50 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
...
@@ -578,45 +591,50 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# req = requests.post(post_url, data=dic_info)
# req = requests.post(post_url, data=dic_info)
else
:
else
:
data_baseinfo
=
baseinfo
(
com_soup
)
# todo: 重新放入redis 删除end再放入ruend
# 主要针对香港台湾企业,社会信用代码传为给定的
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
try
:
Lreputredis
(
company_field
)
data_baseinfo
[
'统一社会信用代码'
]
log
.
error
(
f
'未找到工商信息,重新塞入redis'
)
except
:
log
.
info
(
'未获取到统一社会信用代码'
)
# data_baseinfo = baseinfo(com_soup)
if
social_code
:
# # 主要针对香港台湾企业,社会信用代码传为给定的
data_baseinfo
[
'统一社会信用代码'
]
=
social_code
# try:
else
:
# data_baseinfo['统一社会信用代码']
# 如果未给定社会信用代码,则返回
# except:
return
False
# log.info('未获取到统一社会信用代码')
if
data_baseinfo
[
'企业名称'
]
.
startswith
(
'('
)
and
data_baseinfo
[
'企业名称'
]
.
endswith
(
')'
):
# if social_code:
data_baseinfo
[
'企业名称'
]
=
data_baseinfo
[
'企业名称'
][
1
:
-
1
]
# data_baseinfo['统一社会信用代码'] = social_code
if
data_baseinfo
[
'企业名称'
]
==
'-'
and
com_name
:
# else:
data_baseinfo
[
'企业名称'
]
=
com_name
# # 如果未给定社会信用代码,则返回
elif
not
com_name
:
# return False
return
False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
else
:
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
pass
# if data_baseinfo['企业名称'] == '-' and com_name:
# 采集成功的企业
# data_baseinfo['企业名称'] = com_name
data
=
[
com_name
,
data_baseinfo
[
'企业名称'
],
social_code
,
data_baseinfo
[
'统一社会信用代码'
]]
# elif not com_name:
file
.
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
# return False
# 将字段转化成英文驼峰
# else:
aa_dic
=
dic_handle
(
data_baseinfo
)
# pass
aa_dic
[
'sourceUpdateTime'
]
=
sourceUpdateTime
# # 采集成功的企业
aa_dic
[
'qccId'
]
=
qccid
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
aa_dic
[
'ynDomestic'
]
=
ynDomestic
# file.appenddata(file_name, '获取基本信息成功企业', data)
aa_dic
[
'countryName'
]
=
countryName
# # 将字段转化成英文驼峰
aa_dic
[
'securitiesCode'
]
=
securitiesCode
# aa_dic = dic_handle(data_baseinfo)
aa_dic
[
'securitiesShortName'
]
=
securitiesShortName
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic
[
'listingDate'
]
=
listingDate
# aa_dic['qccId'] = qccid
aa_dic
[
'category'
]
=
category
# aa_dic['ynDomestic'] = ynDomestic
aa_dic
[
'exchange'
]
=
exchange
# aa_dic['countryName'] = countryName
aa_dic
[
'listingType'
]
=
listType
# aa_dic['securitiesCode'] = securitiesCode
# sendkafka(aa_dic)
# aa_dic['securitiesShortName'] = securitiesShortName
print
(
aa_dic
)
# aa_dic['listingDate'] = listingDate
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# aa_dic['category'] = category
# dic_info = json.dumps(aa_dic)
# aa_dic['exchange'] = exchange
# req = requests.post(post_url, data=dic_info)
# aa_dic['listingType'] = listType
# # sendkafka(aa_dic)
# print(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def
remove_parentheses
(
text
):
def
remove_parentheses
(
text
):
# 清除中文小括号
# 清除中文小括号
...
@@ -632,7 +650,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
...
@@ -632,7 +650,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_list
=
soup
.
find_all
(
'div'
,
class_
=
'index_search-box__7YVh6'
)
company_list
=
soup
.
find_all
(
'div'
,
class_
=
'index_search-box__7YVh6'
)
except
:
except
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis
(
company_field
)
token
.
updateTokeen
(
id_cookie
,
2
)
token
.
updateTokeen
(
id_cookie
,
2
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
return
False
return
False
...
@@ -695,12 +714,10 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
...
@@ -695,12 +714,10 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return
True
return
True
def
login
():
def
login
():
driver
=
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
# time.sleep(10)
# time.sleep(10)
cookies_list
,
id_cookie
=
token
.
get_cookies
()
cookies_list
,
id_cookie
,
user_name
=
token
.
get_cookies
()
log
.
info
(
f
'=====当前使用的是{user_name}的cookie======'
)
for
cookie
in
cookies_list
:
for
cookie
in
cookies_list
:
driver
.
add_cookie
(
cookie
)
driver
.
add_cookie
(
cookie
)
time
.
sleep
(
5
)
time
.
sleep
(
5
)
...
@@ -713,8 +730,13 @@ def login():
...
@@ -713,8 +730,13 @@ def login():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
taskType
=
'基本信息/天眼查'
taskType
=
'基本信息/天眼查'
driver
,
id_cookie
=
login
()
# driver, id_cookie = login()
driver
=
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
while
True
:
while
True
:
driver
,
id_cookie
=
login
()
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息采集情况.xlsx'
file_name
=
f
'./data/国内企业基本信息采集情况.xlsx'
file
.
createFile
(
file_name
)
file
.
createFile
(
file_name
)
...
@@ -761,7 +783,8 @@ if __name__ == '__main__':
...
@@ -761,7 +783,8 @@ if __name__ == '__main__':
if
company_field
:
if
company_field
:
flag
=
False
flag
=
False
log
.
info
(
"-----已添加数据------"
)
log
.
info
(
"-----已添加数据------"
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis
(
company_field
)
continue
continue
continue
continue
# company_field_ = f'|{company_field}'
# company_field_ = f'|{company_field}'
...
...
comData/Tyc/baseinfotyc_update.py
浏览文件 @
8cf6e366
...
@@ -7,9 +7,11 @@ import datetime
...
@@ -7,9 +7,11 @@ import datetime
import
pymongo
import
pymongo
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
dateutil.relativedelta
import
relativedelta
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
import
urllib3
import
urllib3
from
retry
import
retry
from
selenium.webdriver.support.wait
import
WebDriverWait
from
selenium.webdriver.support.wait
import
WebDriverWait
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'天眼查登录信息'
]
'天眼查登录信息'
]
...
@@ -385,7 +387,7 @@ def redaytowork(com_name, social_code, file_name):
...
@@ -385,7 +387,7 @@ def redaytowork(com_name, social_code, file_name):
if
spiderwork
(
soup
,
com_name
,
file_name
):
if
spiderwork
(
soup
,
com_name
,
file_name
):
count
+=
1
count
+=
1
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}'
)
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}'
)
#
token.updateTokeen(id_cookie,3)
token
.
updateTokeen
(
id_cookie
,
3
)
return
count
return
count
else
:
else
:
return
count
return
count
...
@@ -430,8 +432,8 @@ def paserTime(publishtime):
...
@@ -430,8 +432,8 @@ def paserTime(publishtime):
elif
'月前'
in
publishtime
:
elif
'月前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
tim
edelta
(
months
=
day
)
publishtime
=
current_datetime
-
relativ
edelta
(
months
=
day
)
publishtime
=
current_datetime
-
delta
#
publishtime = current_datetime - delta
elif
'周前'
in
publishtime
:
elif
'周前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
...
@@ -472,6 +474,16 @@ def paserTime(publishtime):
...
@@ -472,6 +474,16 @@ def paserTime(publishtime):
print
(
'时间解析异常!!'
)
print
(
'时间解析异常!!'
)
return
publishtime
return
publishtime
@retry
(
tries
=
2
,
delay
=
3
)
def
getBusinessinfo
(
com_soup
):
com_soup_
=
com_soup
.
find
(
'div'
,
attrs
=
{
'data-dim'
:
'baseInfo'
})
businessinfo
=
com_soup_
.
find
(
'table'
,
{
'class'
:
'index_tableBox__ZadJW'
})
if
not
businessinfo
:
businessinfo
=
com_soup_
.
find
(
'table'
,
{
'class'
:
'index_tableBox__ZadJW '
})
if
not
businessinfo
:
raise
RuntimeError
(
'工商信息未找到'
)
return
businessinfo
# 采集基本信息和工商信息
# 采集基本信息和工商信息
def
spiderinfo
(
company_url
,
receptname
,
file_name
):
def
spiderinfo
(
company_url
,
receptname
,
file_name
):
...
@@ -495,7 +507,7 @@ def spiderinfo(company_url, receptname, file_name):
...
@@ -495,7 +507,7 @@ def spiderinfo(company_url, receptname, file_name):
return
return
try
:
try
:
businessinfo
=
com_soup
.
find
(
'table'
,
{
'class'
:
'index_tableBox__ZadJW'
}
)
businessinfo
=
getBusinessinfo
(
com_soup
)
except
:
except
:
businessinfo
=
''
businessinfo
=
''
if
businessinfo
:
if
businessinfo
:
...
@@ -576,37 +588,42 @@ def spiderinfo(company_url, receptname, file_name):
...
@@ -576,37 +588,42 @@ def spiderinfo(company_url, receptname, file_name):
# req = requests.post(post_url, data=dic_info)
# req = requests.post(post_url, data=dic_info)
else
:
else
:
data_baseinfo
=
baseinfo
(
com_soup
)
# todo: 重新放入redis
# 主要针对香港台湾企业,社会信用代码传为给定的
baseCore
.
r
.
lpush
(
'UpdateBasdeInfo:SocialCode_CompanyName'
,
company_field
)
try
:
log
.
error
(
f
'未找到工商信息,重新塞入redis'
)
data_baseinfo
[
'统一社会信用代码'
]
token
.
updateTokeen
(
id_cookie
,
3
)
except
:
log
.
info
(
'未获取到统一社会信用代码'
)
# data_baseinfo = baseinfo(com_soup)
if
social_code
:
# # 主要针对香港台湾企业,社会信用代码传为给定的
data_baseinfo
[
'统一社会信用代码'
]
=
social_code
# try:
else
:
# data_baseinfo['统一社会信用代码']
# 如果未给定社会信用代码,则返回
# except:
return
False
# log.info('未获取到统一社会信用代码')
if
data_baseinfo
[
'企业名称'
]
.
startswith
(
'('
)
and
data_baseinfo
[
'企业名称'
]
.
endswith
(
')'
):
# if social_code:
data_baseinfo
[
'企业名称'
]
=
data_baseinfo
[
'企业名称'
][
1
:
-
1
]
# data_baseinfo['统一社会信用代码'] = social_code
if
data_baseinfo
[
'企业名称'
]
==
'-'
and
com_name
:
# else:
data_baseinfo
[
'企业名称'
]
=
com_name
# # 如果未给定社会信用代码,则返回
elif
not
com_name
:
# return False
return
False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
else
:
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
pass
# if data_baseinfo['企业名称'] == '-' and com_name:
# 采集成功的企业
# data_baseinfo['企业名称'] = com_name
data
=
[
com_name
,
data_baseinfo
[
'企业名称'
],
social_code
,
data_baseinfo
[
'统一社会信用代码'
]]
# elif not com_name:
file
.
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
# return False
# 将字段转化成英文驼峰
# else:
aa_dic
=
dic_handle
(
data_baseinfo
)
# pass
aa_dic
[
'sourceUpdateTime'
]
=
sourceUpdateTime
# # 采集成功的企业
aa_dic
[
'qccId'
]
=
qccid
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
# sendkafka(aa_dic)
# file.appenddata(file_name, '获取基本信息成功企业', data)
log
.
info
(
aa_dic
)
# # 将字段转化成英文驼峰
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# aa_dic = dic_handle(data_baseinfo)
# dic_info = json.dumps(aa_dic)
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
# req = requests.post(post_url, data=dic_info)
# aa_dic['qccId'] = qccid
# # sendkafka(aa_dic)
# log.info(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def
remove_parentheses
(
text
):
def
remove_parentheses
(
text
):
# 清除中文小括号
# 清除中文小括号
...
@@ -682,12 +699,10 @@ def spiderwork(soup, receptname, file_name):
...
@@ -682,12 +699,10 @@ def spiderwork(soup, receptname, file_name):
return
True
return
True
def
login
():
def
login
():
driver
=
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
# time.sleep(10)
# time.sleep(10)
cookies_list
,
id_cookie
=
token
.
get_cookies
()
cookies_list
,
id_cookie
,
user_name
=
token
.
get_cookies
()
log
.
info
(
f
'=====当前使用的是{user_name}的cookie======'
)
for
cookie
in
cookies_list
:
for
cookie
in
cookies_list
:
driver
.
add_cookie
(
cookie
)
driver
.
add_cookie
(
cookie
)
time
.
sleep
(
5
)
time
.
sleep
(
5
)
...
@@ -695,7 +710,7 @@ def login():
...
@@ -695,7 +710,7 @@ def login():
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
# # driver.get('https://www.qcc.com/')
time
.
sleep
(
60
)
time
.
sleep
(
5
)
return
driver
,
id_cookie
return
driver
,
id_cookie
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
@@ -704,15 +719,22 @@ if __name__ == '__main__':
...
@@ -704,15 +719,22 @@ if __name__ == '__main__':
# #手动登录
# #手动登录
# driver.get('https://www.tianyancha.com/')
# driver.get('https://www.tianyancha.com/')
#todo:绕过验证使用cookies登录
#todo:绕过验证使用cookies登录
driver
,
id_cookie
=
login
()
# driver, id_cookie = login()
driver
=
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
while
True
:
while
True
:
# todo:绕过验证使用cookies登录
driver
,
id_cookie
=
login
()
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息更新.xlsx'
file_name
=
f
'./data/国内企业基本信息更新.xlsx'
file
.
createFile
(
file_name
)
file
.
createFile
(
file_name
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
company_field
=
baseCore
.
redicPullData
(
'UpdateBasdeInfo:SocialCode_CompanyName'
)
#
company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
# company_field = '913100006073602992|光明乳业
股份有限公司'
company_field
=
'91330000742906207U|浙江我武生物科技
股份有限公司'
if
company_field
==
'end'
:
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
# 本轮处理完毕,需要发送邮件,并且进入下一轮
...
@@ -775,5 +797,5 @@ if __name__ == '__main__':
...
@@ -775,5 +797,5 @@ if __name__ == '__main__':
company_url
=
'https://www.tianyancha.com/company/'
+
tycid
company_url
=
'https://www.tianyancha.com/company/'
+
tycid
spiderinfo
(
company_url
,
com_name
,
file_name
)
spiderinfo
(
company_url
,
com_name
,
file_name
)
time
.
sleep
(
10
)
time
.
sleep
(
10
)
#
break
break
baseCore
.
close
()
baseCore
.
close
()
\ No newline at end of file
comData/Tyc/baseinfouptime_tyc.py
浏览文件 @
8cf6e366
...
@@ -328,7 +328,7 @@ if __name__ == '__main__':
...
@@ -328,7 +328,7 @@ if __name__ == '__main__':
driver
.
get
(
'https://www.tianyancha.com/'
)
driver
.
get
(
'https://www.tianyancha.com/'
)
while
True
:
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息
采集情况
.xlsx'
file_name
=
f
'./data/国内企业基本信息
更新
.xlsx'
file
.
createFile
(
file_name
)
file
.
createFile
(
file_name
)
# cookies_list, id_cookie = token.get_cookies()
# cookies_list, id_cookie = token.get_cookies()
# cookies = {}
# cookies = {}
...
...
comData/Tyc/classtool.py
浏览文件 @
8cf6e366
...
@@ -59,7 +59,8 @@ class Token():
...
@@ -59,7 +59,8 @@ class Token():
result
=
db_storage
.
find_one
(
query
,
sort
=
[(
'updateTime'
,
1
)])
result
=
db_storage
.
find_one
(
query
,
sort
=
[(
'updateTime'
,
1
)])
cookies
=
result
[
'cookies'
]
cookies
=
result
[
'cookies'
]
id_token
=
result
[
'_id'
]
id_token
=
result
[
'_id'
]
return
cookies
,
id_token
user_name
=
result
[
'name'
]
return
cookies
,
id_token
,
user_name
# 删除失效的token
# 删除失效的token
def
delete_token
(
self
,
cookie_
):
def
delete_token
(
self
,
cookie_
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论