Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
7bf2e193
提交
7bf2e193
authored
2月 20, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查基本信息
上级
1c479868
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
40 行增加
和
22 行删除
+40
-22
baseinfo0130_tyc.py
comData/Tyc/baseinfo0130_tyc.py
+40
-22
没有找到文件。
comData/Tyc/baseinfo0130_tyc.py
浏览文件 @
7bf2e193
...
...
@@ -13,7 +13,7 @@ from selenium.webdriver.support.wait import WebDriverWait
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'天眼查登录信息'
]
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
from
dateutil.relativedelta
import
relativedelta
import
sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
...
...
@@ -320,17 +320,18 @@ def dic_handle(result_dic):
}
return
aa_dict
# 检查登陆状态
def
checklogin
(
key
):
t
=
int
(
time
.
time
())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url
=
f
'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
# ip = baseCore.get_proxy(
)
# req = requests.get(headers=headers, url=url, proxies=ip
)
req
=
s
.
get
(
headers
=
headers
,
url
=
url
)
time
.
sleep
(
1
)
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
driver
.
get
(
url
)
time
.
sleep
(
2
)
page_source
=
driver
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
# todo:检查未登录状态
# if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录')
...
...
@@ -390,9 +391,9 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
def
ifbeforename
(
company_url
):
req_
=
s
.
get
(
headers
=
headers
,
url
=
company_url
)
com_soup
=
BeautifulSoup
(
req_
.
content
,
'html.parser'
)
driver
.
get
(
company_url
)
time
.
sleep
(
2
)
com_soup
=
BeautifulSoup
(
driver
.
page_source
,
'html.parser'
)
try
:
businessinfo
=
com_soup
.
find
(
'table'
,
{
'class'
:
'index_tableBox__ZadJW'
})
except
:
...
...
@@ -412,9 +413,10 @@ def ifbeforename(company_url):
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
qccid
=
company_url
.
split
(
'company/'
)[
1
]
log
.
info
(
f
'====={qccid}====='
)
req_
=
s
.
get
(
headers
=
headers
,
url
=
company_url
)
com_soup
=
BeautifulSoup
(
req_
.
content
,
'html.parser'
)
driver
.
get
(
company_url
)
# req_ = s.get(headers=headers, url=company_url)
page_source_detail
=
driver
.
page_source
com_soup
=
BeautifulSoup
(
page_source_detail
,
'html.parser'
)
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime
=
com_soup
.
find
(
'div'
,
class_
=
'index_detail-refresh__6W7U4'
)
.
find
(
'span'
)
.
text
...
...
@@ -502,9 +504,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
print
(
aa_dic
)
# sendkafka(aa_dic)
# print(aa_dic)
post_url
=
'http://192.168.1.41:8088/enterprise/check/judge'
dic_info
=
json
.
dumps
(
aa_dic
)
req
=
requests
.
post
(
post_url
,
data
=
dic_info
)
#
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
#
dic_info = json.dumps(aa_dic)
#
req = requests.post(post_url, data=dic_info)
else
:
data_baseinfo
=
baseinfo
(
com_soup
)
...
...
@@ -543,9 +545,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingType'
]
=
listType
# sendkafka(aa_dic)
print
(
aa_dic
)
post_url
=
'http://192.168.1.41:8088/enterprise/check/judge'
dic_info
=
json
.
dumps
(
aa_dic
)
req
=
requests
.
post
(
post_url
,
data
=
dic_info
)
#
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
#
dic_info = json.dumps(aa_dic)
#
req = requests.post(post_url, data=dic_info)
def
remove_parentheses
(
text
):
# 清除中文小括号
...
...
@@ -623,10 +625,26 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return
False
return
True
def
login
():
driver
=
create_driver
()
url
=
'https://www.tianyancha.com/'
driver
.
get
(
url
)
driver
.
maximize_window
()
# time.sleep(10)
cookies_list
,
id_cookie
=
token
.
get_cookies
()
for
cookie
in
cookies_list
:
driver
.
add_cookie
(
cookie
)
time
.
sleep
(
5
)
driver
.
refresh
()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time
.
sleep
(
5
)
return
driver
,
id_cookie
if
__name__
==
'__main__'
:
taskType
=
'基本信息/天眼查'
#
driver, id_cookie = login()
driver
,
id_cookie
=
login
()
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息采集情况.xlsx'
...
...
@@ -644,12 +662,12 @@ if __name__ == '__main__':
# cookies = {}
# for cookie in cookies_list:
# cookies[cookie['name']] = cookie['value']
s
=
requests
.
Session
()
#
s = requests.Session()
# s.cookies.update(cookies)
start_time
=
time
.
time
()
# 获取企业信息
#
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field
=
'|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
#
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论