Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
4f59604c
提交
4f59604c
authored
2月 20, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查基本信息
上级
7bf2e193
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
73 行增加
和
4 行删除
+73
-4
baseinfo0130_tyc.py
comData/Tyc/baseinfo0130_tyc.py
+73
-4
没有找到文件。
comData/Tyc/baseinfo0130_tyc.py
浏览文件 @
4f59604c
# -*- coding: utf-8 -*-
import
datetime
import
json
import
re
import
time
...
...
@@ -409,6 +410,64 @@ def ifbeforename(company_url):
else
:
return
''
#解析时间
def
paserTime
(
publishtime
):
timeType
=
[
'年前'
,
'月前'
,
'周前'
,
'前天'
,
'昨天'
,
'天前'
,
'今天'
,
'小时前'
,
'分钟前'
]
current_datetime
=
datetime
.
datetime
.
now
()
publishtime
=
publishtime
.
strip
()
print
(
publishtime
)
try
:
if
'年前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
days
=
365
*
day
)
publishtime
=
current_datetime
-
delta
elif
'月前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
# delta = datetime.timedelta(months=day)
publishtime
=
current_datetime
-
relativedelta
(
months
=
day
)
# publishtime = current_datetime - delta
elif
'周前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
weeks
=
day
)
publishtime
=
current_datetime
-
delta
elif
'天前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
days
=
day
)
publishtime
=
current_datetime
-
delta
elif
'前天'
in
publishtime
:
delta
=
datetime
.
timedelta
(
days
=
2
)
publishtime
=
current_datetime
-
delta
elif
'昨天'
in
publishtime
:
current_datetime
=
datetime
.
datetime
.
now
()
delta
=
datetime
.
timedelta
(
days
=
1
)
publishtime
=
current_datetime
-
delta
elif
'今天'
in
publishtime
or
'小时前'
in
publishtime
or
'分钟前'
in
publishtime
:
if
'小时'
in
publishtime
:
hour
=
publishtime
.
split
(
"小时"
)[
0
]
else
:
hour
=
0
if
hour
!=
0
:
min
=
publishtime
.
split
(
"小时"
)[
1
]
.
split
(
"分钟"
)[
0
]
else
:
min
=
publishtime
.
split
(
"分钟"
)[
0
]
delta
=
datetime
.
timedelta
(
hours
=
int
(
hour
),
minutes
=
int
(
min
))
publishtime
=
current_datetime
-
delta
elif
'年'
in
publishtime
and
'月'
in
publishtime
:
time_format
=
'
%
Y年
%
m月
%
d日'
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
elif
'月'
in
publishtime
and
'日'
in
publishtime
:
current_year
=
current_datetime
.
year
time_format
=
'
%
Y年
%
m月
%
d日'
publishtime
=
str
(
current_year
)
+
'年'
+
publishtime
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
except
Exception
as
e
:
print
(
'时间解析异常!!'
)
return
publishtime
# 采集基本信息和工商信息
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
qccid
=
company_url
.
split
(
'company/'
)[
1
]
...
...
@@ -418,7 +477,17 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
page_source_detail
=
driver
.
page_source
com_soup
=
BeautifulSoup
(
page_source_detail
,
'html.parser'
)
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime
=
com_soup
.
find
(
'div'
,
class_
=
'index_detail-refresh__6W7U4'
)
.
find
(
'span'
)
.
text
try
:
sourceUpdateTime_
=
com_soup
.
find
(
'div'
,
class_
=
'index_detail-refresh__6W7U4'
)
.
find
(
'span'
)
.
text
pattern
=
r'\d{4}-\d{2}-\d{2}'
matched
=
re
.
findall
(
pattern
,
sourceUpdateTime_
)
if
matched
:
sourceUpdateTime
=
sourceUpdateTime_
else
:
sourceUpdateTime
=
paserTime
(
sourceUpdateTime_
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
log
.
info
(
f
'天眼查无该企业{social_code}'
)
return
try
:
businessinfo
=
com_soup
.
find
(
'table'
,
{
'class'
:
'index_tableBox__ZadJW'
})
...
...
@@ -666,8 +735,8 @@ if __name__ == '__main__':
# s.cookies.update(cookies)
start_time
=
time
.
time
()
# 获取企业信息
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
#
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
#
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field
=
'|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
...
...
@@ -719,7 +788,7 @@ if __name__ == '__main__':
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
time
.
sleep
(
10
)
#
break
break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论