提交 4f59604c 作者: 薛凌堃

天眼查基本信息

上级 7bf2e193
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import datetime
import json import json
import re import re
import time import time
...@@ -409,6 +410,64 @@ def ifbeforename(company_url): ...@@ -409,6 +410,64 @@ def ifbeforename(company_url):
else: else:
return '' return ''
#解析时间
def paserTime(publishtime):
timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now()
publishtime = publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
# delta = datetime.timedelta(months=day)
publishtime = current_datetime - relativedelta(months=day)
# publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
if '小时' in publishtime:
hour = publishtime.split("小时")[0]
else:
hour = 0
if hour != 0:
min = publishtime.split("小时")[1].split("分钟")[0]
else:
min = publishtime.split("分钟")[0]
delta = datetime.timedelta(hours=int(hour), minutes=int(min))
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# 采集基本信息和工商信息 # 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('company/')[1] qccid = company_url.split('company/')[1]
...@@ -418,7 +477,17 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -418,7 +477,17 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
page_source_detail = driver.page_source page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser') com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器 #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text try:
sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
pattern = r'\d{4}-\d{2}-\d{2}'
matched = re.findall(pattern, sourceUpdateTime_)
if matched:
sourceUpdateTime = sourceUpdateTime_
else:
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
except:
log.info(f'天眼查无该企业{social_code}')
return
try: try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'}) businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
...@@ -666,8 +735,8 @@ if __name__ == '__main__': ...@@ -666,8 +735,8 @@ if __name__ == '__main__':
# s.cookies.update(cookies) # s.cookies.update(cookies)
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||' company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
...@@ -719,7 +788,7 @@ if __name__ == '__main__': ...@@ -719,7 +788,7 @@ if __name__ == '__main__':
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange, count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
listType, ynDomestic, countryName, file_name) listType, ynDomestic, countryName, file_name)
time.sleep(10) time.sleep(10)
# break break
# baseCore.r.close() # baseCore.r.close()
# baseCore.sendEmail(file_name) # baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论