天眼查基本信息

4f59604c · 薛凌堃 · 7bf2e193 · 4f59604c
--- a/comData/Tyc/baseinfo0130_tyc.py
+++ b/comData/Tyc/baseinfo0130_tyc.py
 # -*- coding: utf-8 -*-
+import datetime
 import json
 import re
 import time
@@ -409,6 +410,64 @@ def ifbeforename(company_url):
    else:
        return ''
+#解析时间
+def paserTime(publishtime):
+    timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
+    current_datetime = datetime.datetime.now()
+    publishtime = publishtime.strip()
+    print(publishtime)
+    try:
+        if '年前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=365 * day)
+            publishtime = current_datetime - delta
+        elif '月前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            # delta = datetime.timedelta(months=day)
+            publishtime = current_datetime - relativedelta(months=day)
+            # publishtime = current_datetime - delta
+        elif '周前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(weeks=day)
+            publishtime = current_datetime - delta
+        elif '天前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=day)
+            publishtime = current_datetime - delta
+        elif '前天' in publishtime:
+            delta = datetime.timedelta(days=2)
+            publishtime = current_datetime - delta
+        elif '昨天' in publishtime:
+            current_datetime = datetime.datetime.now()
+            delta = datetime.timedelta(days=1)
+            publishtime = current_datetime - delta
+        elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
+            if '小时' in publishtime:
+                hour = publishtime.split("小时")[0]
+            else:
+                hour = 0
+            if hour != 0:
+                min = publishtime.split("小时")[1].split("分钟")[0]
+            else:
+                min = publishtime.split("分钟")[0]
+            delta = datetime.timedelta(hours=int(hour), minutes=int(min))
+            publishtime = current_datetime - delta
+        elif '年' in publishtime and '月' in publishtime:
+            time_format = '%Y年%m月%d日'
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+        elif '月' in publishtime and '日' in publishtime:
+            current_year = current_datetime.year
+            time_format = '%Y年%m月%d日'
+            publishtime = str(current_year) + '年' + publishtime
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+    except Exception as e:
+        print('时间解析异常！！')
+    return publishtime
 # 采集基本信息和工商信息
 def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    qccid = company_url.split('company/')[1]
@@ -418,7 +477,17 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
    page_source_detail = driver.page_source
    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
    #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
-    sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
+    try:
+        sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
+        pattern = r'\d{4}-\d{2}-\d{2}'
+        matched = re.findall(pattern, sourceUpdateTime_)
+        if matched:
+            sourceUpdateTime = sourceUpdateTime_
+        else:
+            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
+    except:
+        log.info(f'天眼查无该企业{social_code}')
+        return
    try:
        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
@@ -666,8 +735,8 @@ if __name__ == '__main__':
        # s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
-        company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
+        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
-        # company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
+        company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
@@ -719,7 +788,7 @@ if __name__ == '__main__':
        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
                            listType, ynDomestic, countryName, file_name)
        time.sleep(10)
-        # break
+        break
        # baseCore.r.close()
        # baseCore.sendEmail(file_name)
        # 信息采集完成后将该企业的采集次数更新