天眼查基本信息

7bf2e193 · 薛凌堃 · 1c479868 · 7bf2e193
--- a/comData/Tyc/baseinfo0130_tyc.py
+++ b/comData/Tyc/baseinfo0130_tyc.py
@@ -13,7 +13,7 @@ from selenium.webdriver.support.wait import WebDriverWait
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '天眼查登录信息']
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
+from dateutil.relativedelta import relativedelta
 import sys
 # sys.path.append('D:\\KK\\zzsn_spider\\base')
 sys.path.append('D:\\kkwork\\zzsn_spider\\base')
@@ -320,17 +320,18 @@ def dic_handle(result_dic):
    }

    return aa_dict
+
 # 检查登陆状态
 def checklogin(key):

    t = int(time.time())
    # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
    url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
-    # ip = baseCore.get_proxy()
-    # req = requests.get(headers=headers, url=url, proxies=ip)
-    req = s.get(headers=headers, url=url)
-    time.sleep(1)
-    soup = BeautifulSoup(req.content, 'html.parser')
+    driver.get(url)
+    time.sleep(2)
+
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, 'html.parser')
    # todo:检查未登录状态
    # if soup.find('title').text == '会员登录 - 企查查':
    #     log.info('状态---未登录')
@@ -390,9 +391,9 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin


 def ifbeforename(company_url):
-
-    req_ = s.get(headers=headers, url=company_url)
-    com_soup = BeautifulSoup(req_.content, 'html.parser')
+    driver.get(company_url)
+    time.sleep(2)
+    com_soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
    except:
@@ -412,9 +413,10 @@ def ifbeforename(company_url):
 def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    qccid = company_url.split('company/')[1]
    log.info(f'====={qccid}=====')
-
-    req_ = s.get(headers=headers, url=company_url)
-    com_soup = BeautifulSoup(req_.content, 'html.parser')
+    driver.get(company_url)
+    # req_ = s.get(headers=headers, url=company_url)
+    page_source_detail = driver.page_source
+    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
    #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
    sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text

@@ -502,9 +504,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        print(aa_dic)
        # sendkafka(aa_dic)
        # print(aa_dic)
-        post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
-        dic_info = json.dumps(aa_dic)
-        req = requests.post(post_url, data=dic_info)
+        # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # dic_info = json.dumps(aa_dic)
+        # req = requests.post(post_url, data=dic_info)

    else:
        data_baseinfo = baseinfo(com_soup)
@@ -543,9 +545,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingType'] = listType
        # sendkafka(aa_dic)
        print(aa_dic)
-        post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
-        dic_info = json.dumps(aa_dic)
-        req = requests.post(post_url, data=dic_info)
+        # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # dic_info = json.dumps(aa_dic)
+        # req = requests.post(post_url, data=dic_info)

 def remove_parentheses(text):
    # 清除中文小括号
@@ -623,10 +625,26 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
            return False
    return True

+def login():
+    driver = create_driver()
+    url = 'https://www.tianyancha.com/'
+    driver.get(url)
+    driver.maximize_window()
+    # time.sleep(10)
+    cookies_list, id_cookie = token.get_cookies()
+    for cookie in cookies_list:
+        driver.add_cookie(cookie)
+    time.sleep(5)
+    driver.refresh()
+    # url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
+    # driver.get(url_test)
+    # # driver.get('https://www.qcc.com/')
+    time.sleep(5)
+    return driver,id_cookie

 if __name__ == '__main__':
    taskType = '基本信息/天眼查'
-    # driver, id_cookie = login()
+    driver, id_cookie = login()
    while True:
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
        file_name = f'./data/国内企业基本信息采集情况.xlsx'
@@ -644,12 +662,12 @@ if __name__ == '__main__':
        # cookies = {}
        # for cookie in cookies_list:
        #     cookies[cookie['name']] = cookie['value']
-        s = requests.Session()
+        # s = requests.Session()
        # s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
-        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
-        company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
+        company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
+        # company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'

        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮