Merge remote-tracking branch 'origin/master'

785f3d85 · LiuLiYuan · 6cdfccca · c14c8e55 · 785f3d85 · 785f3d85
--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
-#补充剩余核心人员信息
-#先采集天眼查id，再通过id采集核心人员信息
+"""
+天眼查人员信息
+问题1：页面和接口数据不一致 目前方法 单独处理
+问题2：页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑
+"""
 import datetime
 import json
-import os
-import subprocess
-import sys

-import requests,time,random
-import pandas as pd
+import requests, time
 from bs4 import BeautifulSoup
 import urllib3
 from retry import retry
@@ -17,10 +16,6 @@ from getTycId import getTycIdByXYDM
 baseCore = BaseCore()
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 log = baseCore.getLogger()
-headers = {
-    'Cookie': 'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103126138%22%2C%22userId%22:%22304029617%22}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=%255B%252217103126138%2522%255D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22304029617%22%2C%22first_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22304029617%22%7D%2C%22%24device_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529',
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
-}
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor

@@ -30,71 +25,72 @@ cursor = baseCore.cursor_
 list_all_1 = []
 list_all_2 = []
 taskType = '天眼查/核心人员'
-ip_num = 0
-
-def get_proxy(ip_num):
-    sql = "select proxy from clb_proxy"
-    cursor_.execute(sql)
-    proxy_lists = cursor_.fetchall()
-    cnx_.commit()
-    ip_list = []
-    for proxy_ in proxy_lists:
-        ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
-    proxy_list = []
-    for str_ip in ip_list:
-        str_ip_list = str_ip.split('-')
-        proxyMeta = "http://%(host)s:%(port)s" % {
-            "host": str_ip_list[0],
-            "port": str_ip_list[1],
-        }
-        proxy = {
-            "http": proxyMeta,
-            "https": proxyMeta
-        }
-        proxy_list.append(proxy)
-    return proxy_list[ip_num]
+from lxml import etree
+from classtool import Token, File, Tag
+token = Token()

 @retry(tries=3, delay=1)
-def get_html(tycid, ip_num):
+def get_html(tycid, s, headers):
    url = f"https://www.tianyancha.com/company/{tycid}"
-    ip = get_proxy(ip_num)
-    response = requests.get(url=url, headers=headers, proxies=ip)
+    # ip = baseCore.get_proxy()
+    response = s.get(url=url, headers=headers)
    if response.status_code == 200:
        pass
    else:
-        ip_num += 1
        raise
        # return -1
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
-        tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
+        div_part = soup.find('div', attrs={'data-dim': 'staff'})
+        # div_part.find('div', class_='dimHeader_root__XTCLe')
+    except:
+        return -1
+    try:
+        tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
        if '最新公示' in tmp_field:
-            total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+            total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
            return int(total)
        else:
-            return 0
+            return -1
    except:
        return 0

+
 @retry(tries=3, delay=1)
-def get_page(url, ip_num):
-    ip = get_proxy(ip_num)
-    res = requests.get(url=url, headers=headers, proxies=ip)
-    if res.status_code == 200:
-        pass
-    else:
-        ip_num += 1
-        raise
+def get_page(url, s, headers):
+    ip = baseCore.get_proxy()
+    res = s.get(url=url, headers=headers, proxies=ip)
    time.sleep(1)
-    total_page_ = res.json()['data']['total']
+    if res.status_code != 200:
+        raise
+    data_page = res.json()
+    total_page_ = data_page['data']['total']
    return total_page_

+
 def doJob():
+    # for social_code in social_code_list:
    while True:
+        # todo:设置cookies的使用
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'max-age=0',
+            'Connection': 'keep-alive',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'version': 'TYC-Web'
+        }
+        cookies_list, id_cookie = token.get_cookies()
+        cookies = {}
+        for cookie in cookies_list:
+            cookies[cookie['name']] = cookie['value']
+        s = requests.Session()
+        s.cookies.update(cookies)
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '91320691550279691N'
+        social_code = '911101067916069050'
        if social_code == None:
            time.sleep(20)
            continue
@@ -108,26 +104,28 @@ def doJob():
                tycid = data[11]
                count = data[17]
            else:
-                #数据重新塞入redis
+                # 数据重新塞入redis
                # log.info(f'数据库中无该企业{social_code}')
                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
                cursor.execute(sql)
                data = cursor.fetchone()
+                if data:
+                    pass
+                else:
+                    #数据库中并没有该企业 需要新增
+                    pass
                id = data[0]
                com_name = data[3]
                xydm = data[1]
                conut = 0
                # 写入数据库
-                insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
-                cursor_.execute(insert, (com_name, xydm, social_code))
+                insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
+                cursor_.execute(insert, (com_name, xydm))
                cnx_.commit()
                tycid = ''
-                # baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
-                # continue
-
            if tycid == None or tycid == '':
                try:
-                    retData = getTycIdByXYDM(com_name)
+                    retData = getTycIdByXYDM(com_name, s)
                    if retData['state']:
                        tycid = retData['tycData']['id']
                        # # todo:写入数据库
@@ -147,35 +145,29 @@ def doJob():
                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
                    continue
-
+            count = data[17]
            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
            list_one_info = []
            num = 1
-
-            #todo:先确定接口走哪个
            try:
-                charge = get_html(tycid, ip_num)
-            except Exception as e:
+                charge = get_html(tycid, s, headers)
+            # 页面请求三次都失败
+            except:
                charge = -1
-                log.info(e)
-            total_page = 0
+
            t = int(time.time() * 1000)
            if charge == -1:
+                token.updateTokeen(id_cookie, 2)
                # 重新塞入redis
-                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
-                log.info(f"{id}---{xydm}----{tycid}----请求失败")
-                # 获取当前进程pid
-                current_pid = baseCore.getPID()
-                # todo: 重新启动新进程，杀死当前进程
-                subprocess.Popen([sys.executable] + sys.argv)
-                os.kill(current_pid, 9)
+                baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+                log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
+                time.sleep(2)
                continue
            elif charge == 0:
                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
                try:
-                    total_page1 = get_page(url1, ip_num)
+                    total_page1 = get_page(url1, s, headers)
                except:
                    total_page1 = 0
                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
@@ -186,12 +178,12 @@ def doJob():
                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
                try:
-                    total_page2 = get_page(url2, ip_num)
+                    total_page2 = get_page(url2, s, headers)
                except:
                    total_page2 = 0
-                time.sleep(2)
+                time.sleep(1)
                try:
-                    total_page3 = get_page(url3, ip_num)
+                    total_page3 = get_page(url3, s, headers)
                except:
                    total_page3 = 0
                if total_page2 == charge:
@@ -206,33 +198,38 @@ def doJob():
                    else:
                        total_page = 0
                        flag = 0
-                        log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                        baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code)
+                        log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
                        continue
            if total_page == 0:
+                token.updateTokeen(id_cookie, 2)
                # 重新塞入redis
-                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
                continue
-            #todo:获取页数
-            time.sleep(2)
-            for page in range(1, int((total_page/20) + 1)+1):
+            # # todo:获取页数
+            # total_page = 34
+            # flag = 2
+            for page in range(1, int((total_page / 20) + 1) + 1):
+                res = None
                for c in range(3):
                    ip = baseCore.get_proxy()
                    url_ = url.format(t, tycid, page)
-                    res = requests.get(url_, headers=headers, proxies=ip)  # ,verify=False
+                    # url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
+                    res = requests.get(url_, headers=headers, proxies=ip, verify=False)  # ,verify=False
                    time.sleep(1)
                    if res.status_code == 200:
                        break
                    else:
                        if c == 2:
-                            res = ''
                            break
                        continue
                if res:
                    pass
                else:
+                    token.updateTokeen(id_cookie, 2)
                    # 重新塞入redis
-                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                    baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
                    continue
                try:
@@ -250,7 +247,7 @@ def doJob():
                        education = one_info['education']
                        position = one_info['position']
                        Salary = one_info['salary']
-                        #todo:获取当前年份
+                        # todo:获取当前年份
                        now = datetime.datetime.now()
                        year = now.year
                        try:
@@ -266,37 +263,37 @@ def doJob():
                        except:
                            person_img = '--'
                        dic_json = {
-                            "socialCreditCode":social_code,
-                            "name":name,
-                            "sex":sex,
-                            "education":education,
-                            "position":position,
-                            "salary":Salary,
-                            "birthYear":birthYear,
-                            "shareNum":StockKeepings,
-                            "shareRatio":'',
-                            "benefitShare":'',
-                            "currentTerm":currentTerm,
-                            "personInfo":personInfo,
-                            "sort":str(num)
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "sort": str(num)
                        }
                        dic_json_img = {
-                            "socialCreditCode":social_code,
-                            "name":name,
-                            "sex":sex,
-                            "education":education,
-                            "position":position,
-                            "salary":Salary,
-                            "birthYear":birthYear,
-                            "shareNum":StockKeepings,
-                            "shareRatio":'',
-                            "benefitShare":'',
-                            "currentTerm":currentTerm,
-                            "personInfo":personInfo,
-                            "头像":person_img,
-                            "sort":str(num)
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "头像": person_img,
+                            "sort": str(num)
                        }
-                        num = num+1
+                        num = num + 1
                        list_one_info.append(dic_json)
                        # list_all_2.append(dic_json_img)
                elif flag == 3:
@@ -314,8 +311,11 @@ def doJob():
                        except:
                            birthYear = ''
                        personInfo = one_info['resume_cn']
-                        timestamp = int(int(one_info['employ_date'])/10000)
+                        try:
+                            timestamp = int(one_info['employ_date']) / 1000
                            currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
+                        except:
+                            currentTerm = ''
                        dic_json = {
                            "socialCreditCode": social_code,
                            "name": name,
@@ -327,7 +327,7 @@ def doJob():
                            "shareNum": '',
                            "shareRatio": '',
                            "benefitShare": '',
-                            "currentTerm": currentTerm+'至-',
+                            "currentTerm": currentTerm + '至-',
                            "personInfo": personInfo,
                            "sort": str(num)
                        }
@@ -393,19 +393,23 @@ def doJob():
                continue
            else:
                pass
-            response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False)
+            response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
+                                     verify=False)
            print(response.text)
            log.info('=========成功======')
+            token.updateTokeen(id_cookie, 3)
+            time.sleep(10)
        except Exception as e:
            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
            log.info(e)
            # 重新塞入redis
-            baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+            baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
            state = 0
            takeTime = baseCore.getTimeCost(start, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
            time.sleep(5)
-        # break
+        break
+
 # df_img = pd.DataFrame(list_all_2)
 # df_img.to_excel('企业主要人员-头像.xlsx',index=False)
 if __name__ == "__main__":

--- a/comData/Tyc/getTycId.py
+++ b/comData/Tyc/getTycId.py
@@ -21,45 +21,29 @@ headers = {
        'Connection': 'keep-alive',
        'Content-Length': '32',
        'Content-Type': 'application/json',
-        'Host': 'capi.tianyancha.com',
-        'Origin': 'https://www.tianyancha.com',
-        'Referer': 'https://www.tianyancha.com/',
-        'Sec-Fetch-Dest': 'empty',
-        'Sec-Fetch-Mode': 'cors',
-        'Sec-Fetch-Site': 'same-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-        'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
-        'X-TYCID': '6f6298905d3011ee96146793e725899d',
-        'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
-        'sec-ch-ua-mobile': '?0',
-        'sec-ch-ua-platform': '"Windows"',
        'version': 'TYC-Web'
 }
-# headers = {
-#     'X-TYCID':'30c1289042f511ee9182cd1e1bcaa517',
-#     # 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5MjkzMzIxMiwiZXhwIjoxNjk1NTI1MjEyfQ.BKxDem8fpgeDHrIgm3qCoF76ueHtQSG1DggiTl4FAaoNKt4gem6NTX1XYndPXqVj9TXfl-8yp2kKE3jY66dyig',
-#     'version':'TYC-Web',
-#     'Content-Type':'application/json;charset=UTF-8'
-# }
+
 # cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
 # cursor= cnx.cursor()
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor
 taskType = '天眼查企业id/天眼查'
 #根据信用代码获取天眼查id 企业名字等信息
-def getTycIdByXYDM(com_name):
+def getTycIdByXYDM(com_name, s):
    retData={'state':False,'tycData':None,'reput':True}
    url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
    ip = baseCore.get_proxy()
-    paramJsonData = {'keyword':com_name}
+    paramJsonData = {'keyword': com_name}
    try:
        # headers['User-Agent'] = baseCore.getRandomUserAgent()
        # headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
        # response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
-        response = requests.post(url,json=paramJsonData,headers=headers,verify=False)
+        response = s.post(url, json=paramJsonData, headers=headers)
        time.sleep(random.randint(3, 5))
        retJsonData =json.loads(response.content.decode('utf-8'))
-        if retJsonData['data'] and retJsonData['state']== 'ok':
+        if retJsonData['data'] and retJsonData['state'] == 'ok':
            pass
        else:
            log.error(f"---{com_name}-未查询到该企业---")