天眼查核心人员更新

c442ebb4 · 薛凌堃 · 0fe52c2b · c442ebb4 · c442ebb4
--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
@@ -45,15 +45,18 @@ def get_html(tycid, s, headers):
        # div_part.find('div', class_='dimHeader_root__XTCLe')
    except:
        return -1
-    try:
+    if div_part is None:
-        tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
+        return -2
-        if '最新公示' in tmp_field:
+    else:
-            total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+        try:
-            return int(total)
+            tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
-        else:
+            if '最新公示' in tmp_field:
-            return -1
+                total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
-    except:
+                return int(total)
-        return 0
+            else:
+                return -1
+        except:
+            return 0
 @retry(tries=3, delay=1)
@@ -64,7 +67,10 @@ def get_page(url, s, headers):
    if res.status_code != 200:
        raise
    data_page = res.json()
-    total_page_ = data_page['data']['total']
+    try:
+        total_page_ = data_page['data']['total']
+    except:
+        raise
    return total_page_
@@ -77,7 +83,7 @@ def doJob():
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
-            'Connection': 'keep-alive',
+            # 'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'version': 'TYC-Web'
        }
@@ -90,7 +96,7 @@ def doJob():
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '91440300MA5EU1QM0T'
+        social_code = '91110108780992804C'
        if social_code == None:
            time.sleep(20)
            continue
@@ -163,6 +169,11 @@ def doJob():
                log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
                time.sleep(2)
                continue
+            elif charge == -2:
+                # 该企业没有人员信息
+                log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
+                continue
            elif charge == 0:
                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
@@ -240,6 +251,8 @@ def doJob():
                    pass
                else:
                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                # todo: 关闭连接
+                res.close()
                if flag == 1:
                    for one_info in list_all:
                        name = one_info['name']

--- a/comData/Tyc/CorePerson_Update.py
+++ b/comData/Tyc/CorePerson_Update.py
+"""
+天眼查人员信息
+问题1：页面和接口数据不一致 目前方法 单独处理
+问题2：页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
+"""
+import datetime
+import json
+import requests, time
+from bs4 import BeautifulSoup
+import urllib3
+from retry import retry
+from base.BaseCore import BaseCore
+from getTycId import getTycIdByXYDM
+baseCore = BaseCore()
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+log = baseCore.getLogger()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
+list_all_1 = []
+list_all_2 = []
+taskType = '天眼查/核心人员更新'
+from lxml import etree
+from classtool import Token, File, Tag
+token = Token()
+@retry(tries=3, delay=1)
+def get_html(tycid, s, headers):
+    url = f"https://www.tianyancha.com/company/{tycid}"
+    # ip = baseCore.get_proxy()
+    response = s.get(url=url, headers=headers)
+    if response.status_code == 200:
+        pass
+    else:
+        raise
+        # return -1
+    soup = BeautifulSoup(response.content, 'html.parser')
+    try:
+        div_part = soup.find('div', attrs={'data-dim': 'staff'})
+        # div_part.find('div', class_='dimHeader_root__XTCLe')
+    except:
+        return -1
+    if div_part is None:
+        return -2
+    else:
+        try:
+            tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
+            if '最新公示' in tmp_field:
+                total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+                return int(total)
+            else:
+                return -1
+        except:
+            return 0
+@retry(tries=3, delay=1)
+def get_page(url, s, headers):
+    ip = baseCore.get_proxy()
+    res = s.get(url=url, headers=headers, proxies=ip)
+    time.sleep(1)
+    if res.status_code != 200:
+        raise
+    data_page = res.json()
+    try:
+        total_page_ = data_page['data']['total']
+    except:
+        raise
+    return total_page_
+def doJob():
+    # for social_code in social_code_list:
+    while True:
+        # todo:设置cookies的使用
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'max-age=0',
+            # 'Connection': 'keep-alive',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'version': 'TYC-Web'
+        }
+        cookies_list, id_cookie = token.get_cookies()
+        cookies = {}
+        for cookie in cookies_list:
+            cookies[cookie['name']] = cookie['value']
+        s = requests.Session()
+        s.cookies.update(cookies)
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
+        # 判断 如果Redis中已经没有数据，则等待
+        # social_code = '91110108780992804C'
+        if item == None:
+            time.sleep(20)
+            continue
+        start = time.time()
+        social_code = item.split('|')[0]
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                id = data[0]
+                com_name = data[1]
+                xydm = data[2]
+                tycid = data[11]
+                count = data[17]
+            else:
+                # 数据重新塞入redis
+                # log.info(f'数据库中无该企业{social_code}')
+                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
+                cursor.execute(sql)
+                data = cursor.fetchone()
+                if data:
+                    pass
+                else:
+                    #数据库中并没有该企业 需要新增
+                    pass
+                id = data[0]
+                com_name = data[3]
+                xydm = data[1]
+                conut = 0
+                # 写入数据库
+                insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
+                cursor_.execute(insert, (com_name, xydm))
+                cnx_.commit()
+                tycid = ''
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(com_name, s)
+                    if retData['state']:
+                        tycid = retData['tycData']['id']
+                        # # todo:写入数据库
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor_.execute(updateSql)
+                        cnx_.commit()
+                    else:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={social_code}====重新放入redis====')
+                        baseCore.rePutIntoR('UpdateCoreperson:Error', item)
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('UpdateCoreperson:Error', item)
+                    continue
+            count = data[17]
+            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
+            list_one_info = []
+            num = 1
+            try:
+                charge = get_html(tycid, s, headers)
+            # 页面请求三次都失败
+            except:
+                charge = -1
+            t = int(time.time() * 1000)
+            if charge == -1:
+                token.updateTokeen(id_cookie, 2)
+                # 重新塞入redis
+                baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
+                time.sleep(2)
+                continue
+            elif charge == -2:
+                # 该企业没有人员信息
+                log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
+                continue
+            elif charge == 0:
+                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
+                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page1 = get_page(url1, s, headers)
+                except:
+                    total_page1 = 0
+                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
+                total_page = total_page1
+                flag = 2
+            else:
+                log.info(f"{id}---{xydm}----{tycid}----有最新公示")
+                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page2 = get_page(url2, s, headers)
+                except:
+                    total_page2 = 0
+                time.sleep(1)
+                try:
+                    total_page3 = get_page(url3, s, headers)
+                except:
+                    total_page3 = 0
+                if total_page2 == charge:
+                    url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                    total_page = total_page2
+                    flag = 1
+                else:
+                    if total_page3 == charge:
+                        url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                        total_page = total_page3
+                        flag = 3
+                    else:
+                        total_page = 0
+                        flag = 0
+                        baseCore.rePutIntoR('UpdateCoreperson:Map', item)
+                        log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
+                        continue
+            if total_page == 0:
+                token.updateTokeen(id_cookie, 2)
+                # 重新塞入redis
+                baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
+                continue
+            # # todo:获取页数
+            # total_page = 34
+            # flag = 2
+            for page in range(1, int((total_page / 20) + 1) + 1):
+                res = None
+                for c in range(3):
+                    ip = baseCore.get_proxy()
+                    url_ = url.format(t, tycid, page)
+                    # url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
+                    res = requests.get(url_, headers=headers, proxies=ip, verify=False)  # ,verify=False
+                    time.sleep(1)
+                    if res.status_code == 200:
+                        break
+                    else:
+                        if c == 2:
+                            break
+                        continue
+                if res:
+                    pass
+                else:
+                    token.updateTokeen(id_cookie, 2)
+                    # 重新塞入redis
+                    baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
+                    continue
+                # todo:test测试
+                log.info(f'{id}---{xydm}----{tycid}----{res.json()}')
+                try:
+                    list_all = res.json()['data']['dataList']
+                except:
+                    list_all = res.json()['data']['result']
+                if list_all:
+                    pass
+                else:
+                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                # todo: 关闭连接
+                res.close()
+                if flag == 1:
+                    for one_info in list_all:
+                        name = one_info['name']
+                        sex = one_info['sex']
+                        education = one_info['education']
+                        position = one_info['position']
+                        Salary = one_info['salary']
+                        # todo:获取当前年份
+                        now = datetime.datetime.now()
+                        year = now.year
+                        try:
+                            birthYear = year - int(one_info['age'])
+                        except:
+                            birthYear = ''
+                        StockKeepings = one_info['numberOfShares']
+                        currentTerm = one_info['term']
+                        personInfo = one_info['resume']
+                        try:
+                            person_img = one_info['logo']
+                        except:
+                            person_img = '--'
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        dic_json_img = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "头像": person_img,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+                        # list_all_2.append(dic_json_img)
+                elif flag == 3:
+                    for one_info in list_all:
+                        name = one_info['personal_name']
+                        try:
+                            sex = one_info['gender2']
+                        except:
+                            sex = ''
+                        education = ''
+                        position = one_info['position_name']
+                        Salary = ''
+                        try:
+                            birthYear = one_info['year_of_birth']
+                        except:
+                            birthYear = ''
+                        personInfo = one_info['resume_cn']
+                        try:
+                            timestamp = int(one_info['employ_date']) / 1000
+                            currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
+                        except:
+                            currentTerm = ''
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm + '至-',
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+                else:
+                    for one_info in list_all:
+                        name = one_info['name']
+                        try:
+                            position = one_info['typeSore']
+                        except:
+                            position = ''
+                        person_id = one_info['id']
+                        person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
+                        # person_res = requests.get(person_url, headers=headers, proxies=ip)
+                        person_res = requests.get(person_url, headers=headers)
+                        person_soup = BeautifulSoup(person_res.content, 'html.parser')
+                        try:
+                            personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
+                        except:
+                            personInfo = ''
+                        try:
+                            person_img = one_info['logo']
+                        except:
+                            person_img = '--'
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": '',
+                            "education": '',
+                            "position": position,
+                            "salary": '',
+                            "birthYear": '',
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": '',
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        dic_json_img = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": '',
+                            "education": '',
+                            "position": position,
+                            "salary": '',
+                            "birthYear": '',
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": '',
+                            "personInfo": personInfo,
+                            "头像": person_img,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+            # print(list_one_info)
+            json_updata = json.dumps(list_one_info)
+            if json_updata == '[]':
+                continue
+            else:
+                pass
+            # response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
+            #                          verify=False)
+            # print(response.text)
+            log.info('=========成功======')
+            token.updateTokeen(id_cookie, 3)
+            time.sleep(10)
+        except Exception as e:
+            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
+            log.info(e)
+            # 重新塞入redis
+            baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)
+        break
+# df_img = pd.DataFrame(list_all_2)
+# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
+if __name__ == "__main__":
+    doJob()
\ No newline at end of file