02/06

9ed327a7 · 薛凌堃 · b1d1cafd · b1d1cafd · 9ed327a7
--- a/comData/Tyc/CorePerson2.py
+++ b/comData/Tyc/CorePerson2.py
-#补充剩余核心人员信息
-#先采集天眼查id，再通过id采集核心人员信息
-import datetime
-import json
-import requests,time,random
-import pandas as pd
-from bs4 import BeautifulSoup
-import urllib3
-from retry import retry
-from base.BaseCore import BaseCore
-from getTycId import getTycIdByXYDM
-baseCore = BaseCore()
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-log = baseCore.getLogger()
-headers = {
-    'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=581fac60bfe911eeb3fc09360952f0ba; ssuid=1162354300; _ga=GA1.2.1333101206.1706683384; _gid=GA1.2.604055726.1706683384; tyc-user-phone=%255B%252218837538506%2522%252C%2522152%25203756%25200528%2522%255D; HWWAFSESID=b306585832394f6d3b; HWWAFSESTIME=1706751848880; csrfToken=DUIyVpHXj6o8vOwT9idnR4hd; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706671944,1706751850; bannerFlag=true; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2215822283785%22%2C%22userId%22%3A%22269298908%22%7D; tyc-user-info-save-time=1706751947161; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgyMjI4Mzc4NSIsImlhdCI6MTcwNjc1MTk0NiwiZXhwIjoxNzA5MzQzOTQ2fQ.W-hQ1QBEoDkHYqcSFjTEukemZJpHi-iYzqqnpYR-uaKi6ecS3HNp_dUs8UuzSiYyZH4WQjc-98Z-3hysQGEr_Q; searchSessionId=1706751998.12338612; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22269298908%22%2C%22first_id%22%3A%2218d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQ5MzJlZjg1NWEtMGVkMTRiODAyY2YzMDE4LTNlNjA0ODA5LTIwNzM2MDAtMThkNWQ5MzJlZjkyMGEiLCIkaWRlbnRpdHlfbG9naW5faWQiOiIyNjkyOTg5MDgifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22269298908%22%7D%2C%22%24device_id%22%3A%2218d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706752204',
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
-}
-cnx_ = baseCore.cnx
-cursor_ = baseCore.cursor
-cnx = baseCore.cnx_
-cursor = baseCore.cursor_
-list_all_1 = []
-list_all_2 = []
-taskType = '天眼查/核心人员'
-ip_num = 0
-from lxml import etree
-@retry(tries=3, delay=1)
-def get_html(tycid):
-    url = f"https://www.tianyancha.com/company/{tycid}"
-    # ip = baseCore.get_proxy()
-    response = requests.get(url=url, headers=headers)
-    if response.status_code == 200:
-        pass
-    else:
-        raise
-        # return -1
-    # soup = BeautifulSoup(response.content, 'html.parser')
-    soup = etree.HTML(response.content)
-    try:
-        model = soup.xpath('//*[@id="page-root"]/div[3]/div[1]/div[3]/div/div[3]/div[2]/div[2]/div[3]/div/div[1]/div[1]/span/h3')
-        corp = model.text
-        if corp == '主要人员':
-            tmp_field = soup.find('div', class_='index_dim-tab-container__kysLO').find('div',class_='dim-tab-root').find('span').text
-            if '最新公示' in tmp_field:
-                total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
-                return int(total)
-            else:
-                return 0
-    except:
-        return -1
-    try:
-    except:
-        return 0
-@retry(tries=3, delay=1)
-def get_page(url):
-    ip = baseCore.get_proxy()
-    res = requests.get(url=url, headers=headers, proxies=ip)
-    time.sleep(1)
-    if res.status_code != 200:
-        raise
-    total_page_ = res.json()['data']['total']
-    return total_page_
-def doJob():
-    while True:
-        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
-        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '91150400701461969E'
-        if social_code == None:
-            time.sleep(20)
-            continue
-        start = time.time()
-        try:
-            data = baseCore.getInfomation(social_code)
-            if len(data) != 0:
-                id = data[0]
-                com_name = data[1]
-                xydm = data[2]
-                tycid = data[11]
-                count = data[17]
-            else:
-                # 数据重新塞入redis
-                # log.info(f'数据库中无该企业{social_code}')
-                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
-                cursor.execute(sql)
-                data = cursor.fetchone()
-                id = data[0]
-                com_name = data[3]
-                xydm = data[1]
-                conut = 0
-                # 写入数据库
-                insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
-                cursor_.execute(insert, (com_name, xydm))
-                cnx_.commit()
-                tycid = ''
-                # baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
-                # continue
-            # id = data[0]
-            # com_name = data[1]
-            # xydm = data[2]
-            # tycid = data[11]
-            if tycid == None or tycid == '':
-                try:
-                    retData = getTycIdByXYDM(com_name)
-                    if retData['state']:
-                        tycid = retData['tycData']['id']
-                        # # todo:写入数据库
-                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
-                        cursor_.execute(updateSql)
-                        cnx_.commit()
-                    else:
-                        state = 0
-                        takeTime = baseCore.getTimeCost(start, time.time())
-                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
-                        log.info(f'======={social_code}====重新放入redis====')
-                        baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                        continue
-                except:
-                    state = 0
-                    takeTime = baseCore.getTimeCost(start, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
-                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                    continue
-            count = data[17]
-            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
-            list_one_info = []
-            num = 1
-            # todo:先确定接口走哪个
-            try:
-                charge = get_html(tycid)
-            except Exception as e:
-                charge = -1
-                log.info(e)
-                baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
-                log.info(f'{id}---{xydm}------没有高管信息')
-            time.sleep(2)
-            t = int(time.time() * 1000)
-            if charge == -1:
-                # 重新塞入redis
-                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
-                log.info(f"{id}---{xydm}----{tycid}----请求失败")
-                break
-            elif charge == 0:
-                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
-                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
-                try:
-                    total_page1 = get_page(url1)
-                except:
-                    total_page1 = 0
-                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
-                total_page = total_page1
-                flag = 2
-            else:
-                log.info(f"{id}---{xydm}----{tycid}----有最新公示")
-                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
-                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
-                try:
-                    total_page2 = get_page(url2)
-                except:
-                    total_page2 = 0
-                time.sleep(1)
-                try:
-                    total_page3 = get_page(url3)
-                except:
-                    total_page3 = 0
-                if total_page2 == charge:
-                    url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
-                    total_page = total_page2
-                    flag = 1
-                else:
-                    if total_page3 == charge:
-                        url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
-                        total_page = total_page3
-                        flag = 3
-                    else:
-                        total_page = 0
-                        flag = 0
-                        baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code)
-                        log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
-                        continue
-            if total_page == 0:
-                # 重新塞入redis
-                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
-                continue
-            # todo:获取页数
-            for page in range(1, int((total_page / 20) + 1) + 1):
-                for c in range(3):
-                    ip = baseCore.get_proxy()
-                    url_ = url.format(t, tycid, page)
-                    res = requests.get(url_, headers=headers, proxies=ip)  # ,verify=False
-                    time.sleep(1)
-                    if res.status_code == 200:
-                        break
-                    else:
-                        if c == 2:
-                            res = ''
-                            break
-                        continue
-                if res:
-                    pass
-                else:
-                    # 重新塞入redis
-                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
-                    continue
-                try:
-                    list_all = res.json()['data']['dataList']
-                except:
-                    list_all = res.json()['data']['result']
-                if list_all:
-                    pass
-                else:
-                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
-                if flag == 1:
-                    for one_info in list_all:
-                        name = one_info['name']
-                        sex = one_info['sex']
-                        education = one_info['education']
-                        position = one_info['position']
-                        Salary = one_info['salary']
-                        # todo:获取当前年份
-                        now = datetime.datetime.now()
-                        year = now.year
-                        try:
-                            birthYear = year - int(one_info['age'])
-                        except:
-                            birthYear = ''
-                        StockKeepings = one_info['numberOfShares']
-                        currentTerm = one_info['term']
-                        personInfo = one_info['resume']
-                        try:
-                            person_img = one_info['logo']
-                        except:
-                            person_img = '--'
-                        dic_json = {
-                            "socialCreditCode": social_code,
-                            "name": name,
-                            "sex": sex,
-                            "education": education,
-                            "position": position,
-                            "salary": Salary,
-                            "birthYear": birthYear,
-                            "shareNum": StockKeepings,
-                            "shareRatio": '',
-                            "benefitShare": '',
-                            "currentTerm": currentTerm,
-                            "personInfo": personInfo,
-                            "sort": str(num)
-                        }
-                        dic_json_img = {
-                            "socialCreditCode": social_code,
-                            "name": name,
-                            "sex": sex,
-                            "education": education,
-                            "position": position,
-                            "salary": Salary,
-                            "birthYear": birthYear,
-                            "shareNum": StockKeepings,
-                            "shareRatio": '',
-                            "benefitShare": '',
-                            "currentTerm": currentTerm,
-                            "personInfo": personInfo,
-                            "头像": person_img,
-                            "sort": str(num)
-                        }
-                        num = num + 1
-                        list_one_info.append(dic_json)
-                        # list_all_2.append(dic_json_img)
-                elif flag == 3:
-                    for one_info in list_all:
-                        name = one_info['personal_name']
-                        try:
-                            sex = one_info['gender2']
-                        except:
-                            sex = ''
-                        education = ''
-                        position = one_info['position_name']
-                        Salary = ''
-                        try:
-                            birthYear = one_info['year_of_birth']
-                        except:
-                            birthYear = ''
-                        personInfo = one_info['resume_cn']
-                        try:
-                            timestamp = int(int(one_info['employ_date']) / 10000)
-                            currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
-                        except:
-                            currentTerm = ''
-                        dic_json = {
-                            "socialCreditCode": social_code,
-                            "name": name,
-                            "sex": sex,
-                            "education": education,
-                            "position": position,
-                            "salary": Salary,
-                            "birthYear": birthYear,
-                            "shareNum": '',
-                            "shareRatio": '',
-                            "benefitShare": '',
-                            "currentTerm": currentTerm + '至-',
-                            "personInfo": personInfo,
-                            "sort": str(num)
-                        }
-                        num = num + 1
-                        list_one_info.append(dic_json)
-                else:
-                    for one_info in list_all:
-                        name = one_info['name']
-                        try:
-                            position = one_info['typeSore']
-                        except:
-                            position = ''
-                        person_id = one_info['id']
-                        person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
-                        # person_res = requests.get(person_url, headers=headers, proxies=ip)
-                        person_res = requests.get(person_url, headers=headers)
-                        person_soup = BeautifulSoup(person_res.content, 'html.parser')
-                        try:
-                            personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
-                        except:
-                            personInfo = ''
-                        try:
-                            person_img = one_info['logo']
-                        except:
-                            person_img = '--'
-                        dic_json = {
-                            "socialCreditCode": social_code,
-                            "name": name,
-                            "sex": '',
-                            "education": '',
-                            "position": position,
-                            "salary": '',
-                            "birthYear": '',
-                            "shareNum": '',
-                            "shareRatio": '',
-                            "benefitShare": '',
-                            "currentTerm": '',
-                            "personInfo": personInfo,
-                            "sort": str(num)
-                        }
-                        dic_json_img = {
-                            "socialCreditCode": social_code,
-                            "name": name,
-                            "sex": '',
-                            "education": '',
-                            "position": position,
-                            "salary": '',
-                            "birthYear": '',
-                            "shareNum": '',
-                            "shareRatio": '',
-                            "benefitShare": '',
-                            "currentTerm": '',
-                            "personInfo": personInfo,
-                            "头像": person_img,
-                            "sort": str(num)
-                        }
-                        num = num + 1
-                        list_one_info.append(dic_json)
-            # print(list_one_info)
-            json_updata = json.dumps(list_one_info)
-            if json_updata == '[]':
-                continue
-            else:
-                pass
-            response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
-                                     verify=False)
-            print(response.text)
-            log.info('=========成功======')
-        except Exception as e:
-            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
-            log.info(e)
-            # 重新塞入redis
-            baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
-            state = 0
-            takeTime = baseCore.getTimeCost(start, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
-            time.sleep(5)
-        #break
-# df_img = pd.DataFrame(list_all_2)
-# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
-if __name__ == "__main__":
-    doJob()
\ No newline at end of file
--- a/comData/Tyc/baseinfouptime_tyc.py
+++ b/comData/Tyc/baseinfouptime_tyc.py
@@ -221,7 +221,7 @@ def spiderinfo(company_url, receptname, file_name):
        if matched:
            sourceUpdateTime = sourceUpdateTime_
        else:
-            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d")
+            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
    except:
        redaytowork(com_name, social_code, file_name)
    aa_dict = {