提交 785f3d85 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

#补充剩余核心人员信息
#先采集天眼查id,再通过id采集核心人员信息
"""
天眼查人员信息
问题1:页面和接口数据不一致 目前方法 单独处理
问题2:页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑
"""
import datetime
import json
import os
import subprocess
import sys
import requests,time,random
import pandas as pd
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
......@@ -17,10 +16,6 @@ from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie': 'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103126138%22%2C%22userId%22:%22304029617%22}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=%255B%252217103126138%2522%255D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22304029617%22%2C%22first_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22304029617%22%7D%2C%22%24device_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
......@@ -30,71 +25,72 @@ cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员'
ip_num = 0
def get_proxy(ip_num):
sql = "select proxy from clb_proxy"
cursor_.execute(sql)
proxy_lists = cursor_.fetchall()
cnx_.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[ip_num]
from lxml import etree
from classtool import Token, File, Tag
token = Token()
@retry(tries=3, delay=1)
def get_html(tycid, ip_num):
def get_html(tycid, s, headers):
url = f"https://www.tianyancha.com/company/{tycid}"
ip = get_proxy(ip_num)
response = requests.get(url=url, headers=headers, proxies=ip)
# ip = baseCore.get_proxy()
response = s.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
ip_num += 1
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
div_part = soup.find('div', attrs={'data-dim': 'staff'})
# div_part.find('div', class_='dimHeader_root__XTCLe')
except:
return -1
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
return -1
except:
return 0
@retry(tries=3, delay=1)
def get_page(url, ip_num):
ip = get_proxy(ip_num)
res = requests.get(url=url, headers=headers, proxies=ip)
if res.status_code == 200:
pass
else:
ip_num += 1
raise
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
time.sleep(1)
total_page_ = res.json()['data']['total']
if res.status_code != 200:
raise
data_page = res.json()
total_page_ = data_page['data']['total']
return total_page_
def doJob():
# for social_code in social_code_list:
while True:
# todo:设置cookies的使用
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie = token.get_cookies()
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code = '91320691550279691N'
social_code = '911101067916069050'
if social_code == None:
time.sleep(20)
continue
......@@ -108,26 +104,28 @@ def doJob():
tycid = data[11]
count = data[17]
else:
#数据重新塞入redis
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
pass
else:
#数据库中并没有该企业 需要新增
pass
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
cursor_.execute(insert, (com_name, xydm, social_code))
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name, xydm))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name)
retData = getTycIdByXYDM(com_name, s)
if retData['state']:
tycid = retData['tycData']['id']
# # todo:写入数据库
......@@ -147,35 +145,29 @@ def doJob():
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
#todo:先确定接口走哪个
try:
charge = get_html(tycid, ip_num)
except Exception as e:
charge = get_html(tycid, s, headers)
# 页面请求三次都失败
except:
charge = -1
log.info(e)
total_page = 0
t = int(time.time() * 1000)
if charge == -1:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, ip_num)
total_page1 = get_page(url1, s, headers)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
......@@ -186,12 +178,12 @@ def doJob():
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, ip_num)
total_page2 = get_page(url2, s, headers)
except:
total_page2 = 0
time.sleep(2)
time.sleep(1)
try:
total_page3 = get_page(url3, ip_num)
total_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
if total_page2 == charge:
......@@ -206,33 +198,38 @@ def doJob():
else:
total_page = 0
flag = 0
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
continue
if total_page == 0:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
#todo:获取页数
time.sleep(2)
for page in range(1, int((total_page/20) + 1)+1):
# # todo:获取页数
# total_page = 34
# flag = 2
for page in range(1, int((total_page / 20) + 1) + 1):
res = None
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers, proxies=ip) # ,verify=False
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = requests.get(url_, headers=headers, proxies=ip, verify=False) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
......@@ -250,7 +247,7 @@ def doJob():
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
#todo:获取当前年份
# todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
......@@ -266,37 +263,37 @@ def doJob():
except:
person_img = '--'
dic_json = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"sort":str(num)
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"头像":person_img,
"sort":str(num)
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num+1
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
elif flag == 3:
......@@ -314,8 +311,11 @@ def doJob():
except:
birthYear = ''
personInfo = one_info['resume_cn']
timestamp = int(int(one_info['employ_date'])/10000)
try:
timestamp = int(one_info['employ_date']) / 1000
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
except:
currentTerm = ''
dic_json = {
"socialCreditCode": social_code,
"name": name,
......@@ -327,7 +327,7 @@ def doJob():
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm+'至-',
"currentTerm": currentTerm + '至-',
"personInfo": personInfo,
"sort": str(num)
}
......@@ -393,19 +393,23 @@ def doJob():
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False)
response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(10)
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
# break
break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
......
......@@ -21,45 +21,29 @@ headers = {
'Connection': 'keep-alive',
'Content-Length': '32',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
# headers = {
# 'X-TYCID':'30c1289042f511ee9182cd1e1bcaa517',
# # 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5MjkzMzIxMiwiZXhwIjoxNjk1NTI1MjEyfQ.BKxDem8fpgeDHrIgm3qCoF76ueHtQSG1DggiTl4FAaoNKt4gem6NTX1XYndPXqVj9TXfl-8yp2kKE3jY66dyig',
# 'version':'TYC-Web',
# 'Content-Type':'application/json;charset=UTF-8'
# }
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cursor= cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '天眼查企业id/天眼查'
#根据信用代码获取天眼查id 企业名字等信息
def getTycIdByXYDM(com_name):
def getTycIdByXYDM(com_name, s):
retData={'state':False,'tycData':None,'reput':True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
ip = baseCore.get_proxy()
paramJsonData = {'keyword':com_name}
paramJsonData = {'keyword': com_name}
try:
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
# response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
response = requests.post(url,json=paramJsonData,headers=headers,verify=False)
response = s.post(url, json=paramJsonData, headers=headers)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state']== 'ok':
if retJsonData['data'] and retJsonData['state'] == 'ok':
pass
else:
log.error(f"---{com_name}-未查询到该企业---")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论