提交 c442ebb4 作者: 薛凌堃

天眼查核心人员更新

上级 0fe52c2b
...@@ -45,15 +45,18 @@ def get_html(tycid, s, headers): ...@@ -45,15 +45,18 @@ def get_html(tycid, s, headers):
# div_part.find('div', class_='dimHeader_root__XTCLe') # div_part.find('div', class_='dimHeader_root__XTCLe')
except: except:
return -1 return -1
try: if div_part is None:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text return -2
if '最新公示' in tmp_field: else:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '') try:
return int(total) tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
else: if '最新公示' in tmp_field:
return -1 total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
except: return int(total)
return 0 else:
return -1
except:
return 0
@retry(tries=3, delay=1) @retry(tries=3, delay=1)
...@@ -64,7 +67,10 @@ def get_page(url, s, headers): ...@@ -64,7 +67,10 @@ def get_page(url, s, headers):
if res.status_code != 200: if res.status_code != 200:
raise raise
data_page = res.json() data_page = res.json()
total_page_ = data_page['data']['total'] try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_ return total_page_
...@@ -77,7 +83,7 @@ def doJob(): ...@@ -77,7 +83,7 @@ def doJob():
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0', 'Cache-Control': 'max-age=0',
'Connection': 'keep-alive', # 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web' 'version': 'TYC-Web'
} }
...@@ -90,7 +96,7 @@ def doJob(): ...@@ -90,7 +96,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
social_code = '91440300MA5EU1QM0T' social_code = '91110108780992804C'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
...@@ -163,6 +169,11 @@ def doJob(): ...@@ -163,6 +169,11 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis") log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2) time.sleep(2)
continue continue
elif charge == -2:
# 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
elif charge == 0: elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示") log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1' url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
...@@ -240,6 +251,8 @@ def doJob(): ...@@ -240,6 +251,8 @@ def doJob():
pass pass
else: else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息') log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
res.close()
if flag == 1: if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
......
"""
天眼查人员信息
问题1:页面和接口数据不一致 目前方法 单独处理
问题2:页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
"""
import datetime
import json
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员更新'
from lxml import etree
from classtool import Token, File, Tag
token = Token()
@retry(tries=3, delay=1)
def get_html(tycid, s, headers):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = s.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
div_part = soup.find('div', attrs={'data-dim': 'staff'})
# div_part.find('div', class_='dimHeader_root__XTCLe')
except:
return -1
if div_part is None:
return -2
else:
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return -1
except:
return 0
@retry(tries=3, delay=1)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
time.sleep(1)
if res.status_code != 200:
raise
data_page = res.json()
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_
def doJob():
# for social_code in social_code_list:
while True:
# todo:设置cookies的使用
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie = token.get_cookies()
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C'
if item == None:
time.sleep(20)
continue
start = time.time()
social_code = item.split('|')[0]
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
pass
else:
#数据库中并没有该企业 需要新增
pass
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name, xydm))
cnx_.commit()
tycid = ''
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name, s)
if retData['state']:
tycid = retData['tycData']['id']
# # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('UpdateCoreperson:Error', item)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('UpdateCoreperson:Error', item)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
try:
charge = get_html(tycid, s, headers)
# 页面请求三次都失败
except:
charge = -1
t = int(time.time() * 1000)
if charge == -1:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
continue
elif charge == -2:
# 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, s, headers)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, s, headers)
except:
total_page2 = 0
time.sleep(1)
try:
total_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
baseCore.rePutIntoR('UpdateCoreperson:Map', item)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
continue
if total_page == 0:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
# # todo:获取页数
# total_page = 34
# flag = 2
for page in range(1, int((total_page / 20) + 1) + 1):
res = None
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = requests.get(url_, headers=headers, proxies=ip, verify=False) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
break
continue
if res:
pass
else:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
# todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{res.json()}')
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
res.close()
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
# todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
birthYear = year - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
elif flag == 3:
for one_info in list_all:
name = one_info['personal_name']
try:
sex = one_info['gender2']
except:
sex = ''
education = ''
position = one_info['position_name']
Salary = ''
try:
birthYear = one_info['year_of_birth']
except:
birthYear = ''
personInfo = one_info['resume_cn']
try:
timestamp = int(one_info['employ_date']) / 1000
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
except:
currentTerm = ''
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm + '至-',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
for one_info in list_all:
name = one_info['name']
try:
position = one_info['typeSore']
except:
position = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
# response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
# verify=False)
# print(response.text)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(10)
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
doJob()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论