提交 9ed327a7 作者: 薛凌堃

02/06

上级 b1d1cafd
#补充剩余核心人员信息
#先采集天眼查id,再通过id采集核心人员信息
import datetime
import json
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=581fac60bfe911eeb3fc09360952f0ba; ssuid=1162354300; _ga=GA1.2.1333101206.1706683384; _gid=GA1.2.604055726.1706683384; tyc-user-phone=%255B%252218837538506%2522%252C%2522152%25203756%25200528%2522%255D; HWWAFSESID=b306585832394f6d3b; HWWAFSESTIME=1706751848880; csrfToken=DUIyVpHXj6o8vOwT9idnR4hd; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706671944,1706751850; bannerFlag=true; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2215822283785%22%2C%22userId%22%3A%22269298908%22%7D; tyc-user-info-save-time=1706751947161; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTgyMjI4Mzc4NSIsImlhdCI6MTcwNjc1MTk0NiwiZXhwIjoxNzA5MzQzOTQ2fQ.W-hQ1QBEoDkHYqcSFjTEukemZJpHi-iYzqqnpYR-uaKi6ecS3HNp_dUs8UuzSiYyZH4WQjc-98Z-3hysQGEr_Q; searchSessionId=1706751998.12338612; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22269298908%22%2C%22first_id%22%3A%2218d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQ5MzJlZjg1NWEtMGVkMTRiODAyY2YzMDE4LTNlNjA0ODA5LTIwNzM2MDAtMThkNWQ5MzJlZjkyMGEiLCIkaWRlbnRpdHlfbG9naW5faWQiOiIyNjkyOTg5MDgifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22269298908%22%7D%2C%22%24device_id%22%3A%2218d5d932ef855a-0ed14b802cf3018-3e604809-2073600-18d5d932ef920a%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706752204',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员'
ip_num = 0
from lxml import etree
@retry(tries=3, delay=1)
def get_html(tycid):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
# soup = BeautifulSoup(response.content, 'html.parser')
soup = etree.HTML(response.content)
try:
model = soup.xpath('//*[@id="page-root"]/div[3]/div[1]/div[3]/div/div[3]/div[2]/div[2]/div[3]/div/div[1]/div[1]/span/h3')
corp = model.text
if corp == '主要人员':
tmp_field = soup.find('div', class_='index_dim-tab-container__kysLO').find('div',class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return 0
except:
return -1
try:
except:
return 0
@retry(tries=3, delay=1)
def get_page(url):
ip = baseCore.get_proxy()
res = requests.get(url=url, headers=headers, proxies=ip)
time.sleep(1)
if res.status_code != 200:
raise
total_page_ = res.json()['data']['total']
return total_page_
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code = '91150400701461969E'
if social_code == None:
time.sleep(20)
continue
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name, xydm))
cnx_.commit()
tycid = ''
# baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
# continue
# id = data[0]
# com_name = data[1]
# xydm = data[2]
# tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name)
if retData['state']:
tycid = retData['tycData']['id']
# # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
# todo:先确定接口走哪个
try:
charge = get_html(tycid)
except Exception as e:
charge = -1
log.info(e)
baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}------没有高管信息')
time.sleep(2)
t = int(time.time() * 1000)
if charge == -1:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
log.info(f"{id}---{xydm}----{tycid}----请求失败")
break
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2)
except:
total_page2 = 0
time.sleep(1)
try:
total_page3 = get_page(url3)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
continue
if total_page == 0:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
# todo:获取页数
for page in range(1, int((total_page / 20) + 1) + 1):
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
res = requests.get(url_, headers=headers, proxies=ip) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
res = ''
break
continue
if res:
pass
else:
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
# todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
birthYear = year - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
elif flag == 3:
for one_info in list_all:
name = one_info['personal_name']
try:
sex = one_info['gender2']
except:
sex = ''
education = ''
position = one_info['position_name']
Salary = ''
try:
birthYear = one_info['year_of_birth']
except:
birthYear = ''
personInfo = one_info['resume_cn']
try:
timestamp = int(int(one_info['employ_date']) / 10000)
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
except:
currentTerm = ''
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm + '至-',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
for one_info in list_all:
name = one_info['name']
try:
position = one_info['typeSore']
except:
position = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text)
log.info('=========成功======')
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
#break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
doJob()
\ No newline at end of file
...@@ -221,7 +221,7 @@ def spiderinfo(company_url, receptname, file_name): ...@@ -221,7 +221,7 @@ def spiderinfo(company_url, receptname, file_name):
if matched: if matched:
sourceUpdateTime = sourceUpdateTime_ sourceUpdateTime = sourceUpdateTime_
else: else:
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d") sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
except: except:
redaytowork(com_name, social_code, file_name) redaytowork(com_name, social_code, file_name)
aa_dict = { aa_dict = {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论