提交 47c1de7d 作者: XveLingKun

核心人员更新采集

上级 a37579f1
......@@ -29,17 +29,15 @@ from lxml import etree
from classtool import Token, File, Tag
token = Token()
@retry(tries=3, delay=1)
def get_html(tycid, s, headers):
def get_html(tycid, driver, headers):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = s.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
driver.get(url=url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
try:
div_part = soup.find('div', attrs={'data-dim': 'staff'})
# div_part.find('div', class_='dimHeader_root__XTCLe')
......@@ -51,7 +49,8 @@ def get_html(tycid, s, headers):
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(
' ', '')
return int(total)
else:
return -1
......@@ -59,55 +58,96 @@ def get_html(tycid, s, headers):
return 0
@retry(tries=3, delay=1)
@retry(tries=5, delay=2)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
time.sleep(1)
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
# res = s.get(url=url, headers=headers, verify=False)
if res.status_code != 200:
raise
data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_
return total_page_, data_page
from selenium import webdriver
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def login(driver):
cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(3)
driver.refresh()
time.sleep(3)
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
return driver, id_cookie, s
def doJob():
# for social_code in social_code_list:
while True:
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
for i in range(1000):
# while True:
# todo:设置cookies的使用
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
driver, id_cookie, s = login(driver)
if id_cookie:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
item = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code = '91370212MA3MJMA0XW'
if social_code == None:
time.sleep(20)
# social_code = '91110108780992804C'
if item == None:
time.sleep(30 * 60)
continue
start = time.time()
social_code = item.split('|')[0]
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
......@@ -125,7 +165,7 @@ def doJob():
if data:
pass
else:
#数据库中并没有该企业 需要新增
# 数据库中并没有该企业 需要新增
pass
id = data[0]
com_name = data[3]
......@@ -150,20 +190,20 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', item)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', item)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
try:
charge = get_html(tycid, s, headers)
charge = get_html(tycid, driver, headers)
# 页面请求三次都失败
except:
charge = -1
......@@ -172,94 +212,124 @@ def doJob():
if charge == -1:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
time.sleep(3)
continue
elif charge == -2:
# 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, s, headers)
total_page1, data_page1 = get_page(url1, s, headers)
except:
total_page1 = 0
data_page1 = {}
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
data_page_one = data_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, s, headers)
total_page2, data_page2 = get_page(url2, s, headers)
except:
total_page2 = 0
data_page2 = {}
time.sleep(1)
try:
total_page3 = get_page(url3, s, headers)
total_page3, data_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
data_page3 = {}
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
data_page_one = data_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
data_page_one = data_page3
flag = 3
else:
total_page = 0
flag = 0
baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', item)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应---{charge}---{total_page2}---{total_page3}')
continue
if total_page == 0:
token.updateTokeen(id_cookie, 2)
# token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
# # todo:获取页数
# total_page = 34
# flag = 2
for page in range(1, int((total_page / 20) + 1) + 1):
res = None
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = requests.get(url_, headers=headers, proxies=ip, verify=False) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
# todo: 测试程序是否执行到这一步
log.info(f'总数为{total_page}')
if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1)
else:
maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage):
if page == 1:
data_page = data_page_one
errorCode = data_page['errorCode']
else:
res = None
for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
try:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
continue
if res:
res.close()
if errorCode == 0:
pass
else:
token.updateTokeen(id_cookie, 2)
# token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
# baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f'{id}---{xydm}----{tycid}--{data_page}--高管信息请求失败')
continue
# todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{data_page["data"]["total"]}')
try:
list_all = res.json()['data']['dataList']
list_all = data_page['data']['dataList']
except:
list_all = res.json()['data']['result']
list_all = data_page['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
res.close()
# res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
if flag == 1:
for one_info in list_all:
name = one_info['name']
......@@ -363,13 +433,27 @@ def doJob():
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
person_soup = None
while True:
try:
ip = baseCore.get_proxy()
person_res = requests.get(person_url, headers=headers, proxies=ip, timeout=(5, 10))
person_soup = BeautifulSoup(person_res.content, 'html.parser')
break
except requests.exceptions.Timeout:
log.info('请求超时')
time.sleep(1)
except requests.exceptions.RequestException as e:
log.info(e)
except:
log.info('简介请求失败')
time.sleep(3)
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
......@@ -418,17 +502,22 @@ def doJob():
print(response.text)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(10)
# time.sleep(randint(5,10))
time.sleep(5)
except Exception as e:
# 4月28日采集失败不更新封号时间,更新使用时间
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
......
......@@ -56,12 +56,11 @@ def get_html(tycid, driver, headers):
return 0
@retry(tries=5, delay=2)
@retry(tries=5, delay=3)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
# res = s.get(url=url, headers=headers, verify=False)
time.sleep(1)
if res.status_code != 200:
raise
data_page = res.json()
......@@ -120,7 +119,7 @@ def doJob():
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
for i in range(10):
for i in range(1000):
# while True:
# todo:设置cookies的使用
headers = {
......@@ -138,9 +137,9 @@ def doJob():
else:
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# 判断 如果Redis中已经没有数据,则等待
item = '914403003060602251|深圳爱尔创口腔技术有限公司'
# social_code = '91110108780992804C'
if item == None:
time.sleep(30 * 60)
continue
......@@ -217,9 +216,11 @@ def doJob():
continue
elif charge == -2:
# 该企业没有人员信息
token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
......@@ -288,25 +289,22 @@ def doJob():
errorCode = data_page['errorCode']
else:
res = None
for c in range(3):
for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
try:
for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = s.get(url_, headers=headers, proxies=ip) # ,verify=False
# res = s.get(url_, headers=headers) # ,verify=False
# res = requests.get(url_, headers=headers, verify=False) # ,verify=False
time.sleep(randint(2, 4))
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
break
except:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
res.close()
if errorCode == 0:
pass
......@@ -329,9 +327,9 @@ def doJob():
# todo: 关闭连接
# res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
......@@ -433,13 +431,28 @@ def doJob():
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
person_soup = None
while True:
try:
ip = baseCore.get_proxy()
person_res = requests.get(person_url, headers=headers, proxies=ip, timeout=(5, 10))
person_soup = BeautifulSoup(person_res.content, 'html.parser')
break
except requests.exceptions.Timeout:
log.info('请求超时')
time.sleep(1)
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
except:
log.info('简介请求失败')
time.sleep(3)
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论