提交 47c1de7d 作者: XveLingKun

核心人员更新采集

上级 a37579f1
...@@ -29,17 +29,15 @@ from lxml import etree ...@@ -29,17 +29,15 @@ from lxml import etree
from classtool import Token, File, Tag from classtool import Token, File, Tag
token = Token() token = Token()
@retry(tries=3, delay=1) @retry(tries=3, delay=1)
def get_html(tycid, s, headers): def get_html(tycid, driver, headers):
url = f"https://www.tianyancha.com/company/{tycid}" url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy() driver.get(url=url)
response = s.get(url=url, headers=headers) time.sleep(5)
if response.status_code == 200: page_source = driver.page_source
pass
else: soup = BeautifulSoup(page_source, 'html.parser')
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try: try:
div_part = soup.find('div', attrs={'data-dim': 'staff'}) div_part = soup.find('div', attrs={'data-dim': 'staff'})
# div_part.find('div', class_='dimHeader_root__XTCLe') # div_part.find('div', class_='dimHeader_root__XTCLe')
...@@ -51,7 +49,8 @@ def get_html(tycid, s, headers): ...@@ -51,7 +49,8 @@ def get_html(tycid, s, headers):
try: try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field: if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '') total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(
' ', '')
return int(total) return int(total)
else: else:
return -1 return -1
...@@ -59,55 +58,96 @@ def get_html(tycid, s, headers): ...@@ -59,55 +58,96 @@ def get_html(tycid, s, headers):
return 0 return 0
@retry(tries=3, delay=1) @retry(tries=5, delay=2)
def get_page(url, s, headers): def get_page(url, s, headers):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip) res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
time.sleep(1) # res = s.get(url=url, headers=headers, verify=False)
if res.status_code != 200: if res.status_code != 200:
raise raise
data_page = res.json() data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try: try:
total_page_ = data_page['data']['total'] total_page_ = data_page['data']['total']
except: except:
raise raise
return total_page_ return total_page_, data_page
from selenium import webdriver
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def login(driver):
cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(3)
driver.refresh()
time.sleep(3)
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
return driver, id_cookie, s
def doJob(): def doJob():
# for social_code in social_code_list: # for social_code in social_code_list:
while True: driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
for i in range(1000):
# while True:
# todo:设置cookies的使用 # todo:设置cookies的使用
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0', 'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive', 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web' 'version': 'TYC-Web'
} }
cookies_list, id_cookie, user_name = token.get_cookies() driver, id_cookie, s = login(driver)
if cookies_list: if id_cookie:
pass pass
else: else:
log.info("没有账号了,等待30分钟") continue
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') item = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
social_code = '91370212MA3MJMA0XW' # social_code = '91110108780992804C'
if social_code == None: if item == None:
time.sleep(20) time.sleep(30 * 60)
continue continue
start = time.time() start = time.time()
social_code = item.split('|')[0]
try: try:
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
if len(data) != 0: if len(data) != 0:
...@@ -125,7 +165,7 @@ def doJob(): ...@@ -125,7 +165,7 @@ def doJob():
if data: if data:
pass pass
else: else:
#数据库中并没有该企业 需要新增 # 数据库中并没有该企业 需要新增
pass pass
id = data[0] id = data[0]
com_name = data[3] com_name = data[3]
...@@ -150,20 +190,20 @@ def doJob(): ...@@ -150,20 +190,20 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====') log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', item)
continue continue
except: except:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', item)
continue continue
count = data[17] count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员") log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = [] list_one_info = []
num = 1 num = 1
try: try:
charge = get_html(tycid, s, headers) charge = get_html(tycid, driver, headers)
# 页面请求三次都失败 # 页面请求三次都失败
except: except:
charge = -1 charge = -1
...@@ -172,94 +212,124 @@ def doJob(): ...@@ -172,94 +212,124 @@ def doJob():
if charge == -1: if charge == -1:
token.updateTokeen(id_cookie, 2) token.updateTokeen(id_cookie, 2)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis") log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2) time.sleep(3)
continue continue
elif charge == -2: elif charge == -2:
# 该企业没有人员信息 # 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue continue
elif charge == 0: elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示") log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1' url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try: try:
total_page1 = get_page(url1, s, headers) total_page1, data_page1 = get_page(url1, s, headers)
except: except:
total_page1 = 0 total_page1 = 0
data_page1 = {}
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1 total_page = total_page1
data_page_one = data_page1
flag = 2 flag = 2
else: else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示") log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1' url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1' url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try: try:
total_page2 = get_page(url2, s, headers) total_page2, data_page2 = get_page(url2, s, headers)
except: except:
total_page2 = 0 total_page2 = 0
data_page2 = {}
time.sleep(1) time.sleep(1)
try: try:
total_page3 = get_page(url3, s, headers) total_page3, data_page3 = get_page(url3, s, headers)
except: except:
total_page3 = 0 total_page3 = 0
data_page3 = {}
if total_page2 == charge: if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2 total_page = total_page2
data_page_one = data_page2
flag = 1 flag = 1
else: else:
if total_page3 == charge: if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3 total_page = total_page3
data_page_one = data_page3
flag = 3 flag = 3
else: else:
total_page = 0 total_page = 0
flag = 0 flag = 0
baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterpriseMap:gnqy_socialCode', item)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应') log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应---{charge}---{total_page2}---{total_page3}')
continue continue
if total_page == 0: if total_page == 0:
token.updateTokeen(id_cookie, 2) # token.updateTokeen(id_cookie, 2)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====') log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue continue
# # todo:获取页数 # # todo:获取页数
# total_page = 34 # total_page = 34
# flag = 2 # flag = 2
for page in range(1, int((total_page / 20) + 1) + 1): # todo: 测试程序是否执行到这一步
res = None log.info(f'总数为{total_page}')
for c in range(3): if int(total_page % 20) == 0:
ip = baseCore.get_proxy() maxpage = int((total_page / 20) + 1)
url_ = url.format(t, tycid, page) else:
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1' maxpage = int((total_page / 20) + 1) + 1
res = requests.get(url_, headers=headers, proxies=ip, verify=False) # ,verify=False for page in range(1, maxpage):
time.sleep(1) if page == 1:
if res.status_code == 200: data_page = data_page_one
break errorCode = data_page['errorCode']
else: else:
if c == 2: res = None
for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
try:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break break
continue res.close()
if res: if errorCode == 0:
pass pass
else: else:
token.updateTokeen(id_cookie, 2) # token.updateTokeen(id_cookie, 2)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) # baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败') log.info(f'{id}---{xydm}----{tycid}--{data_page}--高管信息请求失败')
continue continue
# todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{data_page["data"]["total"]}')
try: try:
list_all = res.json()['data']['dataList'] list_all = data_page['data']['dataList']
except: except:
list_all = res.json()['data']['result'] list_all = data_page['data']['result']
if list_all: if list_all:
pass pass
else: else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息') log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接 # todo: 关闭连接
res.close() # res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
if flag == 1: if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
...@@ -363,13 +433,27 @@ def doJob(): ...@@ -363,13 +433,27 @@ def doJob():
person_id = one_info['id'] person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}' person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip) person_soup = None
person_res = requests.get(person_url, headers=headers) while True:
person_soup = BeautifulSoup(person_res.content, 'html.parser') try:
ip = baseCore.get_proxy()
person_res = requests.get(person_url, headers=headers, proxies=ip, timeout=(5, 10))
person_soup = BeautifulSoup(person_res.content, 'html.parser')
break
except requests.exceptions.Timeout:
log.info('请求超时')
time.sleep(1)
except requests.exceptions.RequestException as e:
log.info(e)
except:
log.info('简介请求失败')
time.sleep(3)
try: try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip() personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except: except:
personInfo = '' personInfo = ''
try: try:
person_img = one_info['logo'] person_img = one_info['logo']
except: except:
...@@ -418,17 +502,22 @@ def doJob(): ...@@ -418,17 +502,22 @@ def doJob():
print(response.text) print(response.text)
log.info('=========成功======') log.info('=========成功======')
token.updateTokeen(id_cookie, 3) token.updateTokeen(id_cookie, 3)
time.sleep(10) # time.sleep(randint(5,10))
time.sleep(5)
except Exception as e: except Exception as e:
# 4月28日采集失败不更新封号时间,更新使用时间
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====') log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e) log.info(e)
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', item)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
break # break
# df_img = pd.DataFrame(list_all_2) # df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False) # df_img.to_excel('企业主要人员-头像.xlsx',index=False)
......
...@@ -56,12 +56,11 @@ def get_html(tycid, driver, headers): ...@@ -56,12 +56,11 @@ def get_html(tycid, driver, headers):
return 0 return 0
@retry(tries=5, delay=2) @retry(tries=5, delay=3)
def get_page(url, s, headers): def get_page(url, s, headers):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip) res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
# res = s.get(url=url, headers=headers, verify=False) # res = s.get(url=url, headers=headers, verify=False)
time.sleep(1)
if res.status_code != 200: if res.status_code != 200:
raise raise
data_page = res.json() data_page = res.json()
...@@ -120,7 +119,7 @@ def doJob(): ...@@ -120,7 +119,7 @@ def doJob():
url = 'https://www.tianyancha.com/' url = 'https://www.tianyancha.com/'
driver.get(url) driver.get(url)
driver.maximize_window() driver.maximize_window()
for i in range(10): for i in range(1000):
# while True: # while True:
# todo:设置cookies的使用 # todo:设置cookies的使用
headers = { headers = {
...@@ -138,9 +137,9 @@ def doJob(): ...@@ -138,9 +137,9 @@ def doJob():
else: else:
continue continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName') item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
item = '914403003060602251|深圳爱尔创口腔技术有限公司' # social_code = '91110108780992804C'
if item == None: if item == None:
time.sleep(30 * 60) time.sleep(30 * 60)
continue continue
...@@ -217,9 +216,11 @@ def doJob(): ...@@ -217,9 +216,11 @@ def doJob():
continue continue
elif charge == -2: elif charge == -2:
# 该企业没有人员信息 # 该企业没有人员信息
token.updateTokeen(id_cookie, 2) token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item) baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis") log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员") # log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue continue
...@@ -288,25 +289,22 @@ def doJob(): ...@@ -288,25 +289,22 @@ def doJob():
errorCode = data_page['errorCode'] errorCode = data_page['errorCode']
else: else:
res = None res = None
for c in range(3): for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
try: try:
for d in range(3): res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
ip = baseCore.get_proxy() except requests.exceptions.RequestException as e:
url_ = url.format(t, tycid, page) log.info(e)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1' time.sleep(1)
res = s.get(url_, headers=headers, proxies=ip) # ,verify=False continue
# res = s.get(url_, headers=headers) # ,verify=False data_page = res.json()
# res = requests.get(url_, headers=headers, verify=False) # ,verify=False errorCode = res.json()['errorCode']
time.sleep(randint(2, 4)) if errorCode != 0:
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
break
except:
continue continue
else:
break
res.close() res.close()
if errorCode == 0: if errorCode == 0:
pass pass
...@@ -329,9 +327,9 @@ def doJob(): ...@@ -329,9 +327,9 @@ def doJob():
# todo: 关闭连接 # todo: 关闭连接
# res.close() # res.close()
log.info(f'----flag:{flag}----') log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
if flag == 1: if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
sex = one_info['sex'] sex = one_info['sex']
education = one_info['education'] education = one_info['education']
...@@ -433,13 +431,28 @@ def doJob(): ...@@ -433,13 +431,28 @@ def doJob():
person_id = one_info['id'] person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}' person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip) person_soup = None
person_res = requests.get(person_url, headers=headers) while True:
person_soup = BeautifulSoup(person_res.content, 'html.parser') try:
ip = baseCore.get_proxy()
person_res = requests.get(person_url, headers=headers, proxies=ip, timeout=(5, 10))
person_soup = BeautifulSoup(person_res.content, 'html.parser')
break
except requests.exceptions.Timeout:
log.info('请求超时')
time.sleep(1)
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
except:
log.info('简介请求失败')
time.sleep(3)
try: try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip() personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except: except:
personInfo = '' personInfo = ''
try: try:
person_img = one_info['logo'] person_img = one_info['logo']
except: except:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论