提交 47c1de7d 作者: XveLingKun

核心人员更新采集

上级 a37579f1
......@@ -56,12 +56,11 @@ def get_html(tycid, driver, headers):
return 0
@retry(tries=5, delay=2)
@retry(tries=5, delay=3)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
# res = s.get(url=url, headers=headers, verify=False)
time.sleep(1)
if res.status_code != 200:
raise
data_page = res.json()
......@@ -120,7 +119,7 @@ def doJob():
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
for i in range(10):
for i in range(1000):
# while True:
# todo:设置cookies的使用
headers = {
......@@ -138,9 +137,9 @@ def doJob():
else:
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# 判断 如果Redis中已经没有数据,则等待
item = '914403003060602251|深圳爱尔创口腔技术有限公司'
# social_code = '91110108780992804C'
if item == None:
time.sleep(30 * 60)
continue
......@@ -217,9 +216,11 @@ def doJob():
continue
elif charge == -2:
# 该企业没有人员信息
token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----没有核心人员或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
......@@ -288,25 +289,22 @@ def doJob():
errorCode = data_page['errorCode']
else:
res = None
for c in range(3):
try:
for d in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = s.get(url_, headers=headers, proxies=ip) # ,verify=False
# res = s.get(url_, headers=headers) # ,verify=False
# res = requests.get(url_, headers=headers, verify=False) # ,verify=False
time.sleep(randint(2, 4))
try:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
break
except:
continue
res.close()
if errorCode == 0:
pass
......@@ -329,9 +327,9 @@ def doJob():
# todo: 关闭连接
# res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
......@@ -433,13 +431,28 @@ def doJob():
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = None
while True:
try:
ip = baseCore.get_proxy()
person_res = requests.get(person_url, headers=headers, proxies=ip, timeout=(5, 10))
person_soup = BeautifulSoup(person_res.content, 'html.parser')
break
except requests.exceptions.Timeout:
log.info('请求超时')
time.sleep(1)
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
except:
log.info('简介请求失败')
time.sleep(3)
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论