提交 71156c0a 作者: XveLingKun

企业核心人员更新 采集程序调整 完成

上级 4a4b6a87
"""
天眼查人员信息
问题1:页面和接口数据不一致 目前方法 单独处理
问题2:页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
"""
import datetime
import json
from random import randint
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from base import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
cnx_ = baseCore.cnx
......@@ -22,22 +17,24 @@ cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
from random import randint
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员更新'
# from lxml import etree
from lxml import etree
from classtool import Token, File, Tag
token = Token()
@retry(tries=3, delay=1)
def get_html(tycid, driver, headers):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
driver.get(url=url) #, proxies=ip)
time.sleep(5)
driver.get(url=url)
time.sleep(3)
page_source = driver.page_source
# return -1
soup = BeautifulSoup(page_source, 'html.parser')
try:
div_part = soup.find('div', attrs={'data-dim': 'staff'})
......@@ -50,7 +47,8 @@ def get_html(tycid, driver, headers):
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(
' ', '')
return int(total)
else:
return -1
......@@ -58,14 +56,16 @@ def get_html(tycid, driver, headers):
return 0
@retry(tries=3, delay=1)
@retry(tries=5, delay=2)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
# res = s.get(url=url, headers=headers, verify=False)
time.sleep(1)
if res.status_code != 200:
raise
data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try:
total_page_ = data_page['data']['total']
except:
......@@ -74,6 +74,8 @@ def get_page(url, s, headers):
from selenium import webdriver
def create_driver():
path = r'D:\soft\msedgedriver.exe'
......@@ -88,6 +90,7 @@ def create_driver():
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def login(driver):
cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies()
......@@ -95,20 +98,21 @@ def login(driver):
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30*60)
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(3)
driver.refresh()
time.sleep(5)
time.sleep(3)
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
return driver, id_cookie,s
return driver, id_cookie, s
def doJob():
# for social_code in social_code_list:
......@@ -116,7 +120,8 @@ def doJob():
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
for i in range(10):
# while True:
# todo:设置cookies的使用
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
......@@ -134,12 +139,10 @@ def doJob():
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# item = '913600007969593637|江西国泰集团股份有限公司'
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C'
if item == None:
time.sleep(20)
time.sleep(30 * 60)
continue
start = time.time()
social_code = item.split('|')[0]
......@@ -161,7 +164,7 @@ def doJob():
if data:
pass
else:
#数据库中并没有该企业 需要新增
# 数据库中并没有该企业 需要新增
pass
id = data[0]
com_name = data[3]
......@@ -198,7 +201,6 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
data_page = {}
try:
charge = get_html(tycid, driver, headers)
# 页面请求三次都失败
......@@ -211,7 +213,7 @@ def doJob():
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
time.sleep(3)
continue
elif charge == -2:
# 该企业没有人员信息
......@@ -225,33 +227,39 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1, data_page = get_page(url1, s, headers)
total_page1, data_page1 = get_page(url1, s, headers)
except:
total_page1 = 0
data_page1 = {}
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
data_page_one = data_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2, data_page = get_page(url2, s, headers)
total_page2, data_page2 = get_page(url2, s, headers)
except:
total_page2 = 0
data_page2 = {}
time.sleep(1)
try:
total_page3, data_page = get_page(url3, s, headers)
total_page3, data_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
data_page3 = {}
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
data_page_one = data_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
data_page_one = data_page3
flag = 3
else:
total_page = 0
......@@ -273,9 +281,10 @@ def doJob():
if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1)
else:
maxpage = int((total_page/20) + 1) +1
maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage):
if page == 1:
data_page = data_page_one
errorCode = data_page['errorCode']
else:
res = None
......@@ -286,6 +295,7 @@ def doJob():
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = s.get(url_, headers=headers, proxies=ip) # ,verify=False
# res = s.get(url_, headers=headers) # ,verify=False
# res = requests.get(url_, headers=headers, verify=False) # ,verify=False
time.sleep(randint(2, 4))
data_page = res.json()
......@@ -297,7 +307,7 @@ def doJob():
break
except:
continue
res.close()
if errorCode == 0:
pass
else:
......@@ -307,7 +317,7 @@ def doJob():
log.info(f'{id}---{xydm}----{tycid}--{data_page}--高管信息请求失败')
continue
# todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{data_page}')
log.info(f'{id}---{xydm}----{tycid}----{data_page["data"]["total"]}')
try:
list_all = data_page['data']['dataList']
except:
......@@ -318,8 +328,10 @@ def doJob():
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
# res.close()
log.info(f'----flag:{flag}----')
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
......@@ -355,22 +367,22 @@ def doJob():
"personInfo": personInfo,
"sort": str(num)
}
# dic_json_img = {
# "socialCreditCode": social_code,
# "name": name,
# "sex": sex,
# "education": education,
# "position": position,
# "salary": Salary,
# "birthYear": birthYear,
# "shareNum": StockKeepings,
# "shareRatio": '',
# "benefitShare": '',
# "currentTerm": currentTerm,
# "personInfo": personInfo,
# "头像": person_img,
# "sort": str(num)
# }
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
......@@ -447,22 +459,22 @@ def doJob():
"personInfo": personInfo,
"sort": str(num)
}
# dic_json_img = {
# "socialCreditCode": social_code,
# "name": name,
# "sex": '',
# "education": '',
# "position": position,
# "salary": '',
# "birthYear": '',
# "shareNum": '',
# "shareRatio": '',
# "benefitShare": '',
# "currentTerm": '',
# "personInfo": personInfo,
# "头像": person_img,
# "sort": str(num)
# }
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
......@@ -476,8 +488,12 @@ def doJob():
print(response.text)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(10)
# time.sleep(randint(5,10))
time.sleep(5)
except Exception as e:
# 4月28日采集失败不更新封号时间,更新使用时间
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
......@@ -486,7 +502,8 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论