提交 71156c0a 作者: XveLingKun

企业核心人员更新 采集程序调整 完成

上级 4a4b6a87
"""
天眼查人员信息
问题1:页面和接口数据不一致 目前方法 单独处理
问题2:页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
"""
import datetime import datetime
import json import json
from random import randint
import requests, time import requests, time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib3 import urllib3
from retry import retry from retry import retry
from base.BaseCore import BaseCore from base import BaseCore
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
baseCore = BaseCore()
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger() log = baseCore.getLogger()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
...@@ -22,22 +17,24 @@ cursor_ = baseCore.cursor ...@@ -22,22 +17,24 @@ cursor_ = baseCore.cursor
cnx = baseCore.cnx_ cnx = baseCore.cnx_
cursor = baseCore.cursor_ cursor = baseCore.cursor_
from random import randint
list_all_1 = [] list_all_1 = []
list_all_2 = [] list_all_2 = []
taskType = '天眼查/核心人员更新' taskType = '天眼查/核心人员更新'
# from lxml import etree from lxml import etree
from classtool import Token, File, Tag from classtool import Token, File, Tag
token = Token() token = Token()
@retry(tries=3, delay=1) @retry(tries=3, delay=1)
def get_html(tycid, driver, headers): def get_html(tycid, driver, headers):
url = f"https://www.tianyancha.com/company/{tycid}" url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy() driver.get(url=url)
driver.get(url=url) #, proxies=ip) time.sleep(3)
time.sleep(5)
page_source = driver.page_source page_source = driver.page_source
# return -1
soup = BeautifulSoup(page_source, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
try: try:
div_part = soup.find('div', attrs={'data-dim': 'staff'}) div_part = soup.find('div', attrs={'data-dim': 'staff'})
...@@ -50,7 +47,8 @@ def get_html(tycid, driver, headers): ...@@ -50,7 +47,8 @@ def get_html(tycid, driver, headers):
try: try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field: if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '') total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(
' ', '')
return int(total) return int(total)
else: else:
return -1 return -1
...@@ -58,14 +56,16 @@ def get_html(tycid, driver, headers): ...@@ -58,14 +56,16 @@ def get_html(tycid, driver, headers):
return 0 return 0
@retry(tries=3, delay=1) @retry(tries=5, delay=2)
def get_page(url, s, headers): def get_page(url, s, headers):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip) res = s.get(url=url, headers=headers, proxies=ip)
# res = s.get(url=url, headers=headers, verify=False)
time.sleep(1) time.sleep(1)
if res.status_code != 200: if res.status_code != 200:
raise raise
data_page = res.json() data_page = res.json()
# log.info(f'接口获取总数---{data_page}')
try: try:
total_page_ = data_page['data']['total'] total_page_ = data_page['data']['total']
except: except:
...@@ -74,6 +74,8 @@ def get_page(url, s, headers): ...@@ -74,6 +74,8 @@ def get_page(url, s, headers):
from selenium import webdriver from selenium import webdriver
def create_driver(): def create_driver():
path = r'D:\soft\msedgedriver.exe' path = r'D:\soft\msedgedriver.exe'
...@@ -88,6 +90,7 @@ def create_driver(): ...@@ -88,6 +90,7 @@ def create_driver():
session = webdriver.Edge(executable_path=path, capabilities=options) session = webdriver.Edge(executable_path=path, capabilities=options)
return session return session
def login(driver): def login(driver):
cookies = {} cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies() cookies_list, id_cookie, user_name = token.get_cookies()
...@@ -95,20 +98,21 @@ def login(driver): ...@@ -95,20 +98,21 @@ def login(driver):
pass pass
else: else:
log.info("没有账号了,等待30分钟") log.info("没有账号了,等待30分钟")
time.sleep(30*60) time.sleep(30 * 60)
return '', '', '' return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======') log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list: for cookie in cookies_list:
driver.add_cookie(cookie) driver.add_cookie(cookie)
time.sleep(3) time.sleep(3)
driver.refresh() driver.refresh()
time.sleep(5) time.sleep(3)
for cookie in cookies_list: for cookie in cookies_list:
cookies[cookie['name']] = cookie['value'] cookies[cookie['name']] = cookie['value']
s = requests.Session() s = requests.Session()
s.cookies.update(cookies) s.cookies.update(cookies)
return driver, id_cookie,s return driver, id_cookie, s
def doJob(): def doJob():
# for social_code in social_code_list: # for social_code in social_code_list:
...@@ -116,7 +120,8 @@ def doJob(): ...@@ -116,7 +120,8 @@ def doJob():
url = 'https://www.tianyancha.com/' url = 'https://www.tianyancha.com/'
driver.get(url) driver.get(url)
driver.maximize_window() driver.maximize_window()
while True: for i in range(10):
# while True:
# todo:设置cookies的使用 # todo:设置cookies的使用
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
...@@ -134,12 +139,10 @@ def doJob(): ...@@ -134,12 +139,10 @@ def doJob():
continue continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName') item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# item = '913600007969593637|江西国泰集团股份有限公司'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C' # social_code = '91110108780992804C'
if item == None: if item == None:
time.sleep(20) time.sleep(30 * 60)
continue continue
start = time.time() start = time.time()
social_code = item.split('|')[0] social_code = item.split('|')[0]
...@@ -161,7 +164,7 @@ def doJob(): ...@@ -161,7 +164,7 @@ def doJob():
if data: if data:
pass pass
else: else:
#数据库中并没有该企业 需要新增 # 数据库中并没有该企业 需要新增
pass pass
id = data[0] id = data[0]
com_name = data[3] com_name = data[3]
...@@ -198,7 +201,6 @@ def doJob(): ...@@ -198,7 +201,6 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员") log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = [] list_one_info = []
num = 1 num = 1
data_page = {}
try: try:
charge = get_html(tycid, driver, headers) charge = get_html(tycid, driver, headers)
# 页面请求三次都失败 # 页面请求三次都失败
...@@ -211,7 +213,7 @@ def doJob(): ...@@ -211,7 +213,7 @@ def doJob():
# 重新塞入redis # 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item) baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis") log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2) time.sleep(3)
continue continue
elif charge == -2: elif charge == -2:
# 该企业没有人员信息 # 该企业没有人员信息
...@@ -225,33 +227,39 @@ def doJob(): ...@@ -225,33 +227,39 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----没有最新公示") log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1' url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try: try:
total_page1, data_page = get_page(url1, s, headers) total_page1, data_page1 = get_page(url1, s, headers)
except: except:
total_page1 = 0 total_page1 = 0
data_page1 = {}
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1 total_page = total_page1
data_page_one = data_page1
flag = 2 flag = 2
else: else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示") log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1' url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1' url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try: try:
total_page2, data_page = get_page(url2, s, headers) total_page2, data_page2 = get_page(url2, s, headers)
except: except:
total_page2 = 0 total_page2 = 0
data_page2 = {}
time.sleep(1) time.sleep(1)
try: try:
total_page3, data_page = get_page(url3, s, headers) total_page3, data_page3 = get_page(url3, s, headers)
except: except:
total_page3 = 0 total_page3 = 0
data_page3 = {}
if total_page2 == charge: if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2 total_page = total_page2
data_page_one = data_page2
flag = 1 flag = 1
else: else:
if total_page3 == charge: if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}' url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3 total_page = total_page3
data_page_one = data_page3
flag = 3 flag = 3
else: else:
total_page = 0 total_page = 0
...@@ -273,9 +281,10 @@ def doJob(): ...@@ -273,9 +281,10 @@ def doJob():
if int(total_page % 20) == 0: if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1) maxpage = int((total_page / 20) + 1)
else: else:
maxpage = int((total_page/20) + 1) +1 maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage): for page in range(1, maxpage):
if page == 1: if page == 1:
data_page = data_page_one
errorCode = data_page['errorCode'] errorCode = data_page['errorCode']
else: else:
res = None res = None
...@@ -286,6 +295,7 @@ def doJob(): ...@@ -286,6 +295,7 @@ def doJob():
url_ = url.format(t, tycid, page) url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1' # url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = s.get(url_, headers=headers, proxies=ip) # ,verify=False res = s.get(url_, headers=headers, proxies=ip) # ,verify=False
# res = s.get(url_, headers=headers) # ,verify=False
# res = requests.get(url_, headers=headers, verify=False) # ,verify=False # res = requests.get(url_, headers=headers, verify=False) # ,verify=False
time.sleep(randint(2, 4)) time.sleep(randint(2, 4))
data_page = res.json() data_page = res.json()
...@@ -297,7 +307,7 @@ def doJob(): ...@@ -297,7 +307,7 @@ def doJob():
break break
except: except:
continue continue
res.close()
if errorCode == 0: if errorCode == 0:
pass pass
else: else:
...@@ -307,7 +317,7 @@ def doJob(): ...@@ -307,7 +317,7 @@ def doJob():
log.info(f'{id}---{xydm}----{tycid}--{data_page}--高管信息请求失败') log.info(f'{id}---{xydm}----{tycid}--{data_page}--高管信息请求失败')
continue continue
# todo:test测试 # todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{data_page}') log.info(f'{id}---{xydm}----{tycid}----{data_page["data"]["total"]}')
try: try:
list_all = data_page['data']['dataList'] list_all = data_page['data']['dataList']
except: except:
...@@ -318,8 +328,10 @@ def doJob(): ...@@ -318,8 +328,10 @@ def doJob():
log.info(f'{id}---{xydm}----{tycid}----没有高管信息') log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接 # todo: 关闭连接
# res.close() # res.close()
log.info(f'----flag:{flag}----')
if flag == 1: if flag == 1:
for one_info in list_all: for one_info in list_all:
name = one_info['name'] name = one_info['name']
sex = one_info['sex'] sex = one_info['sex']
education = one_info['education'] education = one_info['education']
...@@ -355,22 +367,22 @@ def doJob(): ...@@ -355,22 +367,22 @@ def doJob():
"personInfo": personInfo, "personInfo": personInfo,
"sort": str(num) "sort": str(num)
} }
# dic_json_img = { dic_json_img = {
# "socialCreditCode": social_code, "socialCreditCode": social_code,
# "name": name, "name": name,
# "sex": sex, "sex": sex,
# "education": education, "education": education,
# "position": position, "position": position,
# "salary": Salary, "salary": Salary,
# "birthYear": birthYear, "birthYear": birthYear,
# "shareNum": StockKeepings, "shareNum": StockKeepings,
# "shareRatio": '', "shareRatio": '',
# "benefitShare": '', "benefitShare": '',
# "currentTerm": currentTerm, "currentTerm": currentTerm,
# "personInfo": personInfo, "personInfo": personInfo,
# "头像": person_img, "头像": person_img,
# "sort": str(num) "sort": str(num)
# } }
num = num + 1 num = num + 1
list_one_info.append(dic_json) list_one_info.append(dic_json)
# list_all_2.append(dic_json_img) # list_all_2.append(dic_json_img)
...@@ -447,22 +459,22 @@ def doJob(): ...@@ -447,22 +459,22 @@ def doJob():
"personInfo": personInfo, "personInfo": personInfo,
"sort": str(num) "sort": str(num)
} }
# dic_json_img = { dic_json_img = {
# "socialCreditCode": social_code, "socialCreditCode": social_code,
# "name": name, "name": name,
# "sex": '', "sex": '',
# "education": '', "education": '',
# "position": position, "position": position,
# "salary": '', "salary": '',
# "birthYear": '', "birthYear": '',
# "shareNum": '', "shareNum": '',
# "shareRatio": '', "shareRatio": '',
# "benefitShare": '', "benefitShare": '',
# "currentTerm": '', "currentTerm": '',
# "personInfo": personInfo, "personInfo": personInfo,
# "头像": person_img, "头像": person_img,
# "sort": str(num) "sort": str(num)
# } }
num = num + 1 num = num + 1
list_one_info.append(dic_json) list_one_info.append(dic_json)
# print(list_one_info) # print(list_one_info)
...@@ -476,8 +488,12 @@ def doJob(): ...@@ -476,8 +488,12 @@ def doJob():
print(response.text) print(response.text)
log.info('=========成功======') log.info('=========成功======')
token.updateTokeen(id_cookie, 3) token.updateTokeen(id_cookie, 3)
time.sleep(10) # time.sleep(randint(5,10))
time.sleep(5)
except Exception as e: except Exception as e:
# 4月28日采集失败不更新封号时间,更新使用时间
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====') log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e) log.info(e)
# 重新塞入redis # 重新塞入redis
...@@ -486,7 +502,8 @@ def doJob(): ...@@ -486,7 +502,8 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
break # break
# df_img = pd.DataFrame(list_all_2) # df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False) # df_img.to_excel('企业主要人员-头像.xlsx',index=False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论