提交 f65aad31 作者: XveLingKun

天眼查-股东信息

上级 5753a353
# -*- coding: utf-8 -*-
import json
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base import BaseCore
from classtool import Token, sendData, Driver, Login
from enterprise_tyc.getTycId import getTycIdByDB
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
token = Token()
edge = Driver()
login = Login()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
@retry(tries=3, delay=1)
def get_html(tycid, driver):
url = f"https://www.tianyancha.com/company/{tycid}"
driver.get(url=url)
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
try:
div_part = soup.find('div', attrs={'data-dim': 'holder'})
except:
return -1
if div_part is None:
return -2
else:
try:
tmp_field = div_part.find('h3', class_='dimHeader_main-title-txt__GPoaZ').text
if '股东信息' in tmp_field:
log.info('股东信息')
if '股东信息' in div_part.find('div', class_='dim-tab-root').find('span').get_text():
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('股东信息')[1].replace(
' ', '')
if '最新公示' in div_part.find('div', class_='dim-tab-root').find('span').get_text():
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(
' ', '')
return int(total)
else: # 否则就是主要股东接口
if '主要股东' in tmp_field:
log.info('主要股东')
if '股东信息' in div_part.find('div', class_='dim-tab-root').find('span').get_text():
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('股东信息')[1].replace(' ', '')
if '最新公示' in div_part.find('div', class_='dim-tab-root').find('span').get_text():
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
except:
return 0
@retry(tries=5, delay=3)
def get_page(url, s, headers):
res = s.get(url=url, headers=headers, timeout=(5, 10))
if res.status_code != 200:
raise
data_page = res.json()
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_, data_page
@retry(tries=5, delay=3)
def get_page1(url, s, headers):
res = s.get(url=url, headers=headers, timeout=(5, 10))
if res.status_code != 200:
raise
data_page = res.json()
try:
total_page_ = data_page['data']['stockHolder']['total']
except:
raise
return total_page_, data_page
@retry(tries=5, delay=3)
def post_page(url, s, headers, payload):
res = s.post(url=url, headers=headers, data=json.dumps(payload), timeout=(5, 10))
if res.status_code != 200:
raise
json_info = res.json()
try:
total_page_ = json_info['data']['total']
except:
raise
return total_page_, json_info
def doJob():
# for social_code in social_code_list:
driver = edge.create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
# todo:设置cookies的使用
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
driver, id_cookie, s, update_headers = login.login(driver)
if id_cookie:
pass
else:
continue
headers.update(update_headers)
info = baseCore.r.blpop(['shareHolderEnterprise:gnqy_socialCode'], 2)
if not info:
log.info('数据已全部采集完')
time.sleep(60 * 60)
continue
info = info[1].decode()
# tycid = info.split('|')[1]
socialCreditCode = info.split('|')[0]
start = time.time()
# info = '9133000070471161XA'
# tycid = info.split('|')[1]
socialCreditCode = info.split('|')[0]
dics = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
try:
tycid = getTycIdByDB(socialCreditCode, cursor, start, info, s)
if tycid:
try:
charge = get_html(tycid, driver)
except:
charge = -1
if charge == -1:
token.updateTokeen(id_cookie, 2)
time.sleep(3)
log.info(f'{socialCreditCode}==={tycid}===详情页获取失败')
baseCore.r.rpush('shareHolderEnterprise:gnqy_socialCode', info)
continue
elif charge == -2:
# 该企业没有股东信息
token.updateTokeen(id_cookie, 2)
log.info(f'{socialCreditCode}==={tycid}===没有股东信息')
# baseCore.r.rpush('shareHolderEnterprise:gnqy_socialCode', info)
continue
url2 = f'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holder/latest/announcement'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={tycid}&pageSize=10&pageNum=1&percentLevel=-100&type=1'
url1 = f'https://capi.tianyancha.com/cloud-listed-company/listed/holder/hk?date=&gid={tycid}&sortField=&sortType=-100&pageSize=10&pageNum=1&percentLevel=-100&keyword='
payload = {"gid": f"{tycid}", "pageSize": 20, "pageNum": 1, "sortField": "", "sortType": "-100", "historyType": 1}
try:
total_page2, data_page2 = post_page(url2, s, headers, payload)
except:
total_page2 = 0
data_page2 = {}
time.sleep(1)
try:
total_page3, data_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
data_page3 = {}
try:
total_page1, data_page1 = get_page1(url1, s, headers)
except:
total_page1 = 0
data_page1 = {}
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holderV2/latest/announcement?'
total_page = total_page2
data_page_one = data_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1'
total_page = total_page3
data_page_one = data_page3
flag = 3
else:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/hk?date=&gid={}&sortField=&sortType=-100&pageSize=20&pageNum={}&percentLevel=-100&keyword='
total_page = total_page1
data_page_one = data_page1
flag = 0
if total_page == 0:
token.updateTokeen(id_cookie, 3)
# 重新塞入redis
log.info(f'{socialCreditCode}==={tycid}===接口数据获取失败')
baseCore.r.rpush('shareHolderEnterprise:gnqy_socialCode',info)
continue
# todo:获取页数
log.info(f'总数为{total_page}')
if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1)
else:
maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage):
if page == 1:
data_page = data_page_one
errorCode = data_page['errorCode']
else:
res = None
for d in range(3):
if flag == 1:
url_ = url
payload = {"gid": f"{tycid}", "pageSize": 20, "pageNum": f"{page}", "sortField": "",
"sortType": "-100", "historyType": 1}
try:
res = s.post(url=url_, headers=headers, data=json.dumps(payload),
timeout=(5, 10))
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
else:
url_ = url.format(tycid, page)
try:
res = s.get(url_, headers=headers, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
res.close()
if errorCode == 0:
pass
else:
token.updateTokeen(id_cookie, 3)
# 重新塞入redis
baseCore.r.rpush('shareHolderEnterprise:gnqy_socialCode',info)
log.info(f'{socialCreditCode}==={tycid}===接口数据获取失败')
continue
# todo:test测试
try:
list_all = data_page['data']['holderList']
except:
try:
list_all = data_page['data']['result']
except:
list_all = data_page['data']['stockHolder']['result']
if list_all:
pass
else:
pass
# todo: 关闭连接
# res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
for idx, holder_info in enumerate(list_all):
if page == 1:
sort = idx + 1
else:
sort = idx + 1 + (20 * (page - 1))
if flag == 1:
name = holder_info['shareHolderName'] # 股东名称
shareHoldRation = holder_info['percent'] # 持股比例
shareHoldNum = holder_info['shareholdingNum'] # 持股数
shareHoldUnit = holder_info['shareholdingNumUnit'] # 持股单位
shareType = holder_info['shareType'] # 持股类型
year = holder_info['yearReport'] # 发布年份
dic = {
'socialCreditCode': socialCreditCode,
'name': name,
'shareHoldRation': shareHoldRation,
'shareHoldNum': shareHoldNum,
'shareHoldUnit': shareHoldUnit,
'shareType': shareType,
'year': year,
'sort': sort
}
elif flag == 3:
name = holder_info['name'] # 股东名称
shareHoldRation = holder_info['proportion'] # 持股比例
shareHoldNum = holder_info['holdingNum'] # 持股数
shareHoldUnit = holder_info['shareUnit'] # 持股单位
shareType = holder_info['shareType'] # 持股类型
year = holder_info['publishDate'] # 发布年份
dic = {
'socialCreditCode': socialCreditCode,
'name': name,
'shareHoldRation': shareHoldRation,
'shareHoldNum': shareHoldNum,
'shareHoldUnit': shareHoldUnit,
'shareType': shareType,
'year': year,
'sort': sort
}
else:
name = holder_info['holder_name'] # 股东名称
shareHoldRation = holder_info['longHeldRatioWithUnit'] # 持股比例
shareHoldNum = holder_info['held_total_num_long_position'] # 持股数
shareHoldUnit = holder_info['shareUnit'] # 持股单位
shareType = holder_info['shareTypeName'] # 持股类型
dic = {
'socialCreditCode': socialCreditCode,
'name': name,
'shareHoldRation': shareHoldRation,
'shareHoldNum': shareHoldNum,
'shareHoldUnit': shareHoldUnit,
'shareType': shareType,
'sort': sort
}
dics.append(dic)
# log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(5)
try:
req = sendData('http://114.115.236.206:8088/sync/shareHolder', dics)
log.info('数据发送成功')
takeTime = baseCore.getTimeCost(start, time.time())
log.info(f'{socialCreditCode}==={req.text}===耗时{takeTime}')
except Exception as e:
log.error(f'数据发送结果口失败==={e}')
except Exception as e:
token.updateTokeen(id_cookie, 3)
log.info(f'==={socialCreditCode}=====企业股东采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('ChangeRecordEnterprise:gnqy_socialCode', info)
time.sleep(5)
# break
if __name__ == "__main__":
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论