提交 8ddc2b96 作者: XveLingKun

采集企业信用代码和标签

上级 67e03a3e
"""
采集企业信用代码和企业标签
"""
import json
import time
import requests
import urllib3
from bs4 import BeautifulSoup
from retry import retry
from getTycId import getTycIdByXYDM
from base import BaseCore
from selenium import webdriver
from classtool import Token, Info
token = Token()
info = Info()
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def login(driver):
cookies = {}
cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(3)
driver.refresh()
# time.sleep(3)
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
time.sleep(3)
return driver, id_cookie, s
@retry(tries=3, delay=1)
def get_html(tycid, driver, dic_info):
url = f"https://www.tianyancha.com/company/{tycid}"
driver.get(url=url)
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
script = soup.find('script', attrs={'id': '__NEXT_DATA__'}).text
script = json.loads(script)
script = script['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']['tagListV2']
tag_list = []
filter_list = ['存续', '曾用名', '竞争风险', '司法案件', '合作风险', '股权出质', '仍注册']
for tag in script:
if tag['title'] in filter_list:
continue
if tag['color'] == '#FF463C':
continue
tag_list.append(tag['title'])
dic_info['股东企业标签'] = tag_list
return dic_info
if __name__ == "__main__":
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
item = baseCore.redicPullData('shareHolderInfo')
driver, id_cookie, s = login(driver)
if id_cookie:
pass
else:
continue
com_name = item.split('|')[1]
# com_name = '杭州君瀚股权投资合伙企业(有限合伙)'
no = item.split('|')[0]
# no = '3'
xydm = ''
tycid = ''
dic_info = {}
# time.sleep(3)
try:
retData = getTycIdByXYDM(com_name, s)
except:
retData = {}
log.info('获取天眼查ID失败')
if retData:
log.info(f'retData: {retData}')
if retData['state']:
tycid = retData['tycData']['id']
xydm = retData['tycData']['taxCode']
else:
token.updateTokeen(id_cookie, 3)
log.info(f'{com_name} 重新放入redis')
baseCore.rePutIntoR('shareHorder', item)
continue
if xydm:
pass
else:
log.info('未找到该企业,或该企业没有信用代码')
dic_info['股东企业信用代码'] = xydm
dic_result = get_html(tycid, driver, dic_info)
# 存储数据库
info.update_info(no, dic_result)
token.updateTokeen(id_cookie, 3)
log.info(f'{xydm}---{com_name}---更新完成')
time.sleep(1)
else:
token.updateTokeen(id_cookie, 3)
log.info(f'{com_name} 重新放入redis')
baseCore.rePutIntoR('shareHorder', item)
# break
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论