提交 19ca1446 作者: XveLingKun

天眼查-分支机构

上级 44d3aa3a
# -*- coding: utf-8 -*-
import time
import requests
import urllib3
from retry import retry
from classtool import Token, sendData, Driver, Login
from base import BaseCore
from enterprise_tyc.getTycId import getTycIdByDB
"""分支机构"""
# baseCore = BaseCore.BaseCore(sqlflg=False)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
token = Token()
edge = Driver()
login = Login()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
taskType = "天眼查/分支机构"
@retry(tries=3, delay=5)
def getJson(url, headers, s):
ip = baseCore.get_proxy()
log.info(f'当前使用的ip是{ip}')
# req = requests.get(url, headers=headers, timeout=20)
req = s.get(url, headers=headers, proxies=ip, timeout=(5, 10))
# req = s.get(url, headers=headers, timeout=(5, 10))
dataJson = req.json()
if dataJson['errorCode'] != 0:
raise
req.close()
return dataJson['data']
def doJob():
# driver = edge.create_driver()
# url = 'https://www.tianyancha.com/'
# driver.get(url)
# driver.maximize_window()
while True:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
driver, id_cookie, s, update_headers = login.login()
if id_cookie:
pass
else:
continue
headers.update(update_headers)
# info = "9133000070471161XA"
info = baseCore.r.blpop(['BranchEnterprise:gnqy_socialCode'], 2)
if not info:
log.info('数据已全部采集完')
time.sleep(60 * 60)
continue
info = info[1].deocde()
start = time.time()
socialCreditCode = info.split('|')[0]
log.info(f'开始采集统一社会信用代码为{socialCreditCode}的企业信息')
try:
tycId = getTycIdByDB(socialCreditCode, cursor, start, info, s)
if tycId:
url = f'https://capi.tianyancha.com/cloud-company-background/company/branchList?gid={tycId}&pageSize=10&pageNum=1'
datasJson = getJson(url, headers, s)
try:
total = datasJson['total']
except:
log.info(datasJson)
log.info(f'{socialCreditCode}==={tycId}===分支机构查询失败')
baseCore.r.rpush('BranchEnterprise:gnqy_socialCode', info)
continue
if total % 10 == 0:
totalPage = int(total / 10)
else:
totalPage = int(total / 10 + 1)
dics = []
for page in range(1, totalPage + 1):
if page != 1:
url = url.replace(f'pageNum={page - 1}', f'pageNum={page}')
datasJson = getJson(url, headers, s)
for dataJson in datasJson['result']:
area = dataJson['area']
establishDate = dataJson['estiblishTime']
if not establishDate:
establishDate = ''
else:
if len(establishDate) == 10:
establishDate += ' 00:00:00'
head = dataJson['legalPersonName']
name = dataJson['name']
registerStatus = dataJson['regStatus']
dic = {
"area": area,
"establishDate": establishDate,
"head": head,
"name": name,
"registerStatus": registerStatus,
"socialCreditCode": socialCreditCode
}
dics.append(dic)
log.info(f'{socialCreditCode}==={tycId}===共采集{len(dics)}条记录')
if dics:
req = sendData('http://114.115.236.206:8088/sync/branch', dics)
log.info(f'{socialCreditCode}==={req.text}')
takeTime = baseCore.getTimeCost(start, time.time())
log.info(f'{socialCreditCode}==={req.text}===耗时{takeTime}')
# log.info(f'{socialCreditCode}=====耗时{takeTime}')
else:
log.info(f'{socialCreditCode}===分支结构为空')
time.sleep(5)
except Exception as e:
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={socialCreditCode}=====企业分支机构采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('BranchEnterprise:gnqy_socialCode', info)
# state = 0
# takeTime = baseCore.getTimeCost(start, time.time())
# baseCore.recordLog(socialCreditCode, taskType, state, takeTime, '', f'获取变更信息失败--{e}')
time.sleep(5)
# break
if __name__ == "__main__":
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论