提交 617cfcb9 作者: XveLingKun

天眼查-变更记录

上级 be4f16b4
# -*- coding: utf-8 -*-
import time
import requests
import urllib3
from retry import retry
from selenium import webdriver
from bs4 import BeautifulSoup
from classtool import Token, sendData, Driver, Login
from base import BaseCore
from enterprise_tyc.getTycId import getTycIdByDB
"""变更记录"""
# baseCore = BaseCore.BaseCore(sqlflg=False)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
token = Token()
edge = Driver()
login = Login()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
taskType = "天眼查/变更记录"
@retry(tries=5, delay=5)
def getJson(url, headers, s):
ip = baseCore.get_proxy()
log.info(f'当前使用的ip是{ip}')
# req = requests.get(url, headers=headers, timeout=20)
req = s.get(url, headers=headers, proxies=ip, timeout=(5, 10))
# req = s.get(url, headers=headers, timeout=(5, 10))
dataJson = req.json()
if dataJson['errorCode'] != 0:
raise
req.close()
return dataJson['data']
def doJob():
driver = edge.create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
driver, id_cookie, s, update_headers = login.login(driver)
if id_cookie:
pass
else:
continue
headers.update(update_headers)
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
info = baseCore.r.blpop(['ChangeRecordEnterprise:gnqy_socialCode'], 2)
if not info:
log.info('数据已全部采集完')
time.sleep(60 * 60)
continue
# info = "9133000070471161XA"
info = info[1].deocde()
start = time.time()
socialCreditCode = info.split('|')[0]
try:
tycId = getTycIdByDB(socialCreditCode, cursor, start, info, s)
if tycId:
url = f'https://capi.tianyancha.com/cloud-company-background/company/changeinfoEm?gid={tycId}&pageNum=1&pageSize=10&changeItem=-100'
datasJson = getJson(url, headers, s)
total = datasJson['total']
if total % 10 == 0:
totalPage = int(total / 10)
else:
totalPage = int(total / 10 + 1)
dics = []
for page in range(1, totalPage + 1):
if page != 1:
url = url.replace(f'pageNum={page - 1}', f'pageNum={page}')
datasJson = getJson(url, headers, s)
for dataJson in datasJson['result']:
changeDate = dataJson['changeTime']
changeItem = dataJson['changeItem']
changeBeforeTag = dataJson['contentBefore']
changeBeforeTag = BeautifulSoup(changeBeforeTag, 'lxml')
newChangeBeforeTag = BeautifulSoup('', 'lxml')
changeBeforePList = changeBeforeTag.find_all('p')
for changeBeforePTag in changeBeforePList:
newPTag = newChangeBeforeTag.new_tag('p')
newPTag.string = changeBeforePTag.text
newChangeBeforeTag.append(newPTag)
changeAfterTag = dataJson['contentAfter']
changeAfterTag = BeautifulSoup(changeAfterTag, 'lxml')
newChangeAfterTag = BeautifulSoup('', 'lxml')
changeAfterPList = changeAfterTag.find_all('p')
for changeAfterPTag in changeAfterPList:
newPTag = newChangeAfterTag.new_tag('p')
newPTag.string = changeAfterPTag.text
newChangeAfterTag.append(newPTag)
dic = {
"changeDate": changeDate, # 变更时间
"changeItem": changeItem, # 变更事项
"changeBefore": str(newChangeBeforeTag), # 变更前
"changeAfter": str(newChangeAfterTag), # 变更后
"socialCreditCode": socialCreditCode # 社会信用代码
}
dics.append(dic)
time.sleep(5)
if dics:
# req = sendData('http://114.115.236.206:8088/sync/changeRecord', dics)
# req = sendData('http://192.168.1.69:8088/sync/changeRecord', dics)
takeTime = baseCore.getTimeCost(start, time.time())
# log.info(f'{socialCreditCode}==={req.text}===耗时{takeTime}')
log.info(f'{socialCreditCode}=====耗时{takeTime}')
else:
log.info(f'{socialCreditCode}===变更信息为空')
else:
continue
except Exception as e:
token.updateTokeen(id_cookie, 3)
# token.updateTokeen(id_cookie, 2)
log.info(f'==={socialCreditCode}=====企业变更记录采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('ChangeRecordEnterprise:gnqy_socialCode', info)
# state = 0
# takeTime = baseCore.getTimeCost(start, time.time())
# baseCore.recordLog(socialCreditCode, taskType, state, takeTime, '', f'获取变更信息失败--{e}')
time.sleep(5)
break
if __name__ == "__main__":
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论