提交 d711ce98 作者: 薛凌堃

补采天眼查企业动态

上级 f424eb1b
#企业动态 从redis中获取数据
import json
import random
import requests, time, pymysql
import jieba
import sys
from kafka import KafkaProducer
from getTycId import getTycIdByXYDM
# from base.BaseCore import BaseCore
# from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base')
import BaseCore
from smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
jieba.cut("必须加载jieba")
# 初始化,设置中文分词
smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor()
pageSize = 10
log.info(f'======================当前脚本进程为{baseCore.getPID()}==============================')
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5NzE5MDMwMywiZXhwIjoxNjk5NzgyMzAzfQ.awXuS-59RzK35r0aUJq4Rj83JzyAOvsdUfL_ojp66CVQMjlLv_ZDD9g5gCoZKE21LN1JYRMLNZhuWsHhxapROw',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+'
def beinWork(tyc_code, social_code,start_time):
time.sleep(3)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
try:
for m in range(0, 3):
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
time.sleep(random.randint(3, 5))
break
if (response.status_code == 200):
pass
except Exception as e:
log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
return retData
try:
json_1 = json.loads(response.content.decode('utf-8'))
total = json_1['data']['total']
except:
log.error(f"{tyc_code}-----获取总数失败")
e = '获取总数失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
return retData
if (total > 0):
if (total % pageSize == 0):
totalPage = total // pageSize
else:
totalPage = total // pageSize + 1
else:
log.error(f"{tyc_code}--------总数为0")
retData['state'] = True
return retData
log.info(f"{tyc_code}-------总数:{total}----总页数:{totalPage}")
retData['total'] = total
up_okCount = 0
up_errorCount = 0
up_repetCount = 0
for num in range(1, totalPage + 1):
time.sleep(3)
log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
start_page = time.time()
url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
for m in range(0, 3):
try:
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
time.sleep(1)
break
except:
pass
if (response_page.status_code == 200):
pass
else:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
try:
json_page = json.loads(response_page.content.decode('utf-8'))
info_list_page = json_page['data']['items']
except:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
pageIndex = 0
for info_page in info_list_page:
pageIndex = pageIndex + 1
title = info_page['title']
source = info_page['website']
link = info_page['uri']
try:
sel_sql = '''select social_credit_code from brpa_source_article_news where source_address = %s and social_credit_code=%s and type='2' '''
cursor_.execute(sel_sql, (link, social_code))
except Exception as e:
print(e)
selects = cursor_.fetchone()
if selects:
log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
# todo:如果该条数据存在则说明该条数据之后的都已经采集完成,就可以跳出函数,执行下一个企业
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
#return retData
continue
try:
time_struct = time.localtime(int(info_page['rtm'] / 1000)) # 首先把时间戳转换为结构化时间
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct) # 把结构化时间转换为格式化时间
except:
time_format = baseCore.getNowTime(1)
#记录时间 对比时间
#if time_format > '2023-09-25' and time_format < '2023-10-01':
#pass
#else:
#continue
try:
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
# time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
up_errorCount = up_errorCount + 1
try:
insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
cursor.execute(insert_err_sql)
cnx.commit()
except:
pass
continue
try:
insert_sql = '''insert into brpa_source_article_news(social_credit_code,source_address,origin,type,publish_time,content,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
# 动态信息列表
up_okCount = up_okCount + 1
list_info = [
social_code,
link,
f'天眼查-{source}',
'2',
time_format,
content[:500]
]
cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': source,
'publishDate': time_format,
#'sid': '1684032033495392257',
'sid': '1714852232679067649',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
log.info(f'传输失败:{social_code}----{link}')
error = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
continue
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, '成功')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
e = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
log.info(f"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}")
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
return retData
# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
#social_code = '91440300665899831W'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
#数据重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
continue
id = data[0]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
# 开始采集企业动态
retData = beinWork(tycid, xydm,start_time)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败====')
#重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
#break
cursor.close()
cnx.close()
# 释放资源
baseCore.close()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论