提交 621196c7 作者: 薛凌堃

天眼查企业动态

上级 80e7804c
......@@ -458,6 +458,7 @@ class BaseCore:
print(e)
self.cnx.commit()
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
......@@ -476,6 +477,7 @@ class BaseCore:
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
......@@ -488,4 +490,8 @@ class BaseCore:
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
"""
增量采集:
取state为3、update_state为空的企业 表示上次采集成功的企业,
新增update_state字段,取一个企业更新为2,表示该企业正在采集。
采集完毕更新为1.
表示已经采集完成。跟据date_time 来排列 每次就不会拿到重复的数据。
okCount
errorCount
repectCount
新增三个字段分别对应更新的up_okCount up_errorCount up_repectCount ,
记录这些更新的数据 然后加到原来的数据上表示该企业已采集多少动态
8.8日改版,企业动态也传kafka
"""
import json
import requests, time, pymysql
import jieba
import sys
from kafka import KafkaProducer
from getTycId import getTycIdByXYDM
from base.BaseCore import BaseCore
from base.smart import smart_extractor
# sys.path.append('D:\\KK\\zzsn_spider\\base')
......@@ -53,8 +38,8 @@ headers = {
taskType = '企业动态/天眼查'
def beinWork(tyc_code, social_code):
start_time = time.time()
def beinWork(tyc_code, social_code,start_time):
time.sleep(3)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
......@@ -230,12 +215,13 @@ def beinWork(tyc_code, social_code):
'sid': '1684032033495392257',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': contentText,
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
log.info(f'传输失败:{social_code}----{link}')
e = '数据库传输失败'
state = 0
......@@ -263,6 +249,7 @@ def beinWork(tyc_code, social_code):
baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
......@@ -276,8 +263,6 @@ def beinWork(tyc_code, social_code):
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
log.info(f"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}")
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
......@@ -295,30 +280,49 @@ def doJob():
if social_code == 'None':
time.sleep(20)
continue
data = baseCore.getInfomation(social_code)
id = data[0]
xydm = data[2]
tycid = data[11]
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
start = time.time()
try:
data = baseCore.getInfomation(social_code)
id = data[0]
xydm = data[2]
tycid = data[11]
if tycid == None:
try:
retData = getTycIdByXYDM(xydm)
tycid = retData['tycData']['id']
#todo:写入数据库
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR(social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
# updateBeginSql = f"update ssqy_tyc set update_state=2,date_time=now() where id={id}"
# cursor.execute(updateBeginSql)
# cnx.commit()
# updateBeginSql = f"update ssqy_tyc set update_state=2,date_time=now() where id={id}"
# cursor.execute(updateBeginSql)
# cnx.commit()
# 开始采集企业动态
retData = beinWork(tycid, xydm,start_time)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
except:
log.info(f'==={social_code}=====获取企业信息失败====')
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取企业信息失败')
# 开始采集企业动态
retData = beinWork(tycid, xydm)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
cursor.close()
cnx.close()
......@@ -328,4 +332,6 @@ def doJob():
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论