提交 cdc4a715 作者: 薛凌堃

独角兽榜单基本信息

上级 41c6aaa2
...@@ -9,19 +9,14 @@ import json ...@@ -9,19 +9,14 @@ import json
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from getQccId import find_id_by_name from getQccId import find_id_by_name
from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger() log = baseCore.getLogger()
# 通过企查查id获取企业基本信息 # 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name): def info_by_id(com_id,com_name,social_code):
aa_dict_list = [] aa_dict_list = []
t = str(int(time.time()) * 1000) t = str(int(time.time()) * 1000)
...@@ -29,14 +24,17 @@ def info_by_id(com_id,com_name): ...@@ -29,14 +24,17 @@ def info_by_id(com_id,com_name):
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id) url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json() resp_dict = requests.get(url=url, headers=headers, verify=False).json()
log.info(resp_dict)
time.sleep(2) time.sleep(2)
com_jc_name = '' com_jc_name = ''
try: try:
result_dict = resp_dict['result']['Company'] result_dict = resp_dict['result']['Company']
except: except:
print(com_name + ":获取失败") log.info(com_name + ":获取失败===========重新放入redis")
# baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode',social_code)
return aa_dict_list
company_name = result_dict['Name'] company_name = result_dict['Name']
CreditCode = result_dict['CreditCode'] CreditCode = result_dict['CreditCode']
if CreditCode is None: if CreditCode is None:
...@@ -309,11 +307,12 @@ def info_by_id(com_id,com_name): ...@@ -309,11 +307,12 @@ def info_by_id(com_id,com_name):
} }
aa_dict_list.append(aa_dict) aa_dict_list.append(aa_dict)
print(company_name + ":爬取完成") log.info(company_name + ":爬取完成")
return aa_dict_list return aa_dict_list
if __name__ == '__main__': if __name__ == '__main__':
taskType = '基本信息/企查查/福布斯' taskType = '基本信息/企查查/单项双百企业冠军'
headers = { headers = {
'Host': 'xcx.qcc.com', 'Host': 'xcx.qcc.com',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -325,54 +324,73 @@ if __name__ == '__main__': ...@@ -325,54 +324,73 @@ if __name__ == '__main__':
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html', 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,' 'Accept-Encoding': 'gzip, deflate, br,'
} }
list_weicha = []
name_list = []
#从redis里拿数据 #从redis里拿数据
while True: while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken() token = baseCore.GetToken()
list_weicha = [] dataList = []
list_all_info = [] if token:
name_list = [] pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('BaseInfoEnterpriseFbs:gnqy_social_code') # social_code = baseCore.redicPullData('dujs_1020:baseinfo_socialcode')
# social_code = '91110000710924945A' social_code = '91310115067758342E'
if social_code is None: if social_code == '' or social_code is None:
time.sleep(20) time.sleep(20)
continue continue
log.info(f'----当前企业{social_code}-----')
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
# log.info(f'----当前企业{social_code}--开始处理---')
count = dic_info[13]
count = dic_info[14]
com_name = dic_info[1] com_name = dic_info[1]
social_code = dic_info[2] social_code = dic_info[2]
# 企查查id # 企查查id
company_id = dic_info[12] company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码 #如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None: if company_id == None:
if social_code: if social_code:
company_id = find_id_by_name(start_time, token, social_code) company_id = find_id_by_name(start_time,token,social_code)
else: else:
company_id = find_id_by_name(start_time, token, com_name) company_id = find_id_by_name(start_time,token,com_name)
# todo:写入数据库 if company_id == 'null':
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'" log.info('=====搜索不到该企业====')
cursor_.execute(updateSql) #todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
cnx_.commit() baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode', social_code + ':搜索不到')
post_data_list = info_by_id(company_id, com_name)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
continue continue
else: if not company_id:
log.info(f'====={social_code}===={company_id}=====获取企业id成功=====') log.info(social_code + ":企业ID获取失败===重新放入redis")
try: list_weicha.append(social_code + ":企业ID获取失败")
post_data_list = info_by_id(company_id, com_name) baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode',social_code)
except: baseCore.delete_token(token)
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info('=====已重新放入redis,失效token已删除======')
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gnqy_social_code', social_code) time.sleep(20)
continue continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name, social_code)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('dujs_1020:baseinfo_socialcode', social_code)
continue
if post_data_list:
pass
else:
log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list: for post_data in post_data_list:
list_all_info.append(post_data)
if post_data is None: if post_data is None:
print(com_name + ":企业信息获取失败") print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败") list_weicha.append(com_name + ":企业信息获取失败")
...@@ -396,17 +414,18 @@ if __name__ == '__main__': ...@@ -396,17 +414,18 @@ if __name__ == '__main__':
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception) baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败") log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# break
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
runType = 'BaseInfoRunCount' runType = 'BaseInfoRunCount'
count += 1 count += 1
baseCore.updateRun(social_code, runType, count) baseCore.updateRun(social_code, runType, count)
nowtime = baseCore.getNowTime(1).replace('-', '_')[:10] break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list) companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx', index=False) companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha) false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx', index=False) false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
baseCore.close()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论