提交 b923d30f 作者: 薛凌堃

中国100大企业基本信息

上级 1bb5b282
title dujiaoshoubaseinfo
call activate
call conda activate zzsn@3.8.0
python baseinfo_dujiaoshou.py
pause
\ No newline at end of file
...@@ -7,16 +7,17 @@ import requests ...@@ -7,16 +7,17 @@ import requests
import json import json
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore from BaseCore import BaseCore
from getQccId import find_id_by_name from getQccId import find_id_by_name
baseCore = BaseCore() baseCore = BaseCore()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
log = baseCore.getLogger() log = baseCore.getLogger()
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 通过企查查id获取企业基本信息 # 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name,gpdm): def info_by_id(com_id,com_name):
aa_dict_list = [] aa_dict_list = []
t = str(int(time.time()) * 1000) t = str(int(time.time()) * 1000)
...@@ -31,7 +32,7 @@ def info_by_id(com_id,com_name,gpdm): ...@@ -31,7 +32,7 @@ def info_by_id(com_id,com_name,gpdm):
result_dict = resp_dict['result']['Company'] result_dict = resp_dict['result']['Company']
except: except:
log.info(com_name + ":获取失败===========重新放入redis") log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('EnterpriseIpo:nq_gpdm',gpdm) baseCore.rePutIntoR('china100:baseinfo',com_name)
return aa_dict_list return aa_dict_list
company_name = result_dict['Name'] company_name = result_dict['Name']
...@@ -306,12 +307,12 @@ def info_by_id(com_id,com_name,gpdm): ...@@ -306,12 +307,12 @@ def info_by_id(com_id,com_name,gpdm):
} }
aa_dict_list.append(aa_dict) aa_dict_list.append(aa_dict)
print(company_name + ":爬取完成") log.info(company_name + ":爬取完成")
return aa_dict_list return aa_dict_list
if __name__ == '__main__': if __name__ == '__main__':
taskType = '基本信息/企查查' taskType = '基本信息/企查查/中国100强'
headers = { headers = {
'Host': 'xcx.qcc.com', 'Host': 'xcx.qcc.com',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -323,65 +324,97 @@ if __name__ == '__main__': ...@@ -323,65 +324,97 @@ if __name__ == '__main__':
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html', 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,' 'Accept-Encoding': 'gzip, deflate, br,'
} }
list_weicha = []
name_list = []
#从redis里拿数据 #从redis里拿数据
while True: while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得 # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = 'b4eb43143abdcf395f1335f322ca29e5' token = baseCore.GetToken()
list_weicha = [] dataList = []
list_all_info = [] if token:
name_list = [] pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm') social_code = baseCore.redicPullData('china100:baseinfo')
com_code = '873349'
if '.NQ' in com_code:
com_code1 = com_code
else:
com_code1 = com_code + '.NQ'
company_id = find_id_by_name(start_time,token,com_code) # com_name = '卓新市万达铸业有限公司'
if social_code == '' or social_code is None:
if not company_id:
log.info(com_code + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_code + ":企业ID获取失败")
baseCore.rePutIntoR('EnterpriseIpo:nq_gpdm',com_code)
log.info('-----已重新放入redis-----')
time.sleep(20) time.sleep(20)
continue continue
if '搜索不到' in social_code:
continue
else: else:
log.info(f'====={com_code}===={company_id}=====获取企业id成功=====') pass
# todo:企查查id写入gpdm表中 dic_info = baseCore.getInfomation(social_code)
updateSql = f"update gpdm set QCCID = '{company_id}' where gpdm = '{com_code}'" log.info(f'----当前企业{social_code}--开始处理---')
cursor_.execute(updateSql)
cnx_.commit() com_name = dic_info[1]
#企查查id
company_id = dic_info[3]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None or company_id == False:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('china100:baseinfo', social_code + ':搜索不到')
continue
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('china100:baseinfo',com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateqccid = f"update China100 set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_.execute(updateqccid)
cnx_.commit()
try: try:
post_data_list = info_by_id(company_id, '',com_code1) post_data_list = info_by_id(company_id, com_name)
except: except:
log.info(f'====={com_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('EnterpriseIpo:nq_gpdm', com_code) baseCore.rePutIntoR('china100:baseinfo', com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
continue continue
if post_data_list: if post_data_list:
pass pass
else: else:
log.info(f'======{com_code}====企查查token失效====') # log.info(f'======{social_code}====企查查token失效====')
time.sleep(20) time.sleep(20)
continue continue
for post_data in post_data_list: for post_data in post_data_list:
list_all_info.append(post_data) # list_all_info.append(post_data)
if post_data is None: if post_data is None:
print(com_code + ":企业信息获取失败") print(com_name + ":企业信息获取失败")
list_weicha.append(com_code + ":企业信息获取失败") list_weicha.append(com_name + ":企业信息获取失败")
continue continue
get_name = post_data['name'] get_name = post_data['name']
get_socialcode = post_data['socialCreditCode'] get_socialcode = post_data['socialCreditCode']
#todo:将信用代码更新到表中
updatesocialcode = f"update China100 set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_.execute(updatesocialcode)
cnx_.commit()
name_compile = { name_compile = {
'yuan_name':com_code, 'yuan_name':com_name,
'get_name':get_name 'get_name':get_name
} }
name_list.append(name_compile) name_list.append(name_compile)
# dataList.append(post_data)
log.info(f'采集{com_code}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}') baseCore.writerToExcel(name_list,'中国100强企业.xlsx')
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2)) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
...@@ -392,13 +425,9 @@ if __name__ == '__main__': ...@@ -392,13 +425,9 @@ if __name__ == '__main__':
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception) baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败") log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# 信息采集完成后将该企业的采集次数更新
# break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
......
...@@ -5,21 +5,43 @@ import time ...@@ -5,21 +5,43 @@ import time
from urllib.parse import quote from urllib.parse import quote
import requests import requests
import urllib3 import urllib3
from base.BaseCore import BaseCore from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = { headers = {
'Host': 'xcx.qcc.com', 'Host': 'xcx.qcc.com',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin', 'x-request-device-type': 'Android',
'Qcc-Timestamp': '', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0', 'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat', 'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'content-type': 'application/json', 'xweb_xhr': '1',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html', 'xcx-version': '2023.09.27',
'Accept-Encoding': 'gzip, deflate, br,' 'Qcc-Platform': 'mp-weixin',
} 'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id # 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name): def find_id_by_name(start,token,name):
urllib3.disable_warnings() urllib3.disable_warnings()
...@@ -32,8 +54,8 @@ def find_id_by_name(start,token,name): ...@@ -32,8 +54,8 @@ def find_id_by_name(start,token,name):
try: try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json() resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break break
except: except Exception as e:
print('重试') print(f'{e}-------------重试')
time.sleep(5) time.sleep(5)
continue continue
time.sleep(2) time.sleep(2)
...@@ -46,19 +68,23 @@ def find_id_by_name(start,token,name): ...@@ -46,19 +68,23 @@ def find_id_by_name(start,token,name):
KeyNo = False KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}') log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo return KeyNo
if resp_dict['status']==40102:
KeyNo = False
log.info(f'=======无效的session=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try: try:
if resp_dict['result']['Result']: if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0] result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo'] KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip() Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '': if Name == '':
KeyNo = '' KeyNo = 'null'
else: else:
KeyNo = '' KeyNo = 'null'
except: except:
KeyNo = False KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}') log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo return KeyNo
print("{},企业代码为:{}".format(qcc_key, KeyNo)) log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo return KeyNo
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论