提交 52e227da 作者: 薛凌堃

天眼查基本信息维护

上级 a86fe277
...@@ -49,8 +49,8 @@ class File(): ...@@ -49,8 +49,8 @@ class File():
class Token(): class Token():
# 获取token # 获取token
def getToken(self): def getToken(self):
cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1") # cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
# cursor.execute(f" select id, cookies from QCC_token") cursor.execute(f" select id, cookies from QCC_token where id = 63")
# rows = cursor.fetchall() # rows = cursor.fetchall()
# cnx.commit() # cnx.commit()
# if rows: # if rows:
......
...@@ -81,15 +81,30 @@ def baseinfo(com_soup): ...@@ -81,15 +81,30 @@ def baseinfo(com_soup):
# print(info) # print(info)
value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '') value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
if name == '法定代表人': if name == '法定代表人':
value = cominfo.find('a').text try:
value = cominfo.find('a').text
except:
value = None
if name == '电话': if name == '电话':
value = cominfo.find('span').text try:
value = cominfo.find('span').text
except:
value = None
if name == '邮箱': if name == '邮箱':
value = cominfo.find('a').text try:
value = cominfo.find('a').text
except:
value = None
if name == '网址': if name == '网址':
value = cominfo.find('a').text try:
value = cominfo.find('a').text
except:
value = None
if name == '地址': if name == '地址':
value = cominfo.find('span').text try:
value = cominfo.find('span').text
except:
value = None
data[name] = value data[name] = value
# print("==================") # print("==================")
...@@ -141,7 +156,10 @@ def dic_handle(result_dic): ...@@ -141,7 +156,10 @@ def dic_handle(result_dic):
try: try:
Status = result_dic['经营状态'] Status = result_dic['经营状态']
except: except:
Status = None try:
Status = result_dic['公司现状']
except:
Status = None
try: try:
StartDate = result_dic['成立日期'] StartDate = result_dic['成立日期']
...@@ -198,31 +216,31 @@ def dic_handle(result_dic): ...@@ -198,31 +216,31 @@ def dic_handle(result_dic):
except: except:
TaxpayerType = None TaxpayerType = None
# try:
# SubIndustry = result_dic['国标行业']
# except:
# SubIndustry = ''
try: try:
region = result_dic['所属地区'] SubIndustry = result_dic['国标行业']
except: except:
region = None SubIndustry = None
try:
pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
matches = re.match(pattern, region)
Province = matches.group(1)
City = matches.group(2)
County = matches.group(3)
if Province is None:
for zxs in zxss:
if zxs in region:
Province = zxs
break
except: # try:
Province = None # region = result_dic['所属地区']
City = None # except:
County = None # region = None
# try:
# pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
# matches = re.match(pattern, region)
# Province = matches.group(1)
# City = matches.group(2)
# County = matches.group(3)
# if Province is None:
# for zxs in zxss:
# if zxs in region:
# Province = zxs
# break
# except:
# Province = None
# City = None
# County = None
try: try:
BelongOrg = result_dic['登记机关'] BelongOrg = result_dic['登记机关']
...@@ -285,11 +303,11 @@ def dic_handle(result_dic): ...@@ -285,11 +303,11 @@ def dic_handle(result_dic):
'businessStartDate': TermStart, # 营业期限自 'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至 'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质 'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': None, # 所属行业 'industry': SubIndustry, # 所属行业
'region': region, 'region': None,
'province': Province, # 所属省 'province': None, # 所属省
'city': City, # 所属市 'city': None, # 所属市
'county': County, # 所属县 'county': None, # 所属县
'registerDepartment': BelongOrg, # 登记机关 'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模 'scale': Info, # 人员规模
'insured': can_bao, # 参保人数 'insured': can_bao, # 参保人数
...@@ -326,7 +344,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -326,7 +344,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log.info(f'----当前企业{social_code}-{com_name}--开始处理---') log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
count = 0 count = 0
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码 # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if social_code: if social_code and 'ZZSN' not in social_code and 'ZD' not in social_code:
soup = checklogin(social_code) soup = checklogin(social_code)
else: else:
soup = checklogin(com_name) soup = checklogin(com_name)
...@@ -410,7 +428,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -410,7 +428,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(td_count) # print(td_count)
td_list = tr.find_all('td') td_list = tr.find_all('td')
td_count = len(td_list) td_count = len(td_list)
name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0] name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
# print(name_list) # print(name_list)
# value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0] # value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
...@@ -428,7 +445,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -428,7 +445,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y') tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
# for value_tag in value_tag_list: # for value_tag in value_tag_list:
value_list.append(value_tag.text.replace('\xa0', '')) value_list.append(value_tag.text.replace('\xa0', ''))
# print(value_list) # print(value_list)
if len(name_list) == len(value_list): if len(name_list) == len(value_list):
for i in range(len(name_list)): for i in range(len(name_list)):
...@@ -439,10 +455,30 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -439,10 +455,30 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
if name_list[i] == '法定代表人': if name_list[i] == '法定代表人':
value_list[i] = value_list[i].split('任职')[0] value_list[i] = value_list[i].split('任职')[0]
dic_buseniss[name_list[i]] = value_list[i] dic_buseniss[name_list[i]] = value_list[i]
del dic_buseniss['天眼评分'] try:
del dic_buseniss['天眼评分']
except:
pass
# print(dic_buseniss) # print(dic_buseniss)
result_dict = getinfo(dic_buseniss, data_baseinfo) result_dict = getinfo(dic_buseniss, data_baseinfo)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
result_dict['统一社会信用代码']
except:
# log.info('未获取到统一社会信用代码')
if social_code:
result_dict['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
result_dict['企业名称'] = result_dict['企业名称'][1:-1]
if result_dict['企业名称'] == '-' and com_name:
result_dict['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# print(result_dict) # print(result_dict)
# 采集成功的企业 # 采集成功的企业
data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']] data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
...@@ -460,9 +496,28 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -460,9 +496,28 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType aa_dic['listingType'] = listType
# print(aa_dic) # print(aa_dic)
sendkafka(aa_dic) sendkafka(aa_dic)
# print(aa_dic)
else: else:
data_baseinfo = baseinfo(com_soup) data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业 # 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']] data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data) file.appenddata(file_name, '获取基本信息成功企业', data)
...@@ -479,11 +534,18 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -479,11 +534,18 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType aa_dic['listingType'] = listType
sendkafka(aa_dic) sendkafka(aa_dic)
def remove_parentheses(text):
# 清除中文小括号
text = re.sub(r'(|)', '', text)
# 清除英文小括号
text = re.sub(r'\(|\)', '', text)
return text.replace(' ', '')
# 判断名称是否统一 # 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name): def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = '' company_url = ''
try: try:
company_list = soup.find('div', class_='index_search-box__7YVh6') company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except: except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
...@@ -496,7 +558,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -496,7 +558,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
info_t = compamy.find('div', class_='index_name__qEdWi') info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}') log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname: if receptname and getname == receptname:
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
break break
...@@ -504,7 +565,13 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -504,7 +565,13 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
break break
else: else:
continue jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
if remove_parentheses(receptname) == jian_name:
log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
company_url = info_t.find('a')['href']
break
else:
continue
if company_url: if company_url:
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html' # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html' # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
...@@ -512,30 +579,33 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -512,30 +579,33 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
# 判断是否是曾用名 # 判断是否是曾用名
getname = ''
for child in company_list[0].find_all(): for child in company_list[0].find_all():
if child.has_attr('class'): if child.has_attr('class'):
print(child['class']) print(child['class'])
if 'index_name' in child['class']: if 'index_name' in child['class'][0]:
getname = child.text getname = child.text
company_url = child.find('a')['href'] company_url = child.find('a')['href']
break break
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
# tr = company_list[:1][0] # tr = company_list[:1][0]
# info_t = tr.find('div', class_='index_name__qEdWi') # info_t = tr.find('div', class_='index_name__qEdWi')
# getname = info_t.find('span').text # getname = info_t.find('span').text
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}') if getname:
beforename = ifbeforename(company_url) log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if beforename == receptname: beforename = ifbeforename(company_url)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name) if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType,
ynDomestic, countryName, file_name)
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
else: else:
#没有搜到相同的企业名称 # 没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
file.appenddata(file_name, '需处理企业',data) file.appenddata(file_name, '需处理企业', data)
time.sleep(2) time.sleep(2)
return False return False
return True return True
...@@ -546,7 +616,7 @@ if __name__ == '__main__': ...@@ -546,7 +616,7 @@ if __name__ == '__main__':
# driver, id_cookie = login() # driver, id_cookie = login()
while True: while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8] nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./国内企业基本信息采集情况.xlsx' file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name) file.createFile(file_name)
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
...@@ -564,8 +634,9 @@ if __name__ == '__main__': ...@@ -564,8 +634,9 @@ if __name__ == '__main__':
s.cookies.update(cookies) s.cookies.update(cookies)
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '91110000710925016E||' # company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name) baseCore.sendEmail(file_name)
...@@ -592,26 +663,26 @@ if __name__ == '__main__': ...@@ -592,26 +663,26 @@ if __name__ == '__main__':
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
continue continue
continue continue
# company_field_ = f'|{company_field}'
social_code = company_field.split('|')[0] social_code = company_field.split('|')[0]
com_name = company_field.split('|')[1].replace(' ', '') com_name = company_field.split('|')[2].replace(' ', '')
# ynDomestic = company_field.split('|')[15] ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16] countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17] securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18] securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21] listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19] category = company_field.split('|')[19]
# exchange = company_field.split('|')[20] exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21] listType = company_field.split('|')[21]
ynDomestic = None # ynDomestic = None
countryName = None # countryName = None
securitiesCode = None # securitiesCode = None
securitiesShortName = None # securitiesShortName = None
listingDate = None # listingDate = None
category = None # category = None
exchange = None # exchange = None
listType = None # listType = None
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange, count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
listType, ynDomestic, countryName, file_name) listType, ynDomestic, countryName, file_name)
...@@ -622,5 +693,5 @@ if __name__ == '__main__': ...@@ -622,5 +693,5 @@ if __name__ == '__main__':
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
# runType = 'BaseInfoRunCount' # runType = 'BaseInfoRunCount'
# baseCore.updateRun(social_code, runType, count) # baseCore.updateRun(social_code, runType, count)
break # break
baseCore.close() baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论