提交 52e227da 作者: 薛凌堃

天眼查基本信息维护

上级 a86fe277
......@@ -49,8 +49,8 @@ class File():
class Token():
# 获取token
def getToken(self):
cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
# cursor.execute(f" select id, cookies from QCC_token")
# cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor.execute(f" select id, cookies from QCC_token where id = 63")
# rows = cursor.fetchall()
# cnx.commit()
# if rows:
......
......@@ -81,15 +81,30 @@ def baseinfo(com_soup):
# print(info)
value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
if name == '法定代表人':
value = cominfo.find('a').text
try:
value = cominfo.find('a').text
except:
value = None
if name == '电话':
value = cominfo.find('span').text
try:
value = cominfo.find('span').text
except:
value = None
if name == '邮箱':
value = cominfo.find('a').text
try:
value = cominfo.find('a').text
except:
value = None
if name == '网址':
value = cominfo.find('a').text
try:
value = cominfo.find('a').text
except:
value = None
if name == '地址':
value = cominfo.find('span').text
try:
value = cominfo.find('span').text
except:
value = None
data[name] = value
# print("==================")
......@@ -141,7 +156,10 @@ def dic_handle(result_dic):
try:
Status = result_dic['经营状态']
except:
Status = None
try:
Status = result_dic['公司现状']
except:
Status = None
try:
StartDate = result_dic['成立日期']
......@@ -198,31 +216,31 @@ def dic_handle(result_dic):
except:
TaxpayerType = None
# try:
# SubIndustry = result_dic['国标行业']
# except:
# SubIndustry = ''
try:
region = result_dic['所属地区']
SubIndustry = result_dic['国标行业']
except:
region = None
try:
pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
matches = re.match(pattern, region)
Province = matches.group(1)
City = matches.group(2)
County = matches.group(3)
if Province is None:
for zxs in zxss:
if zxs in region:
Province = zxs
break
SubIndustry = None
except:
Province = None
City = None
County = None
# try:
# region = result_dic['所属地区']
# except:
# region = None
# try:
# pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
# matches = re.match(pattern, region)
# Province = matches.group(1)
# City = matches.group(2)
# County = matches.group(3)
# if Province is None:
# for zxs in zxss:
# if zxs in region:
# Province = zxs
# break
# except:
# Province = None
# City = None
# County = None
try:
BelongOrg = result_dic['登记机关']
......@@ -285,11 +303,11 @@ def dic_handle(result_dic):
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': None, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'industry': SubIndustry, # 所属行业
'region': None,
'province': None, # 所属省
'city': None, # 所属市
'county': None, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
......@@ -326,7 +344,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
count = 0
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if social_code:
if social_code and 'ZZSN' not in social_code and 'ZD' not in social_code:
soup = checklogin(social_code)
else:
soup = checklogin(com_name)
......@@ -410,7 +428,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(td_count)
td_list = tr.find_all('td')
td_count = len(td_list)
name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
# print(name_list)
# value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
......@@ -428,7 +445,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
# for value_tag in value_tag_list:
value_list.append(value_tag.text.replace('\xa0', ''))
# print(value_list)
if len(name_list) == len(value_list):
for i in range(len(name_list)):
......@@ -439,10 +455,30 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
if name_list[i] == '法定代表人':
value_list[i] = value_list[i].split('任职')[0]
dic_buseniss[name_list[i]] = value_list[i]
del dic_buseniss['天眼评分']
try:
del dic_buseniss['天眼评分']
except:
pass
# print(dic_buseniss)
result_dict = getinfo(dic_buseniss, data_baseinfo)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
result_dict['统一社会信用代码']
except:
# log.info('未获取到统一社会信用代码')
if social_code:
result_dict['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
result_dict['企业名称'] = result_dict['企业名称'][1:-1]
if result_dict['企业名称'] == '-' and com_name:
result_dict['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# print(result_dict)
# 采集成功的企业
data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
......@@ -460,9 +496,28 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType
# print(aa_dic)
sendkafka(aa_dic)
# print(aa_dic)
else:
data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
......@@ -479,11 +534,18 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType
sendkafka(aa_dic)
def remove_parentheses(text):
# 清除中文小括号
text = re.sub(r'(|)', '', text)
# 清除英文小括号
text = re.sub(r'\(|\)', '', text)
return text.replace(' ', '')
# 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = ''
try:
company_list = soup.find('div', class_='index_search-box__7YVh6')
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
......@@ -496,7 +558,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname:
company_url = info_t.find('a')['href']
break
......@@ -504,7 +565,13 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
break
else:
continue
jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
if remove_parentheses(receptname) == jian_name:
log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
company_url = info_t.find('a')['href']
break
else:
continue
if company_url:
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
......@@ -512,30 +579,33 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else:
# 判断是否是曾用名
getname = ''
for child in company_list[0].find_all():
if child.has_attr('class'):
print(child['class'])
if 'index_name' in child['class']:
if 'index_name' in child['class'][0]:
getname = child.text
company_url = child.find('a')['href']
break
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
# tr = company_list[:1][0]
# info_t = tr.find('div', class_='index_name__qEdWi')
# getname = info_t.find('span').text
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
if getname:
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType,
ynDomestic, countryName, file_name)
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
else:
#没有搜到相同的企业名称
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业',data)
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
return True
......@@ -546,7 +616,7 @@ if __name__ == '__main__':
# driver, id_cookie = login()
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./国内企业基本信息采集情况.xlsx'
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
......@@ -564,8 +634,9 @@ if __name__ == '__main__':
s.cookies.update(cookies)
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '91110000710925016E||'
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name)
......@@ -592,26 +663,26 @@ if __name__ == '__main__':
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
continue
continue
# company_field_ = f'|{company_field}'
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[1].replace(' ', '')
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21]
ynDomestic = None
countryName = None
securitiesCode = None
securitiesShortName = None
listingDate = None
category = None
exchange = None
listType = None
com_name = company_field.split('|')[2].replace(' ', '')
ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21]
category = company_field.split('|')[19]
exchange = company_field.split('|')[20]
listType = company_field.split('|')[21]
# ynDomestic = None
# countryName = None
# securitiesCode = None
# securitiesShortName = None
# listingDate = None
# category = None
# exchange = None
# listType = None
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
listType, ynDomestic, countryName, file_name)
......@@ -622,5 +693,5 @@ if __name__ == '__main__':
# 信息采集完成后将该企业的采集次数更新
# runType = 'BaseInfoRunCount'
# baseCore.updateRun(social_code, runType, count)
break
# break
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论