提交 c71cd037 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -22,6 +22,9 @@ import BaseCore
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
log = baseCore.getLogger()
from classtool import Token, File, Tag
......@@ -57,270 +60,6 @@ def sendkafka(post_data):
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败")
# 合并基本信息和工商信息字段
def getinfo(dict1,dict2):
# 取出两个字典的key值集合
keys1 = set(dict1.keys())
keys2 = set(dict2.keys())
# 取出并集
union_keys = keys1 | keys2
# 根据并集的key值,从两个字典中取出value值,组成新的字典
result_dict = {key: dict1.get(key, None) or dict2.get(key, None) for key in union_keys}
return result_dict
# 获取基本信息
def baseinfo(com_soup):
baseinfo = com_soup.find('div', class_='index_detail__JSmQM')
cominfo_list = baseinfo.find_all('div', class_='index_detail-info-item__oAOqL') #name
data = {}
for cominfo in cominfo_list:
name = cominfo.find('span', class_='index_detail-label__oRf2J').text.replace(':', '').replace(' ', '')
# print(name)
tag.deletep(cominfo, 'span', 'class', 'index_detail-label__oRf2J')
tag.deletep(cominfo, 'i', 'class', 'index_detail-text-desc__myXYK')
# print(info)
value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
if name == '法定代表人':
try:
value = cominfo.find('a').text
except:
value = None
if name == '电话':
try:
value = cominfo.find('span').text
except:
value = None
if name == '邮箱':
try:
value = cominfo.find('a').text
except:
value = None
if name == '网址':
try:
value = cominfo.find('a').text
except:
value = None
if name == '地址':
try:
value = cominfo.find('span').text
except:
value = None
data[name] = value
# print("==================")
briefTag = baseinfo.find('div', class_='index_detail-linewrap__AKtCa index_-intro__ma3Qd')
span_list = briefTag.find_all('span')
for span in span_list:
if len(span.attrs) == 0:
data['简介'] = span.text.split('通过天眼查大数据分析')[0]
break
return data
def dic_handle(result_dic):
zxss = ['北京市', '天津市', '上海市', '重庆市']
try:
company_name = result_dic['企业名称']
except:
company_name = None
try:
CreditCode = result_dic['统一社会信用代码']
except:
CreditCode = None
try:
OperName = result_dic['法定代表人']
except:
OperName = None
try:
PhoneNumber = result_dic['电话']
except:
PhoneNumber = None
try:
WebSite = result_dic['网址']
except:
WebSite = None
try:
Email = result_dic['邮箱']
except:
Email = None
try:
Desc = result_dic['简介']
except:
Desc = None
try:
Status = result_dic['经营状态']
except:
try:
Status = result_dic['公司现状']
except:
Status = None
try:
StartDate = result_dic['成立日期']
except:
StartDate = None
try:
RecCap = result_dic['实缴资本']
except:
RecCap = None
try:
RegistCapi = result_dic['注册资本']
except:
RegistCapi = None
try:
CheckDate = result_dic['核准日期']
except:
CheckDate = None
try:
OrgNo = result_dic['组织机构代码']
except:
OrgNo = None
try:
No = result_dic['工商注册号']
except:
No = None
try:
taxpayerNo = result_dic['纳税人识别号']
except:
taxpayerNo = None
try:
EconKind = result_dic['企业类型']
except:
EconKind = None
try:
TermStart = result_dic['营业期限'].split('至')[0]
except:
TermStart = None
try:
TeamEnd = result_dic['营业期限'].split('至')[1]
except:
TeamEnd = None
try:
TaxpayerType = result_dic['纳税人资质']
except:
TaxpayerType = None
try:
SubIndustry = result_dic['国标行业']
except:
SubIndustry = None
# try:
# region = result_dic['所属地区']
# except:
# region = None
# try:
# pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
# matches = re.match(pattern, region)
# Province = matches.group(1)
# City = matches.group(2)
# County = matches.group(3)
# if Province is None:
# for zxs in zxss:
# if zxs in region:
# Province = zxs
# break
# except:
# Province = None
# City = None
# County = None
try:
BelongOrg = result_dic['登记机关']
except:
BelongOrg = None
try:
Info = result_dic['人员规模']
except:
Info = None
try:
can_bao = result_dic['参保人数']
except:
can_bao = None
try:
OriginalName = result_dic['曾用名']
except:
OriginalName = None
try:
EnglishName = result_dic['英文名称']
except:
EnglishName = None
try:
IxCode = result_dic['进出口企业代码']
except:
IxCode = None
try:
Address = result_dic['地址']
except:
Address = None
try:
Scope = result_dic['经营范围']
except:
Scope = None
aa_dict = {
'name': company_name, # 企业名称
'shortName': None, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': taxpayerNo, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': None,
'province': None, # 所属省
'city': None, # 所属市
'county': None, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
return aa_dict
# 检查登陆状态
def checklogin(key):
......@@ -340,7 +79,7 @@ def checklogin(key):
return soup
# 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
def redaytowork(com_name, social_code, file_name):
log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
count = 0
......@@ -351,7 +90,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
soup = checklogin(com_name)
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
......@@ -361,7 +100,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
......@@ -375,7 +114,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else:
# 开始采集
try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
if spiderwork(soup, com_name, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
# token.updateTokeen(id_cookie,3)
......@@ -384,16 +123,15 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return count
def ifbeforename(company_url):
req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
time.sleep(2)
com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
......@@ -411,55 +149,64 @@ def ifbeforename(company_url):
#解析时间
def paserTime(publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now()
publishtime=publishtime.strip()
publishtime = publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
day = int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(months= day)
day = int(numbers[0])
delta = datetime.timedelta(months=day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(weeks= day)
day = int(numbers[0])
delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0])
delta = datetime.timedelta(days= day)
day = int(numbers[0])
delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days= 2)
delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
if '小时' in publishtime:
hour = publishtime.split("小时")[0]
else:
hour = 0
if hour != 0:
min = publishtime.split("小时")[1].split("分钟")[0]
else:
min = publishtime.split("分钟")[0]
delta = datetime.timedelta(hours=int(hour), minutes=int(min))
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime :
elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime
publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
def spiderinfo(company_url, receptname, file_name):
qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====')
driver.get(company_url)
......@@ -467,139 +214,38 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
publishtime = paserTime(publishTag)
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
businessinfo = ''
if businessinfo:
data_baseinfo = baseinfo(com_soup)
# print(data_baseinfo)
tr_list = businessinfo.find_all('tr')
dic_buseniss = {}
for tr in tr_list:
# td_count = len(tr.find_all('td'))
# print(td_count)
td_list = tr.find_all('td')
td_count = len(td_list)
name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
# print(name_list)
# value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
value_list = []
for i in range(td_count):
if i % 2 != 0:
value_tag = td_list[i]
# print(value_tag)
# print("==============")
tag.deletep(value_tag, 'span', 'class', 'index_history-operate__t3kjv')
tag.deletep(value_tag, 'div', 'class', '_efcb8')
tag.deletep(value_tag, 'span', 'class', 'index_legal-bottom-info__bYvYZ')
tag.deletep(value_tag, 'a', 'class', 'ml8 link-click')
tag.deletep(value_tag, 'span', 'class', 'index_report-jump__z__UW')
tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
# for value_tag in value_tag_list:
value_list.append(value_tag.text.replace('\xa0', ''))
# print(value_list)
if len(name_list) == len(value_list):
for i in range(len(name_list)):
dic_buseniss[name_list[i]] = value_list[i]
if '曾用名' in value_list[i]:
dic_buseniss['曾用名'] = value_list[i].split('曾用名')[1].split('更多')[0]
dic_buseniss[name_list[i]] = value_list[i].split('曾用名')[0]
if name_list[i] == '法定代表人':
value_list[i] = value_list[i].split('任职')[0]
dic_buseniss[name_list[i]] = value_list[i]
try:
del dic_buseniss['天眼评分']
except:
pass
# print(dic_buseniss)
result_dict = getinfo(dic_buseniss, data_baseinfo)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
result_dict['统一社会信用代码']
except:
# log.info('未获取到统一社会信用代码')
if social_code:
result_dict['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
result_dict['企业名称'] = result_dict['企业名称'][1:-1]
if result_dict['企业名称'] == '-' and com_name:
result_dict['企业名称'] = com_name
elif not com_name:
return False
sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
pattern = r'\d{4}-\d{2}-\d{2}'
matched = re.findall(pattern, sourceUpdateTime_)
if matched:
sourceUpdateTime = sourceUpdateTime_
else:
pass
# print(result_dict)
# 采集成功的企业
data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d")
except:
redaytowork(com_name, social_code, file_name)
aa_dict = {
'name': receptname, # 企业名称
'shortName': None, # 企业简称
'socialCreditCode': social_code, # 统一社会信用代码
'sourceUpdateTime': sourceUpdateTime,
'qccId': qccid
}
print(aa_dict)
# sendkafka(aa_dic)
header = {
'Content-Type': 'application/json',
}
post_url = 'http://114.115.236.206:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dict)
req = requests.post(post_url, data=dic_info, headers=header)
if req.status_code == 200:
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(result_dict)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName
aa_dic['securitiesCode'] = securitiesCode
aa_dic['securitiesShortName'] = securitiesShortName
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
print(aa_dic)
# sendkafka(aa_dic)
# print(aa_dic)
header = {
'Content-Type': 'application/json',
}
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info, headers=header)
print(req.text)
else:
data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName
aa_dic['securitiesCode'] = securitiesCode
aa_dic['securitiesShortName'] = securitiesShortName
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# sendkafka(aa_dic)
print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
log.info(f'====={social_code}=====发送数据失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
def remove_parentheses(text):
# 清除中文小括号
......@@ -609,13 +255,13 @@ def remove_parentheses(text):
return text.replace(' ', '')
# 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
def spiderwork(soup, receptname, file_name):
company_url = ''
try:
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return False
......@@ -640,16 +286,14 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
else:
continue
if company_url:
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
spiderinfo(company_url, receptname, file_name)
else:
# 判断是否是曾用名
getname = ''
for child in company_list[0].find_all():
if child.has_attr('class'):
print(child['class'])
# print(child['class'])
if 'index_name' in child['class'][0]:
getname = child.text
company_url = child.find('a')['href']
......@@ -661,8 +305,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType,
ynDomestic, countryName, file_name)
spiderinfo(company_url, receptname, file_name)
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
......@@ -677,20 +320,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return False
return True
from selenium import webdriver
def create_driver():
path = r'D:\soft\msedgedriver.exe'
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
driver = webdriver.Edge(executable_path=path, capabilities=options)
return driver
if __name__ == '__main__':
taskType = '基本信息/天眼查'
driver = create_driver()
......@@ -699,15 +328,6 @@ if __name__ == '__main__':
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Cookie':'TYCID=6f6298905d3011ee96146793e725899d; ssuid=3467188160; _ga=GA1.2.1049062268.1697190322; HWWAFSESID=2eb035742bde209aa60; HWWAFSESTIME=1706586308439; csrfToken=bT_looAjInHGeAnvjjl12L9v; bannerFlag=true; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=0; tyc-user-phone=%255B%252216603863075%2522%252C%2522152%25203756%25200528%2522%252C%2522159%25200367%25203315%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22310689501%22%2C%22first_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThhZDY5NmEyZWY2ODAtMGFlNWNkOTI5M2ExNTM4LTI2MDMxZjUxLTkyMTYwMC0xOGFkNjk2YTJmMGRjNSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMxMDY4OTUwMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22310689501%22%7D%2C%22%24device_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%7D; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2218703752600%22%2C%22userId%22%3A%22310689501%22%7D; tyc-user-info-save-time=1707008605562; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwNzAwODYwNSwiZXhwIjoxNzA5NjAwNjA1fQ.i8WEUrXjG2X__SnGGlnjwNXyOEdXlslrnvzvKZ_xlVA0rdjdsYHdaieAzkmIjoKbuv6Lc4Eqpb70hWIlq2zeoQ; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1705286979,1706586312; searchSessionId=1707118324.99879267;'
}
# cookies_list, id_cookie = token.get_cookies()
# cookies = {}
# for cookie in cookies_list:
......@@ -716,8 +336,8 @@ if __name__ == '__main__':
# s.cookies.update(cookies)
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
# company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
company_field = '913100006073602992|光明乳业股份有限公司'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
......@@ -738,45 +358,47 @@ if __name__ == '__main__':
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
if company_field:
flag = False
log.info("-----已添加数据------")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
continue
continue
# company_field_ = f'|{company_field}'
social_code = company_field.split('|')[0]
if social_code and 'ZZSN' not in social_code and 'ZD' not in social_code:
continue
#todo:查询天眼查id
com_name = company_field.split('|')[2].replace(' ', '')
com_name = company_field.split('|')[1].replace(' ', '')
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
# listType = company_field.split('|')[21]
ynDomestic = None
countryName = None
securitiesCode = None
securitiesShortName = None
listingDate = None
category = None
exchange = None
listType = None
if 'ZZSN' in social_code and 'ZD' in social_code:
continue
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
listType, ynDomestic, countryName, file_name)
#todo:查询天眼查id
data = baseCore.getInfomation(social_code)
if len(data) != 0:
tycid = data[11]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
pass
else:
# 数据库中并没有该企业 需要新增
pass
com_name_c = data[3]
xydm = data[1]
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name_c, xydm))
cnx_.commit()
tycid = ''
if tycid == None or tycid == '':
count = redaytowork(com_name, social_code, file_name)
else:
company_url = 'https://www.tianyancha.com/company/' + tycid
spiderinfo(company_url, social_code, file_name)
time.sleep(10)
# break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
# runType = 'BaseInfoRunCount'
# baseCore.updateRun(social_code, runType, count)
# break
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论