提交 f7886002 作者: 薛凌堃

企查查脚本维护

上级 08e4725c
...@@ -292,7 +292,7 @@ def dic_handle(result_dic): ...@@ -292,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict return aa_dict
# 采集准备 # 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code: # if social_code:
# dic_info = baseCore.getInfomation(social_code) # dic_info = baseCore.getInfomation(social_code)
...@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else: else:
# 开始采集 # 开始采集
try: try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1 count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}') log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3) token.updateTokeen(id_cookie,3)
...@@ -373,7 +373,7 @@ def ifbeforename(company_url): ...@@ -373,7 +373,7 @@ def ifbeforename(company_url):
return '' return ''
# 采集基本信息和工商信息 # 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0] qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新 # 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'" updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
...@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic) # print(aa_dic)
sendkafka(aa_dic) sendkafka(aa_dic)
...@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic) sendkafka(aa_dic)
# 判断名称是否统一 # 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = '' company_url = ''
try: try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list') company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html' # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html' # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息 # 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
# 判断是否是曾用名 # 判断是否是曾用名
tr = tr_list[:1][0] tr = tr_list[:1][0]
...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url) beforename = ifbeforename(company_url)
if beforename == receptname: if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
...@@ -549,6 +549,7 @@ if __name__ == '__main__': ...@@ -549,6 +549,7 @@ if __name__ == '__main__':
else: else:
log.info('==========已无cookies==========') log.info('==========已无cookies==========')
time.sleep(30) time.sleep(30)
continue continue
id_cookie = cookieinfo[0] id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1]) cookie_ = json.loads(cookieinfo[1])
...@@ -579,8 +580,8 @@ if __name__ == '__main__': ...@@ -579,8 +580,8 @@ if __name__ == '__main__':
} }
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '91220101606092819L||' company_field = '913300007125582210||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name) baseCore.sendEmail(file_name)
...@@ -595,6 +596,11 @@ if __name__ == '__main__': ...@@ -595,6 +596,11 @@ if __name__ == '__main__':
while flag: while flag:
log.info('--------已没有数据---------') log.info('--------已没有数据---------')
time.sleep(30) time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field: if company_field:
flag = False flag = False
...@@ -604,26 +610,28 @@ if __name__ == '__main__': ...@@ -604,26 +610,28 @@ if __name__ == '__main__':
continue continue
social_code = company_field.split('|')[0] social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '') com_name = company_field.split('|')[1].replace(' ', '')
ynDomestic = company_field.split('|')[15] # ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16] # countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17] # securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18] # securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21] # listingDate = company_field.split('|')[21]
category = company_field.split('|')[19] # category = company_field.split('|')[19]
exchange = company_field.split('|')[20] # exchange = company_field.split('|')[20]
# ynDomestic = '' # listType = company_field.split('|')[21]
# countryName = '' ynDomestic = ''
# securitiesCode = '' countryName = ''
# securitiesShortName = '' securitiesCode = ''
# listingDate = '' securitiesShortName = ''
# category = '' listingDate = ''
# exchange = '' category = ''
exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name) listType = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(2) time.sleep(2)
# break break
# baseCore.r.close() # baseCore.r.close()
# baseCore.sendEmail(file_name) # baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
......
...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
# company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html' # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
# company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html' # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
# 采集基本信息和工商信息 # 采集基本信息和工商信息
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
# 判断是否是曾用名 # 判断是否是曾用名
tr = tr_list[:1][0] tr = tr_list[:1][0]
...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url) beforename = ifbeforename(company_url)
if beforename == receptname: if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论