提交 8cf6e366 作者: 薛凌堃

企业新增及更新

上级 5e520511
......@@ -87,7 +87,8 @@ def doJob():
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie = token.get_cookies()
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
......
......@@ -87,7 +87,8 @@ def doJob():
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie = token.get_cookies()
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
......@@ -212,7 +213,7 @@ def doJob():
total_page = 0
flag = 0
baseCore.rePutIntoR('UpdateCoreperson:Map', item)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应')
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应---{charge}---{total_page2}---{total_page3}')
continue
if total_page == 0:
token.updateTokeen(id_cookie, 2)
......@@ -223,6 +224,8 @@ def doJob():
# # todo:获取页数
# total_page = 34
# flag = 2
# todo: 测试程序是否执行到这一步
log.info(f'总数为{total_page}')
for page in range(1, int((total_page / 20) + 1) + 1):
res = None
for c in range(3):
......
......@@ -57,6 +57,12 @@ def sendkafka(post_data):
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败")
def Lreputredis(company_field):
# todo: 重新放入redis
baseCore.r.lrem('BaseInfoEnterprise:gnqy_socialCode', 0, 'end')
baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', 'end')
# 合并基本信息和工商信息字段
def getinfo(dict1,dict2):
# 取出两个字典的key值集合
......@@ -352,7 +358,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
soup = checklogin(com_name)
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
......@@ -361,18 +368,23 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
try:
searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
if searchinfo == '0':
log.info('=====搜索不到该企业====')
data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中
file.appenddata(file_name, '需处理企业', data)
return count
try:
# todo:可能是搜不到该企业
errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
if '抱歉' in errormessage:
log.info('=====搜索不到该企业====')
data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中
file.appenddata(file_name, '需处理企业', data)
return count
except:
log.info("登录失效===重新放入redis")
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
else:
# 开始采集
try:
......@@ -385,7 +397,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return count
......@@ -578,45 +591,50 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# req = requests.post(post_url, data=dic_info)
else:
data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName
aa_dic['securitiesCode'] = securitiesCode
aa_dic['securitiesShortName'] = securitiesShortName
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# sendkafka(aa_dic)
print(aa_dic)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
# todo: 重新放入redis 删除end再放入ruend
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
Lreputredis(company_field)
log.error(f'未找到工商信息,重新塞入redis')
# data_baseinfo = baseinfo(com_soup)
# # 主要针对香港台湾企业,社会信用代码传为给定的
# try:
# data_baseinfo['统一社会信用代码']
# except:
# log.info('未获取到统一社会信用代码')
# if social_code:
# data_baseinfo['统一社会信用代码'] = social_code
# else:
# # 如果未给定社会信用代码,则返回
# return False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
# if data_baseinfo['企业名称'] == '-' and com_name:
# data_baseinfo['企业名称'] = com_name
# elif not com_name:
# return False
# else:
# pass
# # 采集成功的企业
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
# file.appenddata(file_name, '获取基本信息成功企业', data)
# # 将字段转化成英文驼峰
# aa_dic = dic_handle(data_baseinfo)
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
# aa_dic['qccId'] = qccid
# aa_dic['ynDomestic'] = ynDomestic
# aa_dic['countryName'] = countryName
# aa_dic['securitiesCode'] = securitiesCode
# aa_dic['securitiesShortName'] = securitiesShortName
# aa_dic['listingDate'] = listingDate
# aa_dic['category'] = category
# aa_dic['exchange'] = exchange
# aa_dic['listingType'] = listType
# # sendkafka(aa_dic)
# print(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def remove_parentheses(text):
# 清除中文小括号
......@@ -632,7 +650,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return False
......@@ -695,12 +714,10 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return True
def login():
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
# time.sleep(10)
cookies_list, id_cookie = token.get_cookies()
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
......@@ -713,8 +730,13 @@ def login():
if __name__ == '__main__':
taskType = '基本信息/天眼查'
driver, id_cookie = login()
# driver, id_cookie = login()
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
driver, id_cookie = login()
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name)
......@@ -761,7 +783,8 @@ if __name__ == '__main__':
if company_field:
flag = False
log.info("-----已添加数据------")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
continue
continue
# company_field_ = f'|{company_field}'
......
......@@ -7,9 +7,11 @@ import datetime
import pymongo
import requests
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
from kafka import KafkaProducer
import urllib3
from retry import retry
from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
......@@ -385,7 +387,7 @@ def redaytowork(com_name, social_code, file_name):
if spiderwork(soup, com_name, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
# token.updateTokeen(id_cookie,3)
token.updateTokeen(id_cookie,3)
return count
else:
return count
......@@ -430,8 +432,8 @@ def paserTime(publishtime):
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(months=day)
publishtime = current_datetime - delta
publishtime = current_datetime - relativedelta(months=day)
# publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
......@@ -472,6 +474,16 @@ def paserTime(publishtime):
print('时间解析异常!!')
return publishtime
@retry(tries=2,delay=3)
def getBusinessinfo(com_soup):
com_soup_ = com_soup.find('div',attrs={'data-dim':'baseInfo'})
businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW'})
if not businessinfo:
businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW '})
if not businessinfo:
raise RuntimeError('工商信息未找到')
return businessinfo
# 采集基本信息和工商信息
def spiderinfo(company_url, receptname, file_name):
......@@ -495,7 +507,7 @@ def spiderinfo(company_url, receptname, file_name):
return
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
businessinfo = getBusinessinfo(com_soup)
except:
businessinfo = ''
if businessinfo:
......@@ -576,37 +588,42 @@ def spiderinfo(company_url, receptname, file_name):
# req = requests.post(post_url, data=dic_info)
else:
data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
# sendkafka(aa_dic)
log.info(aa_dic)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
# todo: 重新放入redis
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
log.error(f'未找到工商信息,重新塞入redis')
token.updateTokeen(id_cookie, 3)
# data_baseinfo = baseinfo(com_soup)
# # 主要针对香港台湾企业,社会信用代码传为给定的
# try:
# data_baseinfo['统一社会信用代码']
# except:
# log.info('未获取到统一社会信用代码')
# if social_code:
# data_baseinfo['统一社会信用代码'] = social_code
# else:
# # 如果未给定社会信用代码,则返回
# return False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
# if data_baseinfo['企业名称'] == '-' and com_name:
# data_baseinfo['企业名称'] = com_name
# elif not com_name:
# return False
# else:
# pass
# # 采集成功的企业
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
# file.appenddata(file_name, '获取基本信息成功企业', data)
# # 将字段转化成英文驼峰
# aa_dic = dic_handle(data_baseinfo)
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
# aa_dic['qccId'] = qccid
# # sendkafka(aa_dic)
# log.info(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def remove_parentheses(text):
# 清除中文小括号
......@@ -682,12 +699,10 @@ def spiderwork(soup, receptname, file_name):
return True
def login():
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
# time.sleep(10)
cookies_list, id_cookie = token.get_cookies()
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
......@@ -695,7 +710,7 @@ def login():
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(60)
time.sleep(5)
return driver,id_cookie
if __name__ == '__main__':
......@@ -704,15 +719,22 @@ if __name__ == '__main__':
# #手动登录
# driver.get('https://www.tianyancha.com/')
#todo:绕过验证使用cookies登录
driver, id_cookie = login()
# driver, id_cookie = login()
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
# todo:绕过验证使用cookies登录
driver, id_cookie = login()
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息更新.xlsx'
file.createFile(file_name)
start_time = time.time()
# 获取企业信息
company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
# company_field = '913100006073602992|光明乳业股份有限公司'
# company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
company_field = '91330000742906207U|浙江我武生物科技股份有限公司'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
......@@ -775,5 +797,5 @@ if __name__ == '__main__':
company_url = 'https://www.tianyancha.com/company/' + tycid
spiderinfo(company_url, com_name, file_name)
time.sleep(10)
# break
break
baseCore.close()
\ No newline at end of file
......@@ -328,7 +328,7 @@ if __name__ == '__main__':
driver.get('https://www.tianyancha.com/')
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file_name = f'./data/国内企业基本信息更新.xlsx'
file.createFile(file_name)
# cookies_list, id_cookie = token.get_cookies()
# cookies = {}
......
......@@ -59,7 +59,8 @@ class Token():
result = db_storage.find_one(query, sort=[('updateTime', 1)])
cookies = result['cookies']
id_token = result['_id']
return cookies, id_token
user_name = result['name']
return cookies, id_token, user_name
# 删除失效的token
def delete_token(self, cookie_):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论