提交 4dc1d9b9 作者: XveLingKun

企业基本信息更新

上级 71c65de8
......@@ -8,7 +8,7 @@ import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from selenium.webdriver.edge.service import Service
import urllib3
from selenium.webdriver.support.wait import WebDriverWait
......@@ -38,7 +38,7 @@ from selenium.webdriver.common.by import By
def create_driver():
path = r'D:\soft\msedgedriver.exe'
service = Service(path)
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
......@@ -47,7 +47,7 @@ def create_driver():
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
session = webdriver.Edge(service=service, capabilities=options)
return session
......@@ -142,7 +142,7 @@ def redaytowork(com_name, social_code, securitiesCode, securitiesShortName, list
else:
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis====={e}')
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie, 2)
......@@ -162,7 +162,7 @@ def ifbeforename(company_url):
try:
name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
value = \
businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ',
businessinfo.find('div', class_='index_history-container__VywXO').find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ',
'').replace(
'…', '').replace('\n', '').replace('复制', '').split('(')[0]
except:
......@@ -209,8 +209,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
econKind = script['companyOrgType']
termStart = int(script['fromTime'])
termStart = datetime.datetime.fromtimestamp(termStart / 1000).strftime('%Y-%m-%d %H:%M:%S')
try:
termEnd = script['toTime']
termEnd = datetime.datetime.fromtimestamp(termEnd / 1000).strftime('%Y-%m-%d %H:%M:%S')
except:
termEnd = '无固定期限'
taxpayerType = script['taxQualification']
subIndustry = script['industryInfo']['nameLevel3']
belogOrg = script['regInstitute']
......@@ -275,7 +278,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
if value == 'None':
aa_dic[key] = None
# 发送kafka
# sendkafka(aa_dic)
sendkafka(aa_dic)
def remove_parentheses(text):
......@@ -305,7 +308,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname:
if receptname and (getname == receptname):
company_url = info_t.find('a')['href']
break
elif not receptname:
......@@ -404,7 +407,7 @@ if __name__ == '__main__':
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
company_field = '|江苏协昌电子科技股份有限公司|江苏协昌电子科技股份有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论