提交 4dc1d9b9 作者: XveLingKun

企业基本信息更新

上级 71c65de8
...@@ -8,7 +8,7 @@ import pymongo ...@@ -8,7 +8,7 @@ import pymongo
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from selenium.webdriver.edge.service import Service
import urllib3 import urllib3
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
...@@ -38,7 +38,7 @@ from selenium.webdriver.common.by import By ...@@ -38,7 +38,7 @@ from selenium.webdriver.common.by import By
def create_driver(): def create_driver():
path = r'D:\soft\msedgedriver.exe' path = r'D:\soft\msedgedriver.exe'
service = Service(path)
# options = webdriver.EdgeOptions() # options = webdriver.EdgeOptions()
options = { options = {
"browserName": "MicrosoftEdge", "browserName": "MicrosoftEdge",
...@@ -47,7 +47,7 @@ def create_driver(): ...@@ -47,7 +47,7 @@ def create_driver():
} }
} }
session = webdriver.Edge(executable_path=path, capabilities=options) session = webdriver.Edge(service=service, capabilities=options)
return session return session
...@@ -142,7 +142,7 @@ def redaytowork(com_name, social_code, securitiesCode, securitiesShortName, list ...@@ -142,7 +142,7 @@ def redaytowork(com_name, social_code, securitiesCode, securitiesShortName, list
else: else:
return count return count
except Exception as e: except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis====={e}')
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field) Lreputredis(company_field)
token.updateTokeen(id_cookie, 2) token.updateTokeen(id_cookie, 2)
...@@ -162,7 +162,7 @@ def ifbeforename(company_url): ...@@ -162,7 +162,7 @@ def ifbeforename(company_url):
try: try:
name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
value = \ value = \
businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ', businessinfo.find('div', class_='index_history-container__VywXO').find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ',
'').replace( '').replace(
'…', '').replace('\n', '').replace('复制', '').split('(')[0] '…', '').replace('\n', '').replace('复制', '').split('(')[0]
except: except:
...@@ -209,8 +209,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -209,8 +209,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
econKind = script['companyOrgType'] econKind = script['companyOrgType']
termStart = int(script['fromTime']) termStart = int(script['fromTime'])
termStart = datetime.datetime.fromtimestamp(termStart / 1000).strftime('%Y-%m-%d %H:%M:%S') termStart = datetime.datetime.fromtimestamp(termStart / 1000).strftime('%Y-%m-%d %H:%M:%S')
try:
termEnd = script['toTime'] termEnd = script['toTime']
termEnd = datetime.datetime.fromtimestamp(termEnd / 1000).strftime('%Y-%m-%d %H:%M:%S') termEnd = datetime.datetime.fromtimestamp(termEnd / 1000).strftime('%Y-%m-%d %H:%M:%S')
except:
termEnd = '无固定期限'
taxpayerType = script['taxQualification'] taxpayerType = script['taxQualification']
subIndustry = script['industryInfo']['nameLevel3'] subIndustry = script['industryInfo']['nameLevel3']
belogOrg = script['regInstitute'] belogOrg = script['regInstitute']
...@@ -275,7 +278,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -275,7 +278,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
if value == 'None': if value == 'None':
aa_dic[key] = None aa_dic[key] = None
# 发送kafka # 发送kafka
# sendkafka(aa_dic) sendkafka(aa_dic)
def remove_parentheses(text): def remove_parentheses(text):
...@@ -305,7 +308,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -305,7 +308,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
info_t = compamy.find('div', class_='index_name__qEdWi') info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}') log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname: if receptname and (getname == receptname):
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
break break
elif not receptname: elif not receptname:
...@@ -404,7 +407,7 @@ if __name__ == '__main__': ...@@ -404,7 +407,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||' company_field = '|江苏协昌电子科技股份有限公司|江苏协昌电子科技股份有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论