提交 7bf2e193 作者: 薛凌堃

天眼查基本信息

上级 1c479868
...@@ -13,7 +13,7 @@ from selenium.webdriver.support.wait import WebDriverWait ...@@ -13,7 +13,7 @@ from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息'] '天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from dateutil.relativedelta import relativedelta
import sys import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base') # sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base') sys.path.append('D:\\kkwork\\zzsn_spider\\base')
...@@ -320,17 +320,18 @@ def dic_handle(result_dic): ...@@ -320,17 +320,18 @@ def dic_handle(result_dic):
} }
return aa_dict return aa_dict
# 检查登陆状态 # 检查登陆状态
def checklogin(key): def checklogin(key):
t = int(time.time()) t = int(time.time())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563' # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}' url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
# ip = baseCore.get_proxy() driver.get(url)
# req = requests.get(headers=headers, url=url, proxies=ip) time.sleep(2)
req = s.get(headers=headers, url=url)
time.sleep(1) page_source = driver.page_source
soup = BeautifulSoup(req.content, 'html.parser') soup = BeautifulSoup(page_source, 'html.parser')
# todo:检查未登录状态 # todo:检查未登录状态
# if soup.find('title').text == '会员登录 - 企查查': # if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录') # log.info('状态---未登录')
...@@ -390,9 +391,9 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -390,9 +391,9 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
def ifbeforename(company_url): def ifbeforename(company_url):
driver.get(company_url)
req_ = s.get(headers=headers, url=company_url) time.sleep(2)
com_soup = BeautifulSoup(req_.content, 'html.parser') com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try: try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'}) businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except: except:
...@@ -412,9 +413,10 @@ def ifbeforename(company_url): ...@@ -412,9 +413,10 @@ def ifbeforename(company_url):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('company/')[1] qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====') log.info(f'====={qccid}=====')
driver.get(company_url)
req_ = s.get(headers=headers, url=company_url) # req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser') page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器 #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
...@@ -502,9 +504,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -502,9 +504,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
print(aa_dic) print(aa_dic)
# sendkafka(aa_dic) # sendkafka(aa_dic)
# print(aa_dic) # print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge' # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic) # dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info) # req = requests.post(post_url, data=dic_info)
else: else:
data_baseinfo = baseinfo(com_soup) data_baseinfo = baseinfo(com_soup)
...@@ -543,9 +545,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -543,9 +545,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType aa_dic['listingType'] = listType
# sendkafka(aa_dic) # sendkafka(aa_dic)
print(aa_dic) print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge' # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic) # dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info) # req = requests.post(post_url, data=dic_info)
def remove_parentheses(text): def remove_parentheses(text):
# 清除中文小括号 # 清除中文小括号
...@@ -623,10 +625,26 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -623,10 +625,26 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return False return False
return True return True
def login():
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
# time.sleep(10)
cookies_list, id_cookie = token.get_cookies()
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(5)
return driver,id_cookie
if __name__ == '__main__': if __name__ == '__main__':
taskType = '基本信息/天眼查' taskType = '基本信息/天眼查'
# driver, id_cookie = login() driver, id_cookie = login()
while True: while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8] nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx' file_name = f'./data/国内企业基本信息采集情况.xlsx'
...@@ -644,12 +662,12 @@ if __name__ == '__main__': ...@@ -644,12 +662,12 @@ if __name__ == '__main__':
# cookies = {} # cookies = {}
# for cookie in cookies_list: # for cookie in cookies_list:
# cookies[cookie['name']] = cookie['value'] # cookies[cookie['name']] = cookie['value']
s = requests.Session() # s = requests.Session()
# s.cookies.update(cookies) # s.cookies.update(cookies)
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||' # company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论