提交 7bf2e193 作者: 薛凌堃

天眼查基本信息

上级 1c479868
......@@ -13,7 +13,7 @@ from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from dateutil.relativedelta import relativedelta
import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
......@@ -320,17 +320,18 @@ def dic_handle(result_dic):
}
return aa_dict
# 检查登陆状态
def checklogin(key):
t = int(time.time())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
# ip = baseCore.get_proxy()
# req = requests.get(headers=headers, url=url, proxies=ip)
req = s.get(headers=headers, url=url)
time.sleep(1)
soup = BeautifulSoup(req.content, 'html.parser')
driver.get(url)
time.sleep(2)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# todo:检查未登录状态
# if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录')
......@@ -390,9 +391,9 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
def ifbeforename(company_url):
req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
time.sleep(2)
com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
......@@ -412,9 +413,10 @@ def ifbeforename(company_url):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====')
req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
# req_ = s.get(headers=headers, url=company_url)
page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
......@@ -502,9 +504,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
print(aa_dic)
# sendkafka(aa_dic)
# print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
else:
data_baseinfo = baseinfo(com_soup)
......@@ -543,9 +545,9 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingType'] = listType
# sendkafka(aa_dic)
print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
def remove_parentheses(text):
# 清除中文小括号
......@@ -623,10 +625,26 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return False
return True
def login():
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
# time.sleep(10)
cookies_list, id_cookie = token.get_cookies()
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(5)
return driver,id_cookie
if __name__ == '__main__':
taskType = '基本信息/天眼查'
# driver, id_cookie = login()
driver, id_cookie = login()
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
......@@ -644,12 +662,12 @@ if __name__ == '__main__':
# cookies = {}
# for cookie in cookies_list:
# cookies[cookie['name']] = cookie['value']
s = requests.Session()
# s = requests.Session()
# s.cookies.update(cookies)
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论