提交 017e1b47 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

......@@ -421,15 +421,15 @@ class BaseCore:
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument(self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service)
with open('../../base/stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
......@@ -481,7 +481,7 @@ class BaseCore:
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
......
# 雅虎财经企业动态获取
# 雅虎财经企业动态获取
......@@ -8,9 +8,9 @@ import sys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
sys.path.append('D:/zzsn_spider/base')
import BaseCore
from smart import smart_extractor
from base import BaseCore
from base.smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
......@@ -49,6 +49,8 @@ def getZx(xydm, url, title, cnx, path):
content = contentElement.replace("'", "''")
driverContent.close()
# driverContent.quit()
# 动态信息列表
list_info = [
xydm,
......@@ -159,28 +161,24 @@ def getLastUrl():
def scroll(xydm,name,gpdm):
last_url_ = ''
try:
last_url = getLastUrl()
except:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
while True:
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
time.sleep(1)
try:
last_url_ = getLastUrl()
last_url = getLastUrl()
except Exception as e:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break
try:
selects = selectUrl(last_url_,xydm)
except:
break
if selects:
break
# try:
# selects = selectUrl(last_url_,xydm)
# except:
# break
# if selects:
# break
if last_url_ == last_url:
break
last_url = last_url_
last_url_ = last_url
#采集失败的公众号 重新放入redis
def rePutIntoR(item):
......@@ -188,7 +186,7 @@ def rePutIntoR(item):
if __name__ == "__main__":
path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe'
path = r'F:\spider\115\chromedriver.exe'
driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor()
......@@ -198,6 +196,9 @@ if __name__ == "__main__":
social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
if not social_code :
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
......@@ -283,7 +284,10 @@ if __name__ == "__main__":
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
break
# 增量使用
# break
# 全量使用
continue
title = a_ele.text.lstrip().strip().replace("'", "''")
exception = getZx(xydm, news_url, title, cnx, path)
if exception == '':
......@@ -311,7 +315,7 @@ if __name__ == "__main__":
log.info('===========连接已被关闭========等待重新连接===========')
driver.quit()
driver = baseCore.buildDriver(path)
time.sleep(1200)
time.sleep(5)
continue
cursor.close()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore
baseCore = BaseCore()
......@@ -6,13 +8,25 @@ log =baseCore.getLogger()
if __name__ == '__main__':
log.info("ok")
#获取流水号
print(baseCore.getNextSeq())
print(baseCore.getNextSeq())
# 获取随机agent
print(baseCore.getRandomUserAgent())
# 获取代理池
print(baseCore.get_proxy())
# 释放相关资源
baseCore.close()
\ No newline at end of file
path = r'F:\spider\115\chromedriver.exe'
driver = baseCore.buildDriver(path,headless=False)
# service = Service(r'F:\spider\115\chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
# # chrome_options.add_argument('--headless')
# # chrome_options.add_argument('--disable-gpu')
# chrome_options.add_experimental_option(
# "excludeSwitches", ["enable-automation"])
# chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
# chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent())
#
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
gpdm = '9021.T'
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论