提交 184485c7 作者: 薛凌堃

24/01/06

上级 55610b8f
......@@ -13,9 +13,10 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu_test
groupId=python_google
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
......@@ -168,6 +168,8 @@ class GoogleSpider(object):
try:
driver.get(url)
# 等待页面加载完成
time.sleep(3)
driver.refresh()
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
html=driver.page_source
......@@ -256,6 +258,7 @@ class GoogleSpider(object):
self.driver.get(self.url)
# 等待页面加载完成
time.sleep(3)
self.driver.refresh()
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
search_input = self.driver.find_element('xpath', '//textarea[@title="Google 搜索"]')
......@@ -265,7 +268,11 @@ class GoogleSpider(object):
time.sleep(3)
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
try:
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click()
except:
self.driver.find_element('xpath', '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span').click()
time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
time.sleep(2)
......@@ -273,7 +280,8 @@ class GoogleSpider(object):
time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e:
print(e)
self.logger.info(f'--点击按钮失效----{e}')
return
self.logger.info("开始抓取首页..." + self.searchkw )
time.sleep(5)
flag, lists = self.parse_page()
......@@ -446,7 +454,7 @@ class GoogleSpider(object):
detailurl=detailmsg['detailUrl']
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e:
content=''
contentWithTag=''
......
......@@ -40,7 +40,7 @@ class GoogleTaskJob(object):
try:
for record in consumer:
try:
logger.info("value:",record.value)
logger.info(f"value:{record.value}")
keymsg=record.value
if keymsg:
break
......@@ -176,7 +176,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
[redis]
[redis]
......@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论