提交 17a85405 作者: 薛凌堃

搜狗搜索维护

上级 7bf6c3cf
......@@ -105,7 +105,7 @@ class SougouSpider(object):
# 解析页面
def parse_page(self):
self.logger.info('解析搜狗列表页')
self.logger.info(f'{self.searchkw}解析搜狗列表页...')
response = self.driver.page_source
response = response.replace('<em>', '')
response = response.replace('</em>', '')
......@@ -178,7 +178,7 @@ class SougouSpider(object):
'publishTag':publishTag
}
lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}')
self.logger.info(f'{self.searchkw}---列表获取信息的条数{len(lists)}')
return lists
#获取当前时间
......@@ -233,7 +233,7 @@ class SougouSpider(object):
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
......@@ -265,7 +265,7 @@ class SougouSpider(object):
wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3)
self.logger.info("开始抓取首页...")
self.logger.info(f"{self.searchkw}开始抓取首页...")
try:
flag, lists = self.parse_page()
if len(lists)<1:
......@@ -275,10 +275,15 @@ class SougouSpider(object):
return
if len(lists)==0:
time.sleep(5)
repeatCounts = 0
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
repeatCounts += 1
if repeatCounts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}首页已存在50%以上,结束抓取")
return
continue
self.detailList.put(detail)
......@@ -289,10 +294,10 @@ class SougouSpider(object):
timeFlag=False
while hasnext == '下一页':
try:
if self.page_num ==21:
if self.page_num == 5:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
self.logger.info(f"{self.searchkw}开始抓取第{self.page_num}页...")
try:
self.driver.find_element(By.XPATH, '//a[@id="sogou_next"]').click()
except Exception as e:
......@@ -302,6 +307,7 @@ class SougouSpider(object):
flag, lists = self.parse_page()
if len(lists)<1:
break
repeated_counts = 0
for detail in lists:
publishTag=detail['publishTag']
# if publishTag:
......@@ -314,6 +320,11 @@ class SougouSpider(object):
durl = detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeated_counts += 1
if repeated_counts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环")
return
continue
self.detailList.put(detail)
if timeFlag:
......@@ -328,7 +339,7 @@ class SougouSpider(object):
except Exception as e:
time.sleep(5)
break
self.logger.info("抓取完毕")
self.logger.info(f"{self.searchkw}列表抓取完毕")
#获取资讯内容信息
......@@ -423,7 +434,7 @@ class SougouSpider(object):
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s\n" % (title, detailUrl))
self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
try:
# # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js)
......
......@@ -169,8 +169,8 @@ class SougouTaskJob(object):
sougouSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
finally:
sougouSpider.driver.quit()
# finally:
# sougouSpider.driver.quit()
if sougouSpider.detailList.qsize() != 0:
try:
sougouSpider.get_detail_html()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论