提交 17a85405 作者: 薛凌堃

搜狗搜索维护

上级 7bf6c3cf
...@@ -105,7 +105,7 @@ class SougouSpider(object): ...@@ -105,7 +105,7 @@ class SougouSpider(object):
# 解析页面 # 解析页面
def parse_page(self): def parse_page(self):
self.logger.info('解析搜狗列表页') self.logger.info(f'{self.searchkw}解析搜狗列表页...')
response = self.driver.page_source response = self.driver.page_source
response = response.replace('<em>', '') response = response.replace('<em>', '')
response = response.replace('</em>', '') response = response.replace('</em>', '')
...@@ -178,7 +178,7 @@ class SougouSpider(object): ...@@ -178,7 +178,7 @@ class SougouSpider(object):
'publishTag':publishTag 'publishTag':publishTag
} }
lists.append(detailmsg) lists.append(detailmsg)
self.logger.info(f'列表获取信息的条数{len(lists)}') self.logger.info(f'{self.searchkw}---列表获取信息的条数{len(lists)}')
return lists return lists
#获取当前时间 #获取当前时间
...@@ -233,7 +233,7 @@ class SougouSpider(object): ...@@ -233,7 +233,7 @@ class SougouSpider(object):
current_datetime = datetime.datetime.now() current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1) delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime : elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5) delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime : elif '年' in publishtime and '月' in publishtime :
...@@ -265,7 +265,7 @@ class SougouSpider(object): ...@@ -265,7 +265,7 @@ class SougouSpider(object):
wait = WebDriverWait(self.driver, 30) wait = WebDriverWait(self.driver, 30)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(3) time.sleep(3)
self.logger.info("开始抓取首页...") self.logger.info(f"{self.searchkw}开始抓取首页...")
try: try:
flag, lists = self.parse_page() flag, lists = self.parse_page()
if len(lists)<1: if len(lists)<1:
...@@ -275,10 +275,15 @@ class SougouSpider(object): ...@@ -275,10 +275,15 @@ class SougouSpider(object):
return return
if len(lists)==0: if len(lists)==0:
time.sleep(5) time.sleep(5)
repeatCounts = 0
for detail in lists: for detail in lists:
durl=detail['detailUrl'] durl=detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl) is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member: if is_member:
repeatCounts += 1
if repeatCounts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}首页已存在50%以上,结束抓取")
return
continue continue
self.detailList.put(detail) self.detailList.put(detail)
...@@ -289,10 +294,10 @@ class SougouSpider(object): ...@@ -289,10 +294,10 @@ class SougouSpider(object):
timeFlag=False timeFlag=False
while hasnext == '下一页': while hasnext == '下一页':
try: try:
if self.page_num ==21: if self.page_num == 5:
break break
self.page_num = self.page_num + 1 self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num) self.logger.info(f"{self.searchkw}开始抓取第{self.page_num}页...")
try: try:
self.driver.find_element(By.XPATH, '//a[@id="sogou_next"]').click() self.driver.find_element(By.XPATH, '//a[@id="sogou_next"]').click()
except Exception as e: except Exception as e:
...@@ -302,6 +307,7 @@ class SougouSpider(object): ...@@ -302,6 +307,7 @@ class SougouSpider(object):
flag, lists = self.parse_page() flag, lists = self.parse_page()
if len(lists)<1: if len(lists)<1:
break break
repeated_counts = 0
for detail in lists: for detail in lists:
publishTag=detail['publishTag'] publishTag=detail['publishTag']
# if publishTag: # if publishTag:
...@@ -314,6 +320,11 @@ class SougouSpider(object): ...@@ -314,6 +320,11 @@ class SougouSpider(object):
durl = detail['detailUrl'] durl = detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl) is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeated_counts += 1
if repeated_counts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环")
return
continue continue
self.detailList.put(detail) self.detailList.put(detail)
if timeFlag: if timeFlag:
...@@ -328,7 +339,7 @@ class SougouSpider(object): ...@@ -328,7 +339,7 @@ class SougouSpider(object):
except Exception as e: except Exception as e:
time.sleep(5) time.sleep(5)
break break
self.logger.info("抓取完毕") self.logger.info(f"{self.searchkw}列表抓取完毕")
#获取资讯内容信息 #获取资讯内容信息
...@@ -423,7 +434,7 @@ class SougouSpider(object): ...@@ -423,7 +434,7 @@ class SougouSpider(object):
detailmsg=self.detailList.get() detailmsg=self.detailList.get()
title = detailmsg['title'] title = detailmsg['title']
detailUrl = detailmsg['detailUrl'] detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s\n" % (title, detailUrl)) self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
try: try:
# # js = "window.open('"+detailUrl+"')" # # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js) # # self.driver.execute_script(js)
......
...@@ -169,8 +169,8 @@ class SougouTaskJob(object): ...@@ -169,8 +169,8 @@ class SougouTaskJob(object):
sougouSpider.get_page_html() sougouSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('搜狗搜索异常'+searchkw) logger.info('搜狗搜索异常'+searchkw)
finally: # finally:
sougouSpider.driver.quit() # sougouSpider.driver.quit()
if sougouSpider.detailList.qsize() != 0: if sougouSpider.detailList.qsize() != 0:
try: try:
sougouSpider.get_detail_html() sougouSpider.get_detail_html()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论