提交 9b2d7df4 作者: 薛凌堃

谷歌搜索维护

上级 17a85405
...@@ -237,7 +237,7 @@ class GoogleSpider(object): ...@@ -237,7 +237,7 @@ class GoogleSpider(object):
current_datetime = datetime.datetime.now() current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1) delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime : elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5) delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime : elif '年' in publishtime and '月' in publishtime :
...@@ -288,11 +288,15 @@ class GoogleSpider(object): ...@@ -288,11 +288,15 @@ class GoogleSpider(object):
flag, lists = self.parse_page() flag, lists = self.parse_page()
if len(lists)<1: if len(lists)<1:
time.sleep(6) time.sleep(6)
repeatCounts = 0
for detail in lists: for detail in lists:
durl=detail['detailUrl'] durl=detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}") repeatCounts += 1
if repeatCounts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}首页已存在50%以上,结束抓取")
return
continue continue
self.detailList.put(detail) self.detailList.put(detail)
...@@ -305,8 +309,8 @@ class GoogleSpider(object): ...@@ -305,8 +309,8 @@ class GoogleSpider(object):
hasnext = '' hasnext = ''
timeFlag = False timeFlag = False
while hasnext == '下一页': while hasnext == '下一页':
# if self.page_num==5: if self.page_num==5:
# break break
self.page_num = self.page_num + 1 self.page_num = self.page_num + 1
self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...") self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...")
try: try:
...@@ -315,11 +319,16 @@ class GoogleSpider(object): ...@@ -315,11 +319,16 @@ class GoogleSpider(object):
break break
time.sleep(5) time.sleep(5)
flag, lists = self.parse_page() flag, lists = self.parse_page()
repeated_counts = 0
for detail in lists: for detail in lists:
durl = detail['detailUrl'] durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}") self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeated_counts += 1
if repeated_counts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环")
return
continue continue
publishTag=detail['publishTag'] publishTag=detail['publishTag']
# if publishTag: # if publishTag:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论