提交 9b2d7df4 作者: 薛凌堃

谷歌搜索维护

上级 17a85405
......@@ -237,7 +237,7 @@ class GoogleSpider(object):
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1)
publishtime = current_datetime - delta
elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime :
delta = datetime.timedelta(hours= 5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime :
......@@ -288,11 +288,15 @@ class GoogleSpider(object):
flag, lists = self.parse_page()
if len(lists)<1:
time.sleep(6)
repeatCounts = 0
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeatCounts += 1
if repeatCounts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}首页已存在50%以上,结束抓取")
return
continue
self.detailList.put(detail)
......@@ -305,8 +309,8 @@ class GoogleSpider(object):
hasnext = ''
timeFlag = False
while hasnext == '下一页':
# if self.page_num==5:
# break
if self.page_num==5:
break
self.page_num = self.page_num + 1
self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...")
try:
......@@ -315,11 +319,16 @@ class GoogleSpider(object):
break
time.sleep(5)
flag, lists = self.parse_page()
repeated_counts = 0
for detail in lists:
durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeated_counts += 1
if repeated_counts / len(lists) > 0.5:
self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环")
return
continue
publishTag=detail['publishTag']
# if publishTag:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论