提交 845ae514 作者: 刘伟刚

百度采集修改3

上级 ed571f70
#coding=utf-8 #coding=utf-8
...@@ -266,13 +266,13 @@ class BaiduSpider(object): ...@@ -266,13 +266,13 @@ class BaiduSpider(object):
break break
for detail in lists: for detail in lists:
publishTag=detail['publishTag'] publishTag=detail['publishTag']
if publishTag: # if publishTag:
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S") # pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
needDate='2022-01-01 00:00:00' # needDate='2022-01-01 00:00:00'
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S") # needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
if pubtime < needTime: # if pubtime < needTime:
timeFlag = True # timeFlag = True
break # break
is_member = self.r.sismember('pybaidu_baidu_'+self.wordsCode, durl) is_member = self.r.sismember('pybaidu_baidu_'+self.wordsCode, durl)
if is_member: if is_member:
continue continue
...@@ -398,7 +398,7 @@ class BaiduSpider(object): ...@@ -398,7 +398,7 @@ class BaiduSpider(object):
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pybaidu_test_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pybaidu_baidu_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
#插入数据库 #插入数据库
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -190,29 +190,7 @@ if __name__ == '__main__': ...@@ -190,29 +190,7 @@ if __name__ == '__main__':
while True: while True:
try: try:
codeList=[] codeList=[]
codeList.append('KW-20221114-0007') codeList.append('KW-20230818-0003')
codeList.append('KW-20221114-0006')
codeList.append('KW-20221114-0005')
codeList.append('KW-20221114-0009')
codeList.append('KW-20221114-0011')
codeList.append('KW-20221114-0012')
codeList.append('KW-20221114-0013')
codeList.append('KW-20221114-0014')
codeList.append('KW-20221114-0018')
codeList.append('KW-20221213-0006')
codeList.append('KW-20221114-0008')
codeList.append('KW-20221114-0015')
codeList.append('KW-20221114-0016')
codeList.append('KW-20221114-0017')
codeList.append('KW-20221114-0019')
codeList.append('KW-20221114-0022')
codeList.append('KW-20221114-0023')
codeList.append('KW-20221114-0024')
codeList.append('KW-20221114-0025')
codeList.append('KW-20221114-0026')
codeList.append('KW-20221114-0027')
codeList.append('KW-20221114-0020')
codeList.append('KW-20221114-0021')
for codeid in codeList: for codeid in codeList:
try: try:
# keymsg=baiduTaskJob.getkafka() # keymsg=baiduTaskJob.getkafka()
......
...@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple ...@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/ pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0 selenium==3.141.0
selenium-wire==5.1.0 selenium-wire==5.1.0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论