提交 845ae514 作者: 刘伟刚

百度采集修改3

上级 ed571f70
#coding=utf-8
#coding=utf-8
......@@ -266,13 +266,13 @@ class BaiduSpider(object):
break
for detail in lists:
publishTag=detail['publishTag']
if publishTag:
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
needDate='2022-01-01 00:00:00'
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
if pubtime < needTime:
timeFlag = True
break
# if publishTag:
# pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
# needDate='2022-01-01 00:00:00'
# needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
# if pubtime < needTime:
# timeFlag = True
# break
is_member = self.r.sismember('pybaidu_baidu_'+self.wordsCode, durl)
if is_member:
continue
......@@ -398,7 +398,7 @@ class BaiduSpider(object):
processitem=self.getProcessitem(bdetail)
try:
self.sendkafka(processitem)
self.r.sadd('pybaidu_test_'+self.wordsCode, processitem['sourceAddress'])
self.r.sadd('pybaidu_baidu_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -190,29 +190,7 @@ if __name__ == '__main__':
while True:
try:
codeList=[]
codeList.append('KW-20221114-0007')
codeList.append('KW-20221114-0006')
codeList.append('KW-20221114-0005')
codeList.append('KW-20221114-0009')
codeList.append('KW-20221114-0011')
codeList.append('KW-20221114-0012')
codeList.append('KW-20221114-0013')
codeList.append('KW-20221114-0014')
codeList.append('KW-20221114-0018')
codeList.append('KW-20221213-0006')
codeList.append('KW-20221114-0008')
codeList.append('KW-20221114-0015')
codeList.append('KW-20221114-0016')
codeList.append('KW-20221114-0017')
codeList.append('KW-20221114-0019')
codeList.append('KW-20221114-0022')
codeList.append('KW-20221114-0023')
codeList.append('KW-20221114-0024')
codeList.append('KW-20221114-0025')
codeList.append('KW-20221114-0026')
codeList.append('KW-20221114-0027')
codeList.append('KW-20221114-0020')
codeList.append('KW-20221114-0021')
codeList.append('KW-20230818-0003')
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
......
......@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论