提交 66f88aa6 作者: 刘伟刚

百度采集修改2

上级 4a8ab091
#coding=utf-8
#coding=utf-8
......@@ -251,8 +251,8 @@ class BaiduSpider(object):
timeFlag=False
while hasnext == '下一页 >':
try:
# if self.page_num==2:
# break
if self.page_num==3:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -102,7 +102,7 @@ class BaiduTaskJob(object):
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
elif '|' in keywords:
k3=keywords.split("|")
kwList=k3
return kwList
......@@ -129,16 +129,17 @@ class BaiduTaskJob(object):
}
kwList.append(kwmsg)
else:
logger.info('+++++')
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return kwList
......@@ -178,7 +179,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -143,21 +143,41 @@ class BaiduTaskJob(object):
return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg):
try:
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
try:
baiduSpider.get_page_html()
baiduSpider.get_detail_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
finally:
baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0:
try:
baiduSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
finally:
baiduSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
......@@ -179,7 +199,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -190,12 +190,29 @@ if __name__ == '__main__':
while True:
try:
codeList=[]
codeList.append('KW-20230812-0027')
codeList.append('KW-20230812-0028')
codeList.append('KW-20230812-0029')
codeList.append('KW-20230812-0030')
codeList.append('KW-20230812-0031')
codeList.append('KW-20230812-0032')
codeList.append('KW-20221114-0007')
codeList.append('KW-20221114-0006')
codeList.append('KW-20221114-0005')
codeList.append('KW-20221114-0009')
codeList.append('KW-20221114-0011')
codeList.append('KW-20221114-0012')
codeList.append('KW-20221114-0013')
codeList.append('KW-20221114-0014')
codeList.append('KW-20221114-0018')
codeList.append('KW-20221213-0006')
codeList.append('KW-20221114-0008')
codeList.append('KW-20221114-0015')
codeList.append('KW-20221114-0016')
codeList.append('KW-20221114-0017')
codeList.append('KW-20221114-0019')
codeList.append('KW-20221114-0022')
codeList.append('KW-20221114-0023')
codeList.append('KW-20221114-0024')
codeList.append('KW-20221114-0025')
codeList.append('KW-20221114-0026')
codeList.append('KW-20221114-0027')
codeList.append('KW-20221114-0020')
codeList.append('KW-20221114-0021')
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
......@@ -204,7 +221,7 @@ if __name__ == '__main__':
# 从列表中随机选择5个数据
if len(kwList)<1:
continue
kwList = random.sample(kwList, 4)
# kwList = random.sample(kwList, 4)
logger.info(f"需要搜索的关键词:{kwList}")
except Exception as e:
logger.info("从kafka拿取信息失败!")
......@@ -212,7 +229,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
[redis]
[redis]
......@@ -13,7 +13,7 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu_test
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
......
百度采集部署的服务器
百度采集部署的服务器
......@@ -3,7 +3,6 @@
114.115.235.92
114.116.122.247
114.115.153.6
114.116.122.247
192.168.1.150
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论