提交 66f88aa6 作者: 刘伟刚

百度采集修改2

上级 4a8ab091
#coding=utf-8 #coding=utf-8
...@@ -251,8 +251,8 @@ class BaiduSpider(object): ...@@ -251,8 +251,8 @@ class BaiduSpider(object):
timeFlag=False timeFlag=False
while hasnext == '下一页 >': while hasnext == '下一页 >':
try: try:
# if self.page_num==2: if self.page_num==3:
# break break
self.page_num = self.page_num + 1 self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num) self.logger.info("开始抓取第%s页..." % self.page_num)
try: try:
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -102,7 +102,7 @@ class BaiduTaskJob(object): ...@@ -102,7 +102,7 @@ class BaiduTaskJob(object):
for kw in res: for kw in res:
kwstr+=kw+"+" kwstr+=kw+"+"
kwList.append(kwstr.strip('+')) kwList.append(kwstr.strip('+'))
else: elif '|' in keywords:
k3=keywords.split("|") k3=keywords.split("|")
kwList=k3 kwList=k3
return kwList return kwList
...@@ -129,16 +129,17 @@ class BaiduTaskJob(object): ...@@ -129,16 +129,17 @@ class BaiduTaskJob(object):
} }
kwList.append(kwmsg) kwList.append(kwmsg)
else: else:
logger.info('+++++') pass
keyword=keymsg['keyWord'] # logger.info('+++++')
keymsglist=self.getkeywords(keyword) # keyword=keymsg['keyWord']
for kw in keymsglist: # keymsglist=self.getkeywords(keyword)
kwmsg={ # for kw in keymsglist:
'kw':kw, # kwmsg={
'wordsCode':wordsCode, # 'kw':kw,
'sid':id # 'wordsCode':wordsCode,
} # 'sid':id
kwList.append(kwmsg) # }
# kwList.append(kwmsg)
return kwList return kwList
...@@ -178,7 +179,7 @@ if __name__ == '__main__': ...@@ -178,7 +179,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -143,21 +143,41 @@ class BaiduTaskJob(object): ...@@ -143,21 +143,41 @@ class BaiduTaskJob(object):
return kwList return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg): def runSpider(self,kwmsg):
try:
searchkw=kwmsg['kw'] searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode'] wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid'] sid=kwmsg['sid']
baiduSpider=BaiduSpider(searchkw,wordsCode,sid) baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
try:
baiduSpider.get_page_html() baiduSpider.get_page_html()
baiduSpider.get_detail_html()
except Exception as e: except Exception as e:
logger.info('百度搜索异常'+searchkw) logger.info('百度搜索异常'+searchkw)
finally: finally:
baiduSpider.driver.quit() baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0:
try:
baiduSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
finally:
baiduSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw) logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__': if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)' # ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss) # keymsglist=getkeywords(ss)
...@@ -179,7 +199,7 @@ if __name__ == '__main__': ...@@ -179,7 +199,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -190,12 +190,29 @@ if __name__ == '__main__': ...@@ -190,12 +190,29 @@ if __name__ == '__main__':
while True: while True:
try: try:
codeList=[] codeList=[]
codeList.append('KW-20230812-0027') codeList.append('KW-20221114-0007')
codeList.append('KW-20230812-0028') codeList.append('KW-20221114-0006')
codeList.append('KW-20230812-0029') codeList.append('KW-20221114-0005')
codeList.append('KW-20230812-0030') codeList.append('KW-20221114-0009')
codeList.append('KW-20230812-0031') codeList.append('KW-20221114-0011')
codeList.append('KW-20230812-0032') codeList.append('KW-20221114-0012')
codeList.append('KW-20221114-0013')
codeList.append('KW-20221114-0014')
codeList.append('KW-20221114-0018')
codeList.append('KW-20221213-0006')
codeList.append('KW-20221114-0008')
codeList.append('KW-20221114-0015')
codeList.append('KW-20221114-0016')
codeList.append('KW-20221114-0017')
codeList.append('KW-20221114-0019')
codeList.append('KW-20221114-0022')
codeList.append('KW-20221114-0023')
codeList.append('KW-20221114-0024')
codeList.append('KW-20221114-0025')
codeList.append('KW-20221114-0026')
codeList.append('KW-20221114-0027')
codeList.append('KW-20221114-0020')
codeList.append('KW-20221114-0021')
for codeid in codeList: for codeid in codeList:
try: try:
# keymsg=baiduTaskJob.getkafka() # keymsg=baiduTaskJob.getkafka()
...@@ -204,7 +221,7 @@ if __name__ == '__main__': ...@@ -204,7 +221,7 @@ if __name__ == '__main__':
# 从列表中随机选择5个数据 # 从列表中随机选择5个数据
if len(kwList)<1: if len(kwList)<1:
continue continue
kwList = random.sample(kwList, 4) # kwList = random.sample(kwList, 4)
logger.info(f"需要搜索的关键词:{kwList}") logger.info(f"需要搜索的关键词:{kwList}")
except Exception as e: except Exception as e:
logger.info("从kafka拿取信息失败!") logger.info("从kafka拿取信息失败!")
...@@ -212,7 +229,7 @@ if __name__ == '__main__': ...@@ -212,7 +229,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
[redis] [redis]
...@@ -13,7 +13,7 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut ...@@ -13,7 +13,7 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka] [kafka]
bootstrap_servers=114.115.159.144:9092 bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo topic=keyWordsInfo
groupId=python_baidu_test groupId=python_baidu
[selenium] [selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
......
百度采集部署的服务器 百度采集部署的服务器
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
114.115.235.92 114.115.235.92
114.116.122.247 114.116.122.247
114.115.153.6 114.115.153.6
114.116.122.247
192.168.1.150 192.168.1.150
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论