Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
66f88aa6
提交
66f88aa6
authored
8月 16, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
百度采集修改2
上级
4a8ab091
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
72 行增加
和
35 行删除
+72
-35
baiduSpider.py
百度采集/baidu_comm/baiduSpider.py
+3
-3
baidutaskJob.py
百度采集/baidu_comm/baidutaskJob.py
+14
-13
baidutaskJob_comm.py
百度采集/baidu_comm/baidutaskJob_comm.py
+26
-6
baidutaskJob_loc.py
百度采集/baidu_comm/baidutaskJob_loc.py
+26
-9
config.ini
百度采集/baidu_comm/config.ini
+2
-2
百度采集说明
百度采集/baidu_comm/百度采集说明
+1
-2
没有找到文件。
百度采集/baidu_comm/baiduSpider.py
浏览文件 @
66f88aa6
#codi
ng=utf-8
#codi
ng=utf-8
...
...
@@ -251,8 +251,8 @@ class BaiduSpider(object):
timeFlag
=
False
while
hasnext
==
'下一页 >'
:
try
:
# if self.page_num==2
:
#
break
if
self
.
page_num
==
3
:
break
self
.
page_num
=
self
.
page_num
+
1
self
.
logger
.
info
(
"开始抓取第
%
s页..."
%
self
.
page_num
)
try
:
...
...
百度采集/baidu_comm/baidutaskJob.py
浏览文件 @
66f88aa6
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -102,7 +102,7 @@ class BaiduTaskJob(object):
for
kw
in
res
:
kwstr
+=
kw
+
"+"
kwList
.
append
(
kwstr
.
strip
(
'+'
))
el
se
:
el
if
'|'
in
keywords
:
k3
=
keywords
.
split
(
"|"
)
kwList
=
k3
return
kwList
...
...
@@ -129,16 +129,17 @@ class BaiduTaskJob(object):
}
kwList
.
append
(
kwmsg
)
else
:
logger
.
info
(
'+++++'
)
keyword
=
keymsg
[
'keyWord'
]
keymsglist
=
self
.
getkeywords
(
keyword
)
for
kw
in
keymsglist
:
kwmsg
=
{
'kw'
:
kw
,
'wordsCode'
:
wordsCode
,
'sid'
:
id
}
kwList
.
append
(
kwmsg
)
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return
kwList
...
...
@@ -178,7 +179,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
2
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
百度采集/baidu_comm/baidutaskJob_comm.py
浏览文件 @
66f88aa6
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -143,21 +143,41 @@ class BaiduTaskJob(object):
return
kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def
runSpider
(
self
,
kwmsg
):
try
:
searchkw
=
kwmsg
[
'kw'
]
wordsCode
=
kwmsg
[
'wordsCode'
]
sid
=
kwmsg
[
'sid'
]
baiduSpider
=
BaiduSpider
(
searchkw
,
wordsCode
,
sid
)
try
:
baiduSpider
.
get_page_html
()
baiduSpider
.
get_detail_html
()
except
Exception
as
e
:
logger
.
info
(
'百度搜索异常'
+
searchkw
)
finally
:
baiduSpider
.
driver
.
quit
()
if
baiduSpider
.
detailList
.
qsize
()
!=
0
:
try
:
baiduSpider
.
get_detail_html
()
except
Exception
as
e
:
logger
.
info
(
'详情解析异常'
+
searchkw
)
finally
:
baiduSpider
.
driver
.
quit
()
logger
.
info
(
"关键词采集结束!"
+
searchkw
)
if
__name__
==
'__main__'
:
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
...
...
@@ -179,7 +199,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
2
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
4
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
百度采集/baidu_comm/baidutaskJob_loc.py
浏览文件 @
66f88aa6
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -190,12 +190,29 @@ if __name__ == '__main__':
while
True
:
try
:
codeList
=
[]
codeList
.
append
(
'KW-20230812-0027'
)
codeList
.
append
(
'KW-20230812-0028'
)
codeList
.
append
(
'KW-20230812-0029'
)
codeList
.
append
(
'KW-20230812-0030'
)
codeList
.
append
(
'KW-20230812-0031'
)
codeList
.
append
(
'KW-20230812-0032'
)
codeList
.
append
(
'KW-20221114-0007'
)
codeList
.
append
(
'KW-20221114-0006'
)
codeList
.
append
(
'KW-20221114-0005'
)
codeList
.
append
(
'KW-20221114-0009'
)
codeList
.
append
(
'KW-20221114-0011'
)
codeList
.
append
(
'KW-20221114-0012'
)
codeList
.
append
(
'KW-20221114-0013'
)
codeList
.
append
(
'KW-20221114-0014'
)
codeList
.
append
(
'KW-20221114-0018'
)
codeList
.
append
(
'KW-20221213-0006'
)
codeList
.
append
(
'KW-20221114-0008'
)
codeList
.
append
(
'KW-20221114-0015'
)
codeList
.
append
(
'KW-20221114-0016'
)
codeList
.
append
(
'KW-20221114-0017'
)
codeList
.
append
(
'KW-20221114-0019'
)
codeList
.
append
(
'KW-20221114-0022'
)
codeList
.
append
(
'KW-20221114-0023'
)
codeList
.
append
(
'KW-20221114-0024'
)
codeList
.
append
(
'KW-20221114-0025'
)
codeList
.
append
(
'KW-20221114-0026'
)
codeList
.
append
(
'KW-20221114-0027'
)
codeList
.
append
(
'KW-20221114-0020'
)
codeList
.
append
(
'KW-20221114-0021'
)
for
codeid
in
codeList
:
try
:
# keymsg=baiduTaskJob.getkafka()
...
...
@@ -204,7 +221,7 @@ if __name__ == '__main__':
# 从列表中随机选择5个数据
if
len
(
kwList
)
<
1
:
continue
kwList
=
random
.
sample
(
kwList
,
4
)
#
kwList = random.sample(kwList, 4)
logger
.
info
(
f
"需要搜索的关键词:{kwList}"
)
except
Exception
as
e
:
logger
.
info
(
"从kafka拿取信息失败!"
)
...
...
@@ -212,7 +229,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
4
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
百度采集/baidu_comm/config.ini
浏览文件 @
66f88aa6
[redi
s]
[redi
s]
...
...
@@ -13,7 +13,7 @@ url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=ut
[kafka]
bootstrap_servers
=
114.115.159.144:9092
topic
=
keyWordsInfo
groupId
=
python_baidu
_test
groupId
=
python_baidu
[selenium]
chrome_driver
=
C:
\U
sers
\W
IN10
\D
ataspellProjects
\c
rawlerProjectDemo
\t
mpcrawler
\c
md100
\c
hromedriver.exe
...
...
百度采集/baidu_comm/百度采集说明
浏览文件 @
66f88aa6
百度采集部
署的服务器
百度采集部
署的服务器
...
...
@@ -3,7 +3,6 @@
114.115.235.92
114.116.122.247
114.115.153.6
114.116.122.247
192.168.1.150
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论