Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
9d49a0cd
提交
9d49a0cd
authored
5月 20, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
谷歌搜索
上级
252c04d3
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
121 行增加
和
36 行删除
+121
-36
baseCore.py
google_comm/baseCore.py
+48
-2
googleSpider.py
google_comm/googleSpider.py
+0
-0
googletaskJob_loc.py
google_comm/googletaskJob_loc.py
+73
-34
没有找到文件。
google_comm/baseCore.py
浏览文件 @
9d49a0cd
import
datetime
import
os
import
os
import
random
import
random
import
redis
import
sys
import
sys
import
time
import
time
import
logbook
import
logbook
...
@@ -211,12 +213,18 @@ class BaseCore:
...
@@ -211,12 +213,18 @@ class BaseCore:
try
:
try
:
self
.
__cursor_proxy
.
close
()
self
.
__cursor_proxy
.
close
()
self
.
__cnx_proxy
.
close
()
self
.
__cnx_proxy
.
close
()
self
.
cursor_
.
close
()
self
.
cnx_
.
close
()
except
:
except
:
pass
pass
def
__init__
(
self
):
def
__init__
(
self
):
self
.
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
pass
pass
# 计算耗时
# 计算耗时
...
@@ -347,4 +355,42 @@ class BaseCore:
...
@@ -347,4 +355,42 @@ class BaseCore:
ip_list
.
append
(
proxy
)
ip_list
.
append
(
proxy
)
return
ip_list
return
ip_list
\ No newline at end of file
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
try
:
self
.
r
.
ping
()
except
:
self
.
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
item
=
self
.
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
def
getSidName
(
self
,
sid
):
sqlSelect
=
f
"SELECT words_name FROM `key_words` WHERE id = '{sid}'"
self
.
cursor_
.
execute
(
sqlSelect
)
data
=
self
.
cursor_
.
fetchone
()[
0
]
return
data
# 获得脚本进程PID
def
getPID
(
self
):
PID
=
os
.
getpid
()
return
PID
def
getUniqueCode
(
self
,
abbr
,
serverId
,
threadId
):
while
True
:
timeCode
=
self
.
r
.
blpop
([
'timeCode:google'
],
2
)
if
timeCode
:
timeCode
=
timeCode
[
1
]
timeCode
=
timeCode
.
decode
(
'utf-8'
)
break
else
:
time
.
sleep
(
2
)
pid
=
str
(
self
.
getPID
())
if
len
(
pid
)
<
4
:
pid
=
pid
.
zfill
(
4
)
elif
len
(
pid
)
>
4
:
pid
=
pid
[
0
:
4
]
uniqueCode
=
abbr
+
str
(
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d'
))[
2
:]
+
serverId
+
pid
+
str
(
threadId
)
+
str
(
timeCode
)
return
uniqueCode
\ No newline at end of file
google_comm/googleSpider.py
浏览文件 @
9d49a0cd
差异被折叠。
点击展开。
google_comm/googletaskJob_loc.py
浏览文件 @
9d49a0cd
...
@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
...
@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
self
.
r_6
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
6
)
def
getkafka
(
self
):
def
getkafka
(
self
):
# Kafka集群的地址
# Kafka集群的地址
...
@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
...
@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
def
paserKeyMsg
(
self
,
keymsg
):
def
paserKeyMsg
(
self
,
keymsg
):
num
=
1
logger
.
info
(
'----------'
)
logger
.
info
(
'----------'
)
wordsCode
=
keymsg
[
'wordsCode'
]
wordsCode
=
keymsg
[
'wordsCode'
]
id
=
keymsg
[
'id'
]
id
=
keymsg
[
'id'
]
try
:
keyword
=
keymsg
[
'keyWord'
]
searchEngines
=
keymsg
[
'searchEngines'
]
kwList
=
[]
if
'java.util.ArrayList'
in
searchEngines
:
keymsglist
=
self
.
getkeywords
(
keyword
)
searchEngines
=
searchEngines
[
1
]
except
Exception
as
e
:
for
kw
in
keymsglist
:
searchEngines
=
[]
kwmsg
=
{
kwList
=
[]
'kw'
:
kw
,
if
searchEngines
:
'wordsCode'
:
wordsCode
,
if
'4'
in
searchEngines
:
'sid'
:
id
keyword
=
keymsg
[
'keyWord'
]
}
keymsglist
=
self
.
getkeywords
(
keyword
)
kwList
.
append
((
num
,
kwmsg
))
for
kw
in
keymsglist
:
num
+=
1
kwmsg
=
{
'kw'
:
kw
,
'wordsCode'
:
wordsCode
,
'sid'
:
id
}
kwList
.
append
(
kwmsg
)
return
kwList
return
kwList
def
runSpider
(
self
,
kwmsg
):
def
runSpider
(
self
,
threadId
,
kwmsg
,
item
,
bangdan_name
):
if
'lay'
in
kwmsg
[
'kw'
]:
com_name
=
item
.
split
(
'|'
)[
2
]
else
:
com_name
=
item
.
split
(
'|'
)[
1
]
searchkw
=
com_name
+
' '
+
kwmsg
[
'kw'
]
searchkw
=
kwmsg
[
'kw'
]
print
(
f
'======拼接的关键词是{searchkw}=={com_name}===='
)
wordsCode
=
kwmsg
[
'wordsCode'
]
wordsCode
=
kwmsg
[
'wordsCode'
]
sid
=
kwmsg
[
'sid'
]
sid
=
kwmsg
[
'sid'
]
googleSpider
=
GoogleSpider
(
searchkw
,
wordsCode
,
sid
)
googleSpider
=
GoogleSpider
(
threadId
,
searchkw
,
wordsCode
,
sid
,
item
,
bangdan_name
)
try
:
try
:
googleSpider
.
get_page_html
()
googleSpider
.
get_page_html
()
...
@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
...
@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
finally
:
finally
:
googleSpider
.
driver
.
quit
()
googleSpider
.
driver
.
quit
()
logger
.
info
(
"关键词采集结束!"
+
searchkw
)
logger
.
info
(
"关键词采集结束!"
+
searchkw
)
import
random
def
get_comname
(
self
):
# todo:读取redis里的企业名称添加到关键词上
# ZZSN22080900000001|沃尔玛|WMT|1
item
=
baseCore
.
redicPullData
(
'GOOGLE_KEYWORDS:COMPANY_NAME:2023_500'
)
# item = 'ZZSN22080900000001|沃尔玛|WMT|1'
if
item
:
return
item
else
:
logger
.
info
(
'====已无企业==='
)
return
None
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
key
,
r
):
try
:
r
.
ping
()
except
:
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
item
=
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# keymsglist=getkeywords(ss)
...
@@ -164,14 +189,28 @@ if __name__ == '__main__':
...
@@ -164,14 +189,28 @@ if __name__ == '__main__':
print
(
'---------------'
)
print
(
'---------------'
)
while
True
:
while
True
:
try
:
try
:
codeids
=
[]
# try:
# codeid='KW-20230727-0001'
# googleTaskJob.r.ping()
codeids
.
append
(
'KW-20240318-0001'
)
# except:
for
codeid
in
codeids
:
# googleTaskJob.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# all_keys = 'GOOGLE_KEYWORDS:COMPANY_NAME*'
# keys = googleTaskJob.r.scan_iter(f"{key}*")
# for key in keys:
item
=
googleTaskJob
.
get_comname
()
bangdan_name
=
'2023年世界500强'
if
item
:
pass
else
:
break
codeList
=
[
'KW-20240516-0002'
]
for
codeid
in
codeList
:
try
:
try
:
#
keymsg=baiduTaskJob.getkafka()
#keymsg=baiduTaskJob.getkafka()
keymsg
=
googleTaskJob
.
getkeyFromredis
(
codeid
)
keymsg
=
googleTaskJob
.
getkeyFromredis
(
codeid
)
kwList
=
googleTaskJob
.
paserKeyMsg
(
keymsg
)
kwList
=
googleTaskJob
.
paserKeyMsg
(
keymsg
)
# kwList=reversed(kwList)
# kwList=reversed(kwList)
# 从列表中随机选择5个数据
# 从列表中随机选择5个数据
# kwList = random.sample(kwList, 4)
# kwList = random.sample(kwList, 4)
...
@@ -182,9 +221,9 @@ if __name__ == '__main__':
...
@@ -182,9 +221,9 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
4
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
googleTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
googleTaskJob
.
runSpider
,
num
,
data
,
item
,
bangdan_name
)
for
num
,
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
for
future
in
concurrent
.
futures
.
as_completed
(
results
):
for
future
in
concurrent
.
futures
.
as_completed
(
results
):
try
:
try
:
...
@@ -195,5 +234,5 @@ if __name__ == '__main__':
...
@@ -195,5 +234,5 @@ if __name__ == '__main__':
# 处理任务执行过程中的异常
# 处理任务执行过程中的异常
logger
.
info
(
f
"任务执行exception: {e}"
)
logger
.
info
(
f
"任务执行exception: {e}"
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
info
(
'采集异常
'
)
logger
.
info
(
f
'采集异常{e}
'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论