Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
9d49a0cd
提交
9d49a0cd
authored
5月 20, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
谷歌搜索
上级
252c04d3
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
397 行增加
和
239 行删除
+397
-239
baseCore.py
google_comm/baseCore.py
+48
-2
googleSpider.py
google_comm/googleSpider.py
+276
-203
googletaskJob_loc.py
google_comm/googletaskJob_loc.py
+73
-34
没有找到文件。
google_comm/baseCore.py
浏览文件 @
9d49a0cd
import
datetime
import
os
import
os
import
random
import
random
import
redis
import
sys
import
sys
import
time
import
time
import
logbook
import
logbook
...
@@ -211,12 +213,18 @@ class BaseCore:
...
@@ -211,12 +213,18 @@ class BaseCore:
try
:
try
:
self
.
__cursor_proxy
.
close
()
self
.
__cursor_proxy
.
close
()
self
.
__cnx_proxy
.
close
()
self
.
__cnx_proxy
.
close
()
self
.
cursor_
.
close
()
self
.
cnx_
.
close
()
except
:
except
:
pass
pass
def
__init__
(
self
):
def
__init__
(
self
):
self
.
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
pass
pass
# 计算耗时
# 计算耗时
...
@@ -347,4 +355,42 @@ class BaseCore:
...
@@ -347,4 +355,42 @@ class BaseCore:
ip_list
.
append
(
proxy
)
ip_list
.
append
(
proxy
)
return
ip_list
return
ip_list
\ No newline at end of file
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
try
:
self
.
r
.
ping
()
except
:
self
.
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
item
=
self
.
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
def
getSidName
(
self
,
sid
):
sqlSelect
=
f
"SELECT words_name FROM `key_words` WHERE id = '{sid}'"
self
.
cursor_
.
execute
(
sqlSelect
)
data
=
self
.
cursor_
.
fetchone
()[
0
]
return
data
# 获得脚本进程PID
def
getPID
(
self
):
PID
=
os
.
getpid
()
return
PID
def
getUniqueCode
(
self
,
abbr
,
serverId
,
threadId
):
while
True
:
timeCode
=
self
.
r
.
blpop
([
'timeCode:google'
],
2
)
if
timeCode
:
timeCode
=
timeCode
[
1
]
timeCode
=
timeCode
.
decode
(
'utf-8'
)
break
else
:
time
.
sleep
(
2
)
pid
=
str
(
self
.
getPID
())
if
len
(
pid
)
<
4
:
pid
=
pid
.
zfill
(
4
)
elif
len
(
pid
)
>
4
:
pid
=
pid
[
0
:
4
]
uniqueCode
=
abbr
+
str
(
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d'
))[
2
:]
+
serverId
+
pid
+
str
(
threadId
)
+
str
(
timeCode
)
return
uniqueCode
\ No newline at end of file
google_comm/googleSpider.py
浏览文件 @
9d49a0cd
...
@@ -2,7 +2,7 @@ from urllib.parse import urljoin
...
@@ -2,7 +2,7 @@ from urllib.parse import urljoin
import
langid
import
langid
import
pymysql
import
pymysql
from
gne
import
GeneralNewsExtractor
from
retry
import
retry
from
retry
import
retry
from
selenium
import
webdriver
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
selenium.webdriver.chrome.service
import
Service
...
@@ -15,11 +15,12 @@ import threading
...
@@ -15,11 +15,12 @@ import threading
import
time
import
time
from
lxml
import
etree
from
lxml
import
etree
from
queue
import
Queue
from
queue
import
Queue
import
re
,
sys
import
re
,
sys
import
datetime
import
datetime
import
redis
import
redis
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
import
json
import
json
import
uuid
from
baseCore
import
BaseCore
from
baseCore
import
BaseCore
import
configparser
import
configparser
...
@@ -31,14 +32,15 @@ import requests
...
@@ -31,14 +32,15 @@ import requests
# 从HTML中提取纯文本
# 从HTML中提取纯文本
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
class
GoogleSpider
(
object
):
class
GoogleSpider
(
object
):
def
__init__
(
self
,
searchkw
,
wordsCode
,
sid
):
def
__init__
(
self
,
threadId
,
searchkw
,
wordsCode
,
sid
,
item
,
bangdan_name
):
# 创建ConfigParser对象
# 创建ConfigParser对象
self
.
config
=
configparser
.
ConfigParser
()
self
.
config
=
configparser
.
ConfigParser
()
# 读取配置文件
# 读取配置文件
self
.
config
.
read
(
'config.ini'
)
self
.
config
.
read
(
'config.ini'
)
baseCore
=
BaseCore
()
self
.
baseCore
=
BaseCore
()
self
.
logger
=
baseCore
.
getLogger
()
self
.
logger
=
self
.
baseCore
.
getLogger
()
# self.url = f'https://www.google.com/search?q={searchkw}&tbm=nws&source=lnms&sa=X&ved=2ahUKEwicke6y37OAAxWJGIgKHQWAASUQ0pQJegQIDRAB&biw=1366&bih=372&dpr=1'
# self.url = f'https://www.google.com/search?q={searchkw}&tbm=nws&source=lnms&sa=X&ved=2ahUKEwicke6y37OAAxWJGIgKHQWAASUQ0pQJegQIDRAB&biw=1366&bih=372&dpr=1'
# self.url = f'https://www.google.com.hk/search?q={searchkw}&sca_esv=555819424&tbs=sbd:1&tbm=nws&ei=CezVZPaGCaqC4-EPqZi_oAk&start=90&sa=N&ved=2ahUKEwi2r_qGk9SAAxUqwTgGHSnMD5QQ8tMDegQIAhAU&biw=1366&bih=619&dpr=1'
# self.url = f'https://www.google.com.hk/search?q={searchkw}&sca_esv=555819424&tbs=sbd:1&tbm=nws&ei=CezVZPaGCaqC4-EPqZi_oAk&start=90&sa=N&ved=2ahUKEwi2r_qGk9SAAxUqwTgGHSnMD5QQ8tMDegQIAhAU&biw=1366&bih=619&dpr=1'
self
.
url
=
f
'https://www.google.com.hk'
self
.
url
=
f
'https://www.google.com.hk'
...
@@ -46,12 +48,15 @@ class GoogleSpider(object):
...
@@ -46,12 +48,15 @@ class GoogleSpider(object):
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
self
.
page_num
=
1
self
.
page_num
=
1
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
self
.
kafka_bootstrap_servers
=
self
.
config
.
get
(
'kafka'
,
'bootstrap_servers'
)
self
.
kafka_bootstrap_servers
=
self
.
config
.
get
(
'kafka'
,
'bootstrap_servers'
)
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
chrome_options
.
add_argument
(
rf
'user-data-dir=D:
\
seleniumTmp
\b
aidu{uuid.uuid1()}'
)
chrome_options
.
add_argument
(
"--disable-component-update"
)
chrome_options
.
add_argument
(
"--disable-extensions"
)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
# driver = webdriver.Chrome(chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
self
.
qtitle
=
Queue
()
self
.
qtitle
=
Queue
()
self
.
qurl
=
Queue
()
self
.
qurl
=
Queue
()
...
@@ -59,33 +64,41 @@ class GoogleSpider(object):
...
@@ -59,33 +64,41 @@ class GoogleSpider(object):
self
.
searchkw
=
searchkw
self
.
searchkw
=
searchkw
self
.
wordsCode
=
wordsCode
self
.
wordsCode
=
wordsCode
self
.
sid
=
sid
self
.
sid
=
sid
self
.
threadId
=
threadId
self
.
item
=
item
self
.
bangdan_name
=
bangdan_name
def
createDriver
(
self
):
def
createDriver
(
self
):
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
# 设置代理
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
#将列表数据插入到表中 baidu_search_result
# 将列表数据插入到表中 baidu_search_result
def
itemInsertToTable
(
self
,
items
):
def
itemInsertToTable
(
self
,
items
):
itemdata
=
[]
itemdata
=
[]
conx
,
cursorM
=
self
.
connMysql
()
conx
,
cursorM
=
self
.
connMysql
()
companyinfo
=
self
.
item
social_code
=
str
(
companyinfo
.
split
(
'|'
)[
0
])
ch_name
=
companyinfo
.
split
(
'|'
)[
1
]
en_name
=
companyinfo
.
split
(
'|'
)[
2
]
rank
=
self
.
bangdan_name
+
'|'
+
str
(
companyinfo
.
split
(
'|'
)[
3
])
for
item
in
items
:
for
item
in
items
:
nowtime
=
self
.
getNowDate
()
nowtime
=
self
.
getNowDate
()
data
=
(
self
.
sid
,
self
.
wordsCode
,
item
[
'title'
],
item
[
'detailurl'
],
item
[
'source'
],
item
[
'publishtime'
],
item
[
'content'
],
item
[
'contentHtml'
],
'1'
,
item
[
'kword'
],
nowtime
)
data
=
(
social_code
,
en_name
,
ch_name
,
rank
,
item
[
'title'
],
item
[
'content'
],
item
[
'detailurl'
],
item
[
'publishtime'
],
item
[
'source'
],
nowtime
)
itemdata
.
append
(
data
)
itemdata
.
append
(
data
)
sql
=
"INSERT into google_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
sql
=
"INSERT into Company_layoff (企业信用代码,企业英文名称,企业中文名称,所在榜单排名,标题,内容,链接,发布时间,来源,创建时间) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursorM
.
executemany
(
sql
,
itemdata
)
cursorM
.
executemany
(
sql
,
itemdata
)
self
.
logger
.
info
(
"数据插入数据库成功!"
)
self
.
logger
.
info
(
"数据插入数据库成功!"
)
# 定义插入数据的SQL语句
# 定义插入数据的SQL语句
# 执行插入操作
# 执行插入操作
conx
.
commit
()
conx
.
commit
()
self
.
closeSql
(
conx
,
cursorM
)
self
.
closeSql
(
conx
,
cursorM
)
def
connMysql
(
self
):
def
connMysql
(
self
):
# 创建MySQL连接
# 创建MySQL连接
...
@@ -95,62 +108,63 @@ class GoogleSpider(object):
...
@@ -95,62 +108,63 @@ class GoogleSpider(object):
database
=
self
.
config
.
get
(
'mysql'
,
'database'
))
database
=
self
.
config
.
get
(
'mysql'
,
'database'
))
# 创建一个游标对象
# 创建一个游标对象
cursorM
=
conx
.
cursor
()
cursorM
=
conx
.
cursor
()
return
conx
,
cursorM
return
conx
,
cursorM
def
closeSql
(
self
,
conx
,
cursorM
):
def
closeSql
(
self
,
conx
,
cursorM
):
# 关闭游标和连接
# 关闭游标和连接
cursorM
.
close
()
cursorM
.
close
()
conx
.
close
()
conx
.
close
()
# 解析页面
# 解析页面
def
parse_page
(
self
):
def
parse_page
(
self
):
try
:
try
:
response
=
self
.
driver
.
page_source
response
=
self
.
driver
.
page_source
html
=
etree
.
HTML
(
response
)
html
=
etree
.
HTML
(
response
)
lists
=
self
.
xpath_paser
(
html
)
lists
=
self
.
xpath_paser
(
html
)
flag
=
html
.
xpath
(
'//tr[@jsname="TeSSVd"]//a[last()]//@class'
)[
0
]
flag
=
html
.
xpath
(
'//tr[@jsname="TeSSVd"]//a[last()]//@class'
)[
0
]
except
:
except
:
lists
=
[]
lists
=
[]
flag
=
''
flag
=
''
return
flag
,
lists
return
flag
,
lists
def
xpath_paser
(
self
,
html
):
def
xpath_paser
(
self
,
html
):
lists
=
[]
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="SoaBEf"]'
)
itemTag
=
html
.
xpath
(
'//div[@class="SoaBEf"]'
)
for
itemTag
in
itemTag
:
for
itemTag
in
itemTag
:
try
:
try
:
title
=
itemTag
.
xpath
(
'.//div[@class="n0jPhd ynAwRc MBeuO nDgy9d"]/text()'
)[
0
]
title
=
itemTag
.
xpath
(
'.//div[@class="n0jPhd ynAwRc MBeuO nDgy9d"]/text()'
)[
0
]
title
=
str
(
title
)
title
=
str
(
title
)
except
Exception
as
e
:
except
Exception
as
e
:
title
=
''
title
=
''
try
:
try
:
detailUrl
=
itemTag
.
xpath
(
'.//a[@class="WlydOe"]/@href'
)[
0
]
detailUrl
=
itemTag
.
xpath
(
'.//a[@class="WlydOe"]/@href'
)[
0
]
detailUrl
=
str
(
detailUrl
)
detailUrl
=
str
(
detailUrl
)
except
Exception
as
e
:
except
Exception
as
e
:
detailUrl
=
''
detailUrl
=
''
try
:
try
:
sourceTag
=
itemTag
.
xpath
(
'.//div[@class="MgUUmf NUnG9d"]//text()'
)[
0
]
sourceTag
=
itemTag
.
xpath
(
'.//div[@class="MgUUmf NUnG9d"]//text()'
)[
0
]
sourceTag
=
str
(
sourceTag
)
sourceTag
=
str
(
sourceTag
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
sourceTag
=
''
sourceTag
=
''
try
:
try
:
publishTag
=
itemTag
.
xpath
(
'.//div[@class="OSrXXb rbYSKb LfVVr"]/span/text()'
)[
0
]
publishTag
=
itemTag
.
xpath
(
'.//div[@class="OSrXXb rbYSKb LfVVr"]/span/text()'
)[
0
]
publishTag
=
str
(
publishTag
)
publishTag
=
str
(
publishTag
)
publishtime
=
self
.
paserTime
(
publishTag
)
publishtime
=
self
.
paserTime
(
publishTag
)
publishTag
=
publishtime
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
publishTag
=
publishtime
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
Exception
as
e
:
except
Exception
as
e
:
publishTag
=
''
publishTag
=
''
detailmsg
=
{
detailmsg
=
{
'title'
:
title
,
'title'
:
title
,
'detailUrl'
:
detailUrl
,
'detailUrl'
:
detailUrl
,
'sourceTag'
:
sourceTag
,
'sourceTag'
:
sourceTag
,
'publishTag'
:
publishTag
'publishTag'
:
publishTag
}
}
lists
.
append
(
detailmsg
)
lists
.
append
(
detailmsg
)
return
lists
return
lists
#获取当前时间
#
获取当前时间
def
getNowDate
(
self
):
def
getNowDate
(
self
):
# 获取当前时间
# 获取当前时间
...
@@ -159,13 +173,13 @@ class GoogleSpider(object):
...
@@ -159,13 +173,13 @@ class GoogleSpider(object):
currentdate
=
current_time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
currentdate
=
current_time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
return
currentdate
return
currentdate
def
webDriver
(
self
,
url
):
def
webDriver
(
self
,
url
):
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
chrome_options
.
binary_location
=
self
.
config
.
get
(
'selenium'
,
'binary_location'
)
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
html
=
''
html
=
''
try
:
try
:
driver
.
get
(
url
)
driver
.
get
(
url
)
# 等待页面加载完成
# 等待页面加载完成
...
@@ -173,7 +187,7 @@ class GoogleSpider(object):
...
@@ -173,7 +187,7 @@ class GoogleSpider(object):
driver
.
refresh
()
driver
.
refresh
()
wait
=
WebDriverWait
(
driver
,
20
)
wait
=
WebDriverWait
(
driver
,
20
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
"body"
)))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
"body"
)))
html
=
driver
.
page_source
html
=
driver
.
page_source
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
'请求失败'
)
self
.
logger
.
info
(
'请求失败'
)
finally
:
finally
:
...
@@ -182,77 +196,81 @@ class GoogleSpider(object):
...
@@ -182,77 +196,81 @@ class GoogleSpider(object):
return
html
return
html
def
extractorMsg
(
self
,
url
,
title
):
def
extractorMsg
(
self
,
url
,
title
):
content
=
''
content
=
''
contentWithTag
=
''
contentWithTag
=
''
lang
=
''
lang
=
''
try
:
try
:
lang
=
self
.
detect_language
(
title
)
lang
=
self
.
detect_language
(
title
)
sm
=
SmartExtractor
(
lang
)
sm
=
SmartExtractor
(
lang
)
article
=
sm
.
extract_by_url
(
url
=
url
)
article
=
sm
.
extract_by_url
(
url
=
url
)
content
=
article
.
cleaned_text
content
=
article
.
cleaned_text
contentWithTag
=
article
.
text
contentWithTag
=
article
.
text
except
Exception
as
e
:
except
Exception
as
e
:
try
:
try
:
raw_html
=
self
.
webDriver
(
url
)
raw_html
=
self
.
webDriver
(
url
)
sm
=
SmartExtractor
(
lang
)
sm
=
SmartExtractor
(
lang
)
article
=
sm
.
extract_by_html
(
raw_html
)
article
=
sm
.
extract_by_html
(
raw_html
)
content
=
article
.
cleaned_text
content
=
article
.
cleaned_text
contentWithTag
=
article
.
text
contentWithTag
=
article
.
text
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'抽取失败!!'
)
print
(
'抽取失败!!'
)
return
content
,
contentWithTag
return
content
,
contentWithTag
def
paserTime
(
self
,
publishtime
):
def
paserTime
(
self
,
publishtime
):
timeType
=
[
'年前'
,
'月前'
,
'周前'
,
'前天'
,
'昨天'
,
'天前'
,
'今天'
,
'小时前'
,
'分钟前'
]
timeType
=
[
'年前'
,
'月前'
,
'周前'
,
'前天'
,
'昨天'
,
'天前'
,
'今天'
,
'小时前'
,
'分钟前'
]
current_datetime
=
datetime
.
datetime
.
now
()
current_datetime
=
datetime
.
datetime
.
now
()
publishtime
=
publishtime
.
strip
()
publishtime
=
publishtime
.
strip
()
print
(
publishtime
)
print
(
publishtime
)
try
:
try
:
if
'年前'
in
publishtime
:
if
'年前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
days
=
365
*
day
)
delta
=
datetime
.
timedelta
(
days
=
365
*
day
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'月前'
in
publishtime
:
elif
'月前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
days
=
30
*
day
)
delta
=
datetime
.
timedelta
(
days
=
30
*
day
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'周前'
in
publishtime
:
elif
'周前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
weeks
=
day
)
delta
=
datetime
.
timedelta
(
weeks
=
day
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'天前'
in
publishtime
:
elif
'天前'
in
publishtime
:
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
numbers
=
re
.
findall
(
r'\d+'
,
publishtime
)
day
=
int
(
numbers
[
0
])
day
=
int
(
numbers
[
0
])
delta
=
datetime
.
timedelta
(
days
=
day
)
delta
=
datetime
.
timedelta
(
days
=
day
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'前天'
in
publishtime
:
elif
'前天'
in
publishtime
:
delta
=
datetime
.
timedelta
(
days
=
2
)
delta
=
datetime
.
timedelta
(
days
=
2
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'昨天'
in
publishtime
:
elif
'昨天'
in
publishtime
:
current_datetime
=
datetime
.
datetime
.
now
()
current_datetime
=
datetime
.
datetime
.
now
()
delta
=
datetime
.
timedelta
(
days
=
1
)
delta
=
datetime
.
timedelta
(
days
=
1
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'今天'
in
publishtime
or
'小时前'
in
publishtime
or
'分钟前'
in
publishtime
:
elif
'今天'
in
publishtime
or
'小时前'
in
publishtime
or
'分钟前'
in
publishtime
:
delta
=
datetime
.
timedelta
(
hours
=
5
)
delta
=
datetime
.
timedelta
(
hours
=
5
)
publishtime
=
current_datetime
-
delta
publishtime
=
current_datetime
-
delta
elif
'年'
in
publishtime
and
'月'
in
publishtime
:
elif
'年'
in
publishtime
and
'月'
in
publishtime
:
time_format
=
'
%
Y年
%
m月
%
d日'
time_format
=
'
%
Y年
%
m月
%
d日'
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
elif
'月'
in
publishtime
and
'日'
in
publishtime
:
elif
'月'
in
publishtime
and
'日'
in
publishtime
:
current_year
=
current_datetime
.
year
current_year
=
current_datetime
.
year
time_format
=
'
%
Y年
%
m月
%
d日'
time_format
=
'
%
Y年
%
m月
%
d日'
publishtime
=
str
(
current_year
)
+
'年'
+
publishtime
publishtime
=
str
(
current_year
)
+
'年'
+
publishtime
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
publishtime
=
datetime
.
datetime
.
strptime
(
publishtime
,
time_format
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'时间解析异常!!'
)
print
(
'时间解析异常!!'
)
return
publishtime
return
publishtime
@retry
(
tries
=
3
,
delay
=
3
)
@retry
(
tries
=
3
,
delay
=
3
)
def
get_buket_news
(
self
):
self
.
driver
.
find_element
(
'xpath'
,
'//div[contains(@class, "YmvwI") and contains(text(), "新闻")]'
)
.
click
()
@retry
(
tries
=
3
,
delay
=
3
)
# 获取每一页数据, 开趴.
# 获取每一页数据, 开趴.
def
get_page_html
(
self
):
def
get_page_html
(
self
):
self
.
logger
.
info
(
f
"{self.searchkw}...进入google首页..."
)
self
.
logger
.
info
(
f
"{self.searchkw}...进入google首页..."
)
...
@@ -266,33 +284,44 @@ class GoogleSpider(object):
...
@@ -266,33 +284,44 @@ class GoogleSpider(object):
search_input
.
send_keys
(
self
.
searchkw
)
search_input
.
send_keys
(
self
.
searchkw
)
search_input
.
submit
()
search_input
.
submit
()
try
:
try
:
time
.
sleep
(
3
)
wait
=
WebDriverWait
(
self
.
driver
,
20
)
wait
=
WebDriverWait
(
self
.
driver
,
20
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
"body"
)))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"crJ18e"
)))
try
:
time
.
sleep
(
3
)
self
.
driver
.
find_element
(
'xpath'
,
'//div[contains(@class, "YmvwI") and contains(text(), "新闻")]'
)
.
click
()
for
i
in
range
(
3
):
except
:
try
:
self
.
logger
.
info
(
'点击新闻按钮失效'
)
self
.
get_buket_news
()
return
break
except
Exception
as
e
:
self
.
logger
.
info
(
f
'点击新闻按钮失效'
)
self
.
driver
.
refresh
()
time
.
sleep
(
3
)
if
i
<
3
:
continue
else
:
return
time
.
sleep
(
3
)
time
.
sleep
(
3
)
self
.
driver
.
find_element
(
'xpath'
,
'//div[@id="hdtb-tls"]'
)
.
click
()
self
.
driver
.
find_element
(
'xpath'
,
'//div[@id="hdtb-tls"]'
)
.
click
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
self
.
driver
.
find_element
(
'xpath'
,
'//div[@class="hdtb-mn-hd"]/div[text()="按相关性排序"]'
)
.
click
()
# self.driver.find_element('xpath', '//div[@class="hdtb-mn-hd"]/div[text()="按相关性排序"]').click()
self
.
driver
.
find_element
(
'xpath'
,
'//*[@id="tn_1"]/span[3]/g-popup/div[1]/div/div/div[text()="按相关性排序"]'
)
.
click
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
self
.
driver
.
find_element
(
'xpath'
,
'//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]'
)
.
click
()
# self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
self
.
driver
.
find_element
(
'xpath'
,
'//*[@id="lb"]/div/g-menu/g-menu-item[2]/div/a[text()="按日期排序"]'
)
.
click
()
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
f
'--{self.searchkw}--点击按钮失效
----{e}
'
)
self
.
logger
.
info
(
f
'--{self.searchkw}--点击按钮失效'
)
return
return
self
.
logger
.
info
(
f
"{self.searchkw}...开始抓取首页..."
)
self
.
logger
.
info
(
f
"{self.searchkw}...开始抓取首页..."
)
time
.
sleep
(
5
)
time
.
sleep
(
5
)
flag
,
lists
=
self
.
parse_page
()
flag
,
lists
=
self
.
parse_page
()
if
len
(
lists
)
<
1
:
if
len
(
lists
)
<
1
:
time
.
sleep
(
6
)
time
.
sleep
(
6
)
repeatCounts
=
0
repeatCounts
=
0
for
detail
in
lists
:
for
detail
in
lists
:
durl
=
detail
[
'detailUrl'
]
durl
=
detail
[
'detailUrl'
]
is_member
=
self
.
r
.
sismember
(
'pygoogle_'
+
self
.
wordsCode
,
durl
)
is_member
=
self
.
r
.
sismember
(
'pygoogle_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
if
is_member
:
repeatCounts
+=
1
repeatCounts
+=
1
if
repeatCounts
/
len
(
lists
)
>
0.5
:
if
repeatCounts
/
len
(
lists
)
>
0.5
:
...
@@ -310,8 +339,8 @@ class GoogleSpider(object):
...
@@ -310,8 +339,8 @@ class GoogleSpider(object):
hasnext
=
''
hasnext
=
''
timeFlag
=
False
timeFlag
=
False
while
hasnext
==
'下一页'
:
while
hasnext
==
'下一页'
:
if
self
.
page_num
==
5
:
# if self.page_num ==
5:
break
#
break
self
.
page_num
=
self
.
page_num
+
1
self
.
page_num
=
self
.
page_num
+
1
self
.
logger
.
info
(
f
"{self.searchkw}...开始抓取第{self.page_num}页..."
)
self
.
logger
.
info
(
f
"{self.searchkw}...开始抓取第{self.page_num}页..."
)
try
:
try
:
...
@@ -323,7 +352,7 @@ class GoogleSpider(object):
...
@@ -323,7 +352,7 @@ class GoogleSpider(object):
repeated_counts
=
0
repeated_counts
=
0
for
detail
in
lists
:
for
detail
in
lists
:
durl
=
detail
[
'detailUrl'
]
durl
=
detail
[
'detailUrl'
]
is_member
=
self
.
r
.
sismember
(
'pygoogle_'
+
self
.
wordsCode
,
durl
)
is_member
=
self
.
r
.
sismember
(
'pygoogle_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
if
is_member
:
self
.
logger
.
info
(
f
"{self.searchkw}已存在{detail['title']}"
)
self
.
logger
.
info
(
f
"{self.searchkw}已存在{detail['title']}"
)
repeated_counts
+=
1
repeated_counts
+=
1
...
@@ -331,14 +360,14 @@ class GoogleSpider(object):
...
@@ -331,14 +360,14 @@ class GoogleSpider(object):
self
.
logger
.
info
(
f
"{self.searchkw}第{self.page_num}页已存在过多,跳出循环"
)
self
.
logger
.
info
(
f
"{self.searchkw}第{self.page_num}页已存在过多,跳出循环"
)
return
return
continue
continue
publishTag
=
detail
[
'publishTag'
]
publishTag
=
detail
[
'publishTag'
]
#
if publishTag:
if
publishTag
:
#
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
pubtime
=
datetime
.
datetime
.
strptime
(
publishTag
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# needDate=
'2022-01-01 00:00:00'
needDate
=
'2022-01-01 00:00:00'
#
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
needTime
=
datetime
.
datetime
.
strptime
(
needDate
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
#
if pubtime < needTime:
if
pubtime
<
needTime
:
#
timeFlag = True
timeFlag
=
True
#
break
break
self
.
detailList
.
put
(
detail
)
self
.
detailList
.
put
(
detail
)
if
timeFlag
:
if
timeFlag
:
break
break
...
@@ -349,37 +378,74 @@ class GoogleSpider(object):
...
@@ -349,37 +378,74 @@ class GoogleSpider(object):
hasnext
=
hasnext
.
strip
()
hasnext
=
hasnext
.
strip
()
self
.
logger
.
info
(
hasnext
)
self
.
logger
.
info
(
hasnext
)
except
Exception
as
e
:
except
Exception
as
e
:
hasnext
=
''
hasnext
=
''
self
.
logger
.
info
(
f
"{self.searchkw}...列表抓取完毕"
)
self
.
logger
.
info
(
f
"{self.searchkw}...列表抓取完毕"
)
def
getRequest
(
self
,
url
):
def
getRequest
(
self
,
url
):
html
=
''
html
=
''
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
,
}
}
try
:
try
:
print
(
url
)
print
(
url
)
res
=
requests
.
get
(
url
=
url
,
timeout
=
30
)
res
=
requests
.
get
(
url
=
url
,
timeout
=
30
)
if
res
.
status_code
==
200
:
if
res
.
status_code
==
200
:
res
.
encoding
=
res
.
apparent_encoding
# 使用自动检测的编码方式
res
.
encoding
=
res
.
apparent_encoding
# 使用自动检测的编码方式
html
=
res
.
text
html
=
res
.
text
else
:
else
:
html
=
''
html
=
''
if
html
==
''
:
if
html
==
''
:
for
i
in
range
(
1
,
3
):
for
i
in
range
(
1
,
3
):
time
.
sleep
(
1
)
time
.
sleep
(
1
)
html
=
self
.
getRequest
(
url
)
html
=
self
.
getRequest
(
url
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
return
html
return
html
def
sendMonitor
(
self
,
processitem
):
self
.
logger
.
info
(
processitem
[
'uniqueCode'
])
sidName
=
self
.
baseCore
.
getSidName
(
processitem
[
'sid'
])
monitor
=
{
"title"
:
processitem
[
'title'
],
# 标题
"sourceAddress"
:
processitem
[
'sourceAddress'
],
# 原文链接
"uniqueCode"
:
processitem
[
'uniqueCode'
],
# 唯一编码 采集类型+6位日期+服务器序列+线程序列+自定义数字
"operateType"
:
"DATA_CRAWLER"
,
# 操作类型 写死
"handlerBody"
:
{
"success"
:
True
,
# 处理成功或失败状态 写死
"handlerStatus"
:
"CRAWLED"
# 处理状态 写死
},
"source"
:
{
"sourceId"
:
processitem
[
'sid'
],
# 信息源Id
"sourceName"
:
sidName
,
# 信息源名称
"sourceType"
:
4
,
# 信息源类型 sourceType枚举字典
},
"processTime"
:
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
),
# 处理时间 yyyy-MM-dd HH:mm:ss
"server"
:
{
"serverIp"
:
"94.74.96.195"
,
# 所在服务器IP
"serverHostName"
:
"数据采集服务"
,
# 服务器名称
"processId"
:
self
.
baseCore
.
getPID
()
# 进程Id
}
}
producer
=
KafkaProducer
(
bootstrap_servers
=
[
self
.
kafka_bootstrap_servers
],
max_request_size
=
1024
*
1024
*
20
,
api_version
=
(
2
,
7
,
0
))
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
monitor
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
self
.
logger
.
info
(
'监控数据发送Kafka成功'
)
except
Exception
as
e
:
monitor
=
json
.
dumps
(
monitor
,
ensure_ascii
=
False
)
monitorDic
=
{
'lifecycle_data_crawler'
:
monitor
}
self
.
baseCore
.
r
.
xadd
(
'data_lifecycle_log_data_crawler-redis'
,
monitorDic
,
id
=
'*'
)
self
.
logger
.
info
(
'数据监控发送Kafka失败,已放置Redis中'
)
# 获取详情页
# 获取详情页
def
get_detail_html
(
self
):
def
get_detail_html
(
self
):
while
True
:
while
True
:
if
self
.
detailList
.
qsize
()
!=
0
:
if
self
.
detailList
.
qsize
()
!=
0
:
detailmsg
=
self
.
detailList
.
get
()
detailmsg
=
self
.
detailList
.
get
()
title
=
detailmsg
[
'title'
]
title
=
detailmsg
[
'title'
]
detailUrl
=
detailmsg
[
'detailUrl'
]
detailUrl
=
detailmsg
[
'detailUrl'
]
self
.
logger
.
info
(
"
%
s:
%
s开始解析详情数据
\n
"
%
(
title
,
detailUrl
))
self
.
logger
.
info
(
"
%
s:
%
s开始解析详情数据
\n
"
%
(
title
,
detailUrl
))
...
@@ -392,12 +458,12 @@ class GoogleSpider(object):
...
@@ -392,12 +458,12 @@ class GoogleSpider(object):
# self.driver.get(detailUrl)
# self.driver.get(detailUrl)
# response = self.driver.page_source
# response = self.driver.page_source
try
:
try
:
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
# 'content':content,
# 'content':content,
# 'contentHtml':contentWithTag,
# 'contentHtml':contentWithTag,
content
=
bdetail
[
'content'
]
content
=
bdetail
[
'content'
]
contentHtml
=
bdetail
[
'contentHtml'
]
contentHtml
=
bdetail
[
'contentHtml'
]
if
len
(
content
)
<
100
:
if
len
(
content
)
<
100
:
continue
continue
soup
=
BeautifulSoup
(
contentHtml
,
"html.parser"
)
soup
=
BeautifulSoup
(
contentHtml
,
"html.parser"
)
# 查找所有带有class属性的元素
# 查找所有带有class属性的元素
...
@@ -405,57 +471,62 @@ class GoogleSpider(object):
...
@@ -405,57 +471,62 @@ class GoogleSpider(object):
# 循环遍历元素并去掉class属性
# 循环遍历元素并去掉class属性
for
element
in
elements_with_class
:
for
element
in
elements_with_class
:
del
element
.
attrs
[
"class"
]
del
element
.
attrs
[
"class"
]
contentHtml
=
str
(
soup
)
contentHtml
=
str
(
soup
)
bdetail
[
'content'
]
=
content
bdetail
[
'content'
]
=
content
bdetail
[
'contentHtml'
]
=
contentHtml
bdetail
[
'contentHtml'
]
=
contentHtml
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
'详情解析失败'
)
self
.
logger
.
info
(
'详情解析失败'
)
continue
continue
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
# uniqueCode = self.baseCore.getUniqueCode('GG', '195', self.threadId)
# processitem['uniqueCode'] = uniqueCode
try
:
try
:
self
.
sendkafka
(
processitem
)
# flg = self.sendkafka(processitem)
self
.
r
.
sadd
(
'pygoogle_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
flg
=
True
# 插入数据库
if
flg
:
try
:
self
.
r
.
sadd
(
'pygoogle_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
items
=
[]
# 插入数据库
items
.
append
(
bdetail
)
try
:
self
.
itemInsertToTable
(
items
)
items
=
[]
except
Exception
as
e
:
items
.
append
(
bdetail
)
self
.
logger
.
info
(
f
"插入数据库失败!{bdetail['kword']}===={detailUrl}"
)
self
.
itemInsertToTable
(
items
)
self
.
logger
.
info
(
f
"放入kafka成功!{bdetail['kword']}===={detailUrl}"
)
except
Exception
as
e
:
self
.
logger
.
info
(
f
"插入数据库失败!{bdetail['kword']}===={e}"
)
# self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
# self.sendMonitor(processitem)
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
f
"
放入kafka失败!
{bdetail['kword']}===={detailUrl}"
)
self
.
logger
.
info
(
f
"
{e}
{bdetail['kword']}===={detailUrl}"
)
# 关闭当前新窗口
# 关闭当前新窗口
# self.driver.close()
# self.driver.close()
time
.
sleep
(
1
)
time
.
sleep
(
1
)
except
Exception
as
e
:
except
Exception
as
e
:
time
.
sleep
(
5
)
time
.
sleep
(
5
)
self
.
logger
.
info
(
"详情页解析异常!"
+
detailUrl
)
self
.
logger
.
info
(
"详情页解析异常!"
+
detailUrl
)
else
:
else
:
break
break
# time.sleep(5)
# time.sleep(5)
def
rmTagattr
(
self
,
html
,
url
):
def
rmTagattr
(
self
,
html
,
url
):
# 使用BeautifulSoup解析网页内容
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
# soup = BeautifulSoup(html, 'html.parser')
soup
=
self
.
paserUrl
(
html
,
url
)
soup
=
self
.
paserUrl
(
html
,
url
)
# 遍历所有标签,并去掉属性
# 遍历所有标签,并去掉属性
for
tag
in
soup
.
find_all
(
True
):
for
tag
in
soup
.
find_all
(
True
):
if
tag
.
name
==
'img'
:
if
tag
.
name
==
'img'
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
elif
tag
.
name
!=
'img'
:
elif
tag
.
name
!=
'img'
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
else
:
else
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()}
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()}
# 打印去掉属性后的网页内容
# 打印去掉属性后的网页内容
# print(soup.prettify())
# print(soup.prettify())
html
=
soup
.
prettify
()
html
=
soup
.
prettify
()
return
html
return
html
# 将html中的相对地址转换成绝对地址
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
self
,
html
,
listurl
):
def
paserUrl
(
self
,
html
,
listurl
):
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
# 获取所有的<a>标签和<img>标签
# 获取所有的<a>标签和<img>标签
links
=
soup
.
find_all
([
'a'
,
'img'
])
links
=
soup
.
find_all
([
'a'
,
'img'
])
...
@@ -468,73 +539,76 @@ class GoogleSpider(object):
...
@@ -468,73 +539,76 @@ class GoogleSpider(object):
return
soup
return
soup
#获取资讯内容信息
#
获取资讯内容信息
def
getDetailmsg
(
self
,
detailmsg
):
def
getDetailmsg
(
self
,
detailmsg
):
try
:
try
:
detailurl
=
detailmsg
[
'detailUrl'
]
detailurl
=
detailmsg
[
'detailUrl'
]
title
=
detailmsg
[
'title'
]
title
=
detailmsg
[
'title'
]
content
,
contentWithTag
=
self
.
extractorMsg
(
detailurl
,
title
)
content
,
contentWithTag
=
self
.
extractorMsg
(
detailurl
,
title
)
contentWithTag
=
self
.
rmTagattr
(
contentWithTag
,
detailurl
)
contentWithTag
=
self
.
rmTagattr
(
contentWithTag
,
detailurl
)
except
Exception
as
e
:
except
Exception
as
e
:
content
=
''
content
=
''
contentWithTag
=
''
contentWithTag
=
''
currentdate
=
self
.
getNowDate
()
currentdate
=
self
.
getNowDate
()
kword
=
self
.
searchkw
kword
=
self
.
searchkw
publishDate
=
detailmsg
[
'publishTag'
]
publishDate
=
detailmsg
[
'publishTag'
]
publishDate
=
publishDate
+
''
publishDate
=
publishDate
+
''
# publishtime=self.paserTime(publishtime)
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg
=
{
detailmsg
=
{
'title'
:
detailmsg
[
'title'
],
'title'
:
detailmsg
[
'title'
],
'source'
:
detailmsg
[
'sourceTag'
],
'source'
:
detailmsg
[
'sourceTag'
],
'detailurl'
:
detailurl
,
'detailurl'
:
detailurl
,
'content'
:
content
,
'content'
:
content
,
'contentHtml'
:
contentWithTag
,
'contentHtml'
:
contentWithTag
,
'publishtime'
:
publishDate
,
'publishtime'
:
publishDate
,
'currentdate'
:
currentdate
,
'currentdate'
:
currentdate
,
'kword'
:
kword
'kword'
:
kword
}
}
return
detailmsg
return
detailmsg
def
getProcessitem
(
self
,
bdetail
):
def
getProcessitem
(
self
,
bdetail
):
nowDate
=
self
.
getNowDate
()
nowDate
=
self
.
getNowDate
()
content
=
bdetail
[
'content'
]
content
=
bdetail
[
'content'
]
if
content
!=
''
:
if
content
!=
''
:
processitem
=
{
processitem
=
{
"sid"
:
self
.
sid
,
"sid"
:
self
.
sid
,
"source"
:
"4"
,
"source"
:
"4"
,
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWithtag"
:
bdetail
[
'contentHtml'
],
"contentWithtag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"createDate"
:
nowDate
"createDate"
:
nowDate
}
}
return
processitem
return
processitem
def
sendkafka
(
self
,
processitem
):
def
sendkafka
(
self
,
processitem
):
try
:
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
self
.
kafka_bootstrap_servers
])
producer
=
KafkaProducer
(
bootstrap_servers
=
[
self
.
kafka_bootstrap_servers
])
content
=
processitem
[
'content'
]
content
=
processitem
[
'content'
]
publishDate
=
str
(
processitem
[
'publishDate'
])
publishDate
=
str
(
processitem
[
'publishDate'
])
title
=
processitem
[
'title'
]
title
=
processitem
[
'title'
]
if
title
==
''
:
if
title
==
''
:
return
return
if
content
==
''
:
if
content
==
''
:
return
return
if
publishDate
==
''
:
if
publishDate
==
''
:
return
return
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
processitem
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
processitem
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
# self.logger.info("数据发送kafka成功")
# self.logger.info("数据发送kafka成功")
self
.
logger
.
info
(
kafka_result
.
get
(
timeout
=
10
))
self
.
logger
.
info
(
kafka_result
.
get
(
timeout
=
10
))
flg
=
True
except
Exception
as
e
:
except
Exception
as
e
:
flg
=
False
pass
pass
# self.logger.info('发送kafka异常')
# self.logger.info('发送kafka异常')
finally
:
finally
:
producer
.
close
()
producer
.
close
()
return
flg
def
run
(
self
):
def
run
(
self
):
# 获取每页URL
# 获取每页URL
...
@@ -545,38 +619,37 @@ class GoogleSpider(object):
...
@@ -545,38 +619,37 @@ class GoogleSpider(object):
t
=
threading
.
Thread
(
target
=
self
.
get_detail_html
)
t
=
threading
.
Thread
(
target
=
self
.
get_detail_html
)
t
.
start
()
t
.
start
()
def
detect_language
(
self
,
html
):
def
detect_language
(
self
,
html
):
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
text
=
soup
.
get_text
()
text
=
soup
.
get_text
()
# 使用langid.py判断文本的语言
# 使用langid.py判断文本的语言
lang
,
confidence
=
langid
.
classify
(
text
)
lang
,
confidence
=
langid
.
classify
(
text
)
return
lang
return
lang
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
searchkw
=
'kw'
searchkw
=
'kw'
wordsCode
=
'wordsCode'
wordsCode
=
'wordsCode'
sid
=
'sid'
sid
=
'sid'
zhuce
=
GoogleSpider
(
searchkw
,
wordsCode
,
sid
)
zhuce
=
GoogleSpider
(
searchkw
,
wordsCode
,
sid
)
# zhuce.run()
# zhuce.run()
url
=
'https://vostok.today/46962-fesco-i-rzhd-rasshirjat-propusknuju-sposobnost-vladivostokskogo-morskogo-torgovogo-porta.html'
url
=
'https://vostok.today/46962-fesco-i-rzhd-rasshirjat-propusknuju-sposobnost-vladivostokskogo-morskogo-torgovogo-porta.html'
zhuce
.
driver
.
get
(
url
)
zhuce
.
driver
.
get
(
url
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
html
=
zhuce
.
driver
.
page_source
html
=
zhuce
.
driver
.
page_source
print
(
html
)
print
(
html
)
lang
=
zhuce
.
detect_language
(
html
)
lang
=
zhuce
.
detect_language
(
html
)
print
(
lang
)
print
(
lang
)
print
(
'++++++++++++++++++'
)
print
(
'++++++++++++++++++'
)
sm
=
SmartExtractor
(
lang
)
sm
=
SmartExtractor
(
lang
)
article
=
sm
.
extract_by_html
(
html
)
article
=
sm
.
extract_by_html
(
html
)
# article=sm.extract_by_url(url)
# article=sm.extract_by_url(url)
content
=
article
.
cleaned_text
content
=
article
.
cleaned_text
text
=
article
.
text
text
=
article
.
text
print
(
content
)
print
(
content
)
print
(
text
)
print
(
text
)
# raw_html = article.raw_html
# raw_html = article.raw_html
# html=zhuce.getRequest(url)
# html=zhuce.getRequest(url)
# article_content=zhuce.extract_article(html,url)
# article_content=zhuce.extract_article(html,url)
# print(article_content)
# print(article_content)
google_comm/googletaskJob_loc.py
浏览文件 @
9d49a0cd
...
@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
...
@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
self
.
r_6
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
6
)
def
getkafka
(
self
):
def
getkafka
(
self
):
# Kafka集群的地址
# Kafka集群的地址
...
@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
...
@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
def
paserKeyMsg
(
self
,
keymsg
):
def
paserKeyMsg
(
self
,
keymsg
):
num
=
1
logger
.
info
(
'----------'
)
logger
.
info
(
'----------'
)
wordsCode
=
keymsg
[
'wordsCode'
]
wordsCode
=
keymsg
[
'wordsCode'
]
id
=
keymsg
[
'id'
]
id
=
keymsg
[
'id'
]
try
:
keyword
=
keymsg
[
'keyWord'
]
searchEngines
=
keymsg
[
'searchEngines'
]
kwList
=
[]
if
'java.util.ArrayList'
in
searchEngines
:
keymsglist
=
self
.
getkeywords
(
keyword
)
searchEngines
=
searchEngines
[
1
]
except
Exception
as
e
:
for
kw
in
keymsglist
:
searchEngines
=
[]
kwmsg
=
{
kwList
=
[]
'kw'
:
kw
,
if
searchEngines
:
'wordsCode'
:
wordsCode
,
if
'4'
in
searchEngines
:
'sid'
:
id
keyword
=
keymsg
[
'keyWord'
]
}
keymsglist
=
self
.
getkeywords
(
keyword
)
kwList
.
append
((
num
,
kwmsg
))
for
kw
in
keymsglist
:
num
+=
1
kwmsg
=
{
'kw'
:
kw
,
'wordsCode'
:
wordsCode
,
'sid'
:
id
}
kwList
.
append
(
kwmsg
)
return
kwList
return
kwList
def
runSpider
(
self
,
kwmsg
):
def
runSpider
(
self
,
threadId
,
kwmsg
,
item
,
bangdan_name
):
if
'lay'
in
kwmsg
[
'kw'
]:
com_name
=
item
.
split
(
'|'
)[
2
]
else
:
com_name
=
item
.
split
(
'|'
)[
1
]
searchkw
=
com_name
+
' '
+
kwmsg
[
'kw'
]
searchkw
=
kwmsg
[
'kw'
]
print
(
f
'======拼接的关键词是{searchkw}=={com_name}===='
)
wordsCode
=
kwmsg
[
'wordsCode'
]
wordsCode
=
kwmsg
[
'wordsCode'
]
sid
=
kwmsg
[
'sid'
]
sid
=
kwmsg
[
'sid'
]
googleSpider
=
GoogleSpider
(
searchkw
,
wordsCode
,
sid
)
googleSpider
=
GoogleSpider
(
threadId
,
searchkw
,
wordsCode
,
sid
,
item
,
bangdan_name
)
try
:
try
:
googleSpider
.
get_page_html
()
googleSpider
.
get_page_html
()
...
@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
...
@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
finally
:
finally
:
googleSpider
.
driver
.
quit
()
googleSpider
.
driver
.
quit
()
logger
.
info
(
"关键词采集结束!"
+
searchkw
)
logger
.
info
(
"关键词采集结束!"
+
searchkw
)
import
random
def
get_comname
(
self
):
# todo:读取redis里的企业名称添加到关键词上
# ZZSN22080900000001|沃尔玛|WMT|1
item
=
baseCore
.
redicPullData
(
'GOOGLE_KEYWORDS:COMPANY_NAME:2023_500'
)
# item = 'ZZSN22080900000001|沃尔玛|WMT|1'
if
item
:
return
item
else
:
logger
.
info
(
'====已无企业==='
)
return
None
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
key
,
r
):
try
:
r
.
ping
()
except
:
r
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
item
=
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# keymsglist=getkeywords(ss)
...
@@ -164,14 +189,28 @@ if __name__ == '__main__':
...
@@ -164,14 +189,28 @@ if __name__ == '__main__':
print
(
'---------------'
)
print
(
'---------------'
)
while
True
:
while
True
:
try
:
try
:
codeids
=
[]
# try:
# codeid='KW-20230727-0001'
# googleTaskJob.r.ping()
codeids
.
append
(
'KW-20240318-0001'
)
# except:
for
codeid
in
codeids
:
# googleTaskJob.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# all_keys = 'GOOGLE_KEYWORDS:COMPANY_NAME*'
# keys = googleTaskJob.r.scan_iter(f"{key}*")
# for key in keys:
item
=
googleTaskJob
.
get_comname
()
bangdan_name
=
'2023年世界500强'
if
item
:
pass
else
:
break
codeList
=
[
'KW-20240516-0002'
]
for
codeid
in
codeList
:
try
:
try
:
#
keymsg=baiduTaskJob.getkafka()
#keymsg=baiduTaskJob.getkafka()
keymsg
=
googleTaskJob
.
getkeyFromredis
(
codeid
)
keymsg
=
googleTaskJob
.
getkeyFromredis
(
codeid
)
kwList
=
googleTaskJob
.
paserKeyMsg
(
keymsg
)
kwList
=
googleTaskJob
.
paserKeyMsg
(
keymsg
)
# kwList=reversed(kwList)
# kwList=reversed(kwList)
# 从列表中随机选择5个数据
# 从列表中随机选择5个数据
# kwList = random.sample(kwList, 4)
# kwList = random.sample(kwList, 4)
...
@@ -182,9 +221,9 @@ if __name__ == '__main__':
...
@@ -182,9 +221,9 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
4
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
googleTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
googleTaskJob
.
runSpider
,
num
,
data
,
item
,
bangdan_name
)
for
num
,
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
for
future
in
concurrent
.
futures
.
as_completed
(
results
):
for
future
in
concurrent
.
futures
.
as_completed
(
results
):
try
:
try
:
...
@@ -195,5 +234,5 @@ if __name__ == '__main__':
...
@@ -195,5 +234,5 @@ if __name__ == '__main__':
# 处理任务执行过程中的异常
# 处理任务执行过程中的异常
logger
.
info
(
f
"任务执行exception: {e}"
)
logger
.
info
(
f
"任务执行exception: {e}"
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
info
(
'采集异常
'
)
logger
.
info
(
f
'采集异常{e}
'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论