Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
cc9aa52f
提交
cc9aa52f
authored
7月 20, 2022
作者:
张文库
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
更新
上级
f314a48b
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
424 行增加
和
340 行删除
+424
-340
SiteInfoVerify.java
comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
+11
-3
DynaminSiteThread.java
...ler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
+10
-5
PaserSiteDownload.java
...ler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+65
-85
SiteThread.java
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+9
-3
PaserCommDownload.java
...c/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
+1
-1
WebContentPaserByCss.java
...ain/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+10
-12
WebContentPaserByJsonXpath.java
...va/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
+39
-14
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+0
-0
WebContentPaserByXpath.java
...n/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+29
-7
SeleniumTime.java
...rc/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+46
-50
WebPageScreenShot.java
...in/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
+3
-3
ArticleCrawlerThread.java
...main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
+1
-1
PaserCommDownload.java
...n/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
+1
-1
WebContentPaserByJsonXpath.java
...m/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
+3
-3
WebContentPaserByRegular.java
...com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
+2
-2
WebContentPaserByXpath.java
...a/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+3
-3
PageConnectioner.java
...ler/src/main/java/com/zzsn/download/PageConnectioner.java
+103
-67
PageDownloader.java
...awler/src/main/java/com/zzsn/download/PageDownloader.java
+52
-47
ClbAnsProcessitem.java
...wler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
+2
-3
KafkaConsumerJob.java
..._crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+16
-11
constants.properties
comm_crawler/src/main/resources/constants.properties
+6
-6
ThreadExecutorConfig.java
...rch/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
+12
-13
没有找到文件。
comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
浏览文件 @
cc9aa52f
...
@@ -29,7 +29,11 @@ public class SiteInfoVerify{
...
@@ -29,7 +29,11 @@ public class SiteInfoVerify{
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
String
charset
=
"utf-8"
;
String
charset
=
"utf-8"
;
if
(
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
){
if
(
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
){
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
e
)
{
//
}
}
}
...
@@ -82,7 +86,11 @@ public class SiteInfoVerify{
...
@@ -82,7 +86,11 @@ public class SiteInfoVerify{
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
ex
)
{
//
}
}
}
//判断解析表达式类型
//判断解析表达式类型
if
(
siteMsgTemple
.
getListExpressionType
().
equals
(
"3"
))
{
//css表达式
if
(
siteMsgTemple
.
getListExpressionType
().
equals
(
"3"
))
{
//css表达式
...
@@ -165,7 +173,7 @@ public class SiteInfoVerify{
...
@@ -165,7 +173,7 @@ public class SiteInfoVerify{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
浏览文件 @
cc9aa52f
...
@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
...
@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.stereotype.Component
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
...
@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
crawler
();
crawler
();
}
}
@Async
(
"asyncexecutorService"
)
//
@Async("asyncexecutorService")
public
void
crawler
(){
public
void
crawler
(){
//获取栏目链接以及翻页的链接
//获取栏目链接以及翻页的链接
...
@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
...
@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
String
charset
=
""
;
String
charset
=
""
;
try
{
try
{
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
}
catch
(
Exception
e
){
}
catch
(
Exception
e
)
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
ex
)
{
//
}
}
}
//获取列表url等信息通过匹配url过滤
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
...
@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
...
@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
}
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//资讯类容抽取
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
//判断解析详情表达式类型
//判断解析详情表达式类型
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
...
@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
...
@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
浏览文件 @
cc9aa52f
...
@@ -406,47 +406,41 @@ public class PaserSiteDownload {
...
@@ -406,47 +406,41 @@ public class PaserSiteDownload {
return
HttpClients
.
createDefault
();
return
HttpClients
.
createDefault
();
}
}
public
static
String
getCharSet
(
String
url
)
{
public
static
String
getCharSet
(
String
url
)
throws
IOException
{
String
html
=
""
;
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpResponse
httprespse
=
null
;
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
HttpEntity
entitydata
=
null
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
60000
);
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
60000
);
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
httpgeturl
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
httpgeturl
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse
httprespse
=
null
;
try
{
try
{
Thread
.
sleep
(
500L
);
// Thread.sleep(500L);
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
60000
);
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
60000
);
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
httpgeturl
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
httpgeturl
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
httprespse
=
httpClient
.
execute
(
httpgeturl
);
httprespse
=
httpClient
.
execute
(
httpgeturl
);
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
httpgeturl
.
releaseConnection
();
}
catch
(
Exception
e2
)
{
}
catch
(
Exception
e2
)
{
// TODO Auto-generated catch block
// e2.printStackTrace();
log
.
info
(
"请求访问失败!"
);
log
.
info
(
"请求访问失败!"
);
return
"utf-8"
;
return
"utf-8"
;
}
// 发送请求
}
finally
{
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
httpClient
.
close
();
}
Header
lastModify
=
httprespse
.
getFirstHeader
(
"Last-Modified"
);
String
charset
=
"utf-8"
;
String
charset
=
"utf-8"
;
String
infodata
=
""
;
String
infodata
=
""
;
try
{
try
{
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
e1
.
printStackTrace
();
}
}
httpgeturl
.
releaseConnection
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
Pattern
.
CASE_INSENSITIVE
);
...
@@ -465,27 +459,24 @@ public class PaserSiteDownload {
...
@@ -465,27 +459,24 @@ public class PaserSiteDownload {
charset
=
m3
.
group
().
substring
(
9
);
charset
=
m3
.
group
().
substring
(
9
);
}
}
if
(
charset
.
trim
().
length
()
==
0
)
{
if
(
charset
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
charset
=
"gbk"
;
charset
=
"gbk"
;
// }
}
}
}
}
return
charset
;
return
charset
;
}
}
}
}
return
charset
;
return
charset
;
}
}
public
static
String
getHtml
(
String
url
,
String
charset
)
{
public
static
String
getHtml
(
String
url
,
String
charset
)
{
String
html
=
""
;
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
6
0000
);
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
2
0000
);
httpgeturl
.
getParams
().
setParameter
(
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
6
0000
);
HttpMethodParams
.
SO_TIMEOUT
,
2
0000
);
// 伪装成浏览器
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
"application/x-www-form-urlencoded;charset=utf-8"
);
...
@@ -499,16 +490,14 @@ public class PaserSiteDownload {
...
@@ -499,16 +490,14 @@ public class PaserSiteDownload {
httprespse
=
httpClient
.
execute
(
httpgeturl
);
httprespse
=
httpClient
.
execute
(
httpgeturl
);
}
catch
(
Exception
e2
)
{
}
catch
(
Exception
e2
)
{
httpgeturl
.
releaseConnection
();
httpgeturl
.
releaseConnection
();
// TODO Auto-generated catch block
// e2.printStackTrace();
return
""
;
return
""
;
}
// 发送请求
}
// 发送请求
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
Header
lastModify
=
httprespse
//
Header lastModify = httprespse
.
getFirstHeader
(
"Last-Modified"
);
//
.getFirstHeader("Last-Modified");
if
(
lastModify
==
null
)
{
//
if (lastModify == null) {
lastModify
=
httprespse
.
getLastHeader
(
"Last-Modified"
);
//
lastModify = httprespse.getLastHeader("Last-Modified");
}
//
}
if
(
charset
==
null
)
{
if
(
charset
==
null
)
{
String
charstype
=
EntityUtils
String
charstype
=
EntityUtils
.
getContentCharSet
(
entitydata
);
.
getContentCharSet
(
entitydata
);
...
@@ -524,61 +513,52 @@ public class PaserSiteDownload {
...
@@ -524,61 +513,52 @@ public class PaserSiteDownload {
try
{
try
{
Thread
.
sleep
(
500L
);
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
httpgeturl
.
releaseConnection
();
httpClient
.
close
();
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
// e1.printStackTrace();
log
.
info
(
"内容解析异常"
);
log
.
info
(
"内容解析异常"
);
}
finally
{
}
finally
{
httpgeturl
.
releaseConnection
();
httpgeturl
.
releaseConnection
();
}
}
return
infodata
;
return
infodata
;
}
}
// 获取所要抓取网页的编码方式
// 获取所要抓取网页的编码方式
public
static
String
locateCharSet
(
String
url
)
{
public
static
String
locateCharSet
(
String
url
)
{
String
encoding
=
"utf-8"
;
String
encoding
=
"utf-8"
;
try
{
try
{
Connection
conn
=
Jsoup
.
connect
(
url
);
Connection
conn
=
Jsoup
.
connect
(
url
);
conn
.
header
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"
);
conn
.
header
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"
);
// 伪装成浏览器
// 伪装成浏览器
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
10
000
).
get
();
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
5
000
).
get
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
Pattern
.
CASE_INSENSITIVE
);
Matcher
m1
=
p1
.
matcher
(
doc
.
toString
());
Matcher
m1
=
p1
.
matcher
(
doc
.
toString
());
while
(
m1
.
find
())
{
while
(
m1
.
find
())
{
String
str
=
m1
.
group
();
String
str
=
m1
.
group
();
Pattern
p2
=
Pattern
.
compile
(
"charset[^\\s||\"||;||'||>]*"
);
Pattern
p2
=
Pattern
.
compile
(
"charset[^\\s||\"||;||'||>]*"
);
Matcher
m2
=
p2
.
matcher
(
str
);
Matcher
m2
=
p2
.
matcher
(
str
);
if
(
m2
.
find
())
{
if
(
m2
.
find
())
{
encoding
=
m2
.
group
().
substring
(
8
);
encoding
=
m2
.
group
().
substring
(
8
);
if
(
encoding
.
trim
().
length
()
==
0
)
{
if
(
encoding
.
trim
().
length
()
==
0
)
{
Pattern
p3
=
Pattern
Pattern
p3
=
Pattern
.
compile
(
"charset=\"[^\\s||\"||;||>]*"
);
.
compile
(
"charset=\"[^\\s||\"||;||>]*"
);
Matcher
m3
=
p3
.
matcher
(
str
);
Matcher
m3
=
p3
.
matcher
(
str
);
if
(
m3
.
find
())
{
if
(
m3
.
find
())
{
encoding
=
m3
.
group
().
substring
(
9
);
encoding
=
m3
.
group
().
substring
(
9
);
}
}
if
(
encoding
.
trim
().
length
()
==
0
)
{
if
(
encoding
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName)
;
encoding
=
"gbk"
;
// if(encoding == null){
}
encoding
=
"gbk"
;
}
// }
return
encoding
;
}
}
}
}
}
catch
(
IOException
e
)
{
log
.
error
(
"获取编码方式出错"
);
return
encoding
;
return
encoding
;
}
}
}
}
catch
(
IOException
e
)
{
// e.printStackTrace();
log
.
error
(
"获取编码方式出错"
);
System
.
out
.
println
(
"获取编码方式出错"
);
return
encoding
;
return
encoding
;
}
return
encoding
;
}
}
public
static
Properties
getConfig
()
{
public
static
Properties
getConfig
()
{
...
@@ -608,7 +588,7 @@ public class PaserSiteDownload {
...
@@ -608,7 +588,7 @@ public class PaserSiteDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
浏览文件 @
cc9aa52f
...
@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
...
@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.stereotype.Component
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
...
@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
urlList
.
addAll
(
hisUrlList
);
urlList
.
addAll
(
hisUrlList
);
}
}
//获取编码
//获取编码
String
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
String
charset
=
null
;
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
e
)
{
//
}
//获取列表url等信息通过匹配url过滤
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
...
@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
...
@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
}
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//获取文章详情
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
//判断解析详情表达式类型
//判断解析详情表达式类型
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
...
@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
...
@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
浏览文件 @
cc9aa52f
...
@@ -356,7 +356,7 @@ public class PaserCommDownload {
...
@@ -356,7 +356,7 @@ public class PaserCommDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
浏览文件 @
cc9aa52f
...
@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
...
@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
TimeUnit
.
SECONDS
.
sleep
(
2
);
TimeUnit
.
SECONDS
.
sleep
(
2
);
}
}
if
(
StringUtils
.
isEmpty
(
body
)&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
//当body为空和动态时调用
// if (StringUtils.isEmpty(body)) {
sentBadSiteMsg
(
siteMsgTemple
,
"动态请求异常"
,
"0"
);
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
}
else
{
// }
sentBadSiteMsg
(
siteMsgTemple
,
"静态网络请求异常"
,
"0"
);
}
if
(
StringUtils
.
isNotEmpty
(
body
))
{
if
(
StringUtils
.
isNotEmpty
(
body
))
{
Document
doc
=
Jsoup
.
parse
(
body
);
Document
doc
=
Jsoup
.
parse
(
body
);
//抽取资讯url
//抽取资讯url
...
@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
...
@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// }
// }
if
(
catchWebByMetaSearches
.
size
()
<
1
&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
//提取不到信息时再次调用
//
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg
(
siteMsgTemple
,
"列表解析配置异常"
,
"1"
);
//
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
}
//
}
}
}
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getIsScreenshot
())
&&
siteMsgTemple
.
getIsScreenshot
().
contains
(
"1"
)){
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getIsScreenshot
())
&&
siteMsgTemple
.
getIsScreenshot
().
contains
(
"1"
)){
String
imagUrl
=
""
;
String
imagUrl
=
""
;
...
@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
...
@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
if
(
StringUtils
.
isNotEmpty
(
content
))
{
if
(
StringUtils
.
isNotEmpty
(
content
))
{
docInfo
=
doPaserByCssTag
(
content
,
docInfo
,
siteMsgTemple
);
docInfo
=
doPaserByCssTag
(
content
,
docInfo
,
siteMsgTemple
);
}
else
{
}
else
{
sentBadSiteMsg
(
siteMsgTemple
,
"解析配置异常"
,
"1"
);
//
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log
.
info
(
"栏目名称:"
+
siteMsgTemple
.
getSiteName
()+
" 链接请求:"
+
cwbm
.
getSourceaddress
()+
" 内容为空:"
+
content
);
log
.
info
(
"栏目名称:"
+
siteMsgTemple
.
getSiteName
()+
" 链接请求:"
+
cwbm
.
getSourceaddress
()+
" 内容为空:"
+
content
);
}
}
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
sentBadSiteMsg
(
siteMsgTemple
,
"解析配置异常"
,
"1"
);
//
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log
.
info
(
"详情内容解析出现异常:"
+
cwbm
.
getSourceaddress
());
log
.
info
(
"详情内容解析出现异常:"
+
cwbm
.
getSourceaddress
());
}
}
...
@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
...
@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
docInfo
.
setId
(
count
+
""
);
docInfo
.
setId
(
count
+
""
);
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
浏览文件 @
cc9aa52f
...
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
...
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.CatchWebByMetaSearch
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.ClbAnsProcessitem
;
import
com.zzsn.entity.DocInfo
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.util.ContentUtility
;
import
com.zzsn.util.ContentUtility
;
...
@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
...
@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
}
}
}
}
}
}
if
(
StringUtils
.
isNotEmpty
(
body
))
{
if
(
StringUtils
.
isNotEmpty
(
body
))
{
//抽取资讯url
//抽取资讯url
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByJsonpath
(
siteMsgTemple
,
body
);
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByJsonpath
(
siteMsgTemple
,
body
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
}
else
{
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"列表下载异常 对应的链接:"
+
uri_code
);
log
.
info
(
"列表下载异常 对应的链接:"
+
uri_code
);
...
@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
...
@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
try
{
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int
partition
=
0
;
//
int partition=0;
try
{
//
try {
partition
=
Integer
.
parseInt
(
Constants
.
KAFKA_PRODUCT_PARTITION
);
//
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
}
catch
(
Exception
e
){
//
}catch (Exception e){
log
.
info
(
"分区配置异常:"
+
Constants
.
KAFKA_PRODUCT_PARTITION
);
//
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
}
//
}
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
docjson
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
docjson
);
docInfoList
.
add
(
docInfo
);
docInfoList
.
add
(
docInfo
);
log
.
info
(
"发送到kafka成功。"
);
log
.
info
(
"发送到kafka成功。"
);
...
@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
...
@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
...
@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
return
encoding
;
return
encoding
;
}
}
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
badSiteMsg
.
setSiteName
(
siteMsgTemple
.
getSiteName
());
badSiteMsg
.
setSiteUri
(
siteMsgTemple
.
getSiteUri
());
badSiteMsg
.
setErrorType
(
msg
);
badSiteMsg
.
setProblemType
(
problemType
);
String
crawlerType
=
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
?
"0"
:
siteMsgTemple
.
getYnDynamicCrawl
()+
""
;
badSiteMsg
.
setCrawlerType
(
crawlerType
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"badSiteTopic"
,
docjson
);
log
.
info
(
"信息源问题:"
+
msg
);
}
catch
(
Exception
e
){
}
}
}
}
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
cc9aa52f
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
浏览文件 @
cc9aa52f
...
@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
...
@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
import
com.zzsn.crawler.uriparser.WebPageScreenShot
;
import
com.zzsn.crawler.uriparser.WebPageScreenShot
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.CatchWebByMetaSearch
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.ClbAnsProcessitem
;
import
com.zzsn.entity.DocInfo
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.util.*
;
import
com.zzsn.util.*
;
...
@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
...
@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
}
}
}
}
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url
//抽取资讯url
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByXpath
(
siteMsgTemple
,
body
);
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByXpath
(
siteMsgTemple
,
body
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
...
@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
...
@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
return
catchWebByMetaSearchList
;
return
catchWebByMetaSearchList
;
}
}
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
badSiteMsg
.
setSiteName
(
siteMsgTemple
.
getSiteName
());
badSiteMsg
.
setSiteUri
(
siteMsgTemple
.
getSiteUri
());
badSiteMsg
.
setErrorType
(
msg
);
badSiteMsg
.
setProblemType
(
problemType
);
String
crawlerType
=
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
?
"0"
:
siteMsgTemple
.
getYnDynamicCrawl
()+
""
;
badSiteMsg
.
setCrawlerType
(
crawlerType
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"badSiteTopic"
,
docjson
);
log
.
info
(
"信息源问题:"
+
msg
);
}
catch
(
Exception
e
){
}
}
//提取列表信息
//提取列表信息
public
List
<
CatchWebByMetaSearch
>
parserCrawlerSiteListByXpath
(
SiteMsgTemple
siteMsgTemple
,
String
body
)
throws
Exception
{
public
List
<
CatchWebByMetaSearch
>
parserCrawlerSiteListByXpath
(
SiteMsgTemple
siteMsgTemple
,
String
body
)
throws
Exception
{
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<
CatchWebByMetaSearch
>();
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<
CatchWebByMetaSearch
>();
...
@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
...
@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
try
{
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...
@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
...
@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
浏览文件 @
cc9aa52f
...
@@ -70,42 +70,38 @@ public class SeleniumTime {
...
@@ -70,42 +70,38 @@ public class SeleniumTime {
ChromeDriverService
service
=
new
ChromeDriverService
.
Builder
().
ChromeDriverService
service
=
new
ChromeDriverService
.
Builder
().
usingDriverExecutable
(
new
File
(
Constants
.
CHROMEDRIVE
)).
usingAnyFreePort
().
build
();
usingDriverExecutable
(
new
File
(
Constants
.
CHROMEDRIVE
)).
usingAnyFreePort
().
build
();
try
{
try
{
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
service
.
start
();
service
.
start
();
if
(!
System
.
getProperty
(
"os.name"
).
toUpperCase
().
contains
(
"WINDOWS"
))
{
if
(!
System
.
getProperty
(
"os.name"
).
toUpperCase
().
contains
(
"WINDOWS"
))
{
chromeOptions
.
addArguments
(
"--disable-gpu"
,
"--window-size=1290,1080"
);
chromeOptions
.
addArguments
(
"--disable-gpu"
,
"--window-size=1290,1080"
);
chromeOptions
.
addArguments
(
"headless"
);
//无界面参数
chromeOptions
.
addArguments
(
"headless"
);
//无界面参数
chromeOptions
.
addArguments
(
"no-sandbox"
);
//禁用沙盒 就是被这个参数搞了一天
chromeOptions
.
addArguments
(
"no-sandbox"
);
//禁用沙盒 就是被这个参数搞了一天
}
}
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver
=
new
ChromeDriver
(
chromeOptions
);
//生成实例
driver
=
new
ChromeDriver
(
chromeOptions
);
//生成实例
try
{
try
{
Duration
duration
=
Duration
.
of
(
6
0
,
ChronoUnit
.
SECONDS
);
Duration
duration
=
Duration
.
of
(
10
0
,
ChronoUnit
.
SECONDS
);
driver
.
manage
().
timeouts
().
pageLoadTimeout
(
duration
);
driver
.
manage
().
timeouts
().
pageLoadTimeout
(
duration
);
driver
.
get
(
url
);
driver
.
get
(
url
);
Thread
.
sleep
(
1000
l
);
Thread
.
sleep
(
1000
2
);
try
{
try
{
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
html
=
webElement
.
getAttribute
(
"outerHTML"
);
html
=
webElement
.
getAttribute
(
"outerHTML"
);
System
.
out
.
println
(
"browser will be close"
);
System
.
out
.
println
(
"browser will be close"
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
}
finally
{
driver
.
quit
();
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
}
finally
{
}
finally
{
try
{
driver
.
quit
();
driver
.
quit
();
service
.
stop
();
service
.
stop
();
Thread
.
sleep
(
3000
l
);
}
catch
(
InterruptedException
e
)
{
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"chromedriver 驱动访问出现异常:"
+
e
.
getMessage
());
return
""
;
}
finally
{
service
.
stop
();
}
}
return
html
;
return
html
;
}
}
...
@@ -281,18 +277,18 @@ public class SeleniumTime {
...
@@ -281,18 +277,18 @@ public class SeleniumTime {
// robot.keyPress(KeyEvent.VK_ENTER);//按下enter键
// robot.keyPress(KeyEvent.VK_ENTER);//按下enter键
robot
.
keyPress
(
keycode
);
robot
.
keyPress
(
keycode
);
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
//去除html中的相关标签
//去除html中的相关标签
/**
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
*/
SeleniumTime
s
=
new
SeleniumTime
();
SeleniumTime
s
=
new
SeleniumTime
();
String
scopehtml
=
s
.
getScopehtml
(
"http://www.flw.ph/thread-869016-1-1.html"
);
String
scopehtml
=
s
.
getScopehtml
(
"http://www.flw.ph/thread-869016-1-1.html"
);
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
System
.
out
.
println
(
"开始"
);
System
.
out
.
println
(
"开始"
);
...
@@ -303,7 +299,7 @@ public class SeleniumTime {
...
@@ -303,7 +299,7 @@ public class SeleniumTime {
System
.
out
.
println
(
"包含b"
);
System
.
out
.
println
(
"包含b"
);
}
}
System
.
out
.
println
(
"结束"
);
System
.
out
.
println
(
"结束"
);
String
[]
split
=
scopehtml
.
split
(
a
);
String
[]
split
=
scopehtml
.
split
(
a
);
String
sa
=
split
[
0
];
String
sa
=
split
[
0
];
System
.
out
.
println
(
"首次截取的长度"
+
split
.
length
);
System
.
out
.
println
(
"首次截取的长度"
+
split
.
length
);
...
@@ -312,31 +308,31 @@ public class SeleniumTime {
...
@@ -312,31 +308,31 @@ public class SeleniumTime {
String
substring
=
sb
.
substring
(
7
);
String
substring
=
sb
.
substring
(
7
);
System
.
out
.
println
(
"再次截取的长度"
+
split2
.
length
);
System
.
out
.
println
(
"再次截取的长度"
+
split2
.
length
);
String
sab
=
sa
+
substring
;
String
sab
=
sa
+
substring
;
// //解决方式 正则匹配删除标签
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//
//// boolean isMatch = regex.matches(scopehtml);
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
////
// // 创建 Pattern 对象
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
// Pattern r = Pattern.compile(regex);
//
//
// // 现在创建 matcher 对象
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// } else {
// System.out.println("NO MATCH");
// System.out.println("NO MATCH");
// }
// }
//
//
//
//
File
file
=
new
File
(
"D:/123.txt"
);
File
file
=
new
File
(
"D:/123.txt"
);
try
{
try
{
PrintStream
ps
=
new
PrintStream
(
new
FileOutputStream
(
file
));
PrintStream
ps
=
new
PrintStream
(
new
FileOutputStream
(
file
));
...
@@ -345,30 +341,30 @@ public class SeleniumTime {
...
@@ -345,30 +341,30 @@ public class SeleniumTime {
// TODO Auto-generated catch block
// TODO Auto-generated catch block
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
}
}
}
}
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
浏览文件 @
cc9aa52f
...
@@ -41,8 +41,8 @@ public class WebPageScreenShot {
...
@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize();
// driver.manage().window().maximize();
String
js1
=
"return document.body.clientHeight.toString()"
;
String
js1
=
"return document.body.clientHeight.toString()"
;
String
js1_result
=
((
JavascriptExecutor
)
driver
).
executeScript
(
js1
)
+
""
;
//
String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
int
height
=
Integer
.
parseInt
(
js1_result
);
//
int height = Integer.parseInt(js1_result);
List
<
String
>
files
=
new
ArrayList
<
String
>();
List
<
String
>
files
=
new
ArrayList
<
String
>();
int
last_t
=
0
;
int
last_t
=
0
;
// for (int i = 0; i < 20; ) {
// for (int i = 0; i < 20; ) {
...
@@ -80,7 +80,7 @@ public class WebPageScreenShot {
...
@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot
customScreenshot
=
new
CustomScreenshot
();
CustomScreenshot
customScreenshot
=
new
CustomScreenshot
();
files
.
add
(
customScreenshot
.
fullScreenshotLong
(
driver
).
getAbsolutePath
());
files
.
add
(
customScreenshot
.
fullScreenshotLong
(
driver
).
getAbsolutePath
());
driver
.
quit
();
//退出浏览器
driver
.
quit
();
//退出浏览器
boolean
flag
=
merge
(
files
.
toArray
(
new
String
[]{}),
type
,
resultPath
);
//
boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){
// if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
浏览文件 @
cc9aa52f
...
@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
...
@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
浏览文件 @
cc9aa52f
...
@@ -361,7 +361,7 @@ public class PaserCommDownload {
...
@@ -361,7 +361,7 @@ public class PaserCommDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
浏览文件 @
cc9aa52f
...
@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
...
@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
try
{
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
...
@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
...
@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
浏览文件 @
cc9aa52f
...
@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
...
@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
try
{
try
{
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
if
(
StringUtils
.
isEmpty
(
processitem
.
getTitle
())||
StringUtils
.
isEmpty
(
processitem
.
getContent
())
if
(
StringUtils
.
isEmpty
(
processitem
.
getTitle
())||
StringUtils
.
isEmpty
(
processitem
.
getContent
())
||
StringUtils
.
isEmpty
(
processitem
.
getPublishDate
())){
||
StringUtils
.
isEmpty
(
processitem
.
getPublishDate
())){
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
浏览文件 @
cc9aa52f
...
@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
...
@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
try
{
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
...
@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
...
@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/download/PageConnectioner.java
浏览文件 @
cc9aa52f
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/download/PageDownloader.java
浏览文件 @
cc9aa52f
...
@@ -34,7 +34,7 @@ public class PageDownloader {
...
@@ -34,7 +34,7 @@ public class PageDownloader {
this
.
bDownloadUseFrame
=
b
;
this
.
bDownloadUseFrame
=
b
;
}
}
public
PageDownloader
(){
public
PageDownloader
(){
}
}
Timer
timer
;
Timer
timer
;
public
PageDownloader
(
long
sec
)
{
public
PageDownloader
(
long
sec
)
{
...
@@ -49,39 +49,46 @@ public class PageDownloader {
...
@@ -49,39 +49,46 @@ public class PageDownloader {
// 如果页面编码格式未知,则从页面中获取该页面编码格式
// 如果页面编码格式未知,则从页面中获取该页面编码格式
public
String
getEncodingFromHtmlFile
(
String
urlstr
,
HttpURLConnection
connection
)
throws
IOException
{
public
String
getEncodingFromHtmlFile
(
String
urlstr
,
HttpURLConnection
connection
)
throws
IOException
{
connection
.
setRequestMethod
(
"GET"
);
connection
.
setRequestProperty
(
"User-Agent"
,
"Mozilla/5.0 "
+
"(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+
"Gecko/20080404 Firefox/2.0.0.14"
);
connection
.
setRequestProperty
(
"referer"
,
urlstr
);
connection
.
setRequestProperty
(
"Cookie"
,
"auth=token"
);
String
contentType
=
connection
.
getHeaderField
(
"Content-Type"
);
String
encoding
=
null
;
String
encoding
=
null
;
if
(
contentType
!=
null
)
{
try
{
String
temp
=
"charset="
;
connection
.
setRequestMethod
(
"GET"
);
int
m
=
contentType
.
indexOf
(
temp
);
connection
.
setRequestProperty
(
"User-Agent"
,
"Mozilla/5.0 "
+
"(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
if
(
m
!=
-
1
)
{
+
"Gecko/20080404 Firefox/2.0.0.14"
);
encoding
=
contentType
.
substring
(
m
+
temp
.
length
()).
replace
(
"]"
,
""
);
connection
.
setRequestProperty
(
"referer"
,
urlstr
);
connection
.
setRequestProperty
(
"Cookie"
,
"auth=token"
);
String
contentType
=
connection
.
getHeaderField
(
"Content-Type"
);
if
(
contentType
!=
null
)
{
String
temp
=
"charset="
;
int
m
=
contentType
.
indexOf
(
temp
);
if
(
m
!=
-
1
)
{
encoding
=
contentType
.
substring
(
m
+
temp
.
length
()).
replace
(
"]"
,
""
);
}
}
}
}
if
(
encoding
==
null
)
{
if
(
encoding
==
null
)
{
try
{
InputStream
is
=
null
;
InputStream
is
=
null
;
is
=
connection
.
getInputStream
();
try
{
BufferedInputStream
bufferedInputStream
=
new
BufferedInputStream
(
is
);
is
=
connection
.
getInputStream
();
encoding
=
EncodeDetector
.
getEncoding
(
bufferedInputStream
);
BufferedInputStream
bufferedInputStream
=
new
BufferedInputStream
(
is
);
is
.
close
();
encoding
=
EncodeDetector
.
getEncoding
(
bufferedInputStream
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
//
}
finally
{
assert
is
!=
null
;
is
.
close
();
}
}
}
}
catch
(
Exception
e
)
{
//
}
finally
{
connection
.
disconnect
();
}
}
connection
.
disconnect
();
return
encoding
;
return
encoding
;
}
}
// Document接口,主要针对html,txt,deng网页,通过get方式获取,动态或者静态链接
// Document接口,主要针对html,txt,deng网页,通过get方式获取,动态或者静态链接
public
Document
downloadWithDoc
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
public
Document
downloadWithDoc
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
Document
doc
=
null
;
Document
doc
=
null
;
String
docBody
=
""
;
String
docBody
=
""
;
if
(
false
)
{
if
(
false
)
{
...
@@ -117,7 +124,7 @@ public class PageDownloader {
...
@@ -117,7 +124,7 @@ public class PageDownloader {
}
}
return
doc
;
return
doc
;
}
}
// Document接口,主要针对jsonHtml类型配置文件,通过get方式获取,动态或者静态链接
// Document接口,主要针对jsonHtml类型配置文件,通过get方式获取,动态或者静态链接
public
Document
downloadWithJsonHtml
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
,
public
Document
downloadWithJsonHtml
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
,
String
bodyPath
)
{
String
bodyPath
)
{
...
@@ -154,30 +161,24 @@ public class PageDownloader {
...
@@ -154,30 +161,24 @@ public class PageDownloader {
}
}
/** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
/** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
public
String
downloadWithStr
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
public
String
downloadWithStr
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
if
(
interval
>
0
&&
lastDownloadTime
>
0
&&
dis
<
interval
){
if
(
interval
>
0
&&
lastDownloadTime
>
0
&&
dis
<
interval
){
new
PageDownloader
(
dis
+
2000
);
new
PageDownloader
(
dis
+
2000
);
}
}
long
startDtime
=
System
.
currentTimeMillis
();
PageConnectioner
pConn
=
new
PageConnectioner
();
PageConnectioner
pConn
=
new
PageConnectioner
();
HttpURLConnection
connection
=
null
;
try
{
try
{
connection
=
pConn
.
connection
(
url
);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
//获取网站编码
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
encoding
=
paserSiteDownload
.
locateCharSet
(
url
);
encoding
=
paserSiteDownload
.
locateCharSet
(
url
);
}
}
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
log
.
info
(
"获取编码失败"
);
log
.
info
(
"获取编码失败"
);
}
}
String
docBody
=
null
;
String
docBody
=
null
;
if
(
bDynamic
)
{
if
(
bDynamic
)
{
docBody
=
pConn
.
dynamicConnectByGet
(
url
,
encoding
);
docBody
=
pConn
.
dynamicConnectByGet
(
url
,
encoding
);
}
else
{
}
else
{
// this.bDownloadUseFrame=true;
if
(
bFrame
&&
this
.
bDownloadUseFrame
)
{
if
(
bFrame
&&
this
.
bDownloadUseFrame
)
{
String
body
=
null
;
String
body
=
null
;
try
{
try
{
...
@@ -196,12 +197,11 @@ public class PageDownloader {
...
@@ -196,12 +197,11 @@ public class PageDownloader {
}
}
if
(
url
.
contains
(
"https:"
)){
if
(
url
.
contains
(
"https:"
)){
try
{
try
{
connection
=
pConn
.
httpsconnection
(
url
);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
"utf-8"
;
encoding
=
"utf-8"
;
}
}
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
//
}
}
docBody
=
pConn
.
staticHttpsConnectByGet
(
url
,
encoding
,
false
);
docBody
=
pConn
.
staticHttpsConnectByGet
(
url
,
encoding
,
false
);
}
else
{
}
else
{
...
@@ -211,7 +211,7 @@ public class PageDownloader {
...
@@ -211,7 +211,7 @@ public class PageDownloader {
this
.
lastDownloadTime
=
System
.
currentTimeMillis
();
this
.
lastDownloadTime
=
System
.
currentTimeMillis
();
return
docBody
;
return
docBody
;
}
}
public
String
downloadWithStrAddHeader
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
,
String
headerParams
)
{
public
String
downloadWithStrAddHeader
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
,
String
headerParams
)
{
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
...
@@ -221,7 +221,7 @@ public class PageDownloader {
...
@@ -221,7 +221,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info);
logUtil.getLogger().info(info);
Thread.sleep(dis+2000);
Thread.sleep(dis+2000);
} catch (InterruptedException e) {
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
}*/
}*/
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
...
@@ -237,6 +237,9 @@ public class PageDownloader {
...
@@ -237,6 +237,9 @@ public class PageDownloader {
}
}
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
// e1.printStackTrace();
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
}
String
docBody
=
null
;
String
docBody
=
null
;
if
(
bDynamic
)
{
if
(
bDynamic
)
{
...
@@ -264,7 +267,7 @@ public class PageDownloader {
...
@@ -264,7 +267,7 @@ public class PageDownloader {
}
}
if
(
url
.
contains
(
"https:"
)){
if
(
url
.
contains
(
"https:"
)){
try
{
try
{
connection
=
pConn
.
httpsconnection
(
url
);
//
connection = pConn.httpsconnection(url);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
"utf-8"
;
encoding
=
"utf-8"
;
}
}
...
@@ -306,7 +309,7 @@ public class PageDownloader {
...
@@ -306,7 +309,7 @@ public class PageDownloader {
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
getEncodingFromHtmlFile
(
url
,
connection
);
encoding
=
getEncodingFromHtmlFile
(
url
,
connection
);
}
}
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
// e1.printStackTrace();
}
}
String
docBody
=
null
;
String
docBody
=
null
;
...
@@ -334,7 +337,7 @@ public class PageDownloader {
...
@@ -334,7 +337,7 @@ public class PageDownloader {
}
}
}
}
docBody
=
pConn
.
staticConnectByGet
(
url
,
encoding
);
docBody
=
pConn
.
staticConnectByGet
(
url
,
encoding
);
if
(
isBadDownloadPage
(
docBody
)
&&
this
.
badPage
)
{
if
(
isBadDownloadPage
(
docBody
)
&&
this
.
badPage
)
{
return
docBody
;
return
docBody
;
}
}
...
@@ -344,7 +347,7 @@ public class PageDownloader {
...
@@ -344,7 +347,7 @@ public class PageDownloader {
}
}
/** String接口,目前用于豆瓣API图书的爬取 */
/** String接口,目前用于豆瓣API图书的爬取 */
public
String
downloadPoxyWithStrAPI
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
public
String
downloadPoxyWithStrAPI
(
String
url
,
String
encoding
,
boolean
bDynamic
,
boolean
bFrame
)
{
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
if
(
interval
>
0
&&
lastDownloadTime
>
0
&&
dis
<
interval
)
{
if
(
interval
>
0
&&
lastDownloadTime
>
0
&&
dis
<
interval
)
{
/*try {
/*try {
...
@@ -366,8 +369,11 @@ public class PageDownloader {
...
@@ -366,8 +369,11 @@ public class PageDownloader {
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
getEncodingFromHtmlFile
(
url
,
connection
);
encoding
=
getEncodingFromHtmlFile
(
url
,
connection
);
}
}
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
// e1.printStackTrace();
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
}
String
docBody
=
null
;
String
docBody
=
null
;
if
(
bDynamic
)
{
if
(
bDynamic
)
{
...
@@ -399,7 +405,7 @@ public class PageDownloader {
...
@@ -399,7 +405,7 @@ public class PageDownloader {
this
.
lastDownloadTime
=
System
.
currentTimeMillis
();
this
.
lastDownloadTime
=
System
.
currentTimeMillis
();
return
docBody
;
return
docBody
;
}
}
// String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接
// String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接
public
String
downloadWithStr
(
String
url
,
String
encoding
,
String
param
)
{
public
String
downloadWithStr
(
String
url
,
String
encoding
,
String
param
)
{
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
long
dis
=
System
.
currentTimeMillis
()
-
lastDownloadTime
;
...
@@ -409,7 +415,7 @@ public class PageDownloader {
...
@@ -409,7 +415,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info);
logUtil.getLogger().info(info);
Thread.sleep(dis+2000);
Thread.sleep(dis+2000);
} catch (InterruptedException e) {
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
}*/
}*/
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
...
@@ -444,7 +450,7 @@ public class PageDownloader {
...
@@ -444,7 +450,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info);
logUtil.getLogger().info(info);
Thread.sleep(dis+2000);
Thread.sleep(dis+2000);
} catch (InterruptedException e) {
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
}*/
}*/
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
String
info
=
String
.
format
(
"ORMSG: Wait for next dl %dms"
,
dis
+
2000
);
...
@@ -493,7 +499,6 @@ public class PageDownloader {
...
@@ -493,7 +499,6 @@ public class PageDownloader {
return
true
;
return
true
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
// TODO Auto-generated catch block
return
true
;
return
true
;
}
}
return
false
;
return
false
;
...
@@ -501,7 +506,7 @@ public class PageDownloader {
...
@@ -501,7 +506,7 @@ public class PageDownloader {
/**
/**
* 向指定URL发送GET方法的请求
* 向指定URL发送GET方法的请求
*
*
* @param url
* @param url
* 发送请求的URL
* 发送请求的URL
* 只用于塔读APP
* 只用于塔读APP
...
@@ -550,5 +555,5 @@ public class PageDownloader {
...
@@ -550,5 +555,5 @@ public class PageDownloader {
}
}
}
}
return
result
;
return
result
;
}
}
}
}
comm_crawler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
浏览文件 @
cc9aa52f
...
@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
...
@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
/**正文*/
/**正文*/
private
String
content
;
private
String
content
;
private
String
contentWith
t
ag
;
private
String
contentWith
T
ag
;
/**未知*/
/**未知*/
...
@@ -94,4 +94,4 @@ public class ClbAnsProcessitem {
...
@@ -94,4 +94,4 @@ public class ClbAnsProcessitem {
/**(临时处理)关联的专题id*/
/**(临时处理)关联的专题id*/
private
List
<
String
>
subjectIds
;
private
List
<
String
>
subjectIds
;
}
}
\ No newline at end of file
comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
浏览文件 @
cc9aa52f
...
@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
...
@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
// latest earliest
// latest earliest
//时间间隔设置为1h
//时间间隔设置为1h
// properties.put("max.poll.interval.ms", 60*60*1000);
// properties.put("max.poll.interval.ms", 60*60*1000);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_INTERVAL_MS_CONFIG
,
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_INTERVAL_MS_CONFIG
,
2
*
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
HEARTBEAT_INTERVAL_MS_CONFIG
,
25000
);
properties
.
put
(
ConsumerConfig
.
HEARTBEAT_INTERVAL_MS_CONFIG
,
25000
);
properties
.
put
(
ConsumerConfig
.
SESSION_TIMEOUT_MS_CONFIG
,
30000
);
properties
.
put
(
ConsumerConfig
.
SESSION_TIMEOUT_MS_CONFIG
,
30000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_RECORDS_CONFIG
,
1
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_RECORDS_CONFIG
,
1
);
...
@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
...
@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled
(
cron
=
"0 0/
5
* * * ?"
)
@Scheduled
(
cron
=
"0 0/
2
* * * ?"
)
@Async
(
"asyncTaskExecutor"
)
//
@Async("asyncTaskExecutor")
public
void
consumer
(){
public
void
consumer
(){
ExecutorService
threadPool
=
Executors
.
newFixedThreadPool
(
Constants
.
THREAD_SIZE
);
//
ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log
.
info
(
"进入定时获取
mq
消息"
);
log
.
info
(
"进入定时获取
topic
消息"
);
//1.创建消费者
//1.创建消费者
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
// 消费某个主题的某个分区数据
// 消费某个主题的某个分区数据
...
@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
...
@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
Duration
.
ofMillis
(
3000
));
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
Duration
.
ofMillis
(
3000
));
//手动提交已消费数据的offset
//手动提交已消费数据的offset
// consumer.commitAsync();
consumer
.
commitSync
();
consumer
.
commitSync
();
if
(
records
!=
null
&&
records
.
count
()
>
0
)
{
if
(
records
!=
null
&&
records
.
count
()
>
0
)
{
for
(
ConsumerRecord
record
:
records
)
{
for
(
ConsumerRecord
record
:
records
)
{
...
@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
...
@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
}
}
}
}
}
}
}
}
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
// consumer.commitSync();
//退出应用程序前使用close方法关闭消费者,网络连接和socket也会随之关闭,并立即触发一次再均衡
log
.
info
(
e
.
getMessage
());
consumer
.
close
();
// consumer = createConsumer();
System
.
out
.
println
(
"error!!!!!!!!!!!"
);
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
consumer
=
createConsumer
();
// 消费某个主题的某个分区数据
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
String
[]
partitions1
=
kafkaConsumerPartition
.
split
(
","
);
for
(
int
i
=
0
;
i
<
partitions1
.
length
;
i
++)
{
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions1
[
i
])));
}
consumer
.
assign
(
topicPartitions
);
}
}
}
}
...
...
comm_crawler/src/main/resources/constants.properties
浏览文件 @
cc9aa52f
...
@@ -35,8 +35,8 @@ PROXYID=1
...
@@ -35,8 +35,8 @@ PROXYID=1
#线程池大小
#线程池大小
THREAD_SIZE
=
1
THREAD_SIZE
=
1
#
#
CHROMEDRIVE
=
E
:
\\
chrome
\\
chromedriver.exe
CHROMEDRIVE
=
D
:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
CHROMEBIN
=
C:
\\
Program Files
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
#mysql connection
#mysql connection
...
@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
...
@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC
=
clb-infosource-handler-dynamin
KAFKA_CONSUMER_TOPIC
=
clb-infosource-handler-dynamin
#
#
KAFKA_CONSUMER_GROUP_ID
=
dynamin-sync
KAFKA_CONSUMER_GROUP_ID
=
test-zs1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET
=
earliest
KAFKA_CONSUMER_AUTO_OFFSET_RESET
=
earliest
KAFKA_PRODUCT_TOPIC
=
crawlerInfo
KAFKA_PRODUCT_TOPIC
=
crawlerInfo
...
@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
...
@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
META_SEARCH_URL
=
https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
META_SEARCH_URL
=
https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#指定分区使用逗号分割
#指定分区使用逗号分割
KAFKA_CONSUMER_PARTITION
=
0
KAFKA_CONSUMER_PARTITION
=
0
,1,2,3
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_PRODUCT_PARTITION
=
0
KAFKA_PRODUCT_PARTITION
=
0
# Redis settings
# Redis settings
redis.host
=
1
27.0.0.1
redis.host
=
1
14.116.26.150
redis.port
=
6379
redis.port
=
6379
redis.pass
=
xxxxxx
redis.pass
=
zzsn9988
#redis.host=8.130.30.33
#redis.host=8.130.30.33
#redis.port=9010
#redis.port=9010
#redis.pass=wxadS&jklim
#redis.pass=wxadS&jklim
...
...
sina_search/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
浏览文件 @
cc9aa52f
...
@@ -17,19 +17,19 @@ public class ThreadExecutorConfig {
...
@@ -17,19 +17,19 @@ public class ThreadExecutorConfig {
@Bean
(
value
=
"asyncTaskExecutor"
)
@Bean
(
value
=
"asyncTaskExecutor"
)
public
Executor
executor
()
{
public
Executor
executor
()
{
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
executor
.
setCorePoolSize
(
1
);
//线程池维护线程的最少数量
executor
.
setCorePoolSize
(
2
);
//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
1
);
//线程池维护线程的最大数量
executor
.
setMaxPoolSize
(
5
);
//线程池维护线程的最大数量
executor
.
setQueueCapacity
(
5000
);
//缓存队列
executor
.
setQueueCapacity
(
5000
);
//缓存队列
executor
.
setThreadNamePrefix
(
"ssmsExecutor-"
);
executor
.
setThreadNamePrefix
(
"ssmsExecutor-"
);
/**
/**
* 对拒绝task的处理策略
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
*/
executor
.
setRejectedExecutionHandler
(
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
executor
.
setRejectedExecutionHandler
(
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
executor
.
setKeepAliveSeconds
(
60
);
//允许的空闲时间
executor
.
setKeepAliveSeconds
(
60
);
//允许的空闲时间
executor
.
initialize
();
executor
.
initialize
();
return
executor
;
return
executor
;
}
}
@Bean
(
value
=
"asyncTaskExecutorSelenium"
)
@Bean
(
value
=
"asyncTaskExecutorSelenium"
)
...
@@ -139,4 +139,4 @@ public class ThreadExecutorConfig {
...
@@ -139,4 +139,4 @@ public class ThreadExecutorConfig {
executor
.
initialize
();
executor
.
initialize
();
return
executor
;
return
executor
;
}
}
}
}
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论