Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
cc9aa52f
提交
cc9aa52f
authored
7月 20, 2022
作者:
张文库
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
更新
上级
f314a48b
全部展开
显示空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
256 行增加
和
170 行删除
+256
-170
SiteInfoVerify.java
comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
+9
-1
DynaminSiteThread.java
...ler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
+9
-4
PaserSiteDownload.java
...ler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+23
-43
SiteThread.java
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+9
-3
PaserCommDownload.java
...c/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
+1
-1
WebContentPaserByCss.java
...ain/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+10
-12
WebContentPaserByJsonXpath.java
...va/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
+39
-14
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+0
-0
WebContentPaserByXpath.java
...n/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+29
-7
SeleniumTime.java
...rc/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+8
-12
WebPageScreenShot.java
...in/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
+3
-3
ArticleCrawlerThread.java
...main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
+1
-1
PaserCommDownload.java
...n/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
+1
-1
WebContentPaserByJsonXpath.java
...m/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
+3
-3
WebContentPaserByRegular.java
...com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
+2
-2
WebContentPaserByXpath.java
...a/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+3
-3
PageConnectioner.java
...ler/src/main/java/com/zzsn/download/PageConnectioner.java
+62
-26
PageDownloader.java
...awler/src/main/java/com/zzsn/download/PageDownloader.java
+19
-14
ClbAnsProcessitem.java
...wler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
+1
-1
KafkaConsumerJob.java
..._crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+16
-11
constants.properties
comm_crawler/src/main/resources/constants.properties
+6
-6
ThreadExecutorConfig.java
...rch/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
+2
-2
没有找到文件。
comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
浏览文件 @
cc9aa52f
...
...
@@ -29,7 +29,11 @@ public class SiteInfoVerify{
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
String
charset
=
"utf-8"
;
if
(
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
){
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
e
)
{
//
}
}
...
...
@@ -82,7 +86,11 @@ public class SiteInfoVerify{
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
}
catch
(
Exception
e
){
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
ex
)
{
//
}
}
//判断解析表达式类型
if
(
siteMsgTemple
.
getListExpressionType
().
equals
(
"3"
))
{
//css表达式
...
...
@@ -165,7 +173,7 @@ public class SiteInfoVerify{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
浏览文件 @
cc9aa52f
...
...
@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
crawler
();
}
@Async
(
"asyncexecutorService"
)
//
@Async("asyncexecutorService")
public
void
crawler
(){
//获取栏目链接以及翻页的链接
...
...
@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
String
charset
=
""
;
try
{
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
}
catch
(
Exception
e
){
}
catch
(
Exception
e
)
{
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
ex
)
{
//
}
}
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
...
...
@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//资讯类容抽取
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
//判断解析详情表达式类型
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
...
...
@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
浏览文件 @
cc9aa52f
...
...
@@ -406,11 +406,14 @@ public class PaserSiteDownload {
return
HttpClients
.
createDefault
();
}
public
static
String
getCharSet
(
String
url
)
{
String
html
=
""
;
public
static
String
getCharSet
(
String
url
)
throws
IOException
{
String
html
=
""
;
HttpResponse
httprespse
=
null
;
HttpEntity
entitydata
=
null
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
try
{
// Thread.sleep(500L);
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
60000
);
httpgeturl
.
getParams
().
setParameter
(
...
...
@@ -422,31 +425,22 @@ public class PaserSiteDownload {
httpgeturl
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse
httprespse
=
null
;
try
{
Thread
.
sleep
(
500L
);
httprespse
=
httpClient
.
execute
(
httpgeturl
);
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
httpgeturl
.
releaseConnection
();
}
catch
(
Exception
e2
)
{
// TODO Auto-generated catch block
// e2.printStackTrace();
log
.
info
(
"请求访问失败!"
);
return
"utf-8"
;
}
// 发送请求
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
Header
lastModify
=
httprespse
.
getFirstHeader
(
"Last-Modified"
);
}
finally
{
httpClient
.
close
();
}
String
charset
=
"utf-8"
;
String
infodata
=
""
;
try
{
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
httpgeturl
.
releaseConnection
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
...
...
@@ -465,27 +459,24 @@ public class PaserSiteDownload {
charset
=
m3
.
group
().
substring
(
9
);
}
if
(
charset
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
charset
=
"gbk"
;
// }
}
}
return
charset
;
}
}
return
charset
;
}
public
static
String
getHtml
(
String
url
,
String
charset
)
{
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
6
0000
);
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
2
0000
);
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
6
0000
);
HttpMethodParams
.
SO_TIMEOUT
,
2
0000
);
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
...
...
@@ -499,16 +490,14 @@ public class PaserSiteDownload {
httprespse
=
httpClient
.
execute
(
httpgeturl
);
}
catch
(
Exception
e2
)
{
httpgeturl
.
releaseConnection
();
// TODO Auto-generated catch block
// e2.printStackTrace();
return
""
;
}
// 发送请求
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
Header
lastModify
=
httprespse
.
getFirstHeader
(
"Last-Modified"
);
if
(
lastModify
==
null
)
{
lastModify
=
httprespse
.
getLastHeader
(
"Last-Modified"
);
}
//
Header lastModify = httprespse
//
.getFirstHeader("Last-Modified");
//
if (lastModify == null) {
//
lastModify = httprespse.getLastHeader("Last-Modified");
//
}
if
(
charset
==
null
)
{
String
charstype
=
EntityUtils
.
getContentCharSet
(
entitydata
);
...
...
@@ -524,15 +513,13 @@ public class PaserSiteDownload {
try
{
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
httpgeturl
.
releaseConnection
();
httpClient
.
close
();
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
// e1.printStackTrace();
log
.
info
(
"内容解析异常"
);
}
finally
{
httpgeturl
.
releaseConnection
();
}
return
infodata
;
}
// 获取所要抓取网页的编码方式
...
...
@@ -542,7 +529,7 @@ public class PaserSiteDownload {
Connection
conn
=
Jsoup
.
connect
(
url
);
conn
.
header
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"
);
// 伪装成浏览器
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
10
000
).
get
();
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
5
000
).
get
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
...
...
@@ -561,23 +548,16 @@ public class PaserSiteDownload {
encoding
=
m3
.
group
().
substring
(
9
);
}
if
(
encoding
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding
=
"gbk"
;
// }
}
}
return
encoding
;
}
}
}
catch
(
IOException
e
)
{
// e.printStackTrace();
log
.
error
(
"获取编码方式出错"
);
System
.
out
.
println
(
"获取编码方式出错"
);
return
encoding
;
}
return
encoding
;
}
...
...
@@ -608,7 +588,7 @@ public class PaserSiteDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
浏览文件 @
cc9aa52f
...
...
@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
urlList
.
addAll
(
hisUrlList
);
}
//获取编码
String
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
String
charset
=
null
;
try
{
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
}
catch
(
IOException
e
)
{
//
}
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
...
...
@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
WebContentPaserByRegular
webContentPaserByRegular
=
new
WebContentPaserByRegular
();
metaSearchList
=
webContentPaserByRegular
.
catchWebOfStaticmsgByRegular
(
urlList
,
charset
,
siteMsgTemple
);
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//获取文章详情
siteMsgTemple
.
setDetailExpressionType
(
siteMsgTemple
.
getDetailExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getDetailExpressionType
());
//判断解析详情表达式类型
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"3"
))
{
//css表达式
...
...
@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
浏览文件 @
cc9aa52f
...
...
@@ -356,7 +356,7 @@ public class PaserCommDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
浏览文件 @
cc9aa52f
...
...
@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
TimeUnit
.
SECONDS
.
sleep
(
2
);
}
if
(
StringUtils
.
isEmpty
(
body
)&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
//当body为空和动态时调用
sentBadSiteMsg
(
siteMsgTemple
,
"动态请求异常"
,
"0"
);
}
else
{
sentBadSiteMsg
(
siteMsgTemple
,
"静态网络请求异常"
,
"0"
);
}
// if (StringUtils.isEmpty(body)) {
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if
(
StringUtils
.
isNotEmpty
(
body
))
{
Document
doc
=
Jsoup
.
parse
(
body
);
//抽取资讯url
...
...
@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// }
if
(
catchWebByMetaSearches
.
size
()
<
1
&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
//提取不到信息时再次调用
sentBadSiteMsg
(
siteMsgTemple
,
"列表解析配置异常"
,
"1"
);
}
//
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
//
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
//
}
}
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getIsScreenshot
())
&&
siteMsgTemple
.
getIsScreenshot
().
contains
(
"1"
)){
String
imagUrl
=
""
;
...
...
@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
if
(
StringUtils
.
isNotEmpty
(
content
))
{
docInfo
=
doPaserByCssTag
(
content
,
docInfo
,
siteMsgTemple
);
}
else
{
sentBadSiteMsg
(
siteMsgTemple
,
"解析配置异常"
,
"1"
);
//
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log
.
info
(
"栏目名称:"
+
siteMsgTemple
.
getSiteName
()+
" 链接请求:"
+
cwbm
.
getSourceaddress
()+
" 内容为空:"
+
content
);
}
}
catch
(
Exception
e
){
sentBadSiteMsg
(
siteMsgTemple
,
"解析配置异常"
,
"1"
);
//
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log
.
info
(
"详情内容解析出现异常:"
+
cwbm
.
getSourceaddress
());
}
...
...
@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
docInfo
.
setId
(
count
+
""
);
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
浏览文件 @
cc9aa52f
...
...
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.CatchWebByMetaSearch
;
import
com.zzsn.entity.ClbAnsProcessitem
;
import
com.zzsn.entity.DocInfo
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.util.ContentUtility
;
...
...
@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
}
}
}
if
(
StringUtils
.
isNotEmpty
(
body
))
{
if
(
StringUtils
.
isNotEmpty
(
body
))
{
//抽取资讯url
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByJsonpath
(
siteMsgTemple
,
body
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
}
else
{
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
}
}
catch
(
Exception
e
)
{
log
.
info
(
"列表下载异常 对应的链接:"
+
uri_code
);
...
...
@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int
partition
=
0
;
try
{
partition
=
Integer
.
parseInt
(
Constants
.
KAFKA_PRODUCT_PARTITION
);
}
catch
(
Exception
e
){
log
.
info
(
"分区配置异常:"
+
Constants
.
KAFKA_PRODUCT_PARTITION
);
}
//
int partition=0;
//
try {
//
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
//
}catch (Exception e){
//
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
//
}
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
docjson
);
docInfoList
.
add
(
docInfo
);
log
.
info
(
"发送到kafka成功。"
);
...
...
@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
return
encoding
;
}
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
badSiteMsg
.
setSiteName
(
siteMsgTemple
.
getSiteName
());
badSiteMsg
.
setSiteUri
(
siteMsgTemple
.
getSiteUri
());
badSiteMsg
.
setErrorType
(
msg
);
badSiteMsg
.
setProblemType
(
problemType
);
String
crawlerType
=
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
?
"0"
:
siteMsgTemple
.
getYnDynamicCrawl
()+
""
;
badSiteMsg
.
setCrawlerType
(
crawlerType
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"badSiteTopic"
,
docjson
);
log
.
info
(
"信息源问题:"
+
msg
);
}
catch
(
Exception
e
){
}
}
}
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
cc9aa52f
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
浏览文件 @
cc9aa52f
...
...
@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
import
com.zzsn.crawler.uriparser.WebPageScreenShot
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.CatchWebByMetaSearch
;
import
com.zzsn.entity.ClbAnsProcessitem
;
import
com.zzsn.entity.DocInfo
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.util.*
;
...
...
@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
}
}
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
parserCrawlerSiteListByXpath
(
siteMsgTemple
,
body
);
catchWebByMetaSearchList
.
addAll
(
catchWebByMetaSearches
);
...
...
@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
return
catchWebByMetaSearchList
;
}
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
badSiteMsg
.
setSiteName
(
siteMsgTemple
.
getSiteName
());
badSiteMsg
.
setSiteUri
(
siteMsgTemple
.
getSiteUri
());
badSiteMsg
.
setErrorType
(
msg
);
badSiteMsg
.
setProblemType
(
problemType
);
String
crawlerType
=
siteMsgTemple
.
getYnDynamicCrawl
()!=
1
?
"0"
:
siteMsgTemple
.
getYnDynamicCrawl
()+
""
;
badSiteMsg
.
setCrawlerType
(
crawlerType
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"badSiteTopic"
,
docjson
);
log
.
info
(
"信息源问题:"
+
msg
);
}
catch
(
Exception
e
){
}
}
//提取列表信息
public
List
<
CatchWebByMetaSearch
>
parserCrawlerSiteListByXpath
(
SiteMsgTemple
siteMsgTemple
,
String
body
)
throws
Exception
{
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<
CatchWebByMetaSearch
>();
...
...
@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...
...
@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
浏览文件 @
cc9aa52f
...
...
@@ -70,42 +70,38 @@ public class SeleniumTime {
ChromeDriverService
service
=
new
ChromeDriverService
.
Builder
().
usingDriverExecutable
(
new
File
(
Constants
.
CHROMEDRIVE
)).
usingAnyFreePort
().
build
();
try
{
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
service
.
start
();
if
(!
System
.
getProperty
(
"os.name"
).
toUpperCase
().
contains
(
"WINDOWS"
))
{
chromeOptions
.
addArguments
(
"--disable-gpu"
,
"--window-size=1290,1080"
);
chromeOptions
.
addArguments
(
"headless"
);
//无界面参数
chromeOptions
.
addArguments
(
"no-sandbox"
);
//禁用沙盒 就是被这个参数搞了一天
}
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver
=
new
ChromeDriver
(
chromeOptions
);
//生成实例
try
{
Duration
duration
=
Duration
.
of
(
6
0
,
ChronoUnit
.
SECONDS
);
Duration
duration
=
Duration
.
of
(
10
0
,
ChronoUnit
.
SECONDS
);
driver
.
manage
().
timeouts
().
pageLoadTimeout
(
duration
);
driver
.
get
(
url
);
Thread
.
sleep
(
1000
l
);
Thread
.
sleep
(
1000
2
);
try
{
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
html
=
webElement
.
getAttribute
(
"outerHTML"
);
System
.
out
.
println
(
"browser will be close"
);
}
catch
(
Exception
e
)
{
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
}
finally
{
driver
.
quit
();
}
}
catch
(
Exception
e
)
{
log
.
info
(
"chromedriver 出现异常:"
+
e
.
getMessage
());
}
finally
{
try
{
driver
.
quit
();
service
.
stop
();
Thread
.
sleep
(
3000
l
);
}
catch
(
InterruptedException
e
)
{
}
}
}
catch
(
Exception
e
)
{
return
""
;
log
.
info
(
"chromedriver 驱动访问出现异常:"
+
e
.
getMessage
());
}
finally
{
service
.
stop
();
}
return
html
;
}
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
浏览文件 @
cc9aa52f
...
...
@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize();
String
js1
=
"return document.body.clientHeight.toString()"
;
String
js1_result
=
((
JavascriptExecutor
)
driver
).
executeScript
(
js1
)
+
""
;
int
height
=
Integer
.
parseInt
(
js1_result
);
//
String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
//
int height = Integer.parseInt(js1_result);
List
<
String
>
files
=
new
ArrayList
<
String
>();
int
last_t
=
0
;
// for (int i = 0; i < 20; ) {
...
...
@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot
customScreenshot
=
new
CustomScreenshot
();
files
.
add
(
customScreenshot
.
fullScreenshotLong
(
driver
).
getAbsolutePath
());
driver
.
quit
();
//退出浏览器
boolean
flag
=
merge
(
files
.
toArray
(
new
String
[]{}),
type
,
resultPath
);
//
boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
浏览文件 @
cc9aa52f
...
...
@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
浏览文件 @
cc9aa52f
...
...
@@ -361,7 +361,7 @@ public class PaserCommDownload {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
浏览文件 @
cc9aa52f
...
...
@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
...
...
@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
浏览文件 @
cc9aa52f
...
...
@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
try
{
ClbAnsProcessitem
processitem
=
paserSiteDownload
.
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
if
(
StringUtils
.
isEmpty
(
processitem
.
getTitle
())||
StringUtils
.
isEmpty
(
processitem
.
getContent
())
||
StringUtils
.
isEmpty
(
processitem
.
getPublishDate
())){
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
浏览文件 @
cc9aa52f
...
...
@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
try
{
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
processitem
.
setSource
(
"
动态爬取
"
);
processitem
.
setSource
(
"
2
"
);
}
else
{
processitem
.
setSource
(
"
静态爬取
"
);
processitem
.
setSource
(
"
1
"
);
}
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
"key"
,
docjson
);
...
...
@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setTitle
(
docInfo
.
getTitle
());
clbAnsProcessitem
.
setContent
(
docInfo
.
getContentNoTag
());
clbAnsProcessitem
.
setContentWith
t
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setContentWith
T
ag
(
docInfo
.
getContentWithTag
());
clbAnsProcessitem
.
setSummary
(
docInfo
.
getSummary
());
clbAnsProcessitem
.
setAuthor
(
docInfo
.
getAuthor
());
clbAnsProcessitem
.
setOrigin
(
docInfo
.
getOrigin
());
...
...
comm_crawler/src/main/java/com/zzsn/download/PageConnectioner.java
浏览文件 @
cc9aa52f
...
...
@@ -86,9 +86,8 @@ public class PageConnectioner {
//参数类型是json字符串用到
connection
.
setRequestProperty
(
"Content-Type"
,
"application/json"
);
}
catch
(
Exception
e
)
{
//
}
return
connection
;
}
...
...
@@ -157,6 +156,7 @@ public class PageConnectioner {
URL
url
=
null
;
Proxy
proxy
=
new
Proxy
(
Proxy
.
Type
.
HTTP
,
new
InetSocketAddress
(
PROXY_ADDR
,
PROXY_PORT
));
HttpsURLConnection
connection
=
null
;
try
{
trustAllHttpsCertificates
();
HostnameVerifier
hv
=
new
HostnameVerifier
()
{
@Override
...
...
@@ -166,8 +166,6 @@ public class PageConnectioner {
};
HttpsURLConnection
.
setDefaultHostnameVerifier
(
hv
);
try
{
url
=
new
URL
(
urlstr
);
if
(
false
)
{
connection
=
(
HttpsURLConnection
)
url
.
openConnection
(
proxy
);
...
...
@@ -180,14 +178,12 @@ public class PageConnectioner {
connection
.
setRequestProperty
(
"connection"
,
"Keep-Alive"
);
connection
.
setRequestProperty
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
connection
.
addRequestProperty
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
)
{
//
}
return
connection
;
}
/**构造下载使用的{@link HttpsURLConnection}
* @param urlstr 下载url
* @return
...
...
@@ -252,9 +248,9 @@ public class PageConnectioner {
break
;
}
catch
(
Exception
e1
)
{
try
{
Thread
.
sleep
(
10
000
);
Thread
.
sleep
(
2
000
);
}
catch
(
InterruptedException
e2
)
{
// logUtil.getLogger().error(String.format("ORMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e2)));
//
}
}
}
...
...
@@ -313,10 +309,18 @@ public class PageConnectioner {
long
startDownTime
=
System
.
currentTimeMillis
();
PageGet
pg
=
null
;
String
docBody
=
null
;
HttpURLConnection
connection
=
null
;
try
{
pg
=
new
PageGet
(
url
,
encoding
,
this
.
connection
(
url
,
headerParams
));
}
catch
(
Exception
e3
)
{
connection
=
this
.
connection
(
url
,
headerParams
);
pg
=
new
PageGet
(
url
,
encoding
,
connection
);
}
catch
(
Exception
e1
)
{
assert
connection
!=
null
;
connection
.
disconnect
();
return
docBody
;
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
try
{
...
...
@@ -356,12 +360,18 @@ public class PageConnectioner {
long
startDownTime
=
System
.
currentTimeMillis
();
PageGet
pg
=
null
;
String
docBody
=
null
;
HttpURLConnection
connection
=
null
;
try
{
pg
=
new
PageGet
(
url
,
encoding
,
this
.
connection
(
url
));
connection
=
this
.
connection
(
url
);
pg
=
new
PageGet
(
url
,
encoding
,
connection
);
}
catch
(
Exception
e3
)
{
assert
connection
!=
null
;
connection
.
disconnect
();
return
docBody
;
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
try
{
pg
.
urlConnectionGet
();
docBody
=
pg
.
getPageStr
();
...
...
@@ -393,12 +403,18 @@ public class PageConnectioner {
* @return
*/
protected
String
staticHttpsConnectByGet
(
String
url
,
String
encoding
,
boolean
bFrame
)
{
long
exitTimeDis
=
3
000
;
long
exitTimeDis
=
10
000
;
long
startDownTime
=
System
.
currentTimeMillis
();
PageGet
pg
=
null
;
HttpsURLConnection
connection
=
null
;
try
{
pg
=
new
PageGet
(
url
,
encoding
,
this
.
httpsconnection
(
url
));
connection
=
this
.
httpsconnection
(
url
);
pg
=
new
PageGet
(
url
,
encoding
,
connection
);
}
catch
(
Exception
e3
)
{
//
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
String
docBody
=
null
;
try
{
...
...
@@ -542,15 +558,23 @@ public class PageConnectioner {
long
startDownTime
=
System
.
currentTimeMillis
();
PagePost
pp
=
null
;
String
docBody
=
null
;
HttpURLConnection
connection
=
null
;
try
{
if
(
postParam
!=
null
&&
postParam
.
contains
(
"[Content-type]"
))
{
// 仅用于 鹏云课堂
String
param
=
postParam
.
replace
(
"[Content-type]"
,
""
);
pp
=
new
PagePost
(
url
,
encoding
,
this
.
connection
(
url
,
param
),
param
);
connection
=
this
.
connection
(
url
,
param
);
pp
=
new
PagePost
(
url
,
encoding
,
connection
,
param
);
}
else
{
pp
=
new
PagePost
(
url
,
encoding
,
this
.
connection
(
url
),
postParam
);
connection
=
this
.
connection
(
url
);
pp
=
new
PagePost
(
url
,
encoding
,
connection
,
postParam
);
}
}
catch
(
Exception
e3
)
{
assert
connection
!=
null
;
connection
.
disconnect
();
return
docBody
;
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
try
{
...
...
@@ -589,15 +613,23 @@ public class PageConnectioner {
long
startDownTime
=
System
.
currentTimeMillis
();
PagePost
pp
=
null
;
String
docBody
=
null
;
HttpURLConnection
connection
=
null
;
try
{
if
(
postParam
!=
null
&&
postParam
.
contains
(
"{"
)&&
postParam
.
contains
(
":"
))
{
// 仅用于 鹏云课堂
String
param
=
postParam
.
replace
(
"[Content-type]"
,
""
);
pp
=
new
PagePost
(
url
,
encoding
,
this
.
connection
(
url
,
param
),
param
);
connection
=
this
.
connection
(
url
,
param
);
pp
=
new
PagePost
(
url
,
encoding
,
connection
,
param
);
}
else
{
pp
=
new
PagePost
(
url
,
encoding
,
this
.
connection
(
url
),
postParam
);
connection
=
this
.
connection
(
url
);
pp
=
new
PagePost
(
url
,
encoding
,
connection
,
postParam
);
}
}
catch
(
Exception
e3
)
{
assert
connection
!=
null
;
connection
.
disconnect
();
return
docBody
;
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
try
{
...
...
@@ -634,13 +666,18 @@ public class PageConnectioner {
long
exitTimeDis
=
30000
;
long
startDownTime
=
System
.
currentTimeMillis
();
HttpsURLConnection
connection
=
null
;
PagePost
pp
=
null
;
try
{
pp
=
new
PagePost
(
url
,
encoding
,
this
.
httpsconnection
(
url
),
param
);
connection
=
this
.
httpsconnection
(
url
);
pp
=
new
PagePost
(
url
,
encoding
,
connection
,
param
);
}
catch
(
Exception
e3
)
{
// TODO Auto-generated catch block
e3
.
printStackTrace
();
//
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
String
docBody
=
null
;
try
{
pp
.
urlHttpsConnectionPost
();
...
...
@@ -693,7 +730,7 @@ public class PageConnectioner {
String
pageStr
=
""
;
try
{
HtmlPage
htmlPage
=
webClient
.
getPage
(
urlstr
);
webClient
.
waitForBackgroundJavaScript
(
6
00000
);
webClient
.
waitForBackgroundJavaScript
(
3
00000
);
pageStr
=
htmlPage
.
asXml
();
}
catch
(
Exception
e
){
...
...
@@ -740,7 +777,6 @@ public class PageConnectioner {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// pageStr = scriptPage.getContent();
// }
}
catch
(
Exception
e
)
{
}
finally
{
webClient
.
close
();
...
...
comm_crawler/src/main/java/com/zzsn/download/PageDownloader.java
浏览文件 @
cc9aa52f
...
...
@@ -49,6 +49,8 @@ public class PageDownloader {
// 如果页面编码格式未知,则从页面中获取该页面编码格式
public
String
getEncodingFromHtmlFile
(
String
urlstr
,
HttpURLConnection
connection
)
throws
IOException
{
String
encoding
=
null
;
try
{
connection
.
setRequestMethod
(
"GET"
);
connection
.
setRequestProperty
(
"User-Agent"
,
"Mozilla/5.0 "
+
"(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+
"Gecko/20080404 Firefox/2.0.0.14"
);
...
...
@@ -56,7 +58,6 @@ public class PageDownloader {
connection
.
setRequestProperty
(
"Cookie"
,
"auth=token"
);
String
contentType
=
connection
.
getHeaderField
(
"Content-Type"
);
String
encoding
=
null
;
if
(
contentType
!=
null
)
{
String
temp
=
"charset="
;
int
m
=
contentType
.
indexOf
(
temp
);
...
...
@@ -65,17 +66,23 @@ public class PageDownloader {
}
}
if
(
encoding
==
null
)
{
try
{
InputStream
is
=
null
;
try
{
is
=
connection
.
getInputStream
();
BufferedInputStream
bufferedInputStream
=
new
BufferedInputStream
(
is
);
encoding
=
EncodeDetector
.
getEncoding
(
bufferedInputStream
);
is
.
close
();
}
catch
(
Exception
e
)
{
//
}
finally
{
assert
is
!=
null
;
is
.
close
();
}
}
}
catch
(
Exception
e
)
{
//
}
finally
{
connection
.
disconnect
();
}
return
encoding
;
}
...
...
@@ -159,25 +166,19 @@ public class PageDownloader {
if
(
interval
>
0
&&
lastDownloadTime
>
0
&&
dis
<
interval
){
new
PageDownloader
(
dis
+
2000
);
}
long
startDtime
=
System
.
currentTimeMillis
();
PageConnectioner
pConn
=
new
PageConnectioner
();
HttpURLConnection
connection
=
null
;
try
{
connection
=
pConn
.
connection
(
url
);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
encoding
=
paserSiteDownload
.
locateCharSet
(
url
);
}
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
log
.
info
(
"获取编码失败"
);
}
String
docBody
=
null
;
if
(
bDynamic
)
{
docBody
=
pConn
.
dynamicConnectByGet
(
url
,
encoding
);
}
else
{
// this.bDownloadUseFrame=true;
if
(
bFrame
&&
this
.
bDownloadUseFrame
)
{
String
body
=
null
;
try
{
...
...
@@ -196,12 +197,11 @@ public class PageDownloader {
}
if
(
url
.
contains
(
"https:"
)){
try
{
connection
=
pConn
.
httpsconnection
(
url
);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
"utf-8"
;
}
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
//
}
docBody
=
pConn
.
staticHttpsConnectByGet
(
url
,
encoding
,
false
);
}
else
{
...
...
@@ -237,6 +237,9 @@ public class PageDownloader {
}
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
String
docBody
=
null
;
if
(
bDynamic
)
{
...
...
@@ -264,7 +267,7 @@ public class PageDownloader {
}
if
(
url
.
contains
(
"https:"
)){
try
{
connection
=
pConn
.
httpsconnection
(
url
);
//
connection = pConn.httpsconnection(url);
if
(
encoding
==
null
||
encoding
.
isEmpty
())
{
encoding
=
"utf-8"
;
}
...
...
@@ -368,6 +371,9 @@ public class PageDownloader {
}
}
catch
(
Exception
e1
)
{
// e1.printStackTrace();
}
finally
{
assert
connection
!=
null
;
connection
.
disconnect
();
}
String
docBody
=
null
;
if
(
bDynamic
)
{
...
...
@@ -493,7 +499,6 @@ public class PageDownloader {
return
true
;
}
}
catch
(
Exception
e
)
{
// TODO Auto-generated catch block
return
true
;
}
return
false
;
...
...
comm_crawler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
浏览文件 @
cc9aa52f
...
...
@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
/**正文*/
private
String
content
;
private
String
contentWith
t
ag
;
private
String
contentWith
T
ag
;
/**未知*/
...
...
comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
浏览文件 @
cc9aa52f
...
...
@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
// latest earliest
//时间间隔设置为1h
// properties.put("max.poll.interval.ms", 60*60*1000);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_INTERVAL_MS_CONFIG
,
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_INTERVAL_MS_CONFIG
,
2
*
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
HEARTBEAT_INTERVAL_MS_CONFIG
,
25000
);
properties
.
put
(
ConsumerConfig
.
SESSION_TIMEOUT_MS_CONFIG
,
30000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_RECORDS_CONFIG
,
1
);
...
...
@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled
(
cron
=
"0 0/
5
* * * ?"
)
@Async
(
"asyncTaskExecutor"
)
@Scheduled
(
cron
=
"0 0/
2
* * * ?"
)
//
@Async("asyncTaskExecutor")
public
void
consumer
(){
ExecutorService
threadPool
=
Executors
.
newFixedThreadPool
(
Constants
.
THREAD_SIZE
);
log
.
info
(
"进入定时获取
mq
消息"
);
//
ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log
.
info
(
"进入定时获取
topic
消息"
);
//1.创建消费者
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
// 消费某个主题的某个分区数据
...
...
@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
Duration
.
ofMillis
(
3000
));
//手动提交已消费数据的offset
// consumer.commitAsync();
consumer
.
commitSync
();
if
(
records
!=
null
&&
records
.
count
()
>
0
)
{
for
(
ConsumerRecord
record
:
records
)
{
...
...
@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
}
}
}
}
}
catch
(
Exception
e
){
// consumer.commitSync();
log
.
info
(
e
.
getMessage
());
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
//退出应用程序前使用close方法关闭消费者,网络连接和socket也会随之关闭,并立即触发一次再均衡
consumer
.
close
();
System
.
out
.
println
(
"error!!!!!!!!!!!"
);
consumer
=
createConsumer
();
// 消费某个主题的某个分区数据
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
String
[]
partitions1
=
kafkaConsumerPartition
.
split
(
","
);
for
(
int
i
=
0
;
i
<
partitions1
.
length
;
i
++)
{
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions1
[
i
])));
}
consumer
.
assign
(
topicPartitions
);
}
}
...
...
comm_crawler/src/main/resources/constants.properties
浏览文件 @
cc9aa52f
...
...
@@ -35,8 +35,8 @@ PROXYID=1
#线程池大小
THREAD_SIZE
=
1
#
CHROMEDRIVE
=
E
:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
CHROMEDRIVE
=
D
:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\\
Program Files
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
#mysql connection
...
...
@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC
=
clb-infosource-handler-dynamin
#
KAFKA_CONSUMER_GROUP_ID
=
dynamin-sync
KAFKA_CONSUMER_GROUP_ID
=
test-zs1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET
=
earliest
KAFKA_PRODUCT_TOPIC
=
crawlerInfo
...
...
@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
META_SEARCH_URL
=
https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#指定分区使用逗号分割
KAFKA_CONSUMER_PARTITION
=
0
KAFKA_CONSUMER_PARTITION
=
0
,1,2,3
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_PRODUCT_PARTITION
=
0
# Redis settings
redis.host
=
1
27.0.0.1
redis.host
=
1
14.116.26.150
redis.port
=
6379
redis.pass
=
xxxxxx
redis.pass
=
zzsn9988
#redis.host=8.130.30.33
#redis.port=9010
#redis.pass=wxadS&jklim
...
...
sina_search/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
浏览文件 @
cc9aa52f
...
...
@@ -17,8 +17,8 @@ public class ThreadExecutorConfig {
@Bean
(
value
=
"asyncTaskExecutor"
)
public
Executor
executor
()
{
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
executor
.
setCorePoolSize
(
1
);
//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
1
);
//线程池维护线程的最大数量
executor
.
setCorePoolSize
(
2
);
//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
5
);
//线程池维护线程的最大数量
executor
.
setQueueCapacity
(
5000
);
//缓存队列
executor
.
setThreadNamePrefix
(
"ssmsExecutor-"
);
/**
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论