Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
83f00b0f
提交
83f00b0f
authored
9月 24, 2022
作者:
liuweigang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
采集代码更新8
上级
344f8a7b
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
84 行增加
和
33 行删除
+84
-33
WebContentPaserByCss.java
...ain/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+23
-12
WebContentPaserByIntellige.java
...va/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
+19
-10
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+16
-6
WebContentPaserByXpath.java
...n/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+26
-5
没有找到文件。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
浏览文件 @
83f00b0f
...
@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
...
@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownload
;
import
com.zzsn.download.PageDownload
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.RequestUtil
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
...
@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document;
...
@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
import
org.springframework.kafka.core.KafkaTemplate
;
import
org.springframework.kafka.core.KafkaTemplate
;
import
org.springframework.web.bind.annotation.RequestBody
;
import
java.io.InputStream
;
import
java.io.InputStream
;
import
java.net.URI
;
import
java.net.URI
;
...
@@ -64,11 +66,16 @@ public class WebContentPaserByCss {
...
@@ -64,11 +66,16 @@ public class WebContentPaserByCss {
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
//添加header
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
//添加header
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
true
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
true
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
body
=
RequestUtil
.
httpGetRequest
(
uri_code
);
if
(
StringUtils
.
isEmpty
(
body
))
{
try
{
//正常请求
try
{
//正常请求
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
}
}
}
if
(
StringUtils
.
isEmpty
(
body
))
{
//为空时调用
if
(
StringUtils
.
isEmpty
(
body
))
{
//为空时调用
try
{
try
{
if
(
StringUtils
.
isEmpty
(
body
)){
if
(
StringUtils
.
isEmpty
(
body
)){
...
@@ -506,28 +513,32 @@ public class WebContentPaserByCss {
...
@@ -506,28 +513,32 @@ public class WebContentPaserByCss {
// 请求下载内容
// 请求下载内容
String
content
=
""
;
String
content
=
""
;
try
{
try
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content
=
RequestUtil
.
httpGetRequest
(
cwbm
.
getSourceaddress
());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if
(
StringUtils
.
isEmpty
(
content
))
{
try
{
//正常请求
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
}
}
if
(
StringUtils
.
isEmpty
(
content
)
)
{
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if
(
StringUtils
.
isEmpty
(
content
)){
if
(
StringUtils
.
isEmpty
(
content
)){
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
content
=
seleniumVerify
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
seleniumVerify
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
}
// if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
// }else {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// }
}
else
{
}
else
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
if
(
StringUtils
.
isEmpty
(
content
)){
content
=
paserSiteDownload
.
getContent
(
cwbm
);
content
=
paserSiteDownload
.
getContent
(
cwbm
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
}
if
(
StringUtils
.
isEmpty
(
content
))
{
continue
;
}
DocInfo
docInfo
=
new
DocInfo
();
DocInfo
docInfo
=
new
DocInfo
();
docInfo
.
setContentType
(
"HTML"
);
docInfo
.
setContentType
(
"HTML"
);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
浏览文件 @
83f00b0f
...
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
...
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.crawler.uriparser.SeleniumVerify
;
import
com.zzsn.crawler.uriparser.SeleniumVerify
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.*
;
import
com.zzsn.download.PageConnectioner
;
import
com.zzsn.download.PageDownload
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
...
@@ -151,11 +148,14 @@ public class WebContentPaserByIntellige {
...
@@ -151,11 +148,14 @@ public class WebContentPaserByIntellige {
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
//添加header
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
//添加header
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
true
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
true
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
body
=
RequestUtil
.
httpGetRequest
(
uri_code
);
if
(
StringUtils
.
isEmpty
(
body
))
{
try
{
//正常请求
try
{
//正常请求
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
}
}
}
if
(
StringUtils
.
isEmpty
(
body
))
{
//为空时调用
if
(
StringUtils
.
isEmpty
(
body
))
{
//为空时调用
try
{
try
{
if
(
StringUtils
.
isEmpty
(
body
)){
if
(
StringUtils
.
isEmpty
(
body
)){
...
@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige {
...
@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige {
// 请求下载内容
// 请求下载内容
String
content
=
""
;
String
content
=
""
;
try
{
try
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content
=
RequestUtil
.
httpGetRequest
(
cwbm
.
getSourceaddress
());
if
(
StringUtils
.
isEmpty
(
content
))
{
try
{
//正常请求
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
}
}
if
(
StringUtils
.
isEmpty
(
content
)
)
{
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if
(
StringUtils
.
isEmpty
(
content
)){
if
(
StringUtils
.
isEmpty
(
content
)){
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
content
=
seleniumVerify
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
seleniumVerify
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
}
}
else
{
}
else
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
if
(
StringUtils
.
isEmpty
(
content
)){
content
=
paserSiteDownload
.
getContent
(
cwbm
);
content
=
paserSiteDownload
.
getContent
(
cwbm
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
if
(
StringUtils
.
isEmpty
(
content
)){
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
}
if
(
StringUtils
.
isEmpty
(
content
))
{
continue
;
}
}
DocInfo
docInfo
=
new
DocInfo
();
DocInfo
docInfo
=
new
DocInfo
();
docInfo
.
setContentType
(
"HTML"
);
docInfo
.
setContentType
(
"HTML"
);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
83f00b0f
...
@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
...
@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownload
;
import
com.zzsn.download.PageDownload
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.RequestUtil
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
...
@@ -62,12 +63,15 @@ public class WebContentPaserByRegular {
...
@@ -62,12 +63,15 @@ public class WebContentPaserByRegular {
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getHeaders
())){
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getHeaders
())){
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
try
{
//先使用静态网络请求获取列表内容
body
=
RequestUtil
.
httpGetRequest
(
uri_code
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
if
(
StringUtils
.
isEmpty
(
body
))
{
}
catch
(
Exception
e
){
try
{
//正常请求
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
}
}
}
if
(
StringUtils
.
isEmpty
(
body
)){
if
(
StringUtils
.
isEmpty
(
body
)){
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
body
=
seleniumVerify
.
getScopehtml
(
uri_code
);
body
=
seleniumVerify
.
getScopehtml
(
uri_code
);
...
@@ -473,8 +477,14 @@ public class WebContentPaserByRegular {
...
@@ -473,8 +477,14 @@ public class WebContentPaserByRegular {
String
content
=
""
;
String
content
=
""
;
try
{
try
{
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
null
,
true
,
false
);
content
=
RequestUtil
.
httpGetRequest
(
cwbm
.
getSourceaddress
());
// StringUtils.isEmpty(content) && siteMsgTemple.getYnDynamicCrawl()==1
if
(
StringUtils
.
isEmpty
(
content
))
{
try
{
//正常请求
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
}
}
if
(
StringUtils
.
isEmpty
(
content
)
)
{
if
(
StringUtils
.
isEmpty
(
content
)
)
{
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
浏览文件 @
83f00b0f
...
@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot;
...
@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot;
import
com.zzsn.crawler.uriparser.obs.ObsUpload
;
import
com.zzsn.crawler.uriparser.obs.ObsUpload
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.RequestUtil
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.JedisUtil
;
...
@@ -89,11 +90,14 @@ public class WebContentPaserByXpath {
...
@@ -89,11 +90,14 @@ public class WebContentPaserByXpath {
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
if
(
siteMsgTemple
.
getHeaders
()!=
null
){
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
try
{
body
=
RequestUtil
.
httpGetRequest
(
uri_code
);
if
(
StringUtils
.
isEmpty
(
body
))
{
try
{
//正常请求
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
true
,
false
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
}
}
}
if
(
StringUtils
.
isEmpty
(
body
))
{
if
(
StringUtils
.
isEmpty
(
body
))
{
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
if
(
StringUtils
.
isEmpty
(
body
))
{
if
(
StringUtils
.
isEmpty
(
body
))
{
...
@@ -534,13 +538,30 @@ public class WebContentPaserByXpath {
...
@@ -534,13 +538,30 @@ public class WebContentPaserByXpath {
String
content
=
""
;
String
content
=
""
;
try
{
try
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
RequestUtil
.
httpGetRequest
(
cwbm
.
getSourceaddress
());
if
(
StringUtils
.
isEmpty
(
content
))
{
try
{
//正常请求
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
""
,
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
}
}
if
(
StringUtils
.
isEmpty
(
content
)){
if
(
StringUtils
.
isEmpty
(
content
)){
SeleniumVerify
seleniumVerify
=
new
SeleniumVerify
();
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
seleniumVerify
.
getScopehtml
(
cwbm
.
getSourceaddress
());
// SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
}
}
}
else
{
}
else
{
content
=
RequestUtil
.
httpGetRequest
(
cwbm
.
getSourceaddress
());
if
(
StringUtils
.
isEmpty
(
content
))
{
try
{
//正常请求
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
}
}
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
cwbm
.
getCharset
(),
true
,
false
);
if
(
StringUtils
.
isEmpty
(
content
)){
if
(
StringUtils
.
isEmpty
(
content
)){
content
=
paserSiteDownload
.
getContent
(
cwbm
);
content
=
paserSiteDownload
.
getContent
(
cwbm
);
if
(
StringUtils
.
isEmpty
(
content
))
{
if
(
StringUtils
.
isEmpty
(
content
))
{
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论