提交 83f00b0f 作者: liuweigang

采集代码更新8

上级 344f8a7b
...@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload; ...@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload; import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
...@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document; ...@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate; import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.web.bind.annotation.RequestBody;
import java.io.InputStream; import java.io.InputStream;
import java.net.URI; import java.net.URI;
...@@ -64,11 +66,16 @@ public class WebContentPaserByCss { ...@@ -64,11 +66,16 @@ public class WebContentPaserByCss {
if(siteMsgTemple.getHeaders()!=null){//添加header if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
try {//正常请求 body = RequestUtil.httpGetRequest(uri_code);
body = pageDownload.downloadWithStr(uri_code, charset, true, false); if(StringUtils.isEmpty(body)) {
}catch (Exception e){ try {//正常请求
log.info(e.getMessage()); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
} }
if (StringUtils.isEmpty(body)) {//为空时调用 if (StringUtils.isEmpty(body)) {//为空时调用
try { try {
if (StringUtils.isEmpty(body)){ if (StringUtils.isEmpty(body)){
...@@ -506,28 +513,32 @@ public class WebContentPaserByCss { ...@@ -506,28 +513,32 @@ public class WebContentPaserByCss {
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress()); if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress()); content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
// if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
// }else {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// }
}else{ }else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm);
}
} }
}catch (Exception e) { }catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
if (StringUtils.isEmpty(content)) {
continue;
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
......
...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload; ...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.SeleniumVerify; import com.zzsn.crawler.uriparser.SeleniumVerify;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.*;
import com.zzsn.download.PageConnectioner;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
...@@ -151,10 +148,13 @@ public class WebContentPaserByIntellige { ...@@ -151,10 +148,13 @@ public class WebContentPaserByIntellige {
if(siteMsgTemple.getHeaders()!=null){//添加header if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
try {//正常请求 body = RequestUtil.httpGetRequest(uri_code);
body = pageDownload.downloadWithStr(uri_code, charset, true, false); if(StringUtils.isEmpty(body)) {
}catch (Exception e){ try {//正常请求
log.info(e.getMessage()); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
} }
if (StringUtils.isEmpty(body)) {//为空时调用 if (StringUtils.isEmpty(body)) {//为空时调用
try { try {
...@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige { ...@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige {
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress()); content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{ }else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm);
}
} }
}catch (Exception e) { }catch (Exception e) {
if(StringUtils.isEmpty(content)){ content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); }
} if (StringUtils.isEmpty(content)) {
continue;
} }
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
......
...@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload; ...@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload; import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
...@@ -62,12 +63,15 @@ public class WebContentPaserByRegular { ...@@ -62,12 +63,15 @@ public class WebContentPaserByRegular {
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try {//先使用静态网络请求获取列表内容 body = RequestUtil.httpGetRequest(uri_code);
body = pageDownload.downloadWithStr(uri_code, charset, false, false); if(StringUtils.isEmpty(body)) {
}catch (Exception e){ try {//正常请求
log.info(e.getMessage()); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
body = pageDownload.downloadWithStr(uri_code, charset, false, false); } catch (Exception e) {
log.info(e.getMessage());
}
} }
if (StringUtils.isEmpty(body)){ if (StringUtils.isEmpty(body)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); SeleniumVerify seleniumVerify=new SeleniumVerify();
body = seleniumVerify.getScopehtml(uri_code); body = seleniumVerify.getScopehtml(uri_code);
...@@ -473,8 +477,14 @@ public class WebContentPaserByRegular { ...@@ -473,8 +477,14 @@ public class WebContentPaserByRegular {
String content=""; String content="";
try { try {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问 //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),null,true,false); content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
// StringUtils.isEmpty(content) && siteMsgTemple.getYnDynamicCrawl()==1 if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) { if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress()); // content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
......
...@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot; ...@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
...@@ -89,10 +90,13 @@ public class WebContentPaserByXpath { ...@@ -89,10 +90,13 @@ public class WebContentPaserByXpath {
if(siteMsgTemple.getHeaders()!=null){ if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try { body = RequestUtil.httpGetRequest(uri_code);
body = pageDownload.downloadWithStr(uri_code, charset, true, false); if(StringUtils.isEmpty(body)) {
}catch (Exception e){ try {//正常请求
log.info(e.getMessage()); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body)) {
body = pageDownload.downloadWithStr(uri_code, charset, false, false); body = pageDownload.downloadWithStr(uri_code, charset, false, false);
...@@ -534,13 +538,30 @@ public class WebContentPaserByXpath { ...@@ -534,13 +538,30 @@ public class WebContentPaserByXpath {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress()); // SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{ }else{
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论