提交 83f00b0f 作者: liuweigang

采集代码更新8

上级 344f8a7b
......@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
......@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.web.bind.annotation.RequestBody;
import java.io.InputStream;
import java.net.URI;
......@@ -64,11 +66,16 @@ public class WebContentPaserByCss {
if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){
} catch (Exception e) {
log.info(e.getMessage());
}
}
if (StringUtils.isEmpty(body)) {//为空时调用
try {
if (StringUtils.isEmpty(body)){
......@@ -506,28 +513,32 @@ public class WebContentPaserByCss {
// 请求下载内容
String content="";
try {
if(siteMsgTemple.getYnDynamicCrawl()==1) {
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
}
// if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
// }else {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// }
}else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm);
}
}
}catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
if (StringUtils.isEmpty(content)) {
continue;
}
DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML");
......
......@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.SeleniumVerify;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageConnectioner;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader;
import com.zzsn.download.*;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
......@@ -151,11 +148,14 @@ public class WebContentPaserByIntellige {
if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){
} catch (Exception e) {
log.info(e.getMessage());
}
}
if (StringUtils.isEmpty(body)) {//为空时调用
try {
if (StringUtils.isEmpty(body)){
......@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige {
// 请求下载内容
String content="";
try {
if(siteMsgTemple.getYnDynamicCrawl()==1) {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
}
}else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm);
}
}
}catch (Exception e) {
if(StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
if (StringUtils.isEmpty(content)) {
continue;
}
DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML");
......
......@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
......@@ -62,12 +63,15 @@ public class WebContentPaserByRegular {
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else {
try {//先使用静态网络请求获取列表内容
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e) {
log.info(e.getMessage());
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}
}
if (StringUtils.isEmpty(body)){
SeleniumVerify seleniumVerify=new SeleniumVerify();
body = seleniumVerify.getScopehtml(uri_code);
......@@ -473,8 +477,14 @@ public class WebContentPaserByRegular {
String content="";
try {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),null,true,false);
// StringUtils.isEmpty(content) && siteMsgTemple.getYnDynamicCrawl()==1
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
......
......@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.download.RequestUtil;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
......@@ -89,11 +90,14 @@ public class WebContentPaserByXpath {
if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else {
try {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){
} catch (Exception e) {
log.info(e.getMessage());
}
}
if (StringUtils.isEmpty(body)) {
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) {
......@@ -534,13 +538,30 @@ public class WebContentPaserByXpath {
String content="";
try {
if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
}
}else{
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论