采集代码更新8

83f00b0f · liuweigang · 344f8a7b · 83f00b0f · 83f00b0f · 83f00b0f
--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
@@ -12,6 +12,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownload;
 import com.zzsn.download.PageDownloader;
+import com.zzsn.download.RequestUtil;
 import com.zzsn.entity.*;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
@@ -24,6 +25,7 @@ import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.springframework.kafka.core.KafkaTemplate;
+import org.springframework.web.bind.annotation.RequestBody;
 import java.io.InputStream;
 import java.net.URI;
@@ -64,11 +66,16 @@ public class WebContentPaserByCss {
                        if(siteMsgTemple.getHeaders()!=null){//添加header
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
                        }else {
+                            body = RequestUtil.httpGetRequest(uri_code);
+                            if(StringUtils.isEmpty(body)) {
                                try {//正常请求
                                    body = pageDownload.downloadWithStr(uri_code, charset, true, false);
-                            }catch (Exception e){
+                                } catch (Exception e) {
                                    log.info(e.getMessage());
                                }
+                            }
                            if (StringUtils.isEmpty(body)) {//为空时调用
                                    try {
                                        if (StringUtils.isEmpty(body)){
@@ -506,28 +513,32 @@ public class WebContentPaserByCss {
 //                    请求下载内容
                    String content="";
                    try {
-                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
+                        //首先使用静态访问，当访问异常使用动态访问，当解析内容为空再次调用动态访问
+                        content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
-//                            content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
+                        if(StringUtils.isEmpty(content)) {
+                            try {//正常请求
+                                content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
+                            } catch (Exception e) {
+                                log.info(e.getMessage());
+                            }
+                        }
+                        if(StringUtils.isEmpty(content) ) {
                            content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
+//                            content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
                            if(StringUtils.isEmpty(content)){
                                SeleniumVerify seleniumVerify=new SeleniumVerify();
                                content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
                            }
-//                            if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
-//                                content = SeleniumTime.getVerifyScopehtml(cwbm.getSourceaddress());
-//                            }else {
-//                                content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
-//                            }
                        }else{
-                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
-                            if(StringUtils.isEmpty(content)){
                            content = paserSiteDownload.getContent(cwbm);
                        }
-                        }
                    }catch (Exception e) {
                        content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
                    }
+                    if (StringUtils.isEmpty(content)) {
+                        continue;
+                    }
                    DocInfo docInfo = new DocInfo();
                    docInfo.setContentType("HTML");

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
 import com.zzsn.crawler.uriparser.HtmlPageParser;
 import com.zzsn.crawler.uriparser.SeleniumTime;
 import com.zzsn.crawler.uriparser.SeleniumVerify;
-import com.zzsn.download.PageBuilderParser;
+import com.zzsn.download.*;
-import com.zzsn.download.PageConnectioner;
-import com.zzsn.download.PageDownload;
-import com.zzsn.download.PageDownloader;
 import com.zzsn.entity.*;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
@@ -151,11 +148,14 @@ public class WebContentPaserByIntellige {
                        if(siteMsgTemple.getHeaders()!=null){//添加header
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
                        }else {
+                            body = RequestUtil.httpGetRequest(uri_code);
+                            if(StringUtils.isEmpty(body)) {
                                try {//正常请求
                                    body = pageDownload.downloadWithStr(uri_code, charset, true, false);
-                            }catch (Exception e){
+                                } catch (Exception e) {
                                    log.info(e.getMessage());
                                }
+                            }
                            if (StringUtils.isEmpty(body)) {//为空时调用
                                    try {
                                        if (StringUtils.isEmpty(body)){
@@ -527,22 +527,31 @@ public class WebContentPaserByIntellige {
 //                    请求下载内容
                    String content="";
                    try {
-                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
+                        //首先使用静态访问，当访问异常使用动态访问，当解析内容为空再次调用动态访问
+                        content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
+                        if(StringUtils.isEmpty(content)) {
+                            try {//正常请求
+                                content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
+                            } catch (Exception e) {
+                                log.info(e.getMessage());
+                            }
+                        }
+                        if(StringUtils.isEmpty(content) ) {
                            content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
+//                            content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
                            if(StringUtils.isEmpty(content)){
                                SeleniumVerify seleniumVerify=new SeleniumVerify();
                                content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
                            }
                        }else{
-                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
-                            if(StringUtils.isEmpty(content)){
                            content = paserSiteDownload.getContent(cwbm);
                        }
-                        }
                    }catch (Exception e) {
-                        if(StringUtils.isEmpty(content)){
                        content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
                    }
+                    if (StringUtils.isEmpty(content)) {
+                        continue;
                    }
                    DocInfo docInfo = new DocInfo();
                    docInfo.setContentType("HTML");

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
@@ -10,6 +10,7 @@ import com.zzsn.crawler.uriparser.obs.ObsUpload;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownload;
 import com.zzsn.download.PageDownloader;
+import com.zzsn.download.RequestUtil;
 import com.zzsn.entity.*;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
@@ -62,12 +63,15 @@ public class WebContentPaserByRegular {
                        if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
                        }else {
-                            try {//先使用静态网络请求获取列表内容
+                            body = RequestUtil.httpGetRequest(uri_code);
-                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
+                            if(StringUtils.isEmpty(body)) {
-                            }catch (Exception e){
+                                try {//正常请求
+                                    body = pageDownload.downloadWithStr(uri_code, charset, true, false);
+                                } catch (Exception e) {
                                    log.info(e.getMessage());
-                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
                                }
+                            }
                            if (StringUtils.isEmpty(body)){
                                SeleniumVerify seleniumVerify=new SeleniumVerify();
                                body = seleniumVerify.getScopehtml(uri_code);
@@ -473,8 +477,14 @@ public class WebContentPaserByRegular {
                    String content="";
                    try {
                        //首先使用静态访问，当访问异常使用动态访问，当解析内容为空再次调用动态访问
-                        content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),null,true,false);
+                        content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
-//                        StringUtils.isEmpty(content) && siteMsgTemple.getYnDynamicCrawl()==1
+                        if(StringUtils.isEmpty(content)) {
+                            try {//正常请求
+                                content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
+                            } catch (Exception e) {
+                                log.info(e.getMessage());
+                            }
+                        }
                        if(StringUtils.isEmpty(content) ) {
                            content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
 //                            content = JSUtil.getParseredHtml(cwbm.getSourceaddress());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
@@ -11,6 +11,7 @@ import com.zzsn.crawler.uriparser.WebPageScreenShot;
 import com.zzsn.crawler.uriparser.obs.ObsUpload;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownloader;
+import com.zzsn.download.RequestUtil;
 import com.zzsn.entity.*;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
@@ -89,11 +90,14 @@ public class WebContentPaserByXpath {
                        if(siteMsgTemple.getHeaders()!=null){
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
                        }else {
-                            try {
+                            body = RequestUtil.httpGetRequest(uri_code);
+                            if(StringUtils.isEmpty(body)) {
+                                try {//正常请求
                                    body = pageDownload.downloadWithStr(uri_code, charset, true, false);
-                            }catch (Exception e){
+                                } catch (Exception e) {
                                    log.info(e.getMessage());
                                }
+                            }
                            if (StringUtils.isEmpty(body)) {
                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
                                if (StringUtils.isEmpty(body)) {
@@ -534,13 +538,30 @@ public class WebContentPaserByXpath {
                    String content="";
                    try {
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
+                            content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
+                            if(StringUtils.isEmpty(content)) {
+                                try {//正常请求
+                                    content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false);
+                                } catch (Exception e) {
+                                    log.info(e.getMessage());
+                                }
+                            }
                            if(StringUtils.isEmpty(content)){
-                                SeleniumVerify seleniumVerify=new SeleniumVerify();
+                                content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
-                                content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
+//                                SeleniumVerify seleniumVerify=new SeleniumVerify();
+//                                content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
                            }
                        }else{
+                            content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
+                            if(StringUtils.isEmpty(content)) {
+                                try {//正常请求
+                                    content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
+                                } catch (Exception e) {
+                                    log.info(e.getMessage());
+                                }
+                            }
                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
                            if(StringUtils.isEmpty(content)){
                                content = paserSiteDownload.getContent(cwbm);
                                if(StringUtils.isEmpty(content)) {