更新

cc9aa52f · 张文库 · f314a48b · cc9aa52f · cc9aa52f · cc9aa52f
--- a/comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
+++ b/comm_crawler/src/main/java/com/zzsn/api/SiteInfoVerify.java
@@ -29,7 +29,11 @@ public class SiteInfoVerify{
        List<String> urlList=getPageListUrl(siteMsgTemple);
        String charset="utf-8";
        if(siteMsgTemple.getYnDynamicCrawl()!=1){
-           charset = paserSiteDownload.getCharSet(urlList.get(0));
+            try {
+                charset = paserSiteDownload.getCharSet(urlList.get(0));
+            } catch (IOException e) {
+                //
+            }
        }
@@ -82,7 +86,11 @@ public class SiteInfoVerify{
            PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
            charset = paserSiteDownload.locateCharSet(urlList.get(0));
        }catch (Exception e){
-            charset = paserSiteDownload.getCharSet(urlList.get(0));
+            try {
+                charset = paserSiteDownload.getCharSet(urlList.get(0));
+            } catch (IOException ex) {
+                //
+            }
        }
        //判断解析表达式类型
        if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
@@ -165,7 +173,7 @@ public class SiteInfoVerify{
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Component;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
        crawler();
    }
-    @Async("asyncexecutorService")
+//    @Async("asyncexecutorService")
    public   void crawler(){
        //获取栏目链接以及翻页的链接
@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
        String charset = "";
        try {
            charset = paserSiteDownload.locateCharSet(urlList.get(0));
-        }catch (Exception e){
+        } catch (Exception e) {
-            charset = paserSiteDownload.getCharSet(urlList.get(0));
+            try {
+                charset = paserSiteDownload.getCharSet(urlList.get(0));
+            } catch (IOException ex) {
+                //
+            }
        }
        //获取列表url等信息通过匹配url过滤
        List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
            WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
            metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
        }
-//        log.info("本次获取列表url： "+metaSearchList.size()+"个");
+        //资讯类容抽取
        siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
        //判断解析详情表达式类型
        if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
@@ -406,47 +406,41 @@ public class PaserSiteDownload {
    return  HttpClients.createDefault();
  }
-    public static String getCharSet(String url) {
+    public static String getCharSet(String url) throws IOException {
-        String html="";
+        String html = "";
-        CloseableHttpClient  httpClient = CreateSSLClientDefault.createSSLClientDefault();
+        HttpResponse httprespse = null;
-        HttpGet httpgeturl = new HttpGet(url);// Get请求
+        HttpEntity entitydata = null;
+        CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
-        httpgeturl.getParams().setIntParameter(
-                CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
-        httpgeturl.getParams().setParameter(
-                HttpMethodParams.SO_TIMEOUT, 60000);
-        // 伪装成浏览器
-        httpgeturl.setHeader("Content-Type",
-                "application/x-www-form-urlencoded;charset=utf-8");
-        httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
-        httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
-        //httpgeturl.setHeader("Accept-Language", "en");
-        //httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
-        HttpResponse httprespse=null;
        try {
-            Thread.sleep(500L);
+//            Thread.sleep(500L);
+            HttpGet httpgeturl = new HttpGet(url);// Get请求
+            httpgeturl.getParams().setIntParameter(
+                    CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
+            httpgeturl.getParams().setParameter(
+                    HttpMethodParams.SO_TIMEOUT, 60000);
+            // 伪装成浏览器
+            httpgeturl.setHeader("Content-Type",
+                    "application/x-www-form-urlencoded;charset=utf-8");
+            httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
+            httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
+            //httpgeturl.setHeader("Accept-Language", "en");
+            //httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
            httprespse = httpClient.execute(httpgeturl);
+            entitydata = httprespse.getEntity();// 获取返回数据
+            httpgeturl.releaseConnection();
        } catch (Exception e2) {
-            // TODO Auto-generated catch block
-//            e2.printStackTrace();
            log.info("请求访问失败！");
            return "utf-8";
-        } // 发送请求
+        } finally {
-        HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
+            httpClient.close();
+        }
-        Header lastModify = httprespse.getFirstHeader("Last-Modified");
        String charset="utf-8";
        String infodata="";
        try {
-            Thread.sleep(500L);
            infodata = EntityUtils.toString(entitydata, charset);
        } catch (Exception e1) {
-            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
-        httpgeturl.releaseConnection();
        Pattern p1 = Pattern.compile("<meta[^>]*>",
                Pattern.CASE_INSENSITIVE);
@@ -465,27 +459,24 @@ public class PaserSiteDownload {
                        charset = m3.group().substring(9);
                    }
                    if (charset.trim().length() == 0) {
-                        // encoding = DetectCharSet.detectCharSet(fileName);
-                        // if(encoding == null){
                        charset = "gbk";
-                        // }
                    }
                }
                return charset;
            }
        }
        return charset;
    }
    public static String getHtml(String url,String charset) {
        String html="";
        CloseableHttpClient  httpClient = CreateSSLClientDefault.createSSLClientDefault();
        HttpGet httpgeturl = new HttpGet(url);// Get请求
        httpgeturl.getParams().setIntParameter(
-                CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
+                CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
        httpgeturl.getParams().setParameter(
-                HttpMethodParams.SO_TIMEOUT, 60000);
+                HttpMethodParams.SO_TIMEOUT, 20000);
        // 伪装成浏览器
        httpgeturl.setHeader("Content-Type",
                "application/x-www-form-urlencoded;charset=utf-8");
@@ -499,16 +490,14 @@ public class PaserSiteDownload {
            httprespse = httpClient.execute(httpgeturl);
        } catch (Exception e2) {
            httpgeturl.releaseConnection();
-            // TODO Auto-generated catch block
-//            e2.printStackTrace();
            return "";
        } // 发送请求
        HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
-        Header lastModify = httprespse
+//        Header lastModify = httprespse
-                .getFirstHeader("Last-Modified");
+//                .getFirstHeader("Last-Modified");
-        if (lastModify == null) {
+//        if (lastModify == null) {
-            lastModify = httprespse.getLastHeader("Last-Modified");
+//            lastModify = httprespse.getLastHeader("Last-Modified");
-        }
+//        }
        if(charset==null) {
            String charstype = EntityUtils
                    .getContentCharSet(entitydata);
@@ -524,61 +513,52 @@ public class PaserSiteDownload {
        try {
            Thread.sleep(500L);
            infodata = EntityUtils.toString(entitydata, charset);
+            httpgeturl.releaseConnection();
+            httpClient.close();
        } catch (Exception e1) {
-            // TODO Auto-generated catch block
-//            e1.printStackTrace();
            log.info("内容解析异常");
        }finally {
            httpgeturl.releaseConnection();
        }
        return infodata;
    }
  // 获取所要抓取网页的编码方式
  public static String locateCharSet(String url) {
-    String encoding = "utf-8";
+      String encoding = "utf-8";
-    try {
+      try {
-      Connection conn = Jsoup.connect(url);
+          Connection conn = Jsoup.connect(url);
-      conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
+          conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
-      // 伪装成浏览器
+          // 伪装成浏览器
-      Document doc = conn.ignoreContentType(true).timeout(10000).get();
+          Document doc = conn.ignoreContentType(true).timeout(5000).get();
-      Pattern p1 = Pattern.compile("<meta[^>]*>",
+          Pattern p1 = Pattern.compile("<meta[^>]*>",
-              Pattern.CASE_INSENSITIVE);
+                  Pattern.CASE_INSENSITIVE);
-      Matcher m1 = p1.matcher(doc.toString());
+          Matcher m1 = p1.matcher(doc.toString());
-      while (m1.find()) {
+          while (m1.find()) {
-        String str = m1.group();
+              String str = m1.group();
-        Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
+              Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
-        Matcher m2 = p2.matcher(str);
+              Matcher m2 = p2.matcher(str);
-        if (m2.find()) {
+              if (m2.find()) {
-          encoding = m2.group().substring(8);
+                  encoding = m2.group().substring(8);
-          if (encoding.trim().length() == 0) {
+                  if (encoding.trim().length() == 0) {
-            Pattern p3 = Pattern
+                      Pattern p3 = Pattern
-                    .compile("charset=\"[^\\s||\"||;||>]*");
+                              .compile("charset=\"[^\\s||\"||;||>]*");
-            Matcher m3 = p3.matcher(str);
+                      Matcher m3 = p3.matcher(str);
-            if (m3.find()) {
+                      if (m3.find()) {
-              encoding = m3.group().substring(9);
+                          encoding = m3.group().substring(9);
-            }
+                      }
-            if (encoding.trim().length() == 0) {
+                      if (encoding.trim().length() == 0) {
-              // encoding = DetectCharSet.detectCharSet(fileName);
+                          encoding = "gbk";
-              // if(encoding == null){
+                      }
-              encoding = "gbk";
+                  }
-              // }
+                  return encoding;
-            }
+              }
          }
+      } catch (IOException e) {
+          log.error("获取编码方式出错");
          return encoding;
-        }
      }
-    } catch (IOException e) {
-      // e.printStackTrace();
-      log.error("获取编码方式出错");
-      System.out.println("获取编码方式出错");
      return encoding;
-    }
-    return encoding;
  }
  public static Properties getConfig() {
@@ -608,7 +588,7 @@ public class PaserSiteDownload {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Component;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
            urlList.addAll(hisUrlList);
        }
        //获取编码
-        String charset = paserSiteDownload.getCharSet(urlList.get(0));
+        String charset = null;
+        try {
+            charset = paserSiteDownload.getCharSet(urlList.get(0));
+        } catch (IOException e) {
+            //
+        }
        //获取列表url等信息通过匹配url过滤
        List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
            WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
            metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
        }
-//        log.info("本次获取列表url： "+metaSearchList.size()+"个");
+        //获取文章详情
        siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
        //判断解析详情表达式类型
        if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/PaserCommDownload.java
@@ -356,7 +356,7 @@ public class PaserCommDownload {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
                            TimeUnit.SECONDS.sleep(2);
                        }
-                        if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
+//                        if (StringUtils.isEmpty(body)) {
-                            sentBadSiteMsg(siteMsgTemple,"动态请求异常","0");
+//                            sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
-                        }else{
+//                        }
-                            sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
-                        }
                        if(StringUtils.isNotEmpty(body)) {
                            Document doc = Jsoup.parse(body);
                            //抽取资讯url
@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
 //                                catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
 //                                catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
 //                            }
-                            if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
+//                            if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
-                                sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
+//                                sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
-                            }
+//                            }
                        }
                        if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
                            String imagUrl="";
@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
                        if(StringUtils.isNotEmpty(content)) {
                            docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
                        }else {
-                            sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
+//                            sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
                            log.info("栏目名称："+siteMsgTemple.getSiteName()+" 链接请求："+cwbm.getSourceaddress()+"  内容为空："+content);
                        }
                    }catch (Exception e){
-                        sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
+//                        sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
                        log.info("详情内容解析出现异常："+cwbm.getSourceaddress());
                    }
@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
                        docInfo.setId(count+"");
                        ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        String docjson = mapper.writeValueAsString(processitem);
 //                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
 import com.zzsn.crawler.uriparser.HtmlPageParser;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownloader;
-import com.zzsn.entity.CatchWebByMetaSearch;
+import com.zzsn.entity.*;
-import com.zzsn.entity.ClbAnsProcessitem;
-import com.zzsn.entity.DocInfo;
-import com.zzsn.entity.SiteMsgTemple;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
 import com.zzsn.util.ContentUtility;
@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
                                }
                            }
                        }
-                        if(StringUtils.isNotEmpty(body)) {
+                        if (StringUtils.isNotEmpty(body)) {
                            //抽取资讯url
                            List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body);
                            catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
+                        } else {
+//                            sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
                        }
                    } catch (Exception e) {
                        log.info("列表下载异常 对应的链接："+uri_code);
@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
                    try {
                        ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        String docjson = mapper.writeValueAsString(processitem);
 //                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
-                        int partition=0;
+//                        int partition=0;
-                        try {
+//                        try {
-                            partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
+//                            partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
-                        }catch (Exception e){
+//                        }catch (Exception e){
-                            log.info("分区配置异常："+Constants.KAFKA_PRODUCT_PARTITION);
+//                            log.info("分区配置异常："+Constants.KAFKA_PRODUCT_PARTITION);
-                        }
+//                        }
                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC,  docjson);
                        docInfoList.add(docInfo);
                        log.info("发送到kafka成功。");
@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());
@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
        return encoding;
    }
+    /**
+     *
+     * @param siteMsgTemple
+     * @param msg 异常信息
+     * @param problemType 问题类型（1：信息源异常   2：爬取类别设置异常
+     */
+    public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
+        try {
+            BadSiteMsg badSiteMsg = new BadSiteMsg();
+            badSiteMsg.setId(siteMsgTemple.getId());
+            badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
+            badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
+            badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
+            badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
+            badSiteMsg.setErrorType(msg);
+            badSiteMsg.setProblemType(problemType);
+            String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
+            badSiteMsg.setCrawlerType(crawlerType);
+            ObjectMapper mapper = new ObjectMapper();
+            String docjson = mapper.writeValueAsString(badSiteMsg);
+            kafkaTemplate.send("badSiteTopic", docjson);
+            log.info("信息源问题："+msg);
+        }catch (Exception e){
+        }
+    }
 }
--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
 import com.zzsn.crawler.uriparser.WebPageScreenShot;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownloader;
-import com.zzsn.entity.CatchWebByMetaSearch;
+import com.zzsn.entity.*;
-import com.zzsn.entity.ClbAnsProcessitem;
-import com.zzsn.entity.DocInfo;
-import com.zzsn.entity.SiteMsgTemple;
 import com.zzsn.generation.Constants;
 import com.zzsn.job.JedisUtil;
 import com.zzsn.util.*;
@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
                                body = SeleniumTime.getScopehtml(uri_code);
                            }
                        }
+//                        if(StringUtils.isEmpty(body)){
+//                            sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
+//                        }
                        //抽取资讯url
                        List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
                        catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
            return catchWebByMetaSearchList;
    }
+    public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
+        try {
+            BadSiteMsg badSiteMsg = new BadSiteMsg();
+            badSiteMsg.setId(siteMsgTemple.getId());
+            badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
+            badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
+            badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
+            badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
+            badSiteMsg.setErrorType(msg);
+            badSiteMsg.setProblemType(problemType);
+            String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
+            badSiteMsg.setCrawlerType(crawlerType);
+            ObjectMapper mapper = new ObjectMapper();
+            String docjson = mapper.writeValueAsString(badSiteMsg);
+            kafkaTemplate.send("badSiteTopic", docjson);
+            log.info("信息源问题："+msg);
+        }catch (Exception e){
+        }
+    }
    //提取列表信息
    public  List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception {
        List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
                    try {
                        ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        String docjson = mapper.writeValueAsString(processitem);
 //                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
@@ -70,42 +70,38 @@ public class SeleniumTime {
 		ChromeDriverService service = new ChromeDriverService.Builder().
 				usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
 		try {
+			System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
 			service.start();
 			if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
 				chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
 				chromeOptions.addArguments("headless");//无界面参数
 				chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
 			}
-//			chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
-//			chromeOptions.addArguments("headless");//无界面参数
-//			chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
 			driver = new ChromeDriver(chromeOptions);//生成实例
 			try {
-				Duration duration=Duration.of(60, ChronoUnit.SECONDS);
+				Duration duration=Duration.of(100, ChronoUnit.SECONDS);
 				driver.manage().timeouts().pageLoadTimeout(duration);
 				driver.get(url);
-				Thread.sleep(1000l);
+				Thread.sleep(10002);
 				try {
 					WebElement webElement = driver.findElement(By.xpath("/html"));
 					html = webElement.getAttribute("outerHTML");
 					System.out.println("browser will be close");
 				} catch (Exception e) {
 					log.info("chromedriver 出现异常：" + e.getMessage());
+				}finally {
+					driver.quit();
 				}
 			} catch (Exception e) {
 				log.info("chromedriver 出现异常：" + e.getMessage());
 			} finally {
-				try {
+				driver.quit();
-					driver.quit();
+				service.stop();
-					service.stop();
-					Thread.sleep(3000l);
-				} catch (InterruptedException e) {
-				}
 			}
 		} catch (Exception e) {
+			log.info("chromedriver 驱动访问出现异常：" + e.getMessage());
-			return "";
+		} finally {
+			service.stop();
 		}
 		return html;
 	}
@@ -281,18 +277,18 @@ public class SeleniumTime {
 //		robot.keyPress(KeyEvent.VK_ENTER);//按下enter键
 		robot.keyPress(keycode);
 	}
 	public static void main(String[] args) {
 		//去除html中的相关标签
 		/**
 		 * 网上大多是说明直接使用正则表达式不能很好的适用于html
 		 * 经过尝试我无法删除先关div中内容，只能自己通过字符串切割的形式获取
 		 */
 		SeleniumTime s = new SeleniumTime();
 		String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
 		String a = "<div class=\"attach_nopermission attach_tips\">";
 		String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
 		System.out.println("开始");
@@ -303,7 +299,7 @@ public class SeleniumTime {
 			System.out.println("包含b");
 		}
 		System.out.println("结束");
 		String[] split = scopehtml.split(a);
 		String sa = split[0];
 		System.out.println("首次截取的长度"+split.length);
@@ -312,31 +308,31 @@ public class SeleniumTime {
 		String substring = sb.substring(7);
 		System.out.println("再次截取的长度"+split2.length);
 		String sab = sa + substring ;
 //		//解决方式    正则匹配删除标签
 //		//   *.div[class="t_fsz"]
 //		String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
 //		//String regex = "<div.*?>(.*?)</div>";
 //		//String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
-//		
+//
 ////		 boolean isMatch = regex.matches(scopehtml);
 ////	      System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
-////		
+////
 //	   // 创建 Pattern 对象
 //	      Pattern r = Pattern.compile(regex);
-//	 
+//
 //	      // 现在创建 matcher 对象
 //	      Matcher m = r.matcher(scopehtml);
 //	      if (m.find( )) {
 //	          System.out.println("Found value: " + m.group(0) );
 //	          System.out.println("Found value: " + m.group(1) );
 //	          System.out.println("Found value: " + m.group(2) );
-//	          System.out.println("Found value: " + m.group(3) ); 
+//	          System.out.println("Found value: " + m.group(3) );
 //	       } else {
 //	          System.out.println("NO MATCH");
 //	       }
-//		
+//
-//		
+//
 		 File file = new File("D:/123.txt");
         try {
 			PrintStream ps = new PrintStream(new FileOutputStream(file));
@@ -345,30 +341,30 @@ public class SeleniumTime {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}
 }
--- a/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/WebPageScreenShot.java
@@ -41,8 +41,8 @@ public class WebPageScreenShot {
 //      driver.manage().window().maximize();
        String js1 = "return document.body.clientHeight.toString()";
-        String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
+//        String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
-        int height = Integer.parseInt(js1_result);
+//        int height = Integer.parseInt(js1_result);
        List<String> files = new ArrayList<String>();
        int last_t = 0;
 //        for (int i = 0; i < 20; ) {
@@ -80,7 +80,7 @@ public class WebPageScreenShot {
        CustomScreenshot customScreenshot=new CustomScreenshot();
        files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath());
        driver.quit();//退出浏览器
-        boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
+//        boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
 //        if(flag){
 //            InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
 //            HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");

--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/ArticleCrawlerThread.java
@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/PaserCommDownload.java
@@ -361,7 +361,7 @@ public class PaserCommDownload {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByJsonXpath.java
@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
                    try {
                        ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        String docjson = mapper.writeValueAsString(processitem);
                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
                    try {
                        ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
                                ||StringUtils.isEmpty(processitem.getPublishDate())){

--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
                    try {
                        ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
                        if(siteMsgTemple.getYnDynamicCrawl()==1) {
-                            processitem.setSource("动态爬取");
+                            processitem.setSource("2");
                        }else{
-                            processitem.setSource("静态爬取");
+                            processitem.setSource("1");
                        }
                        String docjson = mapper.writeValueAsString(processitem);
                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
-        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
+        clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());

--- a/comm_crawler/src/main/java/com/zzsn/download/PageConnectioner.java
+++ b/comm_crawler/src/main/java/com/zzsn/download/PageConnectioner.java
--- a/comm_crawler/src/main/java/com/zzsn/download/PageDownloader.java
+++ b/comm_crawler/src/main/java/com/zzsn/download/PageDownloader.java
@@ -34,7 +34,7 @@ public class PageDownloader {
 		this.bDownloadUseFrame = b;
 	}
 	public PageDownloader(){
 	}
 	Timer timer;
 	public PageDownloader(long sec) {
@@ -49,39 +49,46 @@ public class PageDownloader {
 	// 如果页面编码格式未知，则从页面中获取该页面编码格式
 	public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException {
-		connection.setRequestMethod("GET");
-		connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
-				+ "Gecko/20080404 Firefox/2.0.0.14");
-		connection.setRequestProperty("referer", urlstr);
-		connection.setRequestProperty("Cookie", "auth=token");
-		String contentType = connection.getHeaderField("Content-Type");
 		String encoding = null;
-		if (contentType != null) {
+		try {
-			String temp = "charset=";
+			connection.setRequestMethod("GET");
-			int m = contentType.indexOf(temp);
+			connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
-			if (m != -1) {
+					+ "Gecko/20080404 Firefox/2.0.0.14");
-				encoding = contentType.substring(m + temp.length()).replace("]", "");
+			connection.setRequestProperty("referer", urlstr);
+			connection.setRequestProperty("Cookie", "auth=token");
+			String contentType = connection.getHeaderField("Content-Type");
+			if (contentType != null) {
+				String temp = "charset=";
+				int m = contentType.indexOf(temp);
+				if (m != -1) {
+					encoding = contentType.substring(m + temp.length()).replace("]", "");
+				}
 			}
-		}
+			if (encoding == null) {
-		if (encoding == null) {
-			try {
 				InputStream is = null;
-				is = connection.getInputStream();
+				try {
-				BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
+					is = connection.getInputStream();
-				encoding = EncodeDetector.getEncoding(bufferedInputStream);
+					BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
-				is.close();
+					encoding = EncodeDetector.getEncoding(bufferedInputStream);
-			} catch (Exception e) {
+				} catch (Exception e) {
+					//
+				}finally {
+					assert is != null;
+					is.close();
+				}
 			}
+		} catch (Exception e) {
+			//
+		} finally {
+			connection.disconnect();
 		}
-		connection.disconnect();
 		return encoding;
 	}
 	// Document接口，主要针对html,txt,deng网页,通过get方式获取，动态或者静态链接
 	public Document downloadWithDoc(String url, String encoding, boolean bDynamic,boolean bFrame) {
 		Document doc = null;
 		String docBody="";
 		if (false) {
@@ -117,7 +124,7 @@ public class PageDownloader {
 		}
 		return doc;
 	}
 	// Document接口，主要针对jsonHtml类型配置文件，通过get方式获取，动态或者静态链接
 	public Document downloadWithJsonHtml(String url, String encoding, boolean bDynamic, boolean bFrame,
 			String bodyPath) {
@@ -154,30 +161,24 @@ public class PageDownloader {
 	}
 	/** String接口，主要针对html网页,通过get方式获取，动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
 	public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) {
 		long dis = System.currentTimeMillis() - lastDownloadTime;
 		if (interval > 0 && lastDownloadTime > 0 && dis < interval){
 			new PageDownloader(dis+2000);
 		}
-		long startDtime = System.currentTimeMillis();
 		PageConnectioner pConn = new PageConnectioner();
-		HttpURLConnection connection = null;
 		try {
-			connection = pConn.connection(url);
 			if (encoding == null || encoding.isEmpty()) {//获取网站编码
-//				encoding = getEncodingFromHtmlFile(url, connection);
 				PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
 				encoding = paserSiteDownload.locateCharSet(url);
 			}
 		} catch (Exception e1) {
-		//	e1.printStackTrace();
 			log.info("获取编码失败");
 		}
 		String docBody = null;
 		if (bDynamic) {
 			docBody = pConn.dynamicConnectByGet(url, encoding);
 		} else {
-//			this.bDownloadUseFrame=true;
 			if (bFrame && this.bDownloadUseFrame) {
 				String body = null;
 				try {
@@ -196,12 +197,11 @@ public class PageDownloader {
 			}
 			if(url.contains("https:")){
 				try {
-					connection = pConn.httpsconnection(url);
 					if (encoding == null || encoding.isEmpty()) {
 						encoding = "utf-8";
 					}
 				} catch (Exception e1) {
-				//	e1.printStackTrace();
+					//
 				}
 				docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
 			}else{
@@ -211,7 +211,7 @@ public class PageDownloader {
 		this.lastDownloadTime = System.currentTimeMillis();
 		return docBody;
 	}
 	public String downloadWithStrAddHeader(String url, String encoding, boolean bDynamic,boolean bFrame,String headerParams) {
 		long dis = System.currentTimeMillis() - lastDownloadTime;
@@ -221,7 +221,7 @@ public class PageDownloader {
 				String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
 				logUtil.getLogger().info(info);
 				Thread.sleep(dis+2000);
-			} catch (InterruptedException e) { 
+			} catch (InterruptedException e) {
 				logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
 			}*/
 			String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
@@ -237,6 +237,9 @@ public class PageDownloader {
 			}
 		} catch (Exception e1) {
 		//	e1.printStackTrace();
+		}finally {
+			assert connection != null;
+			connection.disconnect();
 		}
 		String docBody = null;
 		if (bDynamic) {
@@ -264,7 +267,7 @@ public class PageDownloader {
 			}
 			if(url.contains("https:")){
 				try {
-					connection = pConn.httpsconnection(url);
+//					connection = pConn.httpsconnection(url);
 					if (encoding == null || encoding.isEmpty()) {
 						encoding = "utf-8";
 					}
@@ -306,7 +309,7 @@ public class PageDownloader {
 			if (encoding == null || encoding.isEmpty()) {
 				encoding = getEncodingFromHtmlFile(url, connection);
 			}
-		} catch (Exception e1) { 
+		} catch (Exception e1) {
 		//	e1.printStackTrace();
 		}
 		String docBody = null;
@@ -334,7 +337,7 @@ public class PageDownloader {
 				}
 			}
 			docBody = pConn.staticConnectByGet(url, encoding);
 			if (isBadDownloadPage(docBody) && this.badPage) {
 				return docBody;
 			}
@@ -344,7 +347,7 @@ public class PageDownloader {
 	}
 	/** String接口，目前用于豆瓣API图书的爬取 */
 	public String downloadPoxyWithStrAPI(String url, String encoding, boolean bDynamic, boolean bFrame) {
 		long dis = System.currentTimeMillis() - lastDownloadTime;
 		if (interval > 0 && lastDownloadTime > 0 && dis < interval) {
 			/*try {
@@ -366,8 +369,11 @@ public class PageDownloader {
 			if (encoding == null || encoding.isEmpty()) {
 				encoding = getEncodingFromHtmlFile(url, connection);
 			}
-		} catch (Exception e1) { 
+		} catch (Exception e1) {
 		//	e1.printStackTrace();
+		}finally {
+			assert connection != null;
+			connection.disconnect();
 		}
 		String docBody = null;
 		if (bDynamic) {
@@ -399,7 +405,7 @@ public class PageDownloader {
 		this.lastDownloadTime = System.currentTimeMillis();
 		return docBody;
 	}
 	// String接口，主要针对html网页或者json网页，通过post方式获取，默认静态链接
 	public String downloadWithStr(String url, String encoding, String param) {
 		long dis = System.currentTimeMillis() - lastDownloadTime;
@@ -409,7 +415,7 @@ public class PageDownloader {
 				String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
 				logUtil.getLogger().info(info);
 				Thread.sleep(dis+2000);
-			} catch (InterruptedException e) { 
+			} catch (InterruptedException e) {
 				logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
 			}*/
 			String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
@@ -444,7 +450,7 @@ public class PageDownloader {
 				String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
 				logUtil.getLogger().info(info);
 				Thread.sleep(dis+2000);
-			} catch (InterruptedException e) { 
+			} catch (InterruptedException e) {
 				logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
 			}*/
 			String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
@@ -493,7 +499,6 @@ public class PageDownloader {
 				return true;
 			}
 		} catch (Exception e) {
-			// TODO Auto-generated catch block
 			return true;
 		}
 		return false;
@@ -501,7 +506,7 @@ public class PageDownloader {
 	/**
 	 * 向指定URL发送GET方法的请求
-	 * 
+	 *
 	 * @param url
 	 *            发送请求的URL
 	 *   只用于塔读APP
@@ -550,5 +555,5 @@ public class PageDownloader {
 			}
 		}
 		return result;
-	}	
+	}
 }
--- a/comm_crawler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
+++ b/comm_crawler/src/main/java/com/zzsn/entity/ClbAnsProcessitem.java
@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
    /**正文*/
    private String content;
-    private String contentWithtag;
+    private String contentWithTag;
    /**未知*/
@@ -94,4 +94,4 @@ public class ClbAnsProcessitem {
    /**(临时处理)关联的专题id*/
    private List<String> subjectIds;
 }
\ No newline at end of file
--- a/comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+++ b/comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
 //         latest earliest
        //时间间隔设置为1h
 //        properties.put("max.poll.interval.ms", 60*60*1000);
-        properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 60*60*1000);
+        properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 2*60*60*1000);
        properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000);
        properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
        properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
 //            , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
-    @Scheduled(cron = "0 0/5 * * * ?")
+    @Scheduled(cron = "0 0/2 * * * ?")
-    @Async("asyncTaskExecutor")
+//    @Async("asyncTaskExecutor")
    public void consumer (){
-        ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
+//        ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
-        log.info("进入定时获取mq消息");
+        log.info("进入定时获取topic消息");
        //1.创建消费者
        KafkaConsumer<String, String> consumer = createConsumer();
        // 消费某个主题的某个分区数据
@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
                //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回，不管有没有可用的数据都要返回
                ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000));
                //手动提交已消费数据的offset
-//                consumer.commitAsync();
                consumer.commitSync();
                if (records != null && records.count() > 0) {
                    for (ConsumerRecord record : records) {
@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
                        }
                    }
                }
            }
        }catch (Exception e){
-//            consumer.commitSync();
+            //退出应用程序前使用close方法关闭消费者，网络连接和socket也会随之关闭，并立即触发一次再均衡
-            log.info(e.getMessage());
+            consumer.close();
-//            consumer = createConsumer();
+            System.out.println("error!!!!!!!!!!!");
-//            consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
+            consumer = createConsumer();
+            // 消费某个主题的某个分区数据
+            kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
+            String[] partitions1 = kafkaConsumerPartition.split(",");
+            for (int i = 0; i < partitions1.length; i++) {
+                topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions1[i])));
+            }
+            consumer.assign(topicPartitions);
        }
    }

--- a/comm_crawler/src/main/resources/constants.properties
+++ b/comm_crawler/src/main/resources/constants.properties
@@ -35,8 +35,8 @@ PROXYID=1
 #线程池大小
 THREAD_SIZE=1
 #
-CHROMEDRIVE= E:\\chrome\\chromedriver.exe
+CHROMEDRIVE= D:\\chrome\\chromedriver.exe
-CHROMEBIN= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe
+CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
 USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
 #mysql connection
@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
 #KAFKA_CONSUMER_TOPIC = staticCrawlTopic
 KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
 #
-KAFKA_CONSUMER_GROUP_ID=dynamin-sync
+KAFKA_CONSUMER_GROUP_ID=test-zs1
 #KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
 KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
 KAFKA_PRODUCT_TOPIC=crawlerInfo
@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
 META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
 #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
 #指定分区使用逗号分割
-KAFKA_CONSUMER_PARTITION=0
+KAFKA_CONSUMER_PARTITION=0,1,2,3
 #KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 KAFKA_PRODUCT_PARTITION=0
 # Redis settings
-redis.host=127.0.0.1
+redis.host=114.116.26.150
 redis.port=6379
-redis.pass=xxxxxx
+redis.pass=zzsn9988
 #redis.host=8.130.30.33
 #redis.port=9010
 #redis.pass=wxadS&jklim

--- a/sina_search/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
+++ b/sina_search/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
@@ -17,19 +17,19 @@ public class ThreadExecutorConfig {
    @Bean(value = "asyncTaskExecutor")
    public Executor executor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
-        executor.setCorePoolSize(1);//线程池维护线程的最少数量
+        executor.setCorePoolSize(2);//线程池维护线程的最少数量
-        executor.setMaxPoolSize(1);//线程池维护线程的最大数量
+        executor.setMaxPoolSize(5);//线程池维护线程的最大数量
        executor.setQueueCapacity(5000);//缓存队列
-        executor.setThreadNamePrefix("ssmsExecutor-");  
+        executor.setThreadNamePrefix("ssmsExecutor-");
-        /** 
+        /**
-         * 对拒绝task的处理策略 
+         * 对拒绝task的处理策略
-         rejection-policy：当pool已经达到max size的时候，如何处理新任务 
+         rejection-policy：当pool已经达到max size的时候，如何处理新任务
-         CALLER_RUNS：不在新线程中执行任务，而是由调用者所在的线程来执行 
+         CALLER_RUNS：不在新线程中执行任务，而是由调用者所在的线程来执行
-         */  
+         */
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
-        executor.setKeepAliveSeconds(60);//允许的空闲时间  
+        executor.setKeepAliveSeconds(60);//允许的空闲时间
-        executor.initialize();  
+        executor.initialize();
-        return executor;  
+        return executor;
    }
    @Bean(value = "asyncTaskExecutorSelenium")
@@ -139,4 +139,4 @@ public class ThreadExecutorConfig {
        executor.initialize();
        return executor;
    }
-}  
+}
\ No newline at end of file