通用采集代码更新

af30a040 · liuweigang · 649ac47c · af30a040 · af30a040 · af30a040
--- a/comm_crawler/doc.txt
+++ b/comm_crawler/doc.txt
@@ -11,3 +11,5 @@



+
+
--- a/comm_crawler/src/main/java/com/zzsn/CrawlerStaticApplication.java
+++ b/comm_crawler/src/main/java/com/zzsn/CrawlerStaticApplication.java
--- a/comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
@@ -469,6 +469,7 @@ public class PaserSiteDownload {
    }

    public static String getHtml(String url,String charset) {
+        java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
        String html="";
        CloseableHttpClient  httpClient = CreateSSLClientDefault.createSSLClientDefault();
        HttpGet httpgeturl = new HttpGet(url);// Get请求

--- a/comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
@@ -2,6 +2,7 @@ package com.zzsn.crawler;

 import cn.hutool.core.date.DateTime;
 import cn.hutool.core.date.DateUtil;
+import cn.hutool.core.io.FileUtil;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.zzsn.configuration.SpringContextUtil;
@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
 import com.zzsn.crawler.paser.WebContentPaserByXpath;
 import com.zzsn.crawler.uriparser.HisURIConfig;
 import com.zzsn.crawler.uriparser.HisURIParser;
+import com.zzsn.crawler.uriparser.HttpgetUtil;
+import com.zzsn.crawler.uriparser.SeleniumTime;
+import com.zzsn.download.PageDownloader;
 import com.zzsn.entity.*;
 import com.zzsn.generation.Constants;
 import lombok.extern.slf4j.Slf4j;
@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
    public PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
    public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();

-    public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
+//    public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);

    @Override
    public void run() {
        crawler();
    }
-
+    public static PageDownloader pageDownload=new PageDownloader();
    public   void crawler(){

        //获取栏目链接以及翻页的链接
+//        List<String> urlList=getPageListUrl(siteMsgTemple);
+        List<String> urlList=new ArrayList<>();
+        urlList.add(siteMsgTemple.getSiteUri());
+        //兼容就平台的历史链接方法
+        String charset="utf-8";
+        //获取列表url等信息通过匹配url过滤
+        List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
+        List<DocInfo> docInfoList=new ArrayList<>();
+
+        log.info("信息源名称："+siteMsgTemple.getSiteName()+" 信息源采集开始时间："+DateTime.now());
+//        Date collectTime=DateTime.now();
+//        String infoSourceId=siteMsgTemple.getId();
+//        //默认表达式类型
+//        siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
+//
+//        //判断列表解析表达式类型
+//        if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
+//            WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
+//            metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
+//
+//        }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
+//            WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
+//            metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
+//
+//        }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
+//            WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
+//            metaSearchList = webContentPaserByJsonXpath.catchWebOfStaticmsgByJsonPath(urlList, charset, siteMsgTemple);
+//
+//        }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
+//            WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
+//            metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
+//        }
+        String body = "";
+        if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
+            body = pageDownload.downloadWithStrAddHeader(urlList.get(0),charset,false,false, siteMsgTemple.getHeaders());
+        }else {
+            try {//先使用静态网络请求获取列表内容
+                body = HttpgetUtil.getHtml(urlList.get(0));
+//              body = pageDownload.downloadWithStr(uri_code, charset, false, false);
+            } catch (Exception e) {
+                log.info(e.getMessage());
+                body = pageDownload.downloadWithStr(urlList.get(0), charset, false, false);
+            }
+            //请求返回为空时判断为动态请求使用模拟浏览器的方式
+            if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
+                body = SeleniumTime.getScopehtml(urlList.get(0));
+            }
+        }
+
+       if(body.length()< 1000){
+           FileUtil.appendString(siteMsgTemple.getInfoSourceCode()+"\n\r","D:\\jingwai.txt","utf-8");
+       }
+
+
+    }
+    public   void crawler2(){
+
+        //获取栏目链接以及翻页的链接
        List<String> urlList=getPageListUrl(siteMsgTemple);
        //兼容就平台的历史链接方法
        HisURIParser hisURIParser = new HisURIParser();
@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
                siteMsgRecord.setCollectTime(collectTime);

                String docjson = mapper.writeValueAsString(siteMsgRecord);
-                kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
+//                kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
                log.info("发送到kafka成功。");
            } catch (JsonProcessingException e) {
 //                    e.printStackTrace();

--- a/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import com.zzsn.configuration.SpringContextUtil;
 import com.zzsn.crawler.PaserSiteDownload;
 import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
-import com.zzsn.crawler.uriparser.HtmlPageParser;
-import com.zzsn.crawler.uriparser.SeleniumTime;
-import com.zzsn.crawler.uriparser.WebPageScreenShot;
+import com.zzsn.crawler.uriparser.*;
 import com.zzsn.download.PageBuilderParser;
 import com.zzsn.download.PageDownloader;
 import com.zzsn.entity.*;
@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
                        }else {
                            try {//先使用静态网络请求获取列表内容
-                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
+                                body =HttpgetUtil.getHtml(uri_code);
+//                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
                            }catch (Exception e){
                                log.info(e.getMessage());
-                                body = paserSiteDownload.getHtml(uri_code, charset);
+                                body = pageDownload.downloadWithStr(uri_code, charset, false, false);
+//                                body = paserSiteDownload.getHtml(uri_code, charset);
                            }
                            //请求返回为空时判断为动态请求使用模拟浏览器的方式
                            if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
+//                                SeleniumTime seleniumTime=new SeleniumTime();
+//                                body = seleniumTime.getScopehtml(uri_code);
                                body = SeleniumTime.getScopehtml(uri_code);
                            }
                            if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) {
@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
                String content = "";
                try {
                    if (siteMsgTemple.getYnDynamicCrawl() == 1) {
+//                        SeleniumTime seleniumTime=new SeleniumTime();
+//                        content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
                        content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
                    } else {
                        try {
-                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
+                            content =HttpgetUtil.getHtml(cwbm.getSourceaddress());
+//                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
                        } catch (Exception e) {
+                            content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
                            log.info(e.getMessage());
-                            content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
+//                            content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
                        }
                    }
                    //超时，失效连接反馈，selenium驱动关闭不掉，信息源分类
@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
                docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
                docInfo.setAuthor(cwbm.getAuthor());
                docInfo.setPublishDate(cwbm.getPublishDate());
-                if (cwbm.getSourceaddress() != null) {
-                    docInfo.setOrigin(cwbm.getSourcesite());
-                } else {
-                    docInfo.setOrigin(siteMsgTemple.getSiteName());
-                }
+//                if (cwbm.getSourceaddress() != null) {
+//                    docInfo.setOrigin(cwbm.getSourcesite());
+//                } else {
+//                    docInfo.setOrigin(siteMsgTemple.getSiteName());
+//                }
+                docInfo.setOrigin(siteMsgTemple.getSiteName());
                docInfo.setSummary(cwbm.getSummary());
                //封装解析的docinfo对象
                try {
@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {

            }
            docInfo.setContentWithTag(contentWithTag);
-            docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
+            docInfo.setContentNoTag(ContentUtility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));

    }
        //作者
@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
           origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource());
            if(StringUtils.isNotEmpty(origin)) {
                docInfo.setOrigin(origin);
+            }else{
+                docInfo.setOrigin(siteTemplate.getSiteName());
            }
+        }else{
+            docInfo.setOrigin(siteTemplate.getSiteName());
        }
+
        return docInfo;
    }


--- a/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
+package com.zzsn.crawler.uriparser;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.zzsn.download.CreateSSLClientDefault;
+import com.zzsn.util.Utility;
+import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.apache.http.Header;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.ParseException;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.params.CoreConnectionPNames;
+import org.apache.http.util.EntityUtils;
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+
+public class HttpgetUtil {
+
+	
+	
+	public static String getHtml(String url) {
+		String html="";
+		CloseableHttpClient  httpClient = CreateSSLClientDefault.createSSLClientDefault();
+		HttpGet httpgeturl = new HttpGet(url);// Get请求
+
+		httpgeturl.getParams().setIntParameter(
+				CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
+		httpgeturl.getParams().setParameter(
+				HttpMethodParams.SO_TIMEOUT, 60000);
+		// 伪装成浏览器
+		httpgeturl.setHeader("Content-Type",
+				"application/x-www-form-urlencoded;charset=utf-8");
+		httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
+		httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
+		//httpgeturl.setHeader("Accept-Language", "en");
+		//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
+		HttpResponse httprespse=null;
+		try {
+			Thread.sleep(500L);
+			httprespse = httpClient.execute(httpgeturl);
+		} catch (Exception e2) {
+			// TODO Auto-generated catch block
+			e2.printStackTrace();
+		} // 发送请求
+		HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
+
+		Header lastModify = httprespse
+				.getFirstHeader("Last-Modified");
+		if (lastModify == null) {
+			lastModify = httprespse.getLastHeader("Last-Modified");
+		}
+		String charset="utf-8";
+		String charstype = EntityUtils
+				.getContentCharSet(entitydata);
+		if (charstype != null) {
+			charset = charstype;
+		} else {
+			charset = LocateCharSet(url);
+		}
+		charset = Utility.charsetcheck(charset);
+
+		
+		
+		String infodata="";
+		try {
+			Thread.sleep(500L);
+			infodata = EntityUtils.toString(entitydata, charset);
+		
+		} catch (Exception e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		}
+		httpgeturl.releaseConnection();
+		
+		return infodata;
+	}
+	public static String LocateCharSet(String url) {
+		String encoding = "gb2312";
+		try {
+			Thread.sleep(500L);
+			Connection conn = Jsoup.connect(url);
+			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
+
+			// 伪装成浏览器
+			Document doc = conn.ignoreContentType(true).timeout(10000).get();
+
+			Pattern p1 = Pattern.compile("<meta[^>]*>",
+					Pattern.CASE_INSENSITIVE);
+			Matcher m1 = p1.matcher(doc.toString());
+			while (m1.find()) {
+				String str = m1.group();
+				Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
+				Matcher m2 = p2.matcher(str);
+				if (m2.find()) {
+					encoding = m2.group().substring(8);
+					if (encoding.trim().length() == 0) {
+						Pattern p3 = Pattern
+								.compile("charset=\"[^\\s||\"||;||>]*");
+						Matcher m3 = p3.matcher(str);
+						if (m3.find()) {
+							encoding = m3.group().substring(9);
+						}
+						if (encoding.trim().length() == 0) {
+							// encoding = DetectCharSet.detectCharSet(fileName);
+							// if(encoding == null){
+							encoding = "GB2312";
+							// }
+						}
+					}
+
+					return encoding;
+				}
+			}
+		} catch (Exception e) {
+			 e.printStackTrace();
+			System.out.println("获取出错编码方式");
+			return encoding;
+		}
+
+		return encoding;
+	}
+}
--- a/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
@@ -32,6 +32,7 @@ public class SeleniumTime {
 	public  static String getScopehtml(String url) {
 		String html = "";
 		try {
+
 			ReuseWebDriver driver = DriverUtil.getChromeDriver();
 			try {
 				Duration duration=Duration.of(100, ChronoUnit.SECONDS);

--- a/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
+package com.zzsn.crawler.uriparser;
+
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+
+import com.zzsn.generation.Constants;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+
+
+public class SeleniumTime4 {
+
+	public ChromeOptions chromeOptions =new ChromeOptions() ;
+	public  ChromeDriver driver;
+	
+	
+	public SeleniumTime4(){
+//		 System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
+//		 System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
+//		 System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
+		 System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
+//		 System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
+		 chromeOptions.addArguments("blink-settings=imagesEnabled=false");
+//		 chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
+//		 chromeOptions.addArguments("--start-maximized");
+//		 chromeOptions.addArguments("--headless");
+		 driver = new ChromeDriver(chromeOptions);
+	}
+	/**
+	 * 根据网址获取网页html信息
+	 * @param url
+	 * @return
+	 */
+	public  String getScopehtml(String url){
+//=====================================================================================================		
+//		ChromeOptions chromeOptions =new ChromeOptions();
+////		System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
+//		System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
+//		//System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"); 
+//		//chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
+//		//C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
+//		//C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
+//		//chromeOptions.addArguments("--headless");
+//		ChromeDriver driver = new ChromeDriver(chromeOptions);
+//=====================================================================================================		
+		try{
+			driver.get(url);
+	        WebElement webElement = driver.findElement(By.xpath("/html"));
+	        try{
+	        	Thread.sleep(3000l);
+	        	String html = webElement.getAttribute("outerHTML");
+	        	 Thread.sleep(5000l);
+		        driver.quit();
+		       // System.out.println(html);
+		        if(url.contains("http://www.flw.ph")){
+		        	String a = "<div class=\"attach_nopermission attach_tips\">";
+		        	String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
+		        	if(html.contains(a)&&html.contains(b)){
+		        		String[] split = html.split(a);
+		        		String sa = split[0];
+		        		String[] split2 = split[1].split(b);
+		        		String sb = split2[1];
+		        		String substring = sb.substring(7);
+		        		String sab = sa + substring ;
+		        		return sab;
+		        	}
+		        }
+		        return html;
+	        }catch(Exception e){
+	        	System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+	        +"可能原因为过快的执行没有找到指定的页面元素");
+	        	System.out.println("=============执行方法二==============");
+	        	Thread.sleep(3000l);
+		        String html = driver.getPageSource();
+		        Thread.sleep(5000l);
+		        	driver.quit();
+		        	 if(url.contains("http://www.flw.ph")){
+				        	String a = "<div class=\"attach_nopermission attach_tips\">";
+				        	String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
+				        	if(html.contains(a)&&html.contains(b)){
+				        		String[] split = html.split(a);
+				        		String sa = split[0];
+				        		String[] split2 = split[1].split(b);
+				        		String sb = split2[1];
+				        		String substring = sb.substring(7);
+				        		String sab = sa + substring ;
+				        		return sab;
+				        	}
+				        }
+		        	return html;
+	        }
+//	        Thread.sleep(3000l);
+//	        String source = driver.getPageSource();
+//	        //if(source.length()!=0){
+//	        	driver.quit();
+//	        	return source;
+	        //}
+//	        String html = webElement.getAttribute("outerHTML");
+//	        //System.out.println(html);
+//	        driver.quit();
+//	        return html;
+//==========================================================================
+//			driver.get(url);
+//	        // 休眠1s,为了让js执行完
+//	        Thread.sleep(1000l);
+//	        // 网页源码
+//	        String source = driver.getPageSource();
+//	        System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
+//	        driver.quit();
+//	        return source;
+		}catch(Exception e){
+			 try {
+				Thread.sleep(5000l);
+			} catch (InterruptedException e1) {
+				// TODO Auto-generated catch block
+				e1.printStackTrace();
+			}
+			driver.quit();
+			e.printStackTrace();
+		}
+		 try {
+			Thread.sleep(5000l);
+		} catch (InterruptedException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		driver.quit();
+		return null;
+	}
+	
+	
+	
+	
+	
+	
+	
+	public static void main(String[] args) {
+		
+		//去除html中的相关标签
+		/**
+		 * 网上大多是说明直接使用正则表达式不能很好的适用于html
+		 * 经过尝试我无法删除先关div中内容，只能自己通过字符串切割的形式获取
+		 */
+		
+		SeleniumTime4 s = new SeleniumTime4();
+		String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
+		
+		String a = "<div class=\"attach_nopermission attach_tips\">";
+		String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
+		System.out.println("开始");
+		if(scopehtml.contains(a)){
+			System.out.println("包含a");
+		}
+		if(scopehtml.contains(a)){
+			System.out.println("包含b");
+		}
+		System.out.println("结束");
+		
+		String[] split = scopehtml.split(a);
+		String sa = split[0];
+		System.out.println("首次截取的长度"+split.length);
+		String[] split2 = split[1].split(b);
+		String sb = split2[1];
+		String substring = sb.substring(7);
+		System.out.println("再次截取的长度"+split2.length);
+		String sab = sa + substring ;
+		
+//		//解决方式    正则匹配删除标签
+//		//   *.div[class="t_fsz"]
+//		String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
+//		//String regex = "<div.*?>(.*?)</div>";
+//		//String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
+//		
+////		 boolean isMatch = regex.matches(scopehtml);
+////	      System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
+////		
+//	   // 创建 Pattern 对象
+//	      Pattern r = Pattern.compile(regex);
+//	 
+//	      // 现在创建 matcher 对象
+//	      Matcher m = r.matcher(scopehtml);
+//	      if (m.find( )) {
+//	          System.out.println("Found value: " + m.group(0) );
+//	          System.out.println("Found value: " + m.group(1) );
+//	          System.out.println("Found value: " + m.group(2) );
+//	          System.out.println("Found value: " + m.group(3) ); 
+//	       } else {
+//	          System.out.println("NO MATCH");
+//	       }
+//		
+//		
+		 File file = new File("D:/123.txt");
+         try {
+			PrintStream ps = new PrintStream(new FileOutputStream(file));
+			ps.println(sab);
+		} catch (FileNotFoundException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		
+		
+	}
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+
+}
--- a/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+++ b/comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
                            body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
                        }else {
                            if (siteMsgTemple.getYnDynamicCrawl() == 1) {
-                                seleniumTime=new SeleniumTime();
-                                body = seleniumTime.getScopehtml(uri_code);
+//                                seleniumTime=new SeleniumTime();
+                                body = SeleniumTime.getScopehtml(uri_code);
                                TimeUnit.SECONDS.sleep(5);
                                seleniumTime.close();
                            } else {

--- a/comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
+++ b/comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
@@ -165,7 +165,8 @@ public class JedisUtil {
                throw new Exception("key is null");
            }
            jedis = getDefaultJedis();
-            value = jedis.get(PREFIX + key);
+//            value = jedis.get(PREFIX + key);
+            value = jedis.get(key);
        }catch (Exception e){

        }finally {

--- a/comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+++ b/comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
 //            , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));


-    @Scheduled(cron = "0 0/2 * * * ?")
+//    @Scheduled(cron = "0 0/2 * * * ?")
 //    @Async("asyncTaskExecutor")
    public void consumer (){
 //        ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);

--- a/comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
+++ b/comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
@@ -98,6 +98,7 @@ public class ChromeTest {

    // 可复用驱动使用Demo
    public static void main(String[] args) throws Exception {
+
        ReuseWebDriver driver = DriverUtil.getChromeDriver();
        if (driver == null) {
            // 从缓存取出SessionId为空才时，驱动会返回null，可参考工具类重新设置缓存

--- a/comm_crawler/src/main/java/com/zzsn/test/HttpClientTester.java
+++ b/comm_crawler/src/main/java/com/zzsn/test/HttpClientTester.java
 package com.zzsn.test;


+import com.zzsn.crawler.uriparser.HttpgetUtil;
 import com.zzsn.download.PageBuilderParser;
 import org.apache.http.HttpEntity;
 import org.apache.http.NameValuePair;
@@ -41,7 +42,9 @@ import java.util.List;
 public class HttpClientTester {
 	private static PageBuilderParser builderParser = null;
 	public static void  main(String[] args) {
-		get("https://www.cas.cn/zjs/");
+//		get("https://edition.cnn.com/world");
+		String html = HttpgetUtil.getHtml("https://edition.cnn.com/world");
+		System.out.println(html);
 //		post();
 	}


--- a/comm_crawler/src/main/java/com/zzsn/test/WebTest.java
+++ b/comm_crawler/src/main/java/com/zzsn/test/WebTest.java
 package com.zzsn.test;

+import com.zzsn.crawler.PaserSiteDownload;
 import com.zzsn.download.PageDownloader;

 import java.io.IOException;
@@ -17,21 +18,12 @@ import java.io.InputStream;
 public class WebTest {

    public static void main(String[] args) {
-//        String url="https://www.teriin.org/opinion";
+        String url="https://www.teriin.org/opinion";
 //        PageDownloader pageDownload=new PageDownloader();
 //       String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
 //        System.out.println(body);
-        try {
-            Runtime mt = Runtime.getRuntime();
-            String cmd = "taskkill /F /im chrome.exe";
-            Process pro = mt.exec(cmd);
-            InputStream ers= pro.getErrorStream();
-            pro.waitFor();
-            System.out.println("++++++++ taskkill /F /im chromedriver.exe");
-        } catch (IOException ioe) {
-            ioe.printStackTrace();
-        } catch (InterruptedException e) {
-            // TODO Auto-generated catch block
-        }
+        PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
+        String html = paserSiteDownload.getHtml("https://edition.cnn.com/world", "utf-8");
+        System.out.println(html);
    }
 }
--- a/comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
+++ b/comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
@@ -287,15 +287,17 @@ public class ContentUtility {
        if(htmlText==null){
            return null;
        }
-        String text = ContentUtility.HTMLDecode(ContentUtility.RemoveHTMLCode(ContentUtility.RemoveStyleCode(ContentUtility.RemoveHTMLReturnCode(htmlText))));
-		text = text.replaceAll(" 　　", "\r\n");
+        String text = Utility.HTMLDecode(Utility.RemoveHTMLCode(Utility.RemoveStyleCode(Utility.RemoveHTMLReturnCode(htmlText))));
+        text = text.replaceAll(" 　　", "\r\n");

-		text = text.replaceAll(" +\r\n", "\r\n");
-		text = text.replaceAll(" +", " ");
-		text = text.replaceAll("[\\u00A0\\u3000]", "");
-		text = text.replaceAll("　", "");
+        text = text.replaceAll(" +\r\n", "\r\n");
+        text = text.replaceAll(" +", " ");
+        text = text.replaceAll("[\\u00A0\\u3000]", "");
+        text = text.replaceAll("　", "");
+        text = text.replaceAll(" \n", "\n");
+        text = text.replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n");

-		return text;
+        return text;

    }


--- a/comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
+++ b/comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
@@ -58,10 +58,10 @@ public class DriverUtil {
    }

    public static ReuseWebDriver connectChrome(String sessionId, String serverUrl) throws Exception {
-        if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
-            log.error("未获取到驱动服务地址、sessionId");
-            return null;
-        }
+//        if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
+//            log.error("未获取到驱动服务地址、sessionId");
+//            return null;
+//        }

        ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId);
        if (driver.connectTestFail()) {
@@ -89,10 +89,21 @@ public class DriverUtil {
     * @date 2022/7/25 15:07
     */
    public static ReuseWebDriver getChromeDriver() throws Exception {
-        String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
-        Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
+        Map<String, String> map =getSessionInfo();
        String sessionId = map.get("sessionId");
        String serverUrl = map.get("serverUrl");
        return connectChrome(sessionId, serverUrl);
    }
+    public static Map<String, String> getSessionInfo() throws Exception{
+        String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
+        Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
+        if(map==null || map.size()<1) {
+            map = new HashMap<>(2);
+            map.put("sessionId", "sessionId");
+            map.put("serverUrl", "https://www.baidu.com/");
+            // 缓存浏览器驱动信息
+            JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
+        }
+        return map;
+    }
 }
--- a/comm_crawler/src/main/java/com/zzsn/util/Utility.java
+++ b/comm_crawler/src/main/java/com/zzsn/util/Utility.java
--- a/comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
+++ b/comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
@@ -2,6 +2,8 @@ package com.zzsn.util;

 import com.zzsn.crawler.ReuseWebDriver;
 import lombok.extern.slf4j.Slf4j;
+import org.springframework.scheduling.annotation.EnableScheduling;
+import org.springframework.scheduling.annotation.Scheduled;

 import java.io.BufferedReader;
 import java.io.InputStreamReader;
@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
 */
 @Slf4j
 @SuppressWarnings("all")
+@EnableScheduling
 public class WindowsProcess {

    private static Pattern TASK_LIST_PATTERN = Pattern.compile("^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$");
-    private static String DRIVER_NAME = "chrome.exe";
+    private static String CHROME_NAME = "chrome.exe";
+    private static String DRIVER_NAME = "chromedriver.exe";

    public static void main(String[] args) {
        WindowsProcess process = new WindowsProcess();
@@ -28,6 +32,7 @@ public class WindowsProcess {
     * @author andylau
     * @date 2022/7/26 11:23
     */
+//    @Scheduled(cron = "0 0 1 * * ?")
    private void killProcess() {
        try {
            String line;
@@ -35,14 +40,14 @@ public class WindowsProcess {
            BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream()));

            while ((line = input.readLine()) != null) {
-                if (line.contains(DRIVER_NAME)) {
+                if (line.contains(CHROME_NAME)|| line.contains(DRIVER_NAME) ) {
                    Matcher matcher = TASK_LIST_PATTERN.matcher(line);
                    if (matcher.find()) {
-                        // String serviceName = matcher.group(1);
-                        String pid = matcher.group(2);
-                        // String sessionName = matcher.group(3);
-                        // String size = matcher.group(4).replace(",", "") + "K";
-                        // log.info("正在关闭服务：\n服务名：{}\nPid：{}\n会话名：{}\n内存使用：{}\n", serviceName, pid, sessionName, size);
+//                         String serviceName = matcher.group(1);
+                         String pid = matcher.group(2);
+//                         String sessionName = matcher.group(3);
+//                         String size = matcher.group(4).replace(",", "") + "K";
+//                         log.info("正在关闭服务：\n服务名：{}\nPid：{}\n会话名：{}\n内存使用：{}\n", serviceName, pid, sessionName, size);
                        Runtime.getRuntime().exec("taskkill /pid " + pid);
                    }
                }
@@ -51,11 +56,11 @@ public class WindowsProcess {
            log.error("浏览器驱动关闭异常...");
        } finally {
            // 定时任务关闭驱动后，重新打开驱动
-            try {
-                reopenChromeDriver();
-            } catch (Exception e) {
-                log.error("驱动打开异常...");
-            }
+//            try {
+//                reopenChromeDriver();
+//            } catch (Exception e) {
+//                log.error("驱动打开异常...");
+//            }
        }
    }


--- a/comm_crawler/src/main/resources/aa.txt
+++ b/comm_crawler/src/main/resources/aa.txt
--- a/comm_crawler/src/main/resources/application.properties
+++ b/comm_crawler/src/main/resources/application.properties
@@ -5,9 +5,9 @@ spring.profiles.active:=dev
 server.port=8081


-spring.http.encoding.force=true
-spring.http.encoding.charset=UTF-8
-spring.http.encoding.enabled=true
+#spring.http.encoding.force=true
+#spring.http.encoding.charset=UTF-8
+#spring.http.encoding.enabled=true

 spring.thymeleaf.cache=false
 spring.thymeleaf.enabled=false
@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000

 logging.level.root=info
 logging.level.org.springframework.web=info
-logginglevelorghibernate=info
+logging.level.org.hibernate=info
 logging.config=classpath:logback-spring.xml

 kafka.consumer.task=0 0/2 * * * ?

--- a/comm_crawler/src/main/resources/constants.properties
+++ b/comm_crawler/src/main/resources/constants.properties
@@ -35,7 +35,7 @@ PROXYID=1
 #线程池大小
 THREAD_SIZE=1
 #
-CHROMEDRIVE= D:\\chrome\\chromedriver.exe
+CHROMEDRIVE= E:\\chrome\\chromedriver.exe
 CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
 USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default

@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
 redis.host=114.116.26.150
 redis.port=6379
 redis.pass=zzsn9988
+#redis.host=114.115.236.206
+#redis.port=6379
+#redis.pass=clbzzsn
 #redis.host=8.130.30.33
 #redis.port=9010
 #redis.pass=wxadS&jklim
@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
 HUAWEICLOUD_AK= VEHN7D0TJ9316H8AHCAV
 HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY

-IMGPATH= E:\\chrome\\img\\shot.png
-
+#IMGPATH= E:\\chrome\\img\\shot.png
+IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\aa.txt

+selenium.driver.cache=comm_selenium_driver_cache_1




--- a/comm_crawler/src/main/resources/redis.properties
+++ b/comm_crawler/src/main/resources/redis.properties
 # Redis settings  
-redis.host=127.0.0.1
+redis.host=114.115.236.206
 redis.port=6379
-redis.pass=xxxxxx
+redis.pass=clbzzsn
 redis.timeout=10000
+#redis.host=127.0.0.1
+#redis.port=6379
+#redis.pass=xxxxxx
+#redis.timeout=10000
 
 redis.maxIdle=300
 redis.maxTotal=600