提交 af30a040 作者: liuweigang

通用采集代码更新

上级 649ac47c
...@@ -11,3 +11,5 @@ ...@@ -11,3 +11,5 @@
...@@ -469,6 +469,7 @@ public class PaserSiteDownload { ...@@ -469,6 +469,7 @@ public class PaserSiteDownload {
} }
public static String getHtml(String url,String charset) { public static String getHtml(String url,String charset) {
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
String html=""; String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpGet httpgeturl = new HttpGet(url);// Get请求
......
...@@ -2,6 +2,7 @@ package com.zzsn.crawler; ...@@ -2,6 +2,7 @@ package com.zzsn.crawler;
import cn.hutool.core.date.DateTime; import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil; import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
...@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular; ...@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath; import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.HisURIConfig; import com.zzsn.crawler.uriparser.HisURIConfig;
import com.zzsn.crawler.uriparser.HisURIParser; import com.zzsn.crawler.uriparser.HisURIParser;
import com.zzsn.crawler.uriparser.HttpgetUtil;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{ ...@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
public PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple(); public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class); // public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override @Override
public void run() { public void run() {
crawler(); crawler();
} }
public static PageDownloader pageDownload=new PageDownloader();
public void crawler(){ public void crawler(){
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
// List<String> urlList=getPageListUrl(siteMsgTemple);
List<String> urlList=new ArrayList<>();
urlList.add(siteMsgTemple.getSiteUri());
//兼容就平台的历史链接方法
String charset="utf-8";
//获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
List<DocInfo> docInfoList=new ArrayList<>();
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
// Date collectTime=DateTime.now();
// String infoSourceId=siteMsgTemple.getId();
// //默认表达式类型
// siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//
// //判断列表解析表达式类型
// if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
// WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
// metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
// WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
// metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
// WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
// metaSearchList = webContentPaserByJsonXpath.catchWebOfStaticmsgByJsonPath(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
// WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
// metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
// }
String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(urlList.get(0),charset,false,false, siteMsgTemple.getHeaders());
}else {
try {//先使用静态网络请求获取列表内容
body = HttpgetUtil.getHtml(urlList.get(0));
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} catch (Exception e) {
log.info(e.getMessage());
body = pageDownload.downloadWithStr(urlList.get(0), charset, false, false);
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(urlList.get(0));
}
}
if(body.length()< 1000){
FileUtil.appendString(siteMsgTemple.getInfoSourceCode()+"\n\r","D:\\jingwai.txt","utf-8");
}
}
public void crawler2(){
//获取栏目链接以及翻页的链接
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
//兼容就平台的历史链接方法 //兼容就平台的历史链接方法
HisURIParser hisURIParser = new HisURIParser(); HisURIParser hisURIParser = new HisURIParser();
...@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{ ...@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
siteMsgRecord.setCollectTime(collectTime); siteMsgRecord.setCollectTime(collectTime);
String docjson = mapper.writeValueAsString(siteMsgRecord); String docjson = mapper.writeValueAsString(siteMsgRecord);
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace(); // e.printStackTrace();
......
...@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; ...@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.*;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -61,13 +59,17 @@ public class WebContentPaserByRegular { ...@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try {//先使用静态网络请求获取列表内容 try {//先使用静态网络请求获取列表内容
body = pageDownload.downloadWithStr(uri_code, charset, false, false); body =HttpgetUtil.getHtml(uri_code);
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
body = paserSiteDownload.getHtml(uri_code, charset); body = pageDownload.downloadWithStr(uri_code, charset, false, false);
// body = paserSiteDownload.getHtml(uri_code, charset);
} }
//请求返回为空时判断为动态请求使用模拟浏览器的方式 //请求返回为空时判断为动态请求使用模拟浏览器的方式
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) { if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
// SeleniumTime seleniumTime=new SeleniumTime();
// body = seleniumTime.getScopehtml(uri_code);
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) { if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) {
...@@ -270,13 +272,17 @@ public class WebContentPaserByRegular { ...@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
String content = ""; String content = "";
try { try {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
// SeleniumTime seleniumTime=new SeleniumTime();
// content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} else { } else {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false); content =HttpgetUtil.getHtml(cwbm.getSourceaddress());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
} catch (Exception e) { } catch (Exception e) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
log.info(e.getMessage()); log.info(e.getMessage());
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null); // content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类 //超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
...@@ -307,11 +313,12 @@ public class WebContentPaserByRegular { ...@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", "")); docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor()); docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate()); docInfo.setPublishDate(cwbm.getPublishDate());
if (cwbm.getSourceaddress() != null) { // if (cwbm.getSourceaddress() != null) {
docInfo.setOrigin(cwbm.getSourcesite()); // docInfo.setOrigin(cwbm.getSourcesite());
} else { // } else {
docInfo.setOrigin(siteMsgTemple.getSiteName()); // docInfo.setOrigin(siteMsgTemple.getSiteName());
} // }
docInfo.setOrigin(siteMsgTemple.getSiteName());
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
//封装解析的docinfo对象 //封装解析的docinfo对象
try { try {
...@@ -533,7 +540,7 @@ public class WebContentPaserByRegular { ...@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {
} }
docInfo.setContentWithTag(contentWithTag); docInfo.setContentWithTag(contentWithTag);
docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n","")); docInfo.setContentNoTag(ContentUtility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
} }
//作者 //作者
...@@ -567,8 +574,13 @@ public class WebContentPaserByRegular { ...@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource()); origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource());
if(StringUtils.isNotEmpty(origin)) { if(StringUtils.isNotEmpty(origin)) {
docInfo.setOrigin(origin); docInfo.setOrigin(origin);
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
return docInfo; return docInfo;
} }
......
package com.zzsn.crawler.uriparser;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zzsn.download.CreateSSLClientDefault;
import com.zzsn.util.Utility;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HttpgetUtil {
public static String getHtml(String url) {
String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
// 伪装成浏览器
httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse httprespse=null;
try {
Thread.sleep(500L);
httprespse = httpClient.execute(httpgeturl);
} catch (Exception e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
} // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse
.getFirstHeader("Last-Modified");
if (lastModify == null) {
lastModify = httprespse.getLastHeader("Last-Modified");
}
String charset="utf-8";
String charstype = EntityUtils
.getContentCharSet(entitydata);
if (charstype != null) {
charset = charstype;
} else {
charset = LocateCharSet(url);
}
charset = Utility.charsetcheck(charset);
String infodata="";
try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
httpgeturl.releaseConnection();
return infodata;
}
public static String LocateCharSet(String url) {
String encoding = "gb2312";
try {
Thread.sleep(500L);
Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
// 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(10000).get();
Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE);
Matcher m1 = p1.matcher(doc.toString());
while (m1.find()) {
String str = m1.group();
Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
Matcher m2 = p2.matcher(str);
if (m2.find()) {
encoding = m2.group().substring(8);
if (encoding.trim().length() == 0) {
Pattern p3 = Pattern
.compile("charset=\"[^\\s||\"||;||>]*");
Matcher m3 = p3.matcher(str);
if (m3.find()) {
encoding = m3.group().substring(9);
}
if (encoding.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding = "GB2312";
// }
}
}
return encoding;
}
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("获取出错编码方式");
return encoding;
}
return encoding;
}
}
...@@ -32,6 +32,7 @@ public class SeleniumTime { ...@@ -32,6 +32,7 @@ public class SeleniumTime {
public static String getScopehtml(String url) { public static String getScopehtml(String url) {
String html = ""; String html = "";
try { try {
ReuseWebDriver driver = DriverUtil.getChromeDriver(); ReuseWebDriver driver = DriverUtil.getChromeDriver();
try { try {
Duration duration=Duration.of(100, ChronoUnit.SECONDS); Duration duration=Duration.of(100, ChronoUnit.SECONDS);
......
package com.zzsn.crawler.uriparser;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import com.zzsn.generation.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
public class SeleniumTime4 {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime4(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("--start-maximized");
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try{
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
Thread.sleep(3000l);
String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l);
driver.quit();
// System.out.println(html);
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(3000l);
String html = driver.getPageSource();
Thread.sleep(5000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}
try {
Thread.sleep(5000l);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
driver.quit();
return null;
}
public static void main(String[] args) {
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime4 s = new SeleniumTime4();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
...@@ -89,8 +89,8 @@ public class WebContentPaserByXpath { ...@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
seleniumTime=new SeleniumTime(); // seleniumTime=new SeleniumTime();
body = seleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
TimeUnit.SECONDS.sleep(5); TimeUnit.SECONDS.sleep(5);
seleniumTime.close(); seleniumTime.close();
} else { } else {
......
...@@ -165,7 +165,8 @@ public class JedisUtil { ...@@ -165,7 +165,8 @@ public class JedisUtil {
throw new Exception("key is null"); throw new Exception("key is null");
} }
jedis = getDefaultJedis(); jedis = getDefaultJedis();
value = jedis.get(PREFIX + key); // value = jedis.get(PREFIX + key);
value = jedis.get(key);
}catch (Exception e){ }catch (Exception e){
}finally { }finally {
......
...@@ -62,7 +62,7 @@ public class KafkaConsumerJob { ...@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); // , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
// @Async("asyncTaskExecutor") // @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE); // ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
......
...@@ -98,6 +98,7 @@ public class ChromeTest { ...@@ -98,6 +98,7 @@ public class ChromeTest {
// 可复用驱动使用Demo // 可复用驱动使用Demo
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
ReuseWebDriver driver = DriverUtil.getChromeDriver(); ReuseWebDriver driver = DriverUtil.getChromeDriver();
if (driver == null) { if (driver == null) {
// 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存 // 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存
......
package com.zzsn.test; package com.zzsn.test;
import com.zzsn.crawler.uriparser.HttpgetUtil;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
...@@ -41,7 +42,9 @@ import java.util.List; ...@@ -41,7 +42,9 @@ import java.util.List;
public class HttpClientTester { public class HttpClientTester {
private static PageBuilderParser builderParser = null; private static PageBuilderParser builderParser = null;
public static void main(String[] args) { public static void main(String[] args) {
get("https://www.cas.cn/zjs/"); // get("https://edition.cnn.com/world");
String html = HttpgetUtil.getHtml("https://edition.cnn.com/world");
System.out.println(html);
// post(); // post();
} }
......
package com.zzsn.test; package com.zzsn.test;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import java.io.IOException; import java.io.IOException;
...@@ -17,21 +18,12 @@ import java.io.InputStream; ...@@ -17,21 +18,12 @@ import java.io.InputStream;
public class WebTest { public class WebTest {
public static void main(String[] args) { public static void main(String[] args) {
// String url="https://www.teriin.org/opinion"; String url="https://www.teriin.org/opinion";
// PageDownloader pageDownload=new PageDownloader(); // PageDownloader pageDownload=new PageDownloader();
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false); // String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// System.out.println(body); // System.out.println(body);
try { PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
Runtime mt = Runtime.getRuntime(); String html = paserSiteDownload.getHtml("https://edition.cnn.com/world", "utf-8");
String cmd = "taskkill /F /im chrome.exe"; System.out.println(html);
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
} }
} }
...@@ -287,15 +287,17 @@ public class ContentUtility { ...@@ -287,15 +287,17 @@ public class ContentUtility {
if(htmlText==null){ if(htmlText==null){
return null; return null;
} }
String text = ContentUtility.HTMLDecode(ContentUtility.RemoveHTMLCode(ContentUtility.RemoveStyleCode(ContentUtility.RemoveHTMLReturnCode(htmlText)))); String text = Utility.HTMLDecode(Utility.RemoveHTMLCode(Utility.RemoveStyleCode(Utility.RemoveHTMLReturnCode(htmlText))));
text = text.replaceAll("   ", "\r\n"); text = text.replaceAll("   ", "\r\n");
text = text.replaceAll(" +\r\n", "\r\n"); text = text.replaceAll(" +\r\n", "\r\n");
text = text.replaceAll(" +", " "); text = text.replaceAll(" +", " ");
text = text.replaceAll("[\\u00A0\\u3000]", ""); text = text.replaceAll("[\\u00A0\\u3000]", "");
text = text.replaceAll(" ", ""); text = text.replaceAll(" ", "");
text = text.replaceAll(" \n", "\n");
text = text.replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n");
return text; return text;
} }
......
...@@ -58,10 +58,10 @@ public class DriverUtil { ...@@ -58,10 +58,10 @@ public class DriverUtil {
} }
public static ReuseWebDriver connectChrome(String sessionId, String serverUrl) throws Exception { public static ReuseWebDriver connectChrome(String sessionId, String serverUrl) throws Exception {
if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) { // if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
log.error("未获取到驱动服务地址、sessionId"); // log.error("未获取到驱动服务地址、sessionId");
return null; // return null;
} // }
ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId); ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId);
if (driver.connectTestFail()) { if (driver.connectTestFail()) {
...@@ -89,10 +89,21 @@ public class DriverUtil { ...@@ -89,10 +89,21 @@ public class DriverUtil {
* @date 2022/7/25 15:07 * @date 2022/7/25 15:07
*/ */
public static ReuseWebDriver getChromeDriver() throws Exception { public static ReuseWebDriver getChromeDriver() throws Exception {
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE); Map<String, String> map =getSessionInfo();
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
String sessionId = map.get("sessionId"); String sessionId = map.get("sessionId");
String serverUrl = map.get("serverUrl"); String serverUrl = map.get("serverUrl");
return connectChrome(sessionId, serverUrl); return connectChrome(sessionId, serverUrl);
} }
public static Map<String, String> getSessionInfo() throws Exception{
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
if(map==null || map.size()<1) {
map = new HashMap<>(2);
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
}
return map;
}
} }
...@@ -2,6 +2,8 @@ package com.zzsn.util; ...@@ -2,6 +2,8 @@ package com.zzsn.util;
import com.zzsn.crawler.ReuseWebDriver; import com.zzsn.crawler.ReuseWebDriver;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.InputStreamReader; import java.io.InputStreamReader;
...@@ -13,10 +15,12 @@ import java.util.regex.Pattern; ...@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
*/ */
@Slf4j @Slf4j
@SuppressWarnings("all") @SuppressWarnings("all")
@EnableScheduling
public class WindowsProcess { public class WindowsProcess {
private static Pattern TASK_LIST_PATTERN = Pattern.compile("^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$"); private static Pattern TASK_LIST_PATTERN = Pattern.compile("^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$");
private static String DRIVER_NAME = "chrome.exe"; private static String CHROME_NAME = "chrome.exe";
private static String DRIVER_NAME = "chromedriver.exe";
public static void main(String[] args) { public static void main(String[] args) {
WindowsProcess process = new WindowsProcess(); WindowsProcess process = new WindowsProcess();
...@@ -28,6 +32,7 @@ public class WindowsProcess { ...@@ -28,6 +32,7 @@ public class WindowsProcess {
* @author andylau * @author andylau
* @date 2022/7/26 11:23 * @date 2022/7/26 11:23
*/ */
// @Scheduled(cron = "0 0 1 * * ?")
private void killProcess() { private void killProcess() {
try { try {
String line; String line;
...@@ -35,14 +40,14 @@ public class WindowsProcess { ...@@ -35,14 +40,14 @@ public class WindowsProcess {
BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream())); BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream()));
while ((line = input.readLine()) != null) { while ((line = input.readLine()) != null) {
if (line.contains(DRIVER_NAME)) { if (line.contains(CHROME_NAME)|| line.contains(DRIVER_NAME) ) {
Matcher matcher = TASK_LIST_PATTERN.matcher(line); Matcher matcher = TASK_LIST_PATTERN.matcher(line);
if (matcher.find()) { if (matcher.find()) {
// String serviceName = matcher.group(1); // String serviceName = matcher.group(1);
String pid = matcher.group(2); String pid = matcher.group(2);
// String sessionName = matcher.group(3); // String sessionName = matcher.group(3);
// String size = matcher.group(4).replace(",", "") + "K"; // String size = matcher.group(4).replace(",", "") + "K";
// log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size); // log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
Runtime.getRuntime().exec("taskkill /pid " + pid); Runtime.getRuntime().exec("taskkill /pid " + pid);
} }
} }
...@@ -51,11 +56,11 @@ public class WindowsProcess { ...@@ -51,11 +56,11 @@ public class WindowsProcess {
log.error("浏览器驱动关闭异常..."); log.error("浏览器驱动关闭异常...");
} finally { } finally {
// 定时任务关闭驱动后,重新打开驱动 // 定时任务关闭驱动后,重新打开驱动
try { // try {
reopenChromeDriver(); // reopenChromeDriver();
} catch (Exception e) { // } catch (Exception e) {
log.error("驱动打开异常..."); // log.error("驱动打开异常...");
} // }
} }
} }
......
...@@ -5,9 +5,9 @@ spring.profiles.active:=dev ...@@ -5,9 +5,9 @@ spring.profiles.active:=dev
server.port=8081 server.port=8081
spring.http.encoding.force=true #spring.http.encoding.force=true
spring.http.encoding.charset=UTF-8 #spring.http.encoding.charset=UTF-8
spring.http.encoding.enabled=true #spring.http.encoding.enabled=true
spring.thymeleaf.cache=false spring.thymeleaf.cache=false
spring.thymeleaf.enabled=false spring.thymeleaf.enabled=false
...@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000 ...@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000
logging.level.root=info logging.level.root=info
logging.level.org.springframework.web=info logging.level.org.springframework.web=info
logginglevelorghibernate=info logging.level.org.hibernate=info
logging.config=classpath:logback-spring.xml logging.config=classpath:logback-spring.xml
kafka.consumer.task=0 0/2 * * * ? kafka.consumer.task=0 0/2 * * * ?
......
...@@ -35,7 +35,7 @@ PROXYID=1 ...@@ -35,7 +35,7 @@ PROXYID=1
#线程池大小 #线程池大小
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= D:\\chrome\\chromedriver.exe CHROMEDRIVE= E:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
...@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0 ...@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
redis.host=114.116.26.150 redis.host=114.116.26.150
redis.port=6379 redis.port=6379
redis.pass=zzsn9988 redis.pass=zzsn9988
#redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
...@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn ...@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
HUAWEICLOUD_AK= VEHN7D0TJ9316H8AHCAV HUAWEICLOUD_AK= VEHN7D0TJ9316H8AHCAV
HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
IMGPATH= E:\\chrome\\img\\shot.png #IMGPATH= E:\\chrome\\img\\shot.png
IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\aa.txt
selenium.driver.cache=comm_selenium_driver_cache_1
......
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=114.115.236.206
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=clbzzsn
redis.timeout=10000 redis.timeout=10000
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
#redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论