提交 f314a48b 作者: liuweigang

采集代码更新

上级 076c1840
1.八万多个信息源中,无效信息源占比大,占用很大一部分资源;
2.动态、静态参数设置不准确,导致调用的采集器可能采集不到信息;
3.爬虫程序不完善,采用selenium驱动,操作浏览器时,发生异常时,没有关闭相应的资源;导致资源占用越来越多,最终引起爬虫终断。
1.爬虫向上反馈,根据采集情况,向上游反馈该条信息源是否有效;若无效则关闭该条信息源;减少无效调度,降低资源浪费;
2.向上反馈,动态、静态参数设置是否正确;根据实际情况重新设置参数;这样不用爬虫每次都要采取两种方式采集。
3.修改爬虫业务逻辑,抛出异常时,关闭相应的资源,释放服务器资源,防止服务器资源占用过多,导致程序异常。
4.降低爬虫程序内部相应的等待时间,加快效率。
......@@ -8,7 +8,6 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
......
......@@ -71,14 +71,13 @@ public class DynaminSiteThread implements Runnable{
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
String infoSourceId=siteMsgTemple.getId();
String infoSourceId=siteMsgTemple.getId();//获取信息源id
//默认表达式类型
siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//判断列表解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
......
package com.zzsn.crawler;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.CreateSSLClientDefault;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.util.*;
import com.zzsn.web.ExtType;
import com.zzsn.web.JsoupTagProcessor;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.httpclient.params.HttpMethodParams;
......@@ -42,11 +32,9 @@ import org.jsoup.Jsoup;
//import org.jsoup.helper.W3CDom;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import org.springframework.kafka.core.KafkaTemplate;
import javax.net.ssl.SSLContext;
import java.io.*;
......
......@@ -4,17 +4,12 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.db.DBManager;
import com.zzsn.crawler.db.SnowIdUtils;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.*;
......@@ -28,11 +23,8 @@ import org.springframework.kafka.core.KafkaTemplate;
import java.net.URI;
import java.net.URL;
import java.sql.SQLException;
import java.sql.Types;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -73,30 +65,37 @@ public class WebContentPaserByCss {
log.info(e.getMessage());
}
if (StringUtils.isEmpty(body)) {//为空时调用
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) {
try {
body = paserSiteDownload.getHtml(uri_code, charset);
} catch (Exception e) {
log.info("静态请求失败:"+uri_code);
}
}
}
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
body = SeleniumTime.getScopehtml(uri_code);
}
TimeUnit.SECONDS.sleep(2);
}
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
sentBadSiteMsg(siteMsgTemple,"动态请求异常","0");
}else{
sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
}
if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body);
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
// body = SeleniumTime.getScopehtml(uri_code);
// doc = Jsoup.parse(body);
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
body = SeleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
}
}
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
......@@ -117,7 +116,33 @@ public class WebContentPaserByCss {
return catchWebByMetaSearchList;
}
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param 爬虫类型(0:静态爬取 1:动态爬取)
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByCss(SiteMsgTemple siteMsgTemple,Document doc)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
......@@ -244,10 +269,6 @@ public class WebContentPaserByCss {
if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}else{
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
// if(StringUtils.isEmpty(content)){
// content = paserSiteDownload.getContent(cwbm);
// }
try {
content = paserSiteDownload.getContent(cwbm);
}catch (Exception e){
......@@ -265,17 +286,7 @@ public class WebContentPaserByCss {
}
}
}catch (Exception e) {
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if (StringUtils.isEmpty(content)) {
content = paserSiteDownload.getContent(cwbm);
}
}
}
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
}
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
......@@ -304,9 +315,11 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else {
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
}
}catch (Exception e){
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
}
......
......@@ -6,7 +6,6 @@ import com.jayway.jsonpath.JsonPath;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch;
......@@ -20,17 +19,11 @@ import com.zzsn.util.DateUtil;
import com.zzsn.util.Utility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......
......@@ -53,8 +53,6 @@ import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......
......@@ -4,25 +4,17 @@ package com.zzsn.crawler.uriparser;
import java.awt.*;
import java.awt.event.KeyEvent;
import java.io.*;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.List;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zzsn.crawler.ChromeDriverPool;
import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.interactions.Actions;
import org.springframework.scheduling.annotation.Async;
@Slf4j
public class SeleniumTime {
......@@ -72,69 +64,115 @@ public class SeleniumTime {
// @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url){
ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver;
ChromeDriverService service;
service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
} catch (Exception e) {
service.stop();
return "";
// e.printStackTrace();
}
String html = "";
ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver;
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(service, chromeOptions);//生成实例
String html = "";
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例
try {
driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url);
Thread.sleep(1000l);
try {
// byte[] screenshotAs = driver.getScreenshotAs(OutputType.BYTES);
// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
// SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
System.out.println("browser will be close");
WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close");
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
try {
Thread.sleep(1000l);
driver.quit();
service.stop();
Thread.sleep(1000l);
} catch (InterruptedException e2) {
service.stop();
}
}
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
} finally {
try {
Thread.sleep(1000l);
driver.quit();
service.stop();
Thread.sleep(1000l);
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
}
} catch (Exception e) {
return "";
}
return html;
}
// public static String getScopehtml(String url){
//
// ChromeOptions chromeOptions = new ChromeOptions();
// ChromeDriver driver;
// ChromeDriverService service;
// service = new ChromeDriverService.Builder().
// usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
// try {
// service.start();
// } catch (Exception e) {
// service.stop();
// return "";
//// e.printStackTrace();
// }
// if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
// }
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
// driver = new ChromeDriver(service, chromeOptions);//生成实例
// String html = "";
// try {
// driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
// driver.get(url);
// Thread.sleep(1000l);
// try {
//// byte[] screenshotAs = driver.getScreenshotAs(OutputType.BYTES);
//// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
//// SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
//// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
//// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
//
// System.out.println("browser will be close");
// WebElement webElement = driver.findElement(By.xpath("/html"));
// html = webElement.getAttribute("outerHTML");
// } catch (Exception e) {
// log.info("chromedriver 出现异常:" + e.getMessage());
// try {
// Thread.sleep(1000l);
// driver.quit();
// service.stop();
// Thread.sleep(1000l);
// } catch (InterruptedException e2) {
// service.stop();
// }
// }
// } catch (Exception e) {
// log.info("chromedriver 出现异常:" + e.getMessage());
// } finally {
// try {
// Thread.sleep(1000l);
// driver.quit();
// service.stop();
// Thread.sleep(1000l);
// } catch (InterruptedException e) {
//
// }
// }
//
// return html;
// }
public static InputStream getScreenshot(String url){
ChromeOptions chromeOptions =new ChromeOptions() ;
ChromeDriver driver;
......
......@@ -2,115 +2,131 @@ package com.zzsn.crawler.uriparser;
import java.io.*;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.TimeUnit;
import com.zzsn.generation.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions;
@Slf4j
public class SeleniumTime2 {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime2(){
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("user-data-dir="+Constants.USER_DATA_DIR);
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
public String getChromeDoc(String url) {
// ChromeOptions chromeOptions =new ChromeOptions() ;
// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// ChromeDriver driver = new ChromeDriver(chromeOptions);
String doc="";
try {
Thread.sleep(3000l);
driver.get(url);
Thread.sleep(3000l);
doc = driver.getPageSource();
} catch (Exception e) {
try {
Runtime.getRuntime().exec("taskkill /F /im " + "chromedriver.exe");
} catch (IOException e2) {
e2.printStackTrace();
}
return null;
}finally {
driver.quit();
}
return doc;
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
// @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url){
String html = "";
ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver;
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例
try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url);
Thread.sleep(1000l);
try {
WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close");
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
}
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
} finally {
try {
driver.quit();
service.stop();
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
}
} catch (Exception e) {
return "";
}
return html;
}
public static InputStream getScreenshot(String url){
ChromeOptions chromeOptions =new ChromeOptions() ;
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
ChromeDriver driver = new ChromeDriver(chromeOptions);
ChromeDriver driver;
ChromeDriverService service;
service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
} catch (Exception e) {
e.printStackTrace();
}
if(!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
driver = new ChromeDriver(service, chromeOptions);//生成实例
InputStream inStream = null;
try{
driver.manage ().timeouts().pageLoadTimeout (100 , TimeUnit.SECONDS ) ;
driver.get(url);
Thread.sleep(2000l);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
String html = webElement.getAttribute("outerHTML");
Thread.sleep(500l);
return html;
Thread.sleep(3000l);
try {
byte[] screenshotBytes = driver.getScreenshotAs(OutputType.BYTES);
inStream = new ByteArrayInputStream(screenshotBytes);
// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
// SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(1000l);
String html = driver.getPageSource();
Thread.sleep(2000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
log.info("chromedriver 出现异常:"+e.getMessage());
}finally {
}
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
log.info("chromedriver 出现异常:"+e.getMessage());
}finally {
try {
Thread.sleep(2000l);
driver.quit();
service.stop();
} catch (InterruptedException e) {
e.printStackTrace();
}
driver.quit();
}
return null;
return inStream;
}
public void close(){
// driver.close();
// driver.quit();
// service.stop();
}
public static void main(String[] args) {
......@@ -121,12 +137,57 @@ public class SeleniumTime2 {
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime2 s = new SeleniumTime2();
String scopehtml = s.getScopehtml("https://www.baidu.com/");
System.out.println(scopehtml);
SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
......@@ -135,4 +196,26 @@ public class SeleniumTime2 {
}
}
package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.db.DBManager;
import com.zzsn.crawler.db.SnowIdUtils;
......@@ -13,7 +10,6 @@ import com.zzsn.crawlerOther.StandardWebExtractorHandler;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants;
......@@ -28,7 +24,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import java.net.URI;
import java.net.URL;
......
......@@ -2,7 +2,6 @@ package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser;
......
......@@ -394,7 +394,6 @@ public class PageConnectioner {
*/
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 3000;
long startDownTime = System.currentTimeMillis();
PageGet pg = null;
try {
......
......@@ -5,6 +5,7 @@ import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.jayway.jsonpath.JsonPath;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import lombok.extern.slf4j.Slf4j;
import org.w3c.dom.Document;
import javax.net.ssl.HttpsURLConnection;
......@@ -18,6 +19,7 @@ import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
@Slf4j
public class PageDownloader {
private int interval = 5000;
private long lastDownloadTime = -1;
......@@ -154,9 +156,7 @@ public class PageDownloader {
public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval)
{
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
if (interval > 0 && lastDownloadTime > 0 && dis < interval){
new PageDownloader(dis+2000);
}
long startDtime = System.currentTimeMillis();
......@@ -164,13 +164,14 @@ public class PageDownloader {
HttpURLConnection connection = null;
try {
connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) {
if (encoding == null || encoding.isEmpty()) {//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
encoding = paserSiteDownload.locateCharSet(url);
}
} catch (Exception e1) {
// e1.printStackTrace();
log.info("获取编码失败");
}
String docBody = null;
if (bDynamic) {
......
package com.zzsn.entity;
import lombok.Data;
@Data
public class BadSiteMsg {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType;
}
......@@ -109,7 +109,7 @@ public class KafkaConsumerJob {
}
@Scheduled(cron = "0 0/58 * * * ?")
// @Scheduled(cron = "0 0/30 * * * ?")
@Async("asyncTaskExecutor")
public void runtimeTask (){
try {
......@@ -118,19 +118,38 @@ public class KafkaConsumerJob {
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
// ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
// try {
// Runtime mt = Runtime.getRuntime();
// String cmd = "taskkill /F /im chrome.exe";
// Process pro = mt.exec(cmd);
// InputStream ers= pro.getErrorStream();
// pro.waitFor();
// } catch (IOException ioe) {
// ioe.printStackTrace();
// } catch (InterruptedException e) {
// // TODO Auto-generated catch block
// }
}
// @Scheduled(cron = "0 0/25 * * * ?")
@Async("asyncTaskExecutor")
public void runtimeTask2 (){
try {
Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chrome.exe";
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chrome.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
// ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
......
package com.zzsn.test;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets;
public class UrlConnecttest {
......
......@@ -2,6 +2,9 @@ package com.zzsn.test;
import com.zzsn.download.PageDownloader;
import java.io.IOException;
import java.io.InputStream;
/**
* 网站请求测试,
* 在不调用浏览器情况下获取请求访问的网站内容
......@@ -14,10 +17,21 @@ import com.zzsn.download.PageDownloader;
public class WebTest {
public static void main(String[] args) {
String url="https://www.teriin.org/opinion";
PageDownloader pageDownload=new PageDownloader();
String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
System.out.println(body);
// String url="https://www.teriin.org/opinion";
// PageDownloader pageDownload=new PageDownloader();
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// System.out.println(body);
try {
Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chrome.exe";
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
}
}
......@@ -261,13 +261,13 @@ public class DetailGoogleSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
// intsertData(docInfo);
intsertData(docInfo);
//信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
// ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
// ObjectMapper mapper = new ObjectMapper();
// String docjson = mapper.writeValueAsString(processitem);
// System.out.println(docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
}else {
log.info("资讯发布时间:"+docInfo.getPublishDate());
}
......
......@@ -161,28 +161,49 @@ public class GoogleRecorderUtil {
System.out.println("列表页内容"+docstr.length());
System.out.println("关键词请求:"+keyword+"第"+i+"页");
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
// Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
Elements firstElementsLink = doc.select("div[class=\"xuvV6b BGxR7d\"]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
//标题
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
try {
catchWebByMetaSearch = new CatchWebByMetaSearch();
//标题
Elements e = firstElementsLink.get(j).select("div[class=\"iRPxbe\"]");
//链接
Elements a = firstElementsLink.get(j).select("a");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin = firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
// //标题
// Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
// //链接
// Elements a=firstElementsLink.get(j).select("a");
// System.out.println(e.get(0).text());
// System.out.println(a.get(0).attr("href"));
// catchWebByMetaSearch.setTid(tid);
// catchWebByMetaSearch.setSid(tid);
// catchWebByMetaSearch.setSummary(urlList.get(i));
// catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
// catchWebByMetaSearch.setTitle(e.get(0).text());
// //来源
// String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
// catchWebByMetaSearch.setSourcesite(origin);
// metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){
System.out.println(e.getMessage());
}
}
DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread();
......
......@@ -25,8 +25,8 @@ public class WebGoogleSearch {
// String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2021-01-01";
String endTime="2022-05-23";
String startTime="2021-09-01";
String endTime="2022-07-01";
startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime);
File f = new File(filepath);
......@@ -60,7 +60,7 @@ public class WebGoogleSearch {
webGoogleSearchThread.setStartTime(startTime);
webGoogleSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2020052301");
keywordMsg.setId("2020070101");
keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime));
......
package com.zzsn.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
......@@ -11,11 +14,16 @@ public class TimePaser {
// String aa="2022-04-18";
// String s = dateToStamp(aa);
// System.out.println(s);
Date date = new Date();
String nowTime="";
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
// String format = simpleDateFormat.format("1650384000");
System.out.println(stampToTime("1650384000"));
// Date date = new Date();
// String nowTime="";
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
//// String format = simpleDateFormat.format("1650384000");
// System.out.println(stampToTime("1650384000"));
String realUrl="<html><head><meta name=\"referrer\" content=\"unsafe-url\"><script>window.opener=null;window.location.replace(\"https://www.ky3.com/prnewswire/2022/07/01/infinite-reality-launches-global-metaverse-hub-luxembourg/\");</script><noscript><META http-equiv=\"refresh\" content=\"0;URL='https://www.ky3.com/prnewswire/2022/07/01/infinite-reality-launches-global-metaverse-hub-luxembourg/'\"></noscript></head></html>";
Document parse = Jsoup.parse(realUrl);
String attr = parse.select("META").get(1).attr("content");
String attrurl=attr.substring(attr.indexOf("URL='")+5,attr.length()-2);
System.out.println(attrurl);
}
public static String dateToStamp(String s) throws ParseException {
String res;
......
......@@ -44,7 +44,7 @@ META_SEARCH_URL=https://www.google.com/search?q=[keyword]&newwindow=1&tbs=cdr:1,
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak2.txt
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\google_crawler\\data\\project.txt
# Redis settings
redis.host=127.0.0.1
......
......@@ -25,8 +25,8 @@ public class WebYahooSearch {
// String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2018-04-18";
String endTime="2019-04-18";
String startTime="2021-09-01";
String endTime="2022-07-01";
startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime);
File f = new File(filepath);
......@@ -61,7 +61,7 @@ public class WebYahooSearch {
webYahooSearchThread.setStartTime(startTime);
webYahooSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("123456");
keywordMsg.setId("2022070304");
keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime));
......
......@@ -2,6 +2,7 @@ package com.zzsn.search.yaooThread;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.paser.SeleniumTime;
import com.zzsn.search.util.GetCookies;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
......@@ -36,7 +37,7 @@ public class YahooRecorderUtil {
// 提取新闻列表URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfYahooList(
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate) {
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate,String kWord) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
......@@ -68,41 +69,160 @@ public class YahooRecorderUtil {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
CatchWebByMetaSearch catchWebByMetaSearch = null;
for (int m=0;m<firstElementsLink.size();m++) {
catchWebByMetaSearch = new CatchWebByMetaSearch();
String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text();
//发布时间
String publishDate = DateUtil.getPublishDate(orainAndDatestr);
catchWebByMetaSearch.setPublishDate(publishDate);
//来源
String orin = firstElementsLink.get(m).select("span.s-source").text().trim();
catchWebByMetaSearch.setSourcesite(orin);
Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb");
if (titleAndUrl.size()>0) {
//标题
String title = titleAndUrl.get(0).attr("title");
try {
catchWebByMetaSearch = new CatchWebByMetaSearch();
String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text();
//发布时间
String publishDate = DateUtil.getPublishDate(orainAndDatestr);
catchWebByMetaSearch.setPublishDate(publishDate);
//来源
String orin = firstElementsLink.get(m).select("span.s-source").text().trim();
catchWebByMetaSearch.setSourcesite(orin);
// Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb");
Elements titleAndUrl = firstElementsLink.get(m).select("a[referrerpolicy=\"origin\"]");
String title = titleAndUrl.get(0).text();
catchWebByMetaSearch.setTitle(title);
//源网址
Element element = titleAndUrl.get(0);
element.setBaseUri(uri_code);
String addressurl = titleAndUrl.get(0).absUrl("href");
String realUrl = sendGet(addressurl);
catchWebByMetaSearch.setSourceaddress(realUrl);
Document parse = Jsoup.parse(realUrl);
String attr = parse.select("META").get(1).attr("content");
String attrurl = attr.substring(attr.indexOf("URL='") + 5, attr.length() - 2);
catchWebByMetaSearch.setSourceaddress(attrurl);
// if (titleAndUrl.size()>0) {
// //标题
// String title = titleAndUrl.get(0).attr("title");
// catchWebByMetaSearch.setTitle(title);
// //源网址
// Element element = titleAndUrl.get(0);
// element.setBaseUri(uri_code);
// String addressurl = titleAndUrl.get(0).absUrl("href");
// String realUrl = sendGet(addressurl);
// catchWebByMetaSearch.setSourceaddress(realUrl);
//
// }
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){
}
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
}
for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
ObjectMapper mapper = new ObjectMapper();
WebYahooSearchThread webYahooSearch =new WebYahooSearchThread();
webYahooSearch.CatchWebNews(metaSearchList,kWord);
// for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
// ObjectMapper mapper = new ObjectMapper();
// try {
// String docjson = mapper.writeValueAsString(catchMetaSearch);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
// log.info("发送到kafka成功。");
// }catch (Exception e){
// log.info(e.getMessage());
// }
// }
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// 提取新闻列表URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfWebList(
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate,String kWord) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),
url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
} catch (URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Connection conn = Jsoup.connect(uri_code);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50");
Document doc = null;
try {
doc = conn.timeout(10000).get();
} catch (Exception ex) {
// ex.printStackTrace();
System.out.println("Yahoo搜索中该关键词搜索没有相关新闻!");
// continue;
SeleniumTime seleniumTime=new SeleniumTime();
String docstr=seleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(docstr);
}
System.out.println("----Yahoo搜索----" + uri);
Elements firstElementsLink = doc.select("div[class=\"gsc-webResult gsc-result\"]");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
if (firstElementsLink.size()<1){
SeleniumTime seleniumTime=new SeleniumTime();
String docstr=seleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(docstr);
firstElementsLink = doc.select("div[class=\"gsc-webResult gsc-result\"]");
}
CatchWebByMetaSearch catchWebByMetaSearch = null;
for (int m=0;m<firstElementsLink.size();m++) {
try {
String docjson = mapper.writeValueAsString(catchMetaSearch);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
catchWebByMetaSearch = new CatchWebByMetaSearch();
// String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text();
// //发布时间
// String publishDate = DateUtil.getPublishDate(orainAndDatestr);
// catchWebByMetaSearch.setPublishDate(publishDate);
//来源
String orin = firstElementsLink.get(m).select("div[dir=\"ltr\"]").text().trim();
catchWebByMetaSearch.setSourcesite(orin);
// Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb");
Elements titleAndUrl = firstElementsLink.get(m).select("a[class=\"gs-title\"]");
String title = titleAndUrl.get(0).text();
catchWebByMetaSearch.setTitle(title);
Element element = titleAndUrl.get(0);
element.setBaseUri(uri_code);
String addressurl = titleAndUrl.get(0).absUrl("href");
catchWebByMetaSearch.setSourceaddress(addressurl);
// if (titleAndUrl.size()>0) {
// //标题
// String title = titleAndUrl.get(0).attr("title");
// catchWebByMetaSearch.setTitle(title);
// //源网址
// Element element = titleAndUrl.get(0);
// element.setBaseUri(uri_code);
// String addressurl = titleAndUrl.get(0).absUrl("href");
// String realUrl = sendGet(addressurl);
// catchWebByMetaSearch.setSourceaddress(realUrl);
//
// }
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){
log.info(e.getMessage());
}
}
WebYahooSearchThread webYahooSearch =new WebYahooSearchThread();
webYahooSearch.CatchWebNews(metaSearchList,kWord);
// for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
// ObjectMapper mapper = new ObjectMapper();
// try {
// String docjson = mapper.writeValueAsString(catchMetaSearch);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
// log.info("发送到kafka成功。");
// }catch (Exception e){
// log.info(e.getMessage());
// }
// }
catchWebByMetaSearchList.addAll(metaSearchList);
}
......
......@@ -43,7 +43,7 @@ META_SEARCH_URL=https://news.search.yahoo.com/search?p=[keyword]&ei=UTF-8&b=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak.txt
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\yahoo\\data\\project.txt
# Redis settings
redis.host=127.0.0.1
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论