提交 cc9aa52f 作者: 张文库

更新

上级 f314a48b
......@@ -29,7 +29,11 @@ public class SiteInfoVerify{
List<String> urlList=getPageListUrl(siteMsgTemple);
String charset="utf-8";
if(siteMsgTemple.getYnDynamicCrawl()!=1){
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
}
......@@ -82,7 +86,11 @@ public class SiteInfoVerify{
PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
}
//判断解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
......@@ -165,7 +173,7 @@ public class SiteInfoVerify{
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
crawler();
}
@Async("asyncexecutorService")
// @Async("asyncexecutorService")
public void crawler(){
//获取栏目链接以及翻页的链接
......@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
String charset = "";
try {
charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){
} catch (Exception e) {
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
}
//获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
......@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//资讯类容抽取
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
......@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -406,11 +406,14 @@ public class PaserSiteDownload {
return HttpClients.createDefault();
}
public static String getCharSet(String url) {
String html="";
public static String getCharSet(String url) throws IOException {
String html = "";
HttpResponse httprespse = null;
HttpEntity entitydata = null;
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
try {
// Thread.sleep(500L);
HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter(
......@@ -422,31 +425,22 @@ public class PaserSiteDownload {
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse httprespse=null;
try {
Thread.sleep(500L);
httprespse = httpClient.execute(httpgeturl);
entitydata = httprespse.getEntity();// 获取返回数据
httpgeturl.releaseConnection();
} catch (Exception e2) {
// TODO Auto-generated catch block
// e2.printStackTrace();
log.info("请求访问失败!");
return "utf-8";
} // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse.getFirstHeader("Last-Modified");
} finally {
httpClient.close();
}
String charset="utf-8";
String infodata="";
try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
httpgeturl.releaseConnection();
Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE);
......@@ -465,27 +459,24 @@ public class PaserSiteDownload {
charset = m3.group().substring(9);
}
if (charset.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
charset = "gbk";
// }
}
}
return charset;
}
}
return charset;
}
public static String getHtml(String url,String charset) {
String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
HttpMethodParams.SO_TIMEOUT, 20000);
// 伪装成浏览器
httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
......@@ -499,16 +490,14 @@ public class PaserSiteDownload {
httprespse = httpClient.execute(httpgeturl);
} catch (Exception e2) {
httpgeturl.releaseConnection();
// TODO Auto-generated catch block
// e2.printStackTrace();
return "";
} // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse
.getFirstHeader("Last-Modified");
if (lastModify == null) {
lastModify = httprespse.getLastHeader("Last-Modified");
}
// Header lastModify = httprespse
// .getFirstHeader("Last-Modified");
// if (lastModify == null) {
// lastModify = httprespse.getLastHeader("Last-Modified");
// }
if(charset==null) {
String charstype = EntityUtils
.getContentCharSet(entitydata);
......@@ -524,15 +513,13 @@ public class PaserSiteDownload {
try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset);
httpgeturl.releaseConnection();
httpClient.close();
} catch (Exception e1) {
// TODO Auto-generated catch block
// e1.printStackTrace();
log.info("内容解析异常");
}finally {
httpgeturl.releaseConnection();
}
return infodata;
}
// 获取所要抓取网页的编码方式
......@@ -542,7 +529,7 @@ public class PaserSiteDownload {
Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
// 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(10000).get();
Document doc = conn.ignoreContentType(true).timeout(5000).get();
Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE);
......@@ -561,23 +548,16 @@ public class PaserSiteDownload {
encoding = m3.group().substring(9);
}
if (encoding.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding = "gbk";
// }
}
}
return encoding;
}
}
} catch (IOException e) {
// e.printStackTrace();
log.error("获取编码方式出错");
System.out.println("获取编码方式出错");
return encoding;
}
return encoding;
}
......@@ -608,7 +588,7 @@ public class PaserSiteDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
urlList.addAll(hisUrlList);
}
//获取编码
String charset = paserSiteDownload.getCharSet(urlList.get(0));
String charset = null;
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
//获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
......@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
}
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//获取文章详情
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
......@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -356,7 +356,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
TimeUnit.SECONDS.sleep(2);
}
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
sentBadSiteMsg(siteMsgTemple,"动态请求异常","0");
}else{
sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
}
// if (StringUtils.isEmpty(body)) {
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body);
//抽取资讯url
......@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
}
// if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
// sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
// }
}
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl="";
......@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else {
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
}
}catch (Exception e){
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
}
......@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
docInfo.setId(count+"");
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......
......@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.ContentUtility;
......@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
}
}
}
if(StringUtils.isNotEmpty(body)) {
if (StringUtils.isNotEmpty(body)) {
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} else {
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
}
} catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code);
......@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int partition=0;
try {
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
}catch (Exception e){
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
}
// int partition=0;
// try {
// partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
// }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo);
log.info("发送到kafka成功。");
......@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
return encoding;
}
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
}
......@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.*;
......@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
body = SeleniumTime.getScopehtml(uri_code);
}
}
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
......@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
return catchWebByMetaSearchList;
}
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
......@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -70,42 +70,38 @@ public class SeleniumTime {
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例
try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url);
Thread.sleep(1000l);
Thread.sleep(10002);
try {
WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close");
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
}finally {
driver.quit();
}
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
} finally {
try {
driver.quit();
service.stop();
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
}
} catch (Exception e) {
return "";
log.info("chromedriver 驱动访问出现异常:" + e.getMessage());
} finally {
service.stop();
}
return html;
}
......
......@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize();
String js1 = "return document.body.clientHeight.toString()";
String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
int height = Integer.parseInt(js1_result);
// String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
// int height = Integer.parseInt(js1_result);
List<String> files = new ArrayList<String>();
int last_t = 0;
// for (int i = 0; i < 20; ) {
......@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot customScreenshot=new CustomScreenshot();
files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath());
driver.quit();//退出浏览器
boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
......
......@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -361,7 +361,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
try {
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
||StringUtils.isEmpty(processitem.getPublishDate())){
......
......@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取");
processitem.setSource("2");
}else{
processitem.setSource("静态爬取");
processitem.setSource("1");
}
String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
......@@ -86,9 +86,8 @@ public class PageConnectioner {
//参数类型是json字符串用到
connection.setRequestProperty("Content-Type","application/json");
} catch (Exception e) {
//
}
return connection;
}
......@@ -157,6 +156,7 @@ public class PageConnectioner {
URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpsURLConnection connection = null;
try {
trustAllHttpsCertificates();
HostnameVerifier hv = new HostnameVerifier() {
@Override
......@@ -166,8 +166,6 @@ public class PageConnectioner {
};
HttpsURLConnection.setDefaultHostnameVerifier(hv);
try{
url = new URL(urlstr);
if (false) {
connection = (HttpsURLConnection) url.openConnection(proxy);
......@@ -180,14 +178,12 @@ public class PageConnectioner {
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
}
catch(Exception e){
} catch (Exception e) {
//
}
return connection;
}
/**构造下载使用的{@link HttpsURLConnection}
* @param urlstr 下载url
* @return
......@@ -252,9 +248,9 @@ public class PageConnectioner {
break;
} catch (Exception e1) {
try {
Thread.sleep(10000);
Thread.sleep(2000);
} catch (InterruptedException e2) {
// logUtil.getLogger().error(String.format("ORMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e2)));
//
}
}
}
......@@ -313,10 +309,18 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis();
PageGet pg = null;
String docBody = null;
HttpURLConnection connection = null;
try {
pg = new PageGet(url, encoding, this.connection(url,headerParams));
} catch (Exception e3) {
connection = this.connection(url,headerParams);
pg = new PageGet(url, encoding, connection);
} catch (Exception e1) {
assert connection != null;
connection.disconnect();
return docBody;
}finally {
assert connection != null;
connection.disconnect();
}
try {
......@@ -356,12 +360,18 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis();
PageGet pg = null;
String docBody = null;
HttpURLConnection connection = null;
try {
pg = new PageGet(url, encoding, this.connection(url));
connection = this.connection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody;
}finally {
assert connection != null;
connection.disconnect();
}
try {
pg.urlConnectionGet();
docBody = pg.getPageStr();
......@@ -393,12 +403,18 @@ public class PageConnectioner {
* @return
*/
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 3000;
long exitTimeDis = 10000;
long startDownTime = System.currentTimeMillis();
PageGet pg = null;
HttpsURLConnection connection = null;
try {
pg = new PageGet(url, encoding, this.httpsconnection(url));
connection = this.httpsconnection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) {
//
} finally {
assert connection != null;
connection.disconnect();
}
String docBody = null;
try {
......@@ -542,15 +558,23 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis();
PagePost pp = null;
String docBody = null;
HttpURLConnection connection = null;
try {
if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param);
connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{
pp = new PagePost(url, encoding, this.connection(url), postParam);
connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
}
} catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody;
}finally {
assert connection != null;
connection.disconnect();
}
try {
......@@ -589,15 +613,23 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis();
PagePost pp = null;
String docBody = null;
HttpURLConnection connection = null;
try {
if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param);
connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{
pp = new PagePost(url, encoding, this.connection(url), postParam);
connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
}
} catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody;
}finally {
assert connection != null;
connection.disconnect();
}
try {
......@@ -634,13 +666,18 @@ public class PageConnectioner {
long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis();
HttpsURLConnection connection = null;
PagePost pp = null;
try {
pp = new PagePost(url, encoding, this.httpsconnection(url),param);
connection = this.httpsconnection(url);
pp = new PagePost(url, encoding, connection, param);
} catch (Exception e3) {
// TODO Auto-generated catch block
e3.printStackTrace();
//
} finally {
assert connection != null;
connection.disconnect();
}
String docBody = null;
try {
pp.urlHttpsConnectionPost();
......@@ -693,7 +730,7 @@ public class PageConnectioner {
String pageStr="";
try {
HtmlPage htmlPage = webClient.getPage(urlstr);
webClient.waitForBackgroundJavaScript(600000);
webClient.waitForBackgroundJavaScript(300000);
pageStr = htmlPage.asXml();
}catch (Exception e){
......@@ -740,7 +777,6 @@ public class PageConnectioner {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// pageStr = scriptPage.getContent();
// }
} catch (Exception e) {
}finally {
webClient.close();
......
......@@ -49,6 +49,8 @@ public class PageDownloader {
// 如果页面编码格式未知,则从页面中获取该页面编码格式
public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException {
String encoding = null;
try {
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+ "Gecko/20080404 Firefox/2.0.0.14");
......@@ -56,7 +58,6 @@ public class PageDownloader {
connection.setRequestProperty("Cookie", "auth=token");
String contentType = connection.getHeaderField("Content-Type");
String encoding = null;
if (contentType != null) {
String temp = "charset=";
int m = contentType.indexOf(temp);
......@@ -65,17 +66,23 @@ public class PageDownloader {
}
}
if (encoding == null) {
try {
InputStream is = null;
try {
is = connection.getInputStream();
BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
encoding = EncodeDetector.getEncoding(bufferedInputStream);
is.close();
} catch (Exception e) {
//
}finally {
assert is != null;
is.close();
}
}
} catch (Exception e) {
//
} finally {
connection.disconnect();
}
return encoding;
}
......@@ -159,25 +166,19 @@ public class PageDownloader {
if (interval > 0 && lastDownloadTime > 0 && dis < interval){
new PageDownloader(dis+2000);
}
long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null;
try {
connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) {//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
encoding = paserSiteDownload.locateCharSet(url);
}
} catch (Exception e1) {
// e1.printStackTrace();
log.info("获取编码失败");
}
String docBody = null;
if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding);
} else {
// this.bDownloadUseFrame=true;
if (bFrame && this.bDownloadUseFrame) {
String body = null;
try {
......@@ -196,12 +197,11 @@ public class PageDownloader {
}
if(url.contains("https:")){
try {
connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8";
}
} catch (Exception e1) {
// e1.printStackTrace();
//
}
docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
}else{
......@@ -237,6 +237,9 @@ public class PageDownloader {
}
} catch (Exception e1) {
// e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
}
String docBody = null;
if (bDynamic) {
......@@ -264,7 +267,7 @@ public class PageDownloader {
}
if(url.contains("https:")){
try {
connection = pConn.httpsconnection(url);
// connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8";
}
......@@ -368,6 +371,9 @@ public class PageDownloader {
}
} catch (Exception e1) {
// e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
}
String docBody = null;
if (bDynamic) {
......@@ -493,7 +499,6 @@ public class PageDownloader {
return true;
}
} catch (Exception e) {
// TODO Auto-generated catch block
return true;
}
return false;
......
......@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
/**正文*/
private String content;
private String contentWithtag;
private String contentWithTag;
/**未知*/
......
......@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
// latest earliest
//时间间隔设置为1h
// properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 2*60*60*1000);
properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000);
properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
......@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled(cron = "0 0/5 * * * ?")
@Async("asyncTaskExecutor")
@Scheduled(cron = "0 0/2 * * * ?")
// @Async("asyncTaskExecutor")
public void consumer (){
ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log.info("进入定时获取mq消息");
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log.info("进入定时获取topic消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
// 消费某个主题的某个分区数据
......@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000));
//手动提交已消费数据的offset
// consumer.commitAsync();
consumer.commitSync();
if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) {
......@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
}
}
}
}
}catch (Exception e){
// consumer.commitSync();
log.info(e.getMessage());
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
//退出应用程序前使用close方法关闭消费者,网络连接和socket也会随之关闭,并立即触发一次再均衡
consumer.close();
System.out.println("error!!!!!!!!!!!");
consumer = createConsumer();
// 消费某个主题的某个分区数据
kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions1 = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions1.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions1[i])));
}
consumer.assign(topicPartitions);
}
}
......
......@@ -35,8 +35,8 @@ PROXYID=1
#线程池大小
THREAD_SIZE=1
#
CHROMEDRIVE= E:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe
CHROMEDRIVE= D:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
#mysql connection
......@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
#
KAFKA_CONSUMER_GROUP_ID=dynamin-sync
KAFKA_CONSUMER_GROUP_ID=test-zs1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
KAFKA_PRODUCT_TOPIC=crawlerInfo
......@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#指定分区使用逗号分割
KAFKA_CONSUMER_PARTITION=0
KAFKA_CONSUMER_PARTITION=0,1,2,3
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_PRODUCT_PARTITION=0
# Redis settings
redis.host=127.0.0.1
redis.host=114.116.26.150
redis.port=6379
redis.pass=xxxxxx
redis.pass=zzsn9988
#redis.host=8.130.30.33
#redis.port=9010
#redis.pass=wxadS&jklim
......
......@@ -17,8 +17,8 @@ public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor")
public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setCorePoolSize(2);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论