提交 cc9aa52f 作者: 张文库

更新

上级 f314a48b
...@@ -29,7 +29,11 @@ public class SiteInfoVerify{ ...@@ -29,7 +29,11 @@ public class SiteInfoVerify{
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
String charset="utf-8"; String charset="utf-8";
if(siteMsgTemple.getYnDynamicCrawl()!=1){ if(siteMsgTemple.getYnDynamicCrawl()!=1){
try {
charset = paserSiteDownload.getCharSet(urlList.get(0)); charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
} }
...@@ -82,7 +86,11 @@ public class SiteInfoVerify{ ...@@ -82,7 +86,11 @@ public class SiteInfoVerify{
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
charset = paserSiteDownload.locateCharSet(urlList.get(0)); charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){ }catch (Exception e){
try {
charset = paserSiteDownload.getCharSet(urlList.get(0)); charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
} }
//判断解析表达式类型 //判断解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
...@@ -165,7 +173,7 @@ public class SiteInfoVerify{ ...@@ -165,7 +173,7 @@ public class SiteInfoVerify{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
crawler(); crawler();
} }
@Async("asyncexecutorService") // @Async("asyncexecutorService")
public void crawler(){ public void crawler(){
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
...@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{ ...@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
String charset = ""; String charset = "";
try { try {
charset = paserSiteDownload.locateCharSet(urlList.get(0)); charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){ } catch (Exception e) {
try {
charset = paserSiteDownload.getCharSet(urlList.get(0)); charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
} }
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
...@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{ ...@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
} }
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//资讯类容抽取
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型 //判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
...@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -406,11 +406,14 @@ public class PaserSiteDownload { ...@@ -406,11 +406,14 @@ public class PaserSiteDownload {
return HttpClients.createDefault(); return HttpClients.createDefault();
} }
public static String getCharSet(String url) { public static String getCharSet(String url) throws IOException {
String html=""; String html = "";
HttpResponse httprespse = null;
HttpEntity entitydata = null;
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
try {
// Thread.sleep(500L);
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter( httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000); CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter( httpgeturl.getParams().setParameter(
...@@ -422,31 +425,22 @@ public class PaserSiteDownload { ...@@ -422,31 +425,22 @@ public class PaserSiteDownload {
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en"); //httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate"); //httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse httprespse=null;
try {
Thread.sleep(500L);
httprespse = httpClient.execute(httpgeturl); httprespse = httpClient.execute(httpgeturl);
entitydata = httprespse.getEntity();// 获取返回数据
httpgeturl.releaseConnection();
} catch (Exception e2) { } catch (Exception e2) {
// TODO Auto-generated catch block
// e2.printStackTrace();
log.info("请求访问失败!"); log.info("请求访问失败!");
return "utf-8"; return "utf-8";
} // 发送请求 } finally {
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据 httpClient.close();
}
Header lastModify = httprespse.getFirstHeader("Last-Modified");
String charset="utf-8"; String charset="utf-8";
String infodata=""; String infodata="";
try { try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset); infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) { } catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace(); e1.printStackTrace();
} }
httpgeturl.releaseConnection();
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
...@@ -465,27 +459,24 @@ public class PaserSiteDownload { ...@@ -465,27 +459,24 @@ public class PaserSiteDownload {
charset = m3.group().substring(9); charset = m3.group().substring(9);
} }
if (charset.trim().length() == 0) { if (charset.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
charset = "gbk"; charset = "gbk";
// }
} }
} }
return charset; return charset;
} }
} }
return charset; return charset;
} }
public static String getHtml(String url,String charset) { public static String getHtml(String url,String charset) {
String html=""; String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter( httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000); CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
httpgeturl.getParams().setParameter( httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000); HttpMethodParams.SO_TIMEOUT, 20000);
// 伪装成浏览器 // 伪装成浏览器
httpgeturl.setHeader("Content-Type", httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8"); "application/x-www-form-urlencoded;charset=utf-8");
...@@ -499,16 +490,14 @@ public class PaserSiteDownload { ...@@ -499,16 +490,14 @@ public class PaserSiteDownload {
httprespse = httpClient.execute(httpgeturl); httprespse = httpClient.execute(httpgeturl);
} catch (Exception e2) { } catch (Exception e2) {
httpgeturl.releaseConnection(); httpgeturl.releaseConnection();
// TODO Auto-generated catch block
// e2.printStackTrace();
return ""; return "";
} // 发送请求 } // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据 HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse // Header lastModify = httprespse
.getFirstHeader("Last-Modified"); // .getFirstHeader("Last-Modified");
if (lastModify == null) { // if (lastModify == null) {
lastModify = httprespse.getLastHeader("Last-Modified"); // lastModify = httprespse.getLastHeader("Last-Modified");
} // }
if(charset==null) { if(charset==null) {
String charstype = EntityUtils String charstype = EntityUtils
.getContentCharSet(entitydata); .getContentCharSet(entitydata);
...@@ -524,15 +513,13 @@ public class PaserSiteDownload { ...@@ -524,15 +513,13 @@ public class PaserSiteDownload {
try { try {
Thread.sleep(500L); Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset); infodata = EntityUtils.toString(entitydata, charset);
httpgeturl.releaseConnection();
httpClient.close();
} catch (Exception e1) { } catch (Exception e1) {
// TODO Auto-generated catch block
// e1.printStackTrace();
log.info("内容解析异常"); log.info("内容解析异常");
}finally { }finally {
httpgeturl.releaseConnection(); httpgeturl.releaseConnection();
} }
return infodata; return infodata;
} }
// 获取所要抓取网页的编码方式 // 获取所要抓取网页的编码方式
...@@ -542,7 +529,7 @@ public class PaserSiteDownload { ...@@ -542,7 +529,7 @@ public class PaserSiteDownload {
Connection conn = Jsoup.connect(url); Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"); conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
// 伪装成浏览器 // 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(10000).get(); Document doc = conn.ignoreContentType(true).timeout(5000).get();
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
...@@ -561,23 +548,16 @@ public class PaserSiteDownload { ...@@ -561,23 +548,16 @@ public class PaserSiteDownload {
encoding = m3.group().substring(9); encoding = m3.group().substring(9);
} }
if (encoding.trim().length() == 0) { if (encoding.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding = "gbk"; encoding = "gbk";
// }
} }
} }
return encoding; return encoding;
} }
} }
} catch (IOException e) { } catch (IOException e) {
// e.printStackTrace();
log.error("获取编码方式出错"); log.error("获取编码方式出错");
System.out.println("获取编码方式出错");
return encoding; return encoding;
} }
return encoding; return encoding;
} }
...@@ -608,7 +588,7 @@ public class PaserSiteDownload { ...@@ -608,7 +588,7 @@ public class PaserSiteDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{ ...@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
urlList.addAll(hisUrlList); urlList.addAll(hisUrlList);
} }
//获取编码 //获取编码
String charset = paserSiteDownload.getCharSet(urlList.get(0)); String charset = null;
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
...@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{ ...@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
} }
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//获取文章详情
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型 //判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
...@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{ ...@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -356,7 +356,7 @@ public class PaserCommDownload { ...@@ -356,7 +356,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -78,11 +78,9 @@ public class WebContentPaserByCss { ...@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
TimeUnit.SECONDS.sleep(2); TimeUnit.SECONDS.sleep(2);
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用 // if (StringUtils.isEmpty(body)) {
sentBadSiteMsg(siteMsgTemple,"动态请求异常","0"); // sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
}else{ // }
sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
}
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
...@@ -94,9 +92,9 @@ public class WebContentPaserByCss { ...@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); // catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches); // catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// } // }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 // if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1"); // sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
} // }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl=""; String imagUrl="";
...@@ -315,11 +313,11 @@ public class WebContentPaserByCss { ...@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) { if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { }else {
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1"); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
} }
}catch (Exception e){ }catch (Exception e){
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1"); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
} }
...@@ -329,9 +327,9 @@ public class WebContentPaserByCss { ...@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
docInfo.setId(count+""); docInfo.setId(count+"");
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......
...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload; ...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.ContentUtility; import com.zzsn.util.ContentUtility;
...@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath { ...@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
} }
} }
} }
if(StringUtils.isNotEmpty(body)) { if (StringUtils.isNotEmpty(body)) {
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} else {
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
} }
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
...@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath { ...@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int partition=0; // int partition=0;
try { // try {
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION); // partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
}catch (Exception e){ // }catch (Exception e){
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION); // log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
} // }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo); docInfoList.add(docInfo);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
...@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath { ...@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
...@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath { ...@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
return encoding; return encoding;
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
} }
...@@ -43,9 +43,7 @@ public class WebContentPaserByRegular { ...@@ -43,9 +43,7 @@ public class WebContentPaserByRegular {
// 提取站点新闻列表URL // 提取站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular(List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
...@@ -57,7 +55,7 @@ public class WebContentPaserByRegular { ...@@ -57,7 +55,7 @@ public class WebContentPaserByRegular {
uri_code = Utility.encodURI(uri.toString()) uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); .replaceAll("%20", "+");
Thread.sleep(1000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
...@@ -72,12 +70,8 @@ public class WebContentPaserByRegular { ...@@ -72,12 +70,8 @@ public class WebContentPaserByRegular {
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) { if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); // sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
}
if( pageDownload.isBadDownloadPage(body)){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue; continue;
} }
} }
...@@ -96,7 +90,7 @@ public class WebContentPaserByRegular { ...@@ -96,7 +90,7 @@ public class WebContentPaserByRegular {
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){ if(catchWebByMetaSearches.size()<1){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); // sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue; continue;
} }
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
...@@ -105,7 +99,6 @@ public class WebContentPaserByRegular { ...@@ -105,7 +99,6 @@ public class WebContentPaserByRegular {
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
log.info("异常信息"+e.getMessage()); log.info("异常信息"+e.getMessage());
// return catchWebByMetaSearchList;
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -122,7 +115,6 @@ public class WebContentPaserByRegular { ...@@ -122,7 +115,6 @@ public class WebContentPaserByRegular {
* @param siteMsgTemple * @param siteMsgTemple
* @param msg 异常信息 * @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常 * @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param 爬虫类型(0:静态爬取 1:动态爬取)
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
...@@ -252,61 +244,58 @@ public class WebContentPaserByRegular { ...@@ -252,61 +244,58 @@ public class WebContentPaserByRegular {
} }
return eleText; return eleText;
} }
// 抓取新闻内容 // 抓取新闻内容
public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) { public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) {
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList = new ArrayList<>();
try {
int count = 0; int count = 0;
int mark=0;
for (int i = 0; i < catchWebList.size(); i++) { for (int i = 0; i < catchWebList.size(); i++) {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) { if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length() == 0 || cwbm.getSourceaddress().contains(".PDF") || cwbm.getSourceaddress().contains("download")) {
continue; continue;
} }
log.info("解析内容的URL:"+cwbm.getSourceaddress()); log.info("解析内容的URL:" + cwbm.getSourceaddress());
String rediskey=siteMsgTemple.getInfoSourceCode(); String rediskey = siteMsgTemple.getInfoSourceCode();
try { try {
boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress()); boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
if (sismember) { if (sismember) {
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress()); log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue; continue;
} }
}catch (Exception e){ } catch (Exception e) {
log.info("缓存出问题"); log.info("缓存出问题");
} }
// 请求下载内容 先使用静态访问若内容为空调用动态请求若内容还为空则跳过 // 请求下载内容
String content=""; String content = "";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}else{ } else {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}catch (Exception e){ } catch (Exception e) {
log.info(e.getMessage()); log.info(e.getMessage());
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null); content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类 //超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
}catch (Exception e) { } catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue; continue;
} }
//使用浏览器截取图片 //使用浏览器截取图片
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if (StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")) {
String imagUrl=""; String imagUrl = "";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot(); WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH); webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress()); // InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString(); // imagUrl=map.get("objectUrl").toString();
} }
if(StringUtils.isEmpty(content) ) { if (StringUtils.isEmpty(content)) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue; continue;
} }
log.info("详情内容的长度:"+content.length()); log.info("详情内容的长度:" + content.length());
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -315,71 +304,54 @@ public class WebContentPaserByRegular { ...@@ -315,71 +304,54 @@ public class WebContentPaserByRegular {
docInfo.setLastModified(cwbm.getLastModify()); docInfo.setLastModified(cwbm.getLastModify());
docInfo.setCharset("utf-8"); docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress()); docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle()==null?"":cwbm.getTitle().replace("...", "")); docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor()); docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate()); docInfo.setPublishDate(cwbm.getPublishDate());
if(cwbm.getSourceaddress()!=null) { if (cwbm.getSourceaddress() != null) {
docInfo.setOrigin(cwbm.getSourcesite()); docInfo.setOrigin(cwbm.getSourcesite());
}else{ } else {
docInfo.setOrigin(siteMsgTemple.getSiteName()); docInfo.setOrigin(siteMsgTemple.getSiteName());
} }
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
// 封装解析的docinfo对象 //封装解析的docinfo对象
try { try {
if(StringUtils.isNotEmpty(content)) { if (StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { } else {
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:" + siteMsgTemple.getSiteName() + " 链接请求:" + cwbm.getSourceaddress() + " 内容为空:" + content);
} }
}catch (Exception e){ } catch (Exception e) {
log.info("文本内容解析不正确!"); log.info("文本内容解析不正确!");
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
continue; continue;
} }
count++;
docInfo.setId(count+"");
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try { try {
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
processitem.setSource("2"); processitem.setSource("2");
}else{ } else {
processitem.setSource("1"); processitem.setSource("1");
} }
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent()) if (StringUtils.isEmpty(processitem.getTitle()) || StringUtils.isEmpty(processitem.getContent())
||StringUtils.isEmpty(processitem.getPublishDate())) { || StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress()); log.info("资讯的信息不全缺少标题、时间或内容!:" + cwbm.getSourceaddress());
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
continue; continue;
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC,Constants.KAFKA_CONSUMER_PARTITION , docjson);
// int partition=0;
// try {
// partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
// }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress()); JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
count++;
docInfo.setId(count + "");
docInfoList.add(docInfo); docInfoList.add(docInfo);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace();
log.info("发送到kafka失败。"); log.info("发送到kafka失败。");
continue;
}
} catch (Exception e){
continue;
} }
}
log.info("本次成功件数:" + count);
} catch (Exception e) { } catch (Exception e) {
log.info("内容解析部分出现异常!"); log.info("内容解析部分出现异常!");
} }
}
log.info("本次成功件数:" + count);
return docInfoList; return docInfoList;
} }
......
...@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime; ...@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
...@@ -105,6 +102,9 @@ public class WebContentPaserByXpath { ...@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
} }
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
...@@ -131,6 +131,28 @@ public class WebContentPaserByXpath { ...@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -361,9 +383,9 @@ public class WebContentPaserByXpath { ...@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -489,7 +511,7 @@ public class WebContentPaserByXpath { ...@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -70,42 +70,38 @@ public class SeleniumTime { ...@@ -70,42 +70,38 @@ public class SeleniumTime {
ChromeDriverService service = new ChromeDriverService.Builder(). ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build(); usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try { try {
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
service.start(); service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) { if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080"); chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数 chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
} }
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例 driver = new ChromeDriver(chromeOptions);//生成实例
try { try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS); Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
Thread.sleep(1000l); Thread.sleep(10002);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close"); System.out.println("browser will be close");
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
}finally {
driver.quit();
} }
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
} finally { } finally {
try {
driver.quit(); driver.quit();
service.stop(); service.stop();
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
} }
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 驱动访问出现异常:" + e.getMessage());
return ""; } finally {
service.stop();
} }
return html; return html;
} }
......
...@@ -41,8 +41,8 @@ public class WebPageScreenShot { ...@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize(); // driver.manage().window().maximize();
String js1 = "return document.body.clientHeight.toString()"; String js1 = "return document.body.clientHeight.toString()";
String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + ""; // String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
int height = Integer.parseInt(js1_result); // int height = Integer.parseInt(js1_result);
List<String> files = new ArrayList<String>(); List<String> files = new ArrayList<String>();
int last_t = 0; int last_t = 0;
// for (int i = 0; i < 20; ) { // for (int i = 0; i < 20; ) {
...@@ -80,7 +80,7 @@ public class WebPageScreenShot { ...@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot customScreenshot=new CustomScreenshot(); CustomScreenshot customScreenshot=new CustomScreenshot();
files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath()); files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath());
driver.quit();//退出浏览器 driver.quit();//退出浏览器
boolean flag = merge(files.toArray(new String[]{}), type, resultPath); // boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){ // if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath)); // InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
......
...@@ -133,7 +133,7 @@ public class ArticleCrawlerThread { ...@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -361,7 +361,7 @@ public class PaserCommDownload { ...@@ -361,7 +361,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath { ...@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath { ...@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -321,9 +321,9 @@ public class WebContentPaserByRegular { ...@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
try { try {
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent()) if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
||StringUtils.isEmpty(processitem.getPublishDate())){ ||StringUtils.isEmpty(processitem.getPublishDate())){
......
...@@ -364,9 +364,9 @@ public class WebContentPaserByXpath { ...@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -483,7 +483,7 @@ public class WebContentPaserByXpath { ...@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -86,9 +86,8 @@ public class PageConnectioner { ...@@ -86,9 +86,8 @@ public class PageConnectioner {
//参数类型是json字符串用到 //参数类型是json字符串用到
connection.setRequestProperty("Content-Type","application/json"); connection.setRequestProperty("Content-Type","application/json");
} catch (Exception e) { } catch (Exception e) {
//
} }
return connection; return connection;
} }
...@@ -157,6 +156,7 @@ public class PageConnectioner { ...@@ -157,6 +156,7 @@ public class PageConnectioner {
URL url = null; URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT)); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpsURLConnection connection = null; HttpsURLConnection connection = null;
try {
trustAllHttpsCertificates(); trustAllHttpsCertificates();
HostnameVerifier hv = new HostnameVerifier() { HostnameVerifier hv = new HostnameVerifier() {
@Override @Override
...@@ -166,8 +166,6 @@ public class PageConnectioner { ...@@ -166,8 +166,6 @@ public class PageConnectioner {
}; };
HttpsURLConnection.setDefaultHostnameVerifier(hv); HttpsURLConnection.setDefaultHostnameVerifier(hv);
try{
url = new URL(urlstr); url = new URL(urlstr);
if (false) { if (false) {
connection = (HttpsURLConnection) url.openConnection(proxy); connection = (HttpsURLConnection) url.openConnection(proxy);
...@@ -180,14 +178,12 @@ public class PageConnectioner { ...@@ -180,14 +178,12 @@ public class PageConnectioner {
connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
} catch (Exception e) {
} //
catch(Exception e){
} }
return connection; return connection;
} }
/**构造下载使用的{@link HttpsURLConnection} /**构造下载使用的{@link HttpsURLConnection}
* @param urlstr 下载url * @param urlstr 下载url
* @return * @return
...@@ -252,9 +248,9 @@ public class PageConnectioner { ...@@ -252,9 +248,9 @@ public class PageConnectioner {
break; break;
} catch (Exception e1) { } catch (Exception e1) {
try { try {
Thread.sleep(10000); Thread.sleep(2000);
} catch (InterruptedException e2) { } catch (InterruptedException e2) {
// logUtil.getLogger().error(String.format("ORMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e2))); //
} }
} }
} }
...@@ -313,10 +309,18 @@ public class PageConnectioner { ...@@ -313,10 +309,18 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.connection(url,headerParams)); connection = this.connection(url,headerParams);
} catch (Exception e3) { pg = new PageGet(url, encoding, connection);
} catch (Exception e1) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -356,12 +360,18 @@ public class PageConnectioner { ...@@ -356,12 +360,18 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.connection(url)); connection = this.connection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pg.urlConnectionGet(); pg.urlConnectionGet();
docBody = pg.getPageStr(); docBody = pg.getPageStr();
...@@ -393,12 +403,18 @@ public class PageConnectioner { ...@@ -393,12 +403,18 @@ public class PageConnectioner {
* @return * @return
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 3000; long exitTimeDis = 10000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
HttpsURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.httpsconnection(url)); connection = this.httpsconnection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
//
} finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
...@@ -542,15 +558,23 @@ public class PageConnectioner { ...@@ -542,15 +558,23 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂 if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param); connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
pp = new PagePost(url, encoding, this.connection(url), postParam); connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -589,15 +613,23 @@ public class PageConnectioner { ...@@ -589,15 +613,23 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂 if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param); connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
pp = new PagePost(url, encoding, this.connection(url), postParam); connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -634,13 +666,18 @@ public class PageConnectioner { ...@@ -634,13 +666,18 @@ public class PageConnectioner {
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
HttpsURLConnection connection = null;
PagePost pp = null; PagePost pp = null;
try { try {
pp = new PagePost(url, encoding, this.httpsconnection(url),param); connection = this.httpsconnection(url);
pp = new PagePost(url, encoding, connection, param);
} catch (Exception e3) { } catch (Exception e3) {
// TODO Auto-generated catch block //
e3.printStackTrace(); } finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
pp.urlHttpsConnectionPost(); pp.urlHttpsConnectionPost();
...@@ -693,7 +730,7 @@ public class PageConnectioner { ...@@ -693,7 +730,7 @@ public class PageConnectioner {
String pageStr=""; String pageStr="";
try { try {
HtmlPage htmlPage = webClient.getPage(urlstr); HtmlPage htmlPage = webClient.getPage(urlstr);
webClient.waitForBackgroundJavaScript(600000); webClient.waitForBackgroundJavaScript(300000);
pageStr = htmlPage.asXml(); pageStr = htmlPage.asXml();
}catch (Exception e){ }catch (Exception e){
...@@ -740,7 +777,6 @@ public class PageConnectioner { ...@@ -740,7 +777,6 @@ public class PageConnectioner {
// JavaScriptPage scriptPage = (JavaScriptPage) page; // JavaScriptPage scriptPage = (JavaScriptPage) page;
// pageStr = scriptPage.getContent(); // pageStr = scriptPage.getContent();
// } // }
} catch (Exception e) { } catch (Exception e) {
}finally { }finally {
webClient.close(); webClient.close();
......
...@@ -49,6 +49,8 @@ public class PageDownloader { ...@@ -49,6 +49,8 @@ public class PageDownloader {
// 如果页面编码格式未知,则从页面中获取该页面编码格式 // 如果页面编码格式未知,则从页面中获取该页面编码格式
public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException { public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException {
String encoding = null;
try {
connection.setRequestMethod("GET"); connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) " connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+ "Gecko/20080404 Firefox/2.0.0.14"); + "Gecko/20080404 Firefox/2.0.0.14");
...@@ -56,7 +58,6 @@ public class PageDownloader { ...@@ -56,7 +58,6 @@ public class PageDownloader {
connection.setRequestProperty("Cookie", "auth=token"); connection.setRequestProperty("Cookie", "auth=token");
String contentType = connection.getHeaderField("Content-Type"); String contentType = connection.getHeaderField("Content-Type");
String encoding = null;
if (contentType != null) { if (contentType != null) {
String temp = "charset="; String temp = "charset=";
int m = contentType.indexOf(temp); int m = contentType.indexOf(temp);
...@@ -65,17 +66,23 @@ public class PageDownloader { ...@@ -65,17 +66,23 @@ public class PageDownloader {
} }
} }
if (encoding == null) { if (encoding == null) {
try {
InputStream is = null; InputStream is = null;
try {
is = connection.getInputStream(); is = connection.getInputStream();
BufferedInputStream bufferedInputStream = new BufferedInputStream(is); BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
encoding = EncodeDetector.getEncoding(bufferedInputStream); encoding = EncodeDetector.getEncoding(bufferedInputStream);
is.close();
} catch (Exception e) { } catch (Exception e) {
//
}finally {
assert is != null;
is.close();
} }
} }
} catch (Exception e) {
//
} finally {
connection.disconnect(); connection.disconnect();
}
return encoding; return encoding;
} }
...@@ -159,25 +166,19 @@ public class PageDownloader { ...@@ -159,25 +166,19 @@ public class PageDownloader {
if (interval > 0 && lastDownloadTime > 0 && dis < interval){ if (interval > 0 && lastDownloadTime > 0 && dis < interval){
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner(); PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null;
try { try {
connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) {//获取网站编码 if (encoding == null || encoding.isEmpty()) {//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
encoding = paserSiteDownload.locateCharSet(url); encoding = paserSiteDownload.locateCharSet(url);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace();
log.info("获取编码失败"); log.info("获取编码失败");
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding); docBody = pConn.dynamicConnectByGet(url, encoding);
} else { } else {
// this.bDownloadUseFrame=true;
if (bFrame && this.bDownloadUseFrame) { if (bFrame && this.bDownloadUseFrame) {
String body = null; String body = null;
try { try {
...@@ -196,12 +197,11 @@ public class PageDownloader { ...@@ -196,12 +197,11 @@ public class PageDownloader {
} }
if(url.contains("https:")){ if(url.contains("https:")){
try { try {
connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8"; encoding = "utf-8";
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); //
} }
docBody = pConn.staticHttpsConnectByGet(url, encoding,false); docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
}else{ }else{
...@@ -237,6 +237,9 @@ public class PageDownloader { ...@@ -237,6 +237,9 @@ public class PageDownloader {
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
...@@ -264,7 +267,7 @@ public class PageDownloader { ...@@ -264,7 +267,7 @@ public class PageDownloader {
} }
if(url.contains("https:")){ if(url.contains("https:")){
try { try {
connection = pConn.httpsconnection(url); // connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8"; encoding = "utf-8";
} }
...@@ -368,6 +371,9 @@ public class PageDownloader { ...@@ -368,6 +371,9 @@ public class PageDownloader {
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
...@@ -493,7 +499,6 @@ public class PageDownloader { ...@@ -493,7 +499,6 @@ public class PageDownloader {
return true; return true;
} }
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block
return true; return true;
} }
return false; return false;
......
...@@ -21,7 +21,7 @@ public class ClbAnsProcessitem { ...@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
/**正文*/ /**正文*/
private String content; private String content;
private String contentWithtag; private String contentWithTag;
/**未知*/ /**未知*/
......
...@@ -50,7 +50,7 @@ public class KafkaConsumerJob { ...@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
// latest earliest // latest earliest
//时间间隔设置为1h //时间间隔设置为1h
// properties.put("max.poll.interval.ms", 60*60*1000); // properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 60*60*1000); properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 2*60*60*1000);
properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000); properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000);
properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000); properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1); properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
...@@ -62,11 +62,11 @@ public class KafkaConsumerJob { ...@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); // , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled(cron = "0 0/5 * * * ?") @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") // @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE); // ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log.info("进入定时获取mq消息"); log.info("进入定时获取topic消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
// 消费某个主题的某个分区数据 // 消费某个主题的某个分区数据
...@@ -83,7 +83,6 @@ public class KafkaConsumerJob { ...@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000)); ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000));
//手动提交已消费数据的offset //手动提交已消费数据的offset
// consumer.commitAsync();
consumer.commitSync(); consumer.commitSync();
if (records != null && records.count() > 0) { if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) { for (ConsumerRecord record : records) {
...@@ -98,13 +97,19 @@ public class KafkaConsumerJob { ...@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
} }
} }
} }
} }
}catch (Exception e){ }catch (Exception e){
// consumer.commitSync(); //退出应用程序前使用close方法关闭消费者,网络连接和socket也会随之关闭,并立即触发一次再均衡
log.info(e.getMessage()); consumer.close();
// consumer = createConsumer(); System.out.println("error!!!!!!!!!!!");
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC)); consumer = createConsumer();
// 消费某个主题的某个分区数据
kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions1 = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions1.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions1[i])));
}
consumer.assign(topicPartitions);
} }
} }
......
...@@ -35,8 +35,8 @@ PROXYID=1 ...@@ -35,8 +35,8 @@ PROXYID=1
#线程池大小 #线程池大小
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= E:\\chrome\\chromedriver.exe CHROMEDRIVE= D:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
#mysql connection #mysql connection
...@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic #KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
# #
KAFKA_CONSUMER_GROUP_ID=dynamin-sync KAFKA_CONSUMER_GROUP_ID=test-zs1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest #KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
KAFKA_PRODUCT_TOPIC=crawlerInfo KAFKA_PRODUCT_TOPIC=crawlerInfo
...@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo ...@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q= META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word= #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#指定分区使用逗号分割 #指定分区使用逗号分割
KAFKA_CONSUMER_PARTITION=0 KAFKA_CONSUMER_PARTITION=0,1,2,3
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 #KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_PRODUCT_PARTITION=0 KAFKA_PRODUCT_PARTITION=0
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=114.116.26.150
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=zzsn9988
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
......
...@@ -17,8 +17,8 @@ public class ThreadExecutorConfig { ...@@ -17,8 +17,8 @@ public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor") @Bean(value = "asyncTaskExecutor")
public Executor executor() { public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量 executor.setCorePoolSize(2);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量 executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列 executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-"); executor.setThreadNamePrefix("ssmsExecutor-");
/** /**
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论