提交 cc9aa52f 作者: 张文库

更新

上级 f314a48b
...@@ -29,7 +29,11 @@ public class SiteInfoVerify{ ...@@ -29,7 +29,11 @@ public class SiteInfoVerify{
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
String charset="utf-8"; String charset="utf-8";
if(siteMsgTemple.getYnDynamicCrawl()!=1){ if(siteMsgTemple.getYnDynamicCrawl()!=1){
charset = paserSiteDownload.getCharSet(urlList.get(0)); try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
} }
...@@ -82,7 +86,11 @@ public class SiteInfoVerify{ ...@@ -82,7 +86,11 @@ public class SiteInfoVerify{
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
charset = paserSiteDownload.locateCharSet(urlList.get(0)); charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){ }catch (Exception e){
charset = paserSiteDownload.getCharSet(urlList.get(0)); try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
} }
//判断解析表达式类型 //判断解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
...@@ -165,7 +173,7 @@ public class SiteInfoVerify{ ...@@ -165,7 +173,7 @@ public class SiteInfoVerify{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -18,6 +18,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -36,7 +37,7 @@ public class DynaminSiteThread implements Runnable{
crawler(); crawler();
} }
@Async("asyncexecutorService") // @Async("asyncexecutorService")
public void crawler(){ public void crawler(){
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
...@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{ ...@@ -62,8 +63,12 @@ public class DynaminSiteThread implements Runnable{
String charset = ""; String charset = "";
try { try {
charset = paserSiteDownload.locateCharSet(urlList.get(0)); charset = paserSiteDownload.locateCharSet(urlList.get(0));
}catch (Exception e){ } catch (Exception e) {
charset = paserSiteDownload.getCharSet(urlList.get(0)); try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) {
//
}
} }
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
...@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{ ...@@ -90,8 +95,8 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
} }
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//资讯类容抽取
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型 //判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
...@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -145,7 +150,7 @@ public class DynaminSiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -406,47 +406,41 @@ public class PaserSiteDownload { ...@@ -406,47 +406,41 @@ public class PaserSiteDownload {
return HttpClients.createDefault(); return HttpClients.createDefault();
} }
public static String getCharSet(String url) { public static String getCharSet(String url) throws IOException {
String html=""; String html = "";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); HttpResponse httprespse = null;
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpEntity entitydata = null;
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
// 伪装成浏览器
httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse httprespse=null;
try { try {
Thread.sleep(500L); // Thread.sleep(500L);
HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
// 伪装成浏览器
httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
httprespse = httpClient.execute(httpgeturl); httprespse = httpClient.execute(httpgeturl);
entitydata = httprespse.getEntity();// 获取返回数据
httpgeturl.releaseConnection();
} catch (Exception e2) { } catch (Exception e2) {
// TODO Auto-generated catch block
// e2.printStackTrace();
log.info("请求访问失败!"); log.info("请求访问失败!");
return "utf-8"; return "utf-8";
} // 发送请求 } finally {
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据 httpClient.close();
}
Header lastModify = httprespse.getFirstHeader("Last-Modified");
String charset="utf-8"; String charset="utf-8";
String infodata=""; String infodata="";
try { try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset); infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) { } catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace(); e1.printStackTrace();
} }
httpgeturl.releaseConnection();
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
...@@ -465,27 +459,24 @@ public class PaserSiteDownload { ...@@ -465,27 +459,24 @@ public class PaserSiteDownload {
charset = m3.group().substring(9); charset = m3.group().substring(9);
} }
if (charset.trim().length() == 0) { if (charset.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
charset = "gbk"; charset = "gbk";
// }
} }
} }
return charset; return charset;
} }
} }
return charset; return charset;
} }
public static String getHtml(String url,String charset) { public static String getHtml(String url,String charset) {
String html=""; String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter( httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000); CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
httpgeturl.getParams().setParameter( httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000); HttpMethodParams.SO_TIMEOUT, 20000);
// 伪装成浏览器 // 伪装成浏览器
httpgeturl.setHeader("Content-Type", httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8"); "application/x-www-form-urlencoded;charset=utf-8");
...@@ -499,16 +490,14 @@ public class PaserSiteDownload { ...@@ -499,16 +490,14 @@ public class PaserSiteDownload {
httprespse = httpClient.execute(httpgeturl); httprespse = httpClient.execute(httpgeturl);
} catch (Exception e2) { } catch (Exception e2) {
httpgeturl.releaseConnection(); httpgeturl.releaseConnection();
// TODO Auto-generated catch block
// e2.printStackTrace();
return ""; return "";
} // 发送请求 } // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据 HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse // Header lastModify = httprespse
.getFirstHeader("Last-Modified"); // .getFirstHeader("Last-Modified");
if (lastModify == null) { // if (lastModify == null) {
lastModify = httprespse.getLastHeader("Last-Modified"); // lastModify = httprespse.getLastHeader("Last-Modified");
} // }
if(charset==null) { if(charset==null) {
String charstype = EntityUtils String charstype = EntityUtils
.getContentCharSet(entitydata); .getContentCharSet(entitydata);
...@@ -524,61 +513,52 @@ public class PaserSiteDownload { ...@@ -524,61 +513,52 @@ public class PaserSiteDownload {
try { try {
Thread.sleep(500L); Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset); infodata = EntityUtils.toString(entitydata, charset);
httpgeturl.releaseConnection();
httpClient.close();
} catch (Exception e1) { } catch (Exception e1) {
// TODO Auto-generated catch block
// e1.printStackTrace();
log.info("内容解析异常"); log.info("内容解析异常");
}finally { }finally {
httpgeturl.releaseConnection(); httpgeturl.releaseConnection();
} }
return infodata; return infodata;
} }
// 获取所要抓取网页的编码方式 // 获取所要抓取网页的编码方式
public static String locateCharSet(String url) { public static String locateCharSet(String url) {
String encoding = "utf-8"; String encoding = "utf-8";
try { try {
Connection conn = Jsoup.connect(url); Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"); conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
// 伪装成浏览器 // 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(10000).get(); Document doc = conn.ignoreContentType(true).timeout(5000).get();
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
Matcher m1 = p1.matcher(doc.toString()); Matcher m1 = p1.matcher(doc.toString());
while (m1.find()) { while (m1.find()) {
String str = m1.group(); String str = m1.group();
Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*"); Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
Matcher m2 = p2.matcher(str); Matcher m2 = p2.matcher(str);
if (m2.find()) { if (m2.find()) {
encoding = m2.group().substring(8); encoding = m2.group().substring(8);
if (encoding.trim().length() == 0) { if (encoding.trim().length() == 0) {
Pattern p3 = Pattern Pattern p3 = Pattern
.compile("charset=\"[^\\s||\"||;||>]*"); .compile("charset=\"[^\\s||\"||;||>]*");
Matcher m3 = p3.matcher(str); Matcher m3 = p3.matcher(str);
if (m3.find()) { if (m3.find()) {
encoding = m3.group().substring(9); encoding = m3.group().substring(9);
} }
if (encoding.trim().length() == 0) { if (encoding.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName); encoding = "gbk";
// if(encoding == null){ }
encoding = "gbk"; }
// } return encoding;
} }
} }
} catch (IOException e) {
log.error("获取编码方式出错");
return encoding; return encoding;
}
} }
} catch (IOException e) {
// e.printStackTrace();
log.error("获取编码方式出错");
System.out.println("获取编码方式出错");
return encoding; return encoding;
}
return encoding;
} }
public static Properties getConfig() { public static Properties getConfig() {
...@@ -608,7 +588,7 @@ public class PaserSiteDownload { ...@@ -608,7 +588,7 @@ public class PaserSiteDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -19,6 +19,7 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{ ...@@ -56,7 +57,12 @@ public class SiteThread implements Runnable{
urlList.addAll(hisUrlList); urlList.addAll(hisUrlList);
} }
//获取编码 //获取编码
String charset = paserSiteDownload.getCharSet(urlList.get(0)); String charset = null;
try {
charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException e) {
//
}
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
...@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{ ...@@ -85,8 +91,8 @@ public class SiteThread implements Runnable{
WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular(); WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
} }
// log.info("本次获取列表url: "+metaSearchList.size()+"个");
//获取文章详情
siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType()); siteMsgTemple.setDetailExpressionType(siteMsgTemple.getDetailExpressionType()==null?"0":siteMsgTemple.getDetailExpressionType());
//判断解析详情表达式类型 //判断解析详情表达式类型
if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getDetailExpressionType().equals("3")) {//css表达式
...@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{ ...@@ -138,7 +144,7 @@ public class SiteThread implements Runnable{
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -356,7 +356,7 @@ public class PaserCommDownload { ...@@ -356,7 +356,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -78,11 +78,9 @@ public class WebContentPaserByCss { ...@@ -78,11 +78,9 @@ public class WebContentPaserByCss {
TimeUnit.SECONDS.sleep(2); TimeUnit.SECONDS.sleep(2);
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用 // if (StringUtils.isEmpty(body)) {
sentBadSiteMsg(siteMsgTemple,"动态请求异常","0"); // sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
}else{ // }
sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
}
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
...@@ -94,9 +92,9 @@ public class WebContentPaserByCss { ...@@ -94,9 +92,9 @@ public class WebContentPaserByCss {
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); // catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches); // catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// } // }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 // if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1"); // sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
} // }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl=""; String imagUrl="";
...@@ -315,11 +313,11 @@ public class WebContentPaserByCss { ...@@ -315,11 +313,11 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) { if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { }else {
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1"); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
} }
}catch (Exception e){ }catch (Exception e){
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1"); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
} }
...@@ -329,9 +327,9 @@ public class WebContentPaserByCss { ...@@ -329,9 +327,9 @@ public class WebContentPaserByCss {
docInfo.setId(count+""); docInfo.setId(count+"");
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
......
...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload; ...@@ -8,10 +8,7 @@ import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.ContentUtility; import com.zzsn.util.ContentUtility;
...@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath { ...@@ -84,10 +81,12 @@ public class WebContentPaserByJsonXpath {
} }
} }
} }
if(StringUtils.isNotEmpty(body)) { if (StringUtils.isNotEmpty(body)) {
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByJsonpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} else {
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
} }
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
...@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath { ...@@ -239,18 +238,18 @@ public class WebContentPaserByJsonXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int partition=0; // int partition=0;
try { // try {
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION); // partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
}catch (Exception e){ // }catch (Exception e){
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION); // log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
} // }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo); docInfoList.add(docInfo);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
...@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath { ...@@ -343,7 +342,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
...@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath { ...@@ -580,4 +579,30 @@ public class WebContentPaserByJsonXpath {
return encoding; return encoding;
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
} }
...@@ -43,9 +43,7 @@ public class WebContentPaserByRegular { ...@@ -43,9 +43,7 @@ public class WebContentPaserByRegular {
// 提取站点新闻列表URL // 提取站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular(List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
...@@ -57,7 +55,7 @@ public class WebContentPaserByRegular { ...@@ -57,7 +55,7 @@ public class WebContentPaserByRegular {
uri_code = Utility.encodURI(uri.toString()) uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); .replaceAll("%20", "+");
Thread.sleep(1000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
...@@ -72,14 +70,10 @@ public class WebContentPaserByRegular { ...@@ -72,14 +70,10 @@ public class WebContentPaserByRegular {
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) { if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); // sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue; continue;
} }
if( pageDownload.isBadDownloadPage(body)){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue;
}
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl=""; String imagUrl="";
...@@ -96,7 +90,7 @@ public class WebContentPaserByRegular { ...@@ -96,7 +90,7 @@ public class WebContentPaserByRegular {
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){ if(catchWebByMetaSearches.size()<1){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); // sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue; continue;
} }
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
...@@ -105,7 +99,6 @@ public class WebContentPaserByRegular { ...@@ -105,7 +99,6 @@ public class WebContentPaserByRegular {
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
log.info("异常信息"+e.getMessage()); log.info("异常信息"+e.getMessage());
// return catchWebByMetaSearchList;
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -122,7 +115,6 @@ public class WebContentPaserByRegular { ...@@ -122,7 +115,6 @@ public class WebContentPaserByRegular {
* @param siteMsgTemple * @param siteMsgTemple
* @param msg 异常信息 * @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常 * @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param 爬虫类型(0:静态爬取 1:动态爬取)
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
...@@ -252,134 +244,114 @@ public class WebContentPaserByRegular { ...@@ -252,134 +244,114 @@ public class WebContentPaserByRegular {
} }
return eleText; return eleText;
} }
// 抓取新闻内容 // 抓取新闻内容
public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) { public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) {
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList = new ArrayList<>();
try { int count = 0;
int count = 0; for (int i = 0; i < catchWebList.size(); i++) {
int mark=0; try {
for (int i = 0; i < catchWebList.size(); i++) { CatchWebByMetaSearch cwbm = catchWebList.get(i);
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length() == 0 || cwbm.getSourceaddress().contains(".PDF") || cwbm.getSourceaddress().contains("download")) {
continue;
}
log.info("解析内容的URL:" + cwbm.getSourceaddress());
String rediskey = siteMsgTemple.getInfoSourceCode();
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) { if (sismember) {
log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue; continue;
} }
log.info("解析内容的URL:"+cwbm.getSourceaddress()); } catch (Exception e) {
String rediskey=siteMsgTemple.getInfoSourceCode(); log.info("缓存出问题");
try { }
boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress()); // 请求下载内容
if (sismember) { String content = "";
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress()); try {
continue; if (siteMsgTemple.getYnDynamicCrawl() == 1) {
} content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}catch (Exception e){ } else {
log.info("缓存出问题"); try {
} content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
// 请求下载内容 先使用静态访问若内容为空调用动态请求若内容还为空则跳过 } catch (Exception e) {
String content=""; log.info(e.getMessage());
try { content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}else{
try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}catch (Exception e){
log.info(e.getMessage());
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
}
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
}catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue;
} }
//使用浏览器截取图片 //超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ } catch (Exception e) {
String imagUrl=""; continue;
WebPageScreenShot webPageScreenShot=new WebPageScreenShot(); }
webPageScreenShot.loadPage(cwbm.getSourceaddress(),Constants.IMGPATH); //使用浏览器截取图片
if (StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")) {
String imagUrl = "";
WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress()); // InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString(); // imagUrl=map.get("objectUrl").toString();
}
if (StringUtils.isEmpty(content)) {
continue;
}
log.info("详情内容的长度:" + content.length());
DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId());
docInfo.setSid(Long.parseLong(siteMsgTemple.getId()));
docInfo.setSourceType("News");
docInfo.setLastModified(cwbm.getLastModify());
docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate());
if (cwbm.getSourceaddress() != null) {
docInfo.setOrigin(cwbm.getSourcesite());
} else {
docInfo.setOrigin(siteMsgTemple.getSiteName());
}
docInfo.setSummary(cwbm.getSummary());
//封装解析的docinfo对象
try {
if (StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
} else {
log.info("栏目名称:" + siteMsgTemple.getSiteName() + " 链接请求:" + cwbm.getSourceaddress() + " 内容为空:" + content);
} }
if(StringUtils.isEmpty(content) ) { } catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); log.info("文本内容解析不正确!");
continue; continue;
} }
log.info("详情内容的长度:"+content.length()); ObjectMapper mapper = new ObjectMapper();
DocInfo docInfo = new DocInfo(); try {
docInfo.setContentType("HTML"); ClbAnsProcessitem processitem = paserSiteDownload.docInfoTrans2Processitem(docInfo);
docInfo.setOrgId(cwbm.getOrgId()); if (siteMsgTemple.getYnDynamicCrawl() == 1) {
docInfo.setSid(Long.parseLong(siteMsgTemple.getId())); processitem.setSource("2");
docInfo.setSourceType("News"); } else {
docInfo.setLastModified(cwbm.getLastModify()); processitem.setSource("1");
docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle()==null?"":cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate());
if(cwbm.getSourceaddress()!=null) {
docInfo.setOrigin(cwbm.getSourcesite());
}else{
docInfo.setOrigin(siteMsgTemple.getSiteName());
} }
docInfo.setSummary(cwbm.getSummary()); if (StringUtils.isEmpty(processitem.getTitle()) || StringUtils.isEmpty(processitem.getContent())
// 封装解析的docinfo对象 || StringUtils.isEmpty(processitem.getPublishDate())) {
try { log.info("资讯的信息不全缺少标题、时间或内容!:" + cwbm.getSourceaddress());
if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else {
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
}
}catch (Exception e){
log.info("文本内容解析不正确!");
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
continue; continue;
} }
String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
count++; count++;
docInfo.setId(count+""); docInfo.setId(count + "");
ObjectMapper mapper = new ObjectMapper(); docInfoList.add(docInfo);
try { log.info("发送到kafka成功。");
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); } catch (JsonProcessingException e) {
if(siteMsgTemple.getYnDynamicCrawl()==1) { log.info("发送到kafka失败。");
processitem.setSource("2");
}else{
processitem.setSource("1");
}
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
||StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress());
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
continue;
}
String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC,Constants.KAFKA_CONSUMER_PARTITION , docjson);
// int partition=0;
// try {
// partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
// }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// }
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
docInfoList.add(docInfo);
log.info("发送到kafka成功。");
} catch (JsonProcessingException e) {
// e.printStackTrace();
log.info("发送到kafka失败。");
continue;
}
} catch (Exception e){
continue;
} }
} catch (Exception e) {
log.info("内容解析部分出现异常!");
} }
log.info("本次成功件数:" + count);
} catch (Exception e) {
log.info("内容解析部分出现异常!");
} }
log.info("本次成功件数:" + count);
return docInfoList; return docInfoList;
} }
......
...@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime; ...@@ -9,10 +9,7 @@ import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
...@@ -105,6 +102,9 @@ public class WebContentPaserByXpath { ...@@ -105,6 +102,9 @@ public class WebContentPaserByXpath {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
} }
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
...@@ -131,6 +131,28 @@ public class WebContentPaserByXpath { ...@@ -131,6 +131,28 @@ public class WebContentPaserByXpath {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -361,9 +383,9 @@ public class WebContentPaserByXpath { ...@@ -361,9 +383,9 @@ public class WebContentPaserByXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -489,7 +511,7 @@ public class WebContentPaserByXpath { ...@@ -489,7 +511,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -70,42 +70,38 @@ public class SeleniumTime { ...@@ -70,42 +70,38 @@ public class SeleniumTime {
ChromeDriverService service = new ChromeDriverService.Builder(). ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build(); usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try { try {
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
service.start(); service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) { if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080"); chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数 chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
} }
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例 driver = new ChromeDriver(chromeOptions);//生成实例
try { try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS); Duration duration=Duration.of(100, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
Thread.sleep(1000l); Thread.sleep(10002);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close"); System.out.println("browser will be close");
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
}finally {
driver.quit();
} }
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
} finally { } finally {
try { driver.quit();
driver.quit(); service.stop();
service.stop();
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
} }
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 驱动访问出现异常:" + e.getMessage());
return ""; } finally {
service.stop();
} }
return html; return html;
} }
...@@ -281,18 +277,18 @@ public class SeleniumTime { ...@@ -281,18 +277,18 @@ public class SeleniumTime {
// robot.keyPress(KeyEvent.VK_ENTER);//按下enter键 // robot.keyPress(KeyEvent.VK_ENTER);//按下enter键
robot.keyPress(keycode); robot.keyPress(keycode);
} }
public static void main(String[] args) { public static void main(String[] args) {
//去除html中的相关标签 //去除html中的相关标签
/** /**
* 网上大多是说明直接使用正则表达式不能很好的适用于html * 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取 * 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/ */
SeleniumTime s = new SeleniumTime(); SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html"); String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">"; String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"; String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始"); System.out.println("开始");
...@@ -303,7 +299,7 @@ public class SeleniumTime { ...@@ -303,7 +299,7 @@ public class SeleniumTime {
System.out.println("包含b"); System.out.println("包含b");
} }
System.out.println("结束"); System.out.println("结束");
String[] split = scopehtml.split(a); String[] split = scopehtml.split(a);
String sa = split[0]; String sa = split[0];
System.out.println("首次截取的长度"+split.length); System.out.println("首次截取的长度"+split.length);
...@@ -312,31 +308,31 @@ public class SeleniumTime { ...@@ -312,31 +308,31 @@ public class SeleniumTime {
String substring = sb.substring(7); String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length); System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ; String sab = sa + substring ;
// //解决方式 正则匹配删除标签 // //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"] // // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>"; // String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>"; // //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]"; // //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
// //
//// boolean isMatch = regex.matches(scopehtml); //// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch); //// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
//// ////
// // 创建 Pattern 对象 // // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex); // Pattern r = Pattern.compile(regex);
// //
// // 现在创建 matcher 对象 // // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml); // Matcher m = r.matcher(scopehtml);
// if (m.find( )) { // if (m.find( )) {
// System.out.println("Found value: " + m.group(0) ); // System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) ); // System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) ); // System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) ); // System.out.println("Found value: " + m.group(3) );
// } else { // } else {
// System.out.println("NO MATCH"); // System.out.println("NO MATCH");
// } // }
// //
// //
File file = new File("D:/123.txt"); File file = new File("D:/123.txt");
try { try {
PrintStream ps = new PrintStream(new FileOutputStream(file)); PrintStream ps = new PrintStream(new FileOutputStream(file));
...@@ -345,30 +341,30 @@ public class SeleniumTime { ...@@ -345,30 +341,30 @@ public class SeleniumTime {
// TODO Auto-generated catch block // TODO Auto-generated catch block
e.printStackTrace(); e.printStackTrace();
} }
} }
} }
...@@ -41,8 +41,8 @@ public class WebPageScreenShot { ...@@ -41,8 +41,8 @@ public class WebPageScreenShot {
// driver.manage().window().maximize(); // driver.manage().window().maximize();
String js1 = "return document.body.clientHeight.toString()"; String js1 = "return document.body.clientHeight.toString()";
String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + ""; // String js1_result = ((JavascriptExecutor) driver).executeScript(js1) + "";
int height = Integer.parseInt(js1_result); // int height = Integer.parseInt(js1_result);
List<String> files = new ArrayList<String>(); List<String> files = new ArrayList<String>();
int last_t = 0; int last_t = 0;
// for (int i = 0; i < 20; ) { // for (int i = 0; i < 20; ) {
...@@ -80,7 +80,7 @@ public class WebPageScreenShot { ...@@ -80,7 +80,7 @@ public class WebPageScreenShot {
CustomScreenshot customScreenshot=new CustomScreenshot(); CustomScreenshot customScreenshot=new CustomScreenshot();
files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath()); files.add(customScreenshot.fullScreenshotLong(driver).getAbsolutePath());
driver.quit();//退出浏览器 driver.quit();//退出浏览器
boolean flag = merge(files.toArray(new String[]{}), type, resultPath); // boolean flag = merge(files.toArray(new String[]{}), type, resultPath);
// if(flag){ // if(flag){
// InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath)); // InputStream inputStream =new BufferedInputStream(new FileInputStream(resultPath));
// HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadShotInputStream(inputStream, "png");
......
...@@ -133,7 +133,7 @@ public class ArticleCrawlerThread { ...@@ -133,7 +133,7 @@ public class ArticleCrawlerThread {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -361,7 +361,7 @@ public class PaserCommDownload { ...@@ -361,7 +361,7 @@ public class PaserCommDownload {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath { ...@@ -237,9 +237,9 @@ public class WebContentPaserByJsonXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath { ...@@ -332,7 +332,7 @@ public class WebContentPaserByJsonXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -321,9 +321,9 @@ public class WebContentPaserByRegular { ...@@ -321,9 +321,9 @@ public class WebContentPaserByRegular {
try { try {
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent()) if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
||StringUtils.isEmpty(processitem.getPublishDate())){ ||StringUtils.isEmpty(processitem.getPublishDate())){
......
...@@ -364,9 +364,9 @@ public class WebContentPaserByXpath { ...@@ -364,9 +364,9 @@ public class WebContentPaserByXpath {
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("动态爬取"); processitem.setSource("2");
}else{ }else{
processitem.setSource("静态爬取"); processitem.setSource("1");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
...@@ -483,7 +483,7 @@ public class WebContentPaserByXpath { ...@@ -483,7 +483,7 @@ public class WebContentPaserByXpath {
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle()); clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag()); clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag()); clbAnsProcessitem.setContentWithTag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary()); clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor()); clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin()); clbAnsProcessitem.setOrigin(docInfo.getOrigin());
......
...@@ -39,9 +39,9 @@ public class PageConnectioner { ...@@ -39,9 +39,9 @@ public class PageConnectioner {
connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36");
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
connection.setRequestProperty("referer", urlstr); connection.setRequestProperty("referer", urlstr);
} catch (Exception e) { } catch (Exception e) {
...@@ -50,7 +50,7 @@ public class PageConnectioner { ...@@ -50,7 +50,7 @@ public class PageConnectioner {
return connection; return connection;
} }
/**构造下载使用的{@link HttpURLConnection} /**构造下载使用的{@link HttpURLConnection}
* @param urlstr 下载url (当参数类型是json字符串时调用) * @param urlstr 下载url (当参数类型是json字符串时调用)
* */ * */
...@@ -60,7 +60,7 @@ public class PageConnectioner { ...@@ -60,7 +60,7 @@ public class PageConnectioner {
HttpURLConnection connection = null; HttpURLConnection connection = null;
String[] headerParam=params.split("\\|"); String[] headerParam=params.split("\\|");
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) { if (false) {
connection = (HttpURLConnection) url.openConnection(proxy); connection = (HttpURLConnection) url.openConnection(proxy);
...@@ -84,14 +84,13 @@ public class PageConnectioner { ...@@ -84,14 +84,13 @@ public class PageConnectioner {
} }
} }
//参数类型是json字符串用到 //参数类型是json字符串用到
connection.setRequestProperty("Content-Type","application/json"); connection.setRequestProperty("Content-Type","application/json");
} catch (Exception e) { } catch (Exception e) {
//
} }
return connection; return connection;
} }
/** /**
* 该方法为代理IP * 该方法为代理IP
*/ */
...@@ -102,7 +101,7 @@ public class PageConnectioner { ...@@ -102,7 +101,7 @@ public class PageConnectioner {
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) { if (false) {
connection = (HttpURLConnection) url.openConnection(proxy); connection = (HttpURLConnection) url.openConnection(proxy);
} else { } else {
...@@ -126,7 +125,7 @@ public class PageConnectioner { ...@@ -126,7 +125,7 @@ public class PageConnectioner {
URL url = null; URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ipadd, Integer.parseInt(prot))); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ipadd, Integer.parseInt(prot)));
HttpsURLConnection connection = null; HttpsURLConnection connection = null;
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) { if (false) {
...@@ -142,7 +141,7 @@ public class PageConnectioner { ...@@ -142,7 +141,7 @@ public class PageConnectioner {
connection.setRequestProperty("User-Agent", "Mozilla/5.0 " connection.setRequestProperty("User-Agent", "Mozilla/5.0 "
+ "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) " + "Gecko/20080404 Chrome/54.0.2840.99"); + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) " + "Gecko/20080404 Chrome/54.0.2840.99");
connection.setRequestProperty("X-Terminal-Type", "pc"); connection.setRequestProperty("X-Terminal-Type", "pc");
} catch (Exception e) { } catch (Exception e) {
} }
...@@ -157,17 +156,16 @@ public class PageConnectioner { ...@@ -157,17 +156,16 @@ public class PageConnectioner {
URL url = null; URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT)); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpsURLConnection connection = null; HttpsURLConnection connection = null;
trustAllHttpsCertificates(); try {
HostnameVerifier hv = new HostnameVerifier() { trustAllHttpsCertificates();
@Override HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) { @Override
return true; public boolean verify(String urlHostName, SSLSession session) {
} return true;
}
};
HttpsURLConnection.setDefaultHostnameVerifier(hv); };
try{ HttpsURLConnection.setDefaultHostnameVerifier(hv);
url = new URL(urlstr); url = new URL(urlstr);
if (false) { if (false) {
connection = (HttpsURLConnection) url.openConnection(proxy); connection = (HttpsURLConnection) url.openConnection(proxy);
...@@ -180,14 +178,12 @@ public class PageConnectioner { ...@@ -180,14 +178,12 @@ public class PageConnectioner {
connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
} catch (Exception e) {
} //
catch(Exception e){
} }
return connection; return connection;
} }
/**构造下载使用的{@link HttpsURLConnection} /**构造下载使用的{@link HttpsURLConnection}
* @param urlstr 下载url * @param urlstr 下载url
* @return * @return
...@@ -203,7 +199,7 @@ public class PageConnectioner { ...@@ -203,7 +199,7 @@ public class PageConnectioner {
public boolean verify(String urlHostName, SSLSession session) { public boolean verify(String urlHostName, SSLSession session) {
return true; return true;
} }
}; };
HttpsURLConnection.setDefaultHostnameVerifier(hv); HttpsURLConnection.setDefaultHostnameVerifier(hv);
try{ try{
...@@ -232,7 +228,7 @@ public class PageConnectioner { ...@@ -232,7 +228,7 @@ public class PageConnectioner {
catch(Exception e){ catch(Exception e){
} }
return connection; return connection;
} }
/** /**
...@@ -252,16 +248,16 @@ public class PageConnectioner { ...@@ -252,16 +248,16 @@ public class PageConnectioner {
break; break;
} catch (Exception e1) { } catch (Exception e1) {
try { try {
Thread.sleep(10000); Thread.sleep(2000);
} catch (InterruptedException e2) { } catch (InterruptedException e2) {
// logUtil.getLogger().error(String.format("ORMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e2))); //
} }
} }
} }
} }
return docBody; return docBody;
} }
/** /**
* @param url * @param url
* @param encoding * @param encoding
...@@ -298,7 +294,7 @@ public class PageConnectioner { ...@@ -298,7 +294,7 @@ public class PageConnectioner {
} }
protected String dynamicHttpsConnectByPost(String url, String encoding) { protected String dynamicHttpsConnectByPost(String url, String encoding) {
return null; return null;
} }
/** /**
* http get方法下载 static链接网页 * http get方法下载 static链接网页
...@@ -309,16 +305,24 @@ public class PageConnectioner { ...@@ -309,16 +305,24 @@ public class PageConnectioner {
protected String staticConnectByGet(String url, String encoding,String headerParams) { protected String staticConnectByGet(String url, String encoding,String headerParams) {
//循环访问的超时时间 //循环访问的超时时间
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.connection(url,headerParams)); connection = this.connection(url,headerParams);
} catch (Exception e3) { pg = new PageGet(url, encoding, connection);
} catch (Exception e1) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pg.urlConnectionGet(); pg.urlConnectionGet();
docBody = pg.getPageStr(); docBody = pg.getPageStr();
...@@ -356,12 +360,18 @@ public class PageConnectioner { ...@@ -356,12 +360,18 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.connection(url)); connection = this.connection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pg.urlConnectionGet(); pg.urlConnectionGet();
docBody = pg.getPageStr(); docBody = pg.getPageStr();
...@@ -393,12 +403,18 @@ public class PageConnectioner { ...@@ -393,12 +403,18 @@ public class PageConnectioner {
* @return * @return
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 3000; long exitTimeDis = 10000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
HttpsURLConnection connection = null;
try { try {
pg = new PageGet(url, encoding, this.httpsconnection(url)); connection = this.httpsconnection(url);
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
//
} finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
...@@ -406,15 +422,15 @@ public class PageConnectioner { ...@@ -406,15 +422,15 @@ public class PageConnectioner {
docBody = pg.getPageStr(); docBody = pg.getPageStr();
// 测试导出文件 // 测试导出文件
// byte[] buff=new byte[]{}; // byte[] buff=new byte[]{};
// buff=docBody.getBytes(); // buff=docBody.getBytes();
// FileOutputStream out=null; // FileOutputStream out=null;
// try { // try {
// out = new FileOutputStream("D://out.txt"); // out = new FileOutputStream("D://out.txt");
// out.write(buff,0,buff.length); // out.write(buff,0,buff.length);
// } catch (IOException e1) { // } catch (IOException e1) {
// e1.printStackTrace(); // e1.printStackTrace();
// } // }
return docBody; return docBody;
} catch (Exception e) { } catch (Exception e) {
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
...@@ -444,7 +460,7 @@ public class PageConnectioner { ...@@ -444,7 +460,7 @@ public class PageConnectioner {
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame,String params) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame,String params) {
long exitTimeDis = 3000; long exitTimeDis = 3000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
try { try {
...@@ -526,15 +542,15 @@ public class PageConnectioner { ...@@ -526,15 +542,15 @@ public class PageConnectioner {
// return docBody; // return docBody;
// } // }
/** /**
* http post方法下载 static链接网页 * http post方法下载 static链接网页
* @param url 下载链接 * @param url 下载链接
* @param encoding 页面编码 * @param encoding 页面编码
* @param postParam post参数,格式为raw(A=a&B=b) * @param postParam post参数,格式为raw(A=a&B=b)
* @return 下载的内容 * @return 下载的内容
*/ */
protected String staticConnectByPost(String url, String encoding, String postParam) { protected String staticConnectByPost(String url, String encoding, String postParam) {
long exitTimeDis = 30000; long exitTimeDis = 30000;
...@@ -542,17 +558,25 @@ public class PageConnectioner { ...@@ -542,17 +558,25 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
try { HttpURLConnection connection = null;
try {
if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂 if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param); connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
pp = new PagePost(url, encoding, this.connection(url), postParam); connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pp.urlConnectionPost(); pp.urlConnectionPost();
docBody = pp.getPageStr(); docBody = pp.getPageStr();
...@@ -581,7 +605,7 @@ public class PageConnectioner { ...@@ -581,7 +605,7 @@ public class PageConnectioner {
* http post方法下载 static链接网页 * http post方法下载 static链接网页
* @param url 下载链接 * @param url 下载链接
* @param postParam post参数,格式为raw(A=a&B=b) * @param postParam post参数,格式为raw(A=a&B=b)
* @return 下载的内容 * @return 下载的内容
*/ */
protected String staticConnectByPost(String url, String postParam) { protected String staticConnectByPost(String url, String postParam) {
long exitTimeDis = 30000; long exitTimeDis = 30000;
...@@ -589,17 +613,25 @@ public class PageConnectioner { ...@@ -589,17 +613,25 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
try { HttpURLConnection connection = null;
try {
if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂 if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
pp = new PagePost(url, encoding, this.connection(url,param),param); connection = this.connection(url,param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
pp = new PagePost(url, encoding, this.connection(url), postParam); connection = this.connection(url);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pp.urlConnectionPost(); pp.urlConnectionPost();
docBody = pp.getPageStr(); docBody = pp.getPageStr();
...@@ -632,15 +664,20 @@ public class PageConnectioner { ...@@ -632,15 +664,20 @@ public class PageConnectioner {
*/ */
protected String staticHttpsConnectByPost(String url, String encoding, String param) { protected String staticHttpsConnectByPost(String url, String encoding, String param) {
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
HttpsURLConnection connection = null;
PagePost pp = null; PagePost pp = null;
try { try {
pp = new PagePost(url, encoding, this.httpsconnection(url),param); connection = this.httpsconnection(url);
pp = new PagePost(url, encoding, connection, param);
} catch (Exception e3) { } catch (Exception e3) {
// TODO Auto-generated catch block //
e3.printStackTrace(); } finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
pp.urlHttpsConnectionPost(); pp.urlHttpsConnectionPost();
...@@ -667,7 +704,7 @@ public class PageConnectioner { ...@@ -667,7 +704,7 @@ public class PageConnectioner {
return docBody; return docBody;
} }
/** /**
* *
* @param urlstr * @param urlstr
* @param bSave * @param bSave
* @return * @return
...@@ -693,7 +730,7 @@ public class PageConnectioner { ...@@ -693,7 +730,7 @@ public class PageConnectioner {
String pageStr=""; String pageStr="";
try { try {
HtmlPage htmlPage = webClient.getPage(urlstr); HtmlPage htmlPage = webClient.getPage(urlstr);
webClient.waitForBackgroundJavaScript(600000); webClient.waitForBackgroundJavaScript(300000);
pageStr = htmlPage.asXml(); pageStr = htmlPage.asXml();
}catch (Exception e){ }catch (Exception e){
...@@ -740,15 +777,14 @@ public class PageConnectioner { ...@@ -740,15 +777,14 @@ public class PageConnectioner {
// JavaScriptPage scriptPage = (JavaScriptPage) page; // JavaScriptPage scriptPage = (JavaScriptPage) page;
// pageStr = scriptPage.getContent(); // pageStr = scriptPage.getContent();
// } // }
} catch (Exception e) { } catch (Exception e) {
}finally { }finally {
webClient.close(); webClient.close();
} }
return pageStr; return pageStr;
} }
private static void trustAllHttpsCertificates() throws Exception { private static void trustAllHttpsCertificates() throws Exception {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM(); javax.net.ssl.TrustManager tm = new miTM();
......
...@@ -34,7 +34,7 @@ public class PageDownloader { ...@@ -34,7 +34,7 @@ public class PageDownloader {
this.bDownloadUseFrame = b; this.bDownloadUseFrame = b;
} }
public PageDownloader(){ public PageDownloader(){
} }
Timer timer; Timer timer;
public PageDownloader(long sec) { public PageDownloader(long sec) {
...@@ -49,39 +49,46 @@ public class PageDownloader { ...@@ -49,39 +49,46 @@ public class PageDownloader {
// 如果页面编码格式未知,则从页面中获取该页面编码格式 // 如果页面编码格式未知,则从页面中获取该页面编码格式
public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException { public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException {
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+ "Gecko/20080404 Firefox/2.0.0.14");
connection.setRequestProperty("referer", urlstr);
connection.setRequestProperty("Cookie", "auth=token");
String contentType = connection.getHeaderField("Content-Type");
String encoding = null; String encoding = null;
if (contentType != null) { try {
String temp = "charset="; connection.setRequestMethod("GET");
int m = contentType.indexOf(temp); connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
if (m != -1) { + "Gecko/20080404 Firefox/2.0.0.14");
encoding = contentType.substring(m + temp.length()).replace("]", ""); connection.setRequestProperty("referer", urlstr);
connection.setRequestProperty("Cookie", "auth=token");
String contentType = connection.getHeaderField("Content-Type");
if (contentType != null) {
String temp = "charset=";
int m = contentType.indexOf(temp);
if (m != -1) {
encoding = contentType.substring(m + temp.length()).replace("]", "");
}
} }
} if (encoding == null) {
if (encoding == null) {
try {
InputStream is = null; InputStream is = null;
is = connection.getInputStream(); try {
BufferedInputStream bufferedInputStream = new BufferedInputStream(is); is = connection.getInputStream();
encoding = EncodeDetector.getEncoding(bufferedInputStream); BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
is.close(); encoding = EncodeDetector.getEncoding(bufferedInputStream);
} catch (Exception e) { } catch (Exception e) {
//
}finally {
assert is != null;
is.close();
}
} }
} catch (Exception e) {
//
} finally {
connection.disconnect();
} }
connection.disconnect();
return encoding; return encoding;
} }
// Document接口,主要针对html,txt,deng网页,通过get方式获取,动态或者静态链接 // Document接口,主要针对html,txt,deng网页,通过get方式获取,动态或者静态链接
public Document downloadWithDoc(String url, String encoding, boolean bDynamic,boolean bFrame) { public Document downloadWithDoc(String url, String encoding, boolean bDynamic,boolean bFrame) {
Document doc = null; Document doc = null;
String docBody=""; String docBody="";
if (false) { if (false) {
...@@ -117,7 +124,7 @@ public class PageDownloader { ...@@ -117,7 +124,7 @@ public class PageDownloader {
} }
return doc; return doc;
} }
// Document接口,主要针对jsonHtml类型配置文件,通过get方式获取,动态或者静态链接 // Document接口,主要针对jsonHtml类型配置文件,通过get方式获取,动态或者静态链接
public Document downloadWithJsonHtml(String url, String encoding, boolean bDynamic, boolean bFrame, public Document downloadWithJsonHtml(String url, String encoding, boolean bDynamic, boolean bFrame,
String bodyPath) { String bodyPath) {
...@@ -154,30 +161,24 @@ public class PageDownloader { ...@@ -154,30 +161,24 @@ public class PageDownloader {
} }
/** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/ /** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) { public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval){ if (interval > 0 && lastDownloadTime > 0 && dis < interval){
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner(); PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null;
try { try {
connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) {//获取网站编码 if (encoding == null || encoding.isEmpty()) {//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
encoding = paserSiteDownload.locateCharSet(url); encoding = paserSiteDownload.locateCharSet(url);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace();
log.info("获取编码失败"); log.info("获取编码失败");
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding); docBody = pConn.dynamicConnectByGet(url, encoding);
} else { } else {
// this.bDownloadUseFrame=true;
if (bFrame && this.bDownloadUseFrame) { if (bFrame && this.bDownloadUseFrame) {
String body = null; String body = null;
try { try {
...@@ -196,12 +197,11 @@ public class PageDownloader { ...@@ -196,12 +197,11 @@ public class PageDownloader {
} }
if(url.contains("https:")){ if(url.contains("https:")){
try { try {
connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8"; encoding = "utf-8";
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); //
} }
docBody = pConn.staticHttpsConnectByGet(url, encoding,false); docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
}else{ }else{
...@@ -211,7 +211,7 @@ public class PageDownloader { ...@@ -211,7 +211,7 @@ public class PageDownloader {
this.lastDownloadTime = System.currentTimeMillis(); this.lastDownloadTime = System.currentTimeMillis();
return docBody; return docBody;
} }
public String downloadWithStrAddHeader(String url, String encoding, boolean bDynamic,boolean bFrame,String headerParams) { public String downloadWithStrAddHeader(String url, String encoding, boolean bDynamic,boolean bFrame,String headerParams) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
...@@ -221,7 +221,7 @@ public class PageDownloader { ...@@ -221,7 +221,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info); logUtil.getLogger().info(info);
Thread.sleep(dis+2000); Thread.sleep(dis+2000);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e ))); logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
}*/ }*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
...@@ -237,6 +237,9 @@ public class PageDownloader { ...@@ -237,6 +237,9 @@ public class PageDownloader {
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
...@@ -264,7 +267,7 @@ public class PageDownloader { ...@@ -264,7 +267,7 @@ public class PageDownloader {
} }
if(url.contains("https:")){ if(url.contains("https:")){
try { try {
connection = pConn.httpsconnection(url); // connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8"; encoding = "utf-8";
} }
...@@ -306,7 +309,7 @@ public class PageDownloader { ...@@ -306,7 +309,7 @@ public class PageDownloader {
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = getEncodingFromHtmlFile(url, connection); encoding = getEncodingFromHtmlFile(url, connection);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
} }
String docBody = null; String docBody = null;
...@@ -334,7 +337,7 @@ public class PageDownloader { ...@@ -334,7 +337,7 @@ public class PageDownloader {
} }
} }
docBody = pConn.staticConnectByGet(url, encoding); docBody = pConn.staticConnectByGet(url, encoding);
if (isBadDownloadPage(docBody) && this.badPage) { if (isBadDownloadPage(docBody) && this.badPage) {
return docBody; return docBody;
} }
...@@ -344,7 +347,7 @@ public class PageDownloader { ...@@ -344,7 +347,7 @@ public class PageDownloader {
} }
/** String接口,目前用于豆瓣API图书的爬取 */ /** String接口,目前用于豆瓣API图书的爬取 */
public String downloadPoxyWithStrAPI(String url, String encoding, boolean bDynamic, boolean bFrame) { public String downloadPoxyWithStrAPI(String url, String encoding, boolean bDynamic, boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval) { if (interval > 0 && lastDownloadTime > 0 && dis < interval) {
/*try { /*try {
...@@ -366,8 +369,11 @@ public class PageDownloader { ...@@ -366,8 +369,11 @@ public class PageDownloader {
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = getEncodingFromHtmlFile(url, connection); encoding = getEncodingFromHtmlFile(url, connection);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
...@@ -399,7 +405,7 @@ public class PageDownloader { ...@@ -399,7 +405,7 @@ public class PageDownloader {
this.lastDownloadTime = System.currentTimeMillis(); this.lastDownloadTime = System.currentTimeMillis();
return docBody; return docBody;
} }
// String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接 // String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接
public String downloadWithStr(String url, String encoding, String param) { public String downloadWithStr(String url, String encoding, String param) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
...@@ -409,7 +415,7 @@ public class PageDownloader { ...@@ -409,7 +415,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info); logUtil.getLogger().info(info);
Thread.sleep(dis+2000); Thread.sleep(dis+2000);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e))); logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
}*/ }*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
...@@ -444,7 +450,7 @@ public class PageDownloader { ...@@ -444,7 +450,7 @@ public class PageDownloader {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info); logUtil.getLogger().info(info);
Thread.sleep(dis+2000); Thread.sleep(dis+2000);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e))); logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s", ExceptionUtil.getExceptionStr(e)));
}*/ }*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
...@@ -493,7 +499,6 @@ public class PageDownloader { ...@@ -493,7 +499,6 @@ public class PageDownloader {
return true; return true;
} }
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block
return true; return true;
} }
return false; return false;
...@@ -501,7 +506,7 @@ public class PageDownloader { ...@@ -501,7 +506,7 @@ public class PageDownloader {
/** /**
* 向指定URL发送GET方法的请求 * 向指定URL发送GET方法的请求
* *
* @param url * @param url
* 发送请求的URL * 发送请求的URL
* 只用于塔读APP * 只用于塔读APP
...@@ -550,5 +555,5 @@ public class PageDownloader { ...@@ -550,5 +555,5 @@ public class PageDownloader {
} }
} }
return result; return result;
} }
} }
...@@ -21,7 +21,7 @@ public class ClbAnsProcessitem { ...@@ -21,7 +21,7 @@ public class ClbAnsProcessitem {
/**正文*/ /**正文*/
private String content; private String content;
private String contentWithtag; private String contentWithTag;
/**未知*/ /**未知*/
...@@ -94,4 +94,4 @@ public class ClbAnsProcessitem { ...@@ -94,4 +94,4 @@ public class ClbAnsProcessitem {
/**(临时处理)关联的专题id*/ /**(临时处理)关联的专题id*/
private List<String> subjectIds; private List<String> subjectIds;
} }
\ No newline at end of file
...@@ -50,7 +50,7 @@ public class KafkaConsumerJob { ...@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
// latest earliest // latest earliest
//时间间隔设置为1h //时间间隔设置为1h
// properties.put("max.poll.interval.ms", 60*60*1000); // properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 60*60*1000); properties.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 2*60*60*1000);
properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000); properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,25000);
properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000); properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1); properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
...@@ -62,11 +62,11 @@ public class KafkaConsumerJob { ...@@ -62,11 +62,11 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); // , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled(cron = "0 0/5 * * * ?") @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") // @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE); // ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
log.info("进入定时获取mq消息"); log.info("进入定时获取topic消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
// 消费某个主题的某个分区数据 // 消费某个主题的某个分区数据
...@@ -83,7 +83,6 @@ public class KafkaConsumerJob { ...@@ -83,7 +83,6 @@ public class KafkaConsumerJob {
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000)); ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(3000));
//手动提交已消费数据的offset //手动提交已消费数据的offset
// consumer.commitAsync();
consumer.commitSync(); consumer.commitSync();
if (records != null && records.count() > 0) { if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) { for (ConsumerRecord record : records) {
...@@ -98,13 +97,19 @@ public class KafkaConsumerJob { ...@@ -98,13 +97,19 @@ public class KafkaConsumerJob {
} }
} }
} }
} }
}catch (Exception e){ }catch (Exception e){
// consumer.commitSync(); //退出应用程序前使用close方法关闭消费者,网络连接和socket也会随之关闭,并立即触发一次再均衡
log.info(e.getMessage()); consumer.close();
// consumer = createConsumer(); System.out.println("error!!!!!!!!!!!");
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC)); consumer = createConsumer();
// 消费某个主题的某个分区数据
kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions1 = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions1.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions1[i])));
}
consumer.assign(topicPartitions);
} }
} }
......
...@@ -35,8 +35,8 @@ PROXYID=1 ...@@ -35,8 +35,8 @@ PROXYID=1
#线程池大小 #线程池大小
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= E:\\chrome\\chromedriver.exe CHROMEDRIVE= D:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
#mysql connection #mysql connection
...@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -52,7 +52,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic #KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
# #
KAFKA_CONSUMER_GROUP_ID=dynamin-sync KAFKA_CONSUMER_GROUP_ID=test-zs1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest #KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
KAFKA_PRODUCT_TOPIC=crawlerInfo KAFKA_PRODUCT_TOPIC=crawlerInfo
...@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo ...@@ -62,16 +62,16 @@ KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q= META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word= #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#指定分区使用逗号分割 #指定分区使用逗号分割
KAFKA_CONSUMER_PARTITION=0 KAFKA_CONSUMER_PARTITION=0,1,2,3
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 #KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_PRODUCT_PARTITION=0 KAFKA_PRODUCT_PARTITION=0
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=114.116.26.150
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=zzsn9988
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
......
...@@ -17,19 +17,19 @@ public class ThreadExecutorConfig { ...@@ -17,19 +17,19 @@ public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor") @Bean(value = "asyncTaskExecutor")
public Executor executor() { public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量 executor.setCorePoolSize(2);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量 executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列 executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-"); executor.setThreadNamePrefix("ssmsExecutor-");
/** /**
* 对拒绝task的处理策略 * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间 executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize(); executor.initialize();
return executor; return executor;
} }
@Bean(value = "asyncTaskExecutorSelenium") @Bean(value = "asyncTaskExecutorSelenium")
...@@ -139,4 +139,4 @@ public class ThreadExecutorConfig { ...@@ -139,4 +139,4 @@ public class ThreadExecutorConfig {
executor.initialize(); executor.initialize();
return executor; return executor;
} }
} }
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论