提交 af30a040 作者: liuweigang

通用采集代码更新

上级 649ac47c
...@@ -11,3 +11,5 @@ ...@@ -11,3 +11,5 @@
...@@ -3,9 +3,12 @@ package com.zzsn; ...@@ -3,9 +3,12 @@ package com.zzsn;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.DynaminSiteThread; import com.zzsn.crawler.DynaminSiteThread;
import com.zzsn.crawler.SiteThread;
import com.zzsn.crawlerOther.ArticleCrawler; import com.zzsn.crawlerOther.ArticleCrawler;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.generation.FileUtil;
import com.zzsn.job.JedisUtil;
import com.zzsn.job.KafkaConsumerJob; import com.zzsn.job.KafkaConsumerJob;
import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.ConsumerRecords;
...@@ -20,7 +23,9 @@ import org.springframework.boot.web.servlet.ServletComponentScan; ...@@ -20,7 +23,9 @@ import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import org.springframework.context.ConfigurableApplicationContext; import org.springframework.context.ConfigurableApplicationContext;
import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
@SpringBootApplication(scanBasePackages="com.zzsn") @SpringBootApplication(scanBasePackages="com.zzsn")
//@ServletComponentScan //@ServletComponentScan
...@@ -39,17 +44,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -39,17 +44,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
} }
@Override @Override
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
// loadSiteMsg();
try {
loadSiteMsg();
} catch (Exception e) {
loadSiteMsg();
}
// loadSiteMsgLoc(); // loadSiteMsgLoc();
// loadSiteMsgLoc2(); // loadSiteMsgLoc2();
// loadSiteMsgLoc3(); // loadSiteMsgLoc3();
// loadSiteMsgLoc4();
// loadSiteMsgLoc5();
// loadSiteMsgLoc6();
// loadSiteMsgLoc7();
// loadSiteMsgLoc8();
// loadSiteMsgLoc9();
// loadSiteMsgLoc10();
} }
public void loadSiteMsg(){ public void loadSiteMsg(){
try{ try{
...@@ -90,88 +93,31 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -90,88 +93,31 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
loadSiteMsg(); loadSiteMsg();
} }
} }
public void loadSiteMsgLoc(){ public void loadSiteMsgLoc() {
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class); String filepath= Constants.IMGPATH;
// kafkaConsumerJob.consumer(); System.out.println(filepath);
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class); // String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
// articleCrawler.consumer(); try {
System.out.println("——————++++++++++++——————==="); File f = new File(filepath);
String value="{\n" + List<String> allLines = FileUtil.getFileLines(f, "utf-8");
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + System.out.println(allLines.size());
" \"id\": \"1541605392350359554\",\n" + for (String keysite:allLines) {
" \"infoSourceCode\": \"IN-20220628-0001\",\n" + try {
" \"webSiteName\": \"审计署\",\n" + String value = JedisUtil.getString("INFO_SOURCE_TO_REDIS::"+keysite);
" \"siteName\": \"审计署-法律法规\",\n" + System.out.println("——————++++++++++++——————===");
" \"siteUri\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n" + SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
" \"infoSourceTypeId\": \"1\",\n" + DynaminSiteThread siteThread = new DynaminSiteThread();
" \"siteLevel\": null,\n" + siteMsgTemple.setYnDynamicCrawl(1);
" \"language\": null,\n" + // siteMsgTemple.getYnDynamicCrawl()
" \"checkedList\": null,\n" + siteThread.siteMsgTemple = siteMsgTemple;
" \"hisUriExp\": null,\n" + siteThread.crawler();
" \"hisDateStartTime\": null,\n" + }catch (Exception e){
" \"hisDateEndTime\": null,\n" + continue;
" \"ynHisDataAll\": \"0\",\n" + }
" \"status\": null,\n" + }
" \"listUrl\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n" + }catch (Exception e){
" \"listExpressionType\": \"3\",\n" + e.getMessage();
" \"informationUrl\": null,\n" + }
" \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"list-box-dl\\\"]>span>dl\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": null,\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>dd[class=\\\"fb-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>dd[class=\\\"ly-name\\\"]</exp></origin>\",\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"textSize\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"21 12 10 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
} }
public void loadSiteMsgLoc2(){ public void loadSiteMsgLoc2(){
...@@ -182,59 +128,59 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -182,59 +128,59 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1541618011601838081\",\n" + " \"id\": \"1534691893595426818\",\n" +
" \"infoSourceCode\": \"IN-20220628-0002\",\n" + " \"infoSourceCode\": \"IN-20220609-50867\",\n" +
" \"webSiteName\": \"北京市审计局\",\n" + " \"webSiteName\": \"美国CNN有线电视新闻网\",\n" +
" \"siteName\": \"北京市审计局-法律法规\",\n" + " \"siteName\": \"美国CNN有线电视新闻网-world\",\n" +
" \"siteUri\": \"http://sjj.beijing.gov.cn/zwxx/flfg/\",\n" + " \"siteUri\": \"https://edition.cnn.com/world\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": \"2\",\n" +
" \"language\": null,\n" + " \"language\": \"en\",\n" +
" \"checkedList\": null,\n" + " \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" + " \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": null,\n" +
" \"status\": null,\n" + " \"status\": \"1\",\n" +
" \"listUrl\": \"http://sjj.beijing.gov.cn/zwxx/flfg/\",\n" + " \"listUrl\": null,\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"0\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": \"https://edition\\\\.cnn\\\\.com/[\\\\d]{1,}/[\\\\d]{1,}/[\\\\d]{1,}/.*\",\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": null,\n" +
" \"informationPublishDate\": null,\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[class=\\\"list\\\"]>li\",\n" + " \"infoBlockPosition\": null,\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": null,\n" +
" \"extractInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\": \\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": 3,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": null,\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"0\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"title\\\"]>h1</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>*.h1[class=\\\"pg-headline\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"pubdate\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>*.p[class=\\\"update-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>*.div[class=\\\"l-container\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\": \\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": null,\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\": 0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"mapping\\\": \\\"\\\", \\\"expression\\\": \\\"\\\", \\\"primaryKey\\\": \\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" + " \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" + " \"ynDataPageAll\": null,\n" +
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" + " \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 1,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
...@@ -249,8 +195,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -249,8 +195,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlType\": 1,\n" + " \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": null,\n" +
" \"cron\": \"30 02 11 1/1 * ?\"\n" + " \"cron\": \"12 5 0/10 * * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -266,211 +212,43 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -266,211 +212,43 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1541670628478623746\",\n" + " \"id\": \"1534691893595426818\",\n" +
" \"infoSourceCode\": \"IN-20220628-0003\",\n" + " \"infoSourceCode\": \"IN-20220609-50867\",\n" +
" \"webSiteName\": \"上海市审计厅\",\n" + " \"webSiteName\": \"美国CNN有线电视新闻网\",\n" +
" \"siteName\": \"上海市审计厅-规范性文件\",\n" + " \"siteName\": \"美国CNN有线电视新闻网-world\",\n" +
" \"siteUri\": \"https://sjj.sh.gov.cn/zcwj_gfxwj/index.html\",\n" + " \"siteUri\": \"https://edition.cnn.com/world\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": \"2\",\n" +
" \"language\": null,\n" + " \"language\": \"en\",\n" +
" \"checkedList\": null,\n" + " \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" + " \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": \"1\",\n" +
" \"listUrl\": \"https://sjj.sh.gov.cn/zcwj_gfxwj/index.html\",\n" + " \"listUrl\": null,\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"0\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": \"https://edition.cnn.com/[\\\\d]{1,}/[\\\\d]{1,}/[\\\\d]{1,}/.*\",\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": null,\n" +
" \"informationPublishDate\": \"span\",\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[class=\\\"zfgk_area_list\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"ivs_content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"35 31 14 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc4(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1541705220539490306\",\n" +
" \"infoSourceCode\": \"IN-20220628-0004\",\n" +
" \"webSiteName\": \"湖北省审计厅\",\n" +
" \"siteName\": \"湖北省审计厅-规范性文件\",\n" +
" \"siteUri\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": \"span\",\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[id=\\\"ulList\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"article\\\"]>h2</exp></title>\",\n" +
" \"detailExpressionPublishDate\": null,\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article-box\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"02 49 16 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc5(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1541722286336188418\",\n" +
" \"infoSourceCode\": \"IN-20220628-0005\",\n" +
" \"webSiteName\": \"审计署\",\n" +
" \"siteName\": \"审计署-审计要闻\",\n" +
" \"siteUri\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"dt[class=\\\"fl\\\"]>a\",\n" +
" \"informationPublishDate\": null,\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"span[id=\\\"comp_10044770\\\"]>dl\",\n" + " \"infoBlockPosition\": null,\n" +
" \"linkLocation\": \"dt[class=\\\"fl\\\"]>a\",\n" + " \"linkLocation\": null,\n" +
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": 3,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"0\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"con-article-title\\\"]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>*.h1[class=\\\"pg-headline\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>dd[class=\\\"fb-time\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>*.p[class=\\\"update-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"textSize\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>*.div[class=\\\"l-container\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
...@@ -486,7 +264,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -486,7 +264,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" + " \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 1,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
...@@ -502,256 +280,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -502,256 +280,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"51 56 1/2 * * ?\"\n" + " \"cron\": \"12 5 0/10 * * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc6(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1541723496678105090\",\n" +
" \"infoSourceCode\": \"IN-20220628-0006\",\n" +
" \"webSiteName\": \"上海市审计局\",\n" +
" \"siteName\": \"上海市审计局-审计要闻\",\n" +
" \"siteUri\": \"https://sjj.sh.gov.cn/n388/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://sjj.sh.gov.cn/n388/index.html\",\n" +
" \"listExpressionType\": null,\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"u1[class=\\\"dtul dtul1\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": null,\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": null,\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": null,\n" +
" \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"40 01 1/2 * * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc7(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539588694743097346\",\n" +
" \"infoSourceCode\": \"IN-20220622-0007\",\n" +
" \"webSiteName\": \"新华丝路\",\n" +
" \"siteName\": \"新华丝路-投资资讯\",\n" +
" \"siteUri\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"h5[class=\\\"text-xl\\\"]>a\",\n" +
" \"informationPublishDate\": \"\",\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"mb-3\\\"]>ul>li\",\n" +
" \"linkLocation\": \"h5[class=\\\"text-xl\\\"]>a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"text-2xl md:text-4xl mb-4 font-song\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>span:contains(来源)</exp></origin>\",\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"43 38 20 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc8(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539588694743097346\",\n" +
" \"infoSourceCode\": \"IN-20220622-0007\",\n" +
" \"webSiteName\": \"新华丝路\",\n" +
" \"siteName\": \"新华丝路-投资资讯\",\n" +
" \"siteUri\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"h5[class=\\\"text-xl\\\"]>a\",\n" +
" \"informationPublishDate\": \"\",\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"mb-3\\\"]>ul>li\",\n" +
" \"linkLocation\": \"h5[class=\\\"text-xl\\\"]>a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"text-2xl md:text-4xl mb-4 font-song\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>span:contains(来源)</exp></origin>\",\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"43 38 20 1/1 * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -759,171 +288,5 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -759,171 +288,5 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
siteThread.crawler(); siteThread.crawler();
} }
public void loadSiteMsgLoc9(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539590279724441602\",\n" +
" \"infoSourceCode\": \"IN-20220622-0012\",\n" +
" \"webSiteName\": \"走出去情报\",\n" +
" \"siteName\": \"走出去情报-最新\",\n" +
" \"siteUri\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"div[class=\\\"item-text-content-title\\\"]\",\n" +
" \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"FeedList\\\"]\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"text-title\\\"]>h1</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span[id=\\\"news-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>article[class=\\\"article-info\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"01 45 20 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
public void loadSiteMsgLoc10(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System.out.println("——————++++++++++++——————===");
String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539590279724441602\",\n" +
" \"infoSourceCode\": \"IN-20220622-0012\",\n" +
" \"webSiteName\": \"走出去情报\",\n" +
" \"siteName\": \"走出去情报-最新\",\n" +
" \"siteUri\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" +
" \"language\": null,\n" +
" \"checkedList\": null,\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" +
" \"listUrl\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" +
" \"informationTitle\": \"div[class=\\\"item-text-content-title\\\"]\",\n" +
" \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"FeedList\\\"]\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"text-title\\\"]>h1</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span[id=\\\"news-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>article[class=\\\"article-info\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" +
" \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"01 45 20 1/1 * ?\"\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
}
} }
...@@ -469,6 +469,7 @@ public class PaserSiteDownload { ...@@ -469,6 +469,7 @@ public class PaserSiteDownload {
} }
public static String getHtml(String url,String charset) { public static String getHtml(String url,String charset) {
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
String html=""; String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault(); CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求 HttpGet httpgeturl = new HttpGet(url);// Get请求
......
...@@ -2,6 +2,7 @@ package com.zzsn.crawler; ...@@ -2,6 +2,7 @@ package com.zzsn.crawler;
import cn.hutool.core.date.DateTime; import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil; import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
...@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular; ...@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath; import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.HisURIConfig; import com.zzsn.crawler.uriparser.HisURIConfig;
import com.zzsn.crawler.uriparser.HisURIParser; import com.zzsn.crawler.uriparser.HisURIParser;
import com.zzsn.crawler.uriparser.HttpgetUtil;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{ ...@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
public PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple(); public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class); // public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override @Override
public void run() { public void run() {
crawler(); crawler();
} }
public static PageDownloader pageDownload=new PageDownloader();
public void crawler(){ public void crawler(){
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
// List<String> urlList=getPageListUrl(siteMsgTemple);
List<String> urlList=new ArrayList<>();
urlList.add(siteMsgTemple.getSiteUri());
//兼容就平台的历史链接方法
String charset="utf-8";
//获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
List<DocInfo> docInfoList=new ArrayList<>();
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
// Date collectTime=DateTime.now();
// String infoSourceId=siteMsgTemple.getId();
// //默认表达式类型
// siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//
// //判断列表解析表达式类型
// if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
// WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
// metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
// WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
// metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
// WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
// metaSearchList = webContentPaserByJsonXpath.catchWebOfStaticmsgByJsonPath(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
// WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
// metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
// }
String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(urlList.get(0),charset,false,false, siteMsgTemple.getHeaders());
}else {
try {//先使用静态网络请求获取列表内容
body = HttpgetUtil.getHtml(urlList.get(0));
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} catch (Exception e) {
log.info(e.getMessage());
body = pageDownload.downloadWithStr(urlList.get(0), charset, false, false);
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(urlList.get(0));
}
}
if(body.length()< 1000){
FileUtil.appendString(siteMsgTemple.getInfoSourceCode()+"\n\r","D:\\jingwai.txt","utf-8");
}
}
public void crawler2(){
//获取栏目链接以及翻页的链接
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
//兼容就平台的历史链接方法 //兼容就平台的历史链接方法
HisURIParser hisURIParser = new HisURIParser(); HisURIParser hisURIParser = new HisURIParser();
...@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{ ...@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
siteMsgRecord.setCollectTime(collectTime); siteMsgRecord.setCollectTime(collectTime);
String docjson = mapper.writeValueAsString(siteMsgRecord); String docjson = mapper.writeValueAsString(siteMsgRecord);
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace(); // e.printStackTrace();
......
...@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; ...@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.*;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
...@@ -61,13 +59,17 @@ public class WebContentPaserByRegular { ...@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try {//先使用静态网络请求获取列表内容 try {//先使用静态网络请求获取列表内容
body = pageDownload.downloadWithStr(uri_code, charset, false, false); body =HttpgetUtil.getHtml(uri_code);
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
body = paserSiteDownload.getHtml(uri_code, charset); body = pageDownload.downloadWithStr(uri_code, charset, false, false);
// body = paserSiteDownload.getHtml(uri_code, charset);
} }
//请求返回为空时判断为动态请求使用模拟浏览器的方式 //请求返回为空时判断为动态请求使用模拟浏览器的方式
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) { if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
// SeleniumTime seleniumTime=new SeleniumTime();
// body = seleniumTime.getScopehtml(uri_code);
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) { if (StringUtils.isEmpty(body) || pageDownload.isBadDownloadPage(body)) {
...@@ -270,13 +272,17 @@ public class WebContentPaserByRegular { ...@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
String content = ""; String content = "";
try { try {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
// SeleniumTime seleniumTime=new SeleniumTime();
// content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} else { } else {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false); content =HttpgetUtil.getHtml(cwbm.getSourceaddress());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
} catch (Exception e) { } catch (Exception e) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
log.info(e.getMessage()); log.info(e.getMessage());
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null); // content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类 //超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
...@@ -307,11 +313,12 @@ public class WebContentPaserByRegular { ...@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", "")); docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor()); docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate()); docInfo.setPublishDate(cwbm.getPublishDate());
if (cwbm.getSourceaddress() != null) { // if (cwbm.getSourceaddress() != null) {
docInfo.setOrigin(cwbm.getSourcesite()); // docInfo.setOrigin(cwbm.getSourcesite());
} else { // } else {
docInfo.setOrigin(siteMsgTemple.getSiteName()); // docInfo.setOrigin(siteMsgTemple.getSiteName());
} // }
docInfo.setOrigin(siteMsgTemple.getSiteName());
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
//封装解析的docinfo对象 //封装解析的docinfo对象
try { try {
...@@ -533,7 +540,7 @@ public class WebContentPaserByRegular { ...@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {
} }
docInfo.setContentWithTag(contentWithTag); docInfo.setContentWithTag(contentWithTag);
docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n","")); docInfo.setContentNoTag(ContentUtility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
} }
//作者 //作者
...@@ -567,8 +574,13 @@ public class WebContentPaserByRegular { ...@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource()); origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource());
if(StringUtils.isNotEmpty(origin)) { if(StringUtils.isNotEmpty(origin)) {
docInfo.setOrigin(origin); docInfo.setOrigin(origin);
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
return docInfo; return docInfo;
} }
......
package com.zzsn.crawler.uriparser;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zzsn.download.CreateSSLClientDefault;
import com.zzsn.util.Utility;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HttpgetUtil {
public static String getHtml(String url) {
String html="";
CloseableHttpClient httpClient = CreateSSLClientDefault.createSSLClientDefault();
HttpGet httpgeturl = new HttpGet(url);// Get请求
httpgeturl.getParams().setIntParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
httpgeturl.getParams().setParameter(
HttpMethodParams.SO_TIMEOUT, 60000);
// 伪装成浏览器
httpgeturl.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse httprespse=null;
try {
Thread.sleep(500L);
httprespse = httpClient.execute(httpgeturl);
} catch (Exception e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
} // 发送请求
HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
Header lastModify = httprespse
.getFirstHeader("Last-Modified");
if (lastModify == null) {
lastModify = httprespse.getLastHeader("Last-Modified");
}
String charset="utf-8";
String charstype = EntityUtils
.getContentCharSet(entitydata);
if (charstype != null) {
charset = charstype;
} else {
charset = LocateCharSet(url);
}
charset = Utility.charsetcheck(charset);
String infodata="";
try {
Thread.sleep(500L);
infodata = EntityUtils.toString(entitydata, charset);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
httpgeturl.releaseConnection();
return infodata;
}
public static String LocateCharSet(String url) {
String encoding = "gb2312";
try {
Thread.sleep(500L);
Connection conn = Jsoup.connect(url);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
// 伪装成浏览器
Document doc = conn.ignoreContentType(true).timeout(10000).get();
Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE);
Matcher m1 = p1.matcher(doc.toString());
while (m1.find()) {
String str = m1.group();
Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
Matcher m2 = p2.matcher(str);
if (m2.find()) {
encoding = m2.group().substring(8);
if (encoding.trim().length() == 0) {
Pattern p3 = Pattern
.compile("charset=\"[^\\s||\"||;||>]*");
Matcher m3 = p3.matcher(str);
if (m3.find()) {
encoding = m3.group().substring(9);
}
if (encoding.trim().length() == 0) {
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding = "GB2312";
// }
}
}
return encoding;
}
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("获取出错编码方式");
return encoding;
}
return encoding;
}
}
...@@ -32,6 +32,7 @@ public class SeleniumTime { ...@@ -32,6 +32,7 @@ public class SeleniumTime {
public static String getScopehtml(String url) { public static String getScopehtml(String url) {
String html = ""; String html = "";
try { try {
ReuseWebDriver driver = DriverUtil.getChromeDriver(); ReuseWebDriver driver = DriverUtil.getChromeDriver();
try { try {
Duration duration=Duration.of(100, ChronoUnit.SECONDS); Duration duration=Duration.of(100, ChronoUnit.SECONDS);
......
package com.zzsn.crawler.uriparser;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import com.zzsn.generation.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
public class SeleniumTime4 {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime4(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("--start-maximized");
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try{
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
Thread.sleep(3000l);
String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l);
driver.quit();
// System.out.println(html);
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(3000l);
String html = driver.getPageSource();
Thread.sleep(5000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}
try {
Thread.sleep(5000l);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
driver.quit();
return null;
}
public static void main(String[] args) {
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime4 s = new SeleniumTime4();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
...@@ -89,8 +89,8 @@ public class WebContentPaserByXpath { ...@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
seleniumTime=new SeleniumTime(); // seleniumTime=new SeleniumTime();
body = seleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
TimeUnit.SECONDS.sleep(5); TimeUnit.SECONDS.sleep(5);
seleniumTime.close(); seleniumTime.close();
} else { } else {
......
...@@ -165,7 +165,8 @@ public class JedisUtil { ...@@ -165,7 +165,8 @@ public class JedisUtil {
throw new Exception("key is null"); throw new Exception("key is null");
} }
jedis = getDefaultJedis(); jedis = getDefaultJedis();
value = jedis.get(PREFIX + key); // value = jedis.get(PREFIX + key);
value = jedis.get(key);
}catch (Exception e){ }catch (Exception e){
}finally { }finally {
......
...@@ -62,7 +62,7 @@ public class KafkaConsumerJob { ...@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); // , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
// @Async("asyncTaskExecutor") // @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE); // ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
......
...@@ -98,6 +98,7 @@ public class ChromeTest { ...@@ -98,6 +98,7 @@ public class ChromeTest {
// 可复用驱动使用Demo // 可复用驱动使用Demo
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
ReuseWebDriver driver = DriverUtil.getChromeDriver(); ReuseWebDriver driver = DriverUtil.getChromeDriver();
if (driver == null) { if (driver == null) {
// 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存 // 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存
......
package com.zzsn.test; package com.zzsn.test;
import com.zzsn.crawler.uriparser.HttpgetUtil;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
...@@ -41,7 +42,9 @@ import java.util.List; ...@@ -41,7 +42,9 @@ import java.util.List;
public class HttpClientTester { public class HttpClientTester {
private static PageBuilderParser builderParser = null; private static PageBuilderParser builderParser = null;
public static void main(String[] args) { public static void main(String[] args) {
get("https://www.cas.cn/zjs/"); // get("https://edition.cnn.com/world");
String html = HttpgetUtil.getHtml("https://edition.cnn.com/world");
System.out.println(html);
// post(); // post();
} }
......
package com.zzsn.test; package com.zzsn.test;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import java.io.IOException; import java.io.IOException;
...@@ -17,21 +18,12 @@ import java.io.InputStream; ...@@ -17,21 +18,12 @@ import java.io.InputStream;
public class WebTest { public class WebTest {
public static void main(String[] args) { public static void main(String[] args) {
// String url="https://www.teriin.org/opinion"; String url="https://www.teriin.org/opinion";
// PageDownloader pageDownload=new PageDownloader(); // PageDownloader pageDownload=new PageDownloader();
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false); // String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// System.out.println(body); // System.out.println(body);
try { PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
Runtime mt = Runtime.getRuntime(); String html = paserSiteDownload.getHtml("https://edition.cnn.com/world", "utf-8");
String cmd = "taskkill /F /im chrome.exe"; System.out.println(html);
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
} }
} }
...@@ -287,15 +287,17 @@ public class ContentUtility { ...@@ -287,15 +287,17 @@ public class ContentUtility {
if(htmlText==null){ if(htmlText==null){
return null; return null;
} }
String text = ContentUtility.HTMLDecode(ContentUtility.RemoveHTMLCode(ContentUtility.RemoveStyleCode(ContentUtility.RemoveHTMLReturnCode(htmlText)))); String text = Utility.HTMLDecode(Utility.RemoveHTMLCode(Utility.RemoveStyleCode(Utility.RemoveHTMLReturnCode(htmlText))));
text = text.replaceAll("   ", "\r\n"); text = text.replaceAll("   ", "\r\n");
text = text.replaceAll(" +\r\n", "\r\n"); text = text.replaceAll(" +\r\n", "\r\n");
text = text.replaceAll(" +", " "); text = text.replaceAll(" +", " ");
text = text.replaceAll("[\\u00A0\\u3000]", ""); text = text.replaceAll("[\\u00A0\\u3000]", "");
text = text.replaceAll(" ", ""); text = text.replaceAll(" ", "");
text = text.replaceAll(" \n", "\n");
text = text.replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n");
return text; return text;
} }
......
...@@ -58,10 +58,10 @@ public class DriverUtil { ...@@ -58,10 +58,10 @@ public class DriverUtil {
} }
public static ReuseWebDriver connectChrome(String sessionId, String serverUrl) throws Exception { public static ReuseWebDriver connectChrome(String sessionId, String serverUrl) throws Exception {
if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) { // if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
log.error("未获取到驱动服务地址、sessionId"); // log.error("未获取到驱动服务地址、sessionId");
return null; // return null;
} // }
ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId); ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId);
if (driver.connectTestFail()) { if (driver.connectTestFail()) {
...@@ -89,10 +89,21 @@ public class DriverUtil { ...@@ -89,10 +89,21 @@ public class DriverUtil {
* @date 2022/7/25 15:07 * @date 2022/7/25 15:07
*/ */
public static ReuseWebDriver getChromeDriver() throws Exception { public static ReuseWebDriver getChromeDriver() throws Exception {
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE); Map<String, String> map =getSessionInfo();
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
String sessionId = map.get("sessionId"); String sessionId = map.get("sessionId");
String serverUrl = map.get("serverUrl"); String serverUrl = map.get("serverUrl");
return connectChrome(sessionId, serverUrl); return connectChrome(sessionId, serverUrl);
} }
public static Map<String, String> getSessionInfo() throws Exception{
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
if(map==null || map.size()<1) {
map = new HashMap<>(2);
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
}
return map;
}
} }
package com.zzsn.util; package com.zzsn.util;
import org.jsoup.Jsoup; import java.io.BufferedReader;
import org.jsoup.nodes.Document; import java.io.BufferedWriter;
import org.jsoup.nodes.Element; import java.io.File;
import org.jsoup.select.Elements; import java.io.FileInputStream;
import org.mozilla.universalchardet.UniversalDetector; import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.*; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.MappedByteBuffer; import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.TimeZone;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.mozilla.universalchardet.UniversalDetector;
//import com.zzsn.worker.db.model.ImgData;
//import com.zzsn.worker.index.Constants;
/* /*
* *
...@@ -39,6 +62,8 @@ import java.util.regex.PatternSyntaxException; ...@@ -39,6 +62,8 @@ import java.util.regex.PatternSyntaxException;
*/ */
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public class Utility { public class Utility {
//定时器控制flg
public static int flg = 0;
//任务执行状态flg //任务执行状态flg
public static int status_flg = 0; public static int status_flg = 0;
static String regEx = "[\\u4e00-\\u9fa5]"; static String regEx = "[\\u4e00-\\u9fa5]";
...@@ -58,6 +83,7 @@ public class Utility { ...@@ -58,6 +83,7 @@ public class Utility {
static Pattern divP = Pattern.compile("<div>",Pattern.CASE_INSENSITIVE); static Pattern divP = Pattern.compile("<div>",Pattern.CASE_INSENSITIVE);
static Pattern divRP = Pattern.compile("</div>",Pattern.CASE_INSENSITIVE); static Pattern divRP = Pattern.compile("</div>",Pattern.CASE_INSENSITIVE);
static Pattern brP = Pattern.compile("<br />",Pattern.CASE_INSENSITIVE); static Pattern brP = Pattern.compile("<br />",Pattern.CASE_INSENSITIVE);
static Pattern brP2 = Pattern.compile("<br/>",Pattern.CASE_INSENSITIVE);
static Pattern br2P = Pattern.compile("<br>",Pattern.CASE_INSENSITIVE); static Pattern br2P = Pattern.compile("<br>",Pattern.CASE_INSENSITIVE);
static Pattern spaceP = Pattern.compile("&nbsp;",Pattern.CASE_INSENSITIVE); static Pattern spaceP = Pattern.compile("&nbsp;",Pattern.CASE_INSENSITIVE);
static Pattern strongP = Pattern.compile("<strong>",Pattern.CASE_INSENSITIVE); static Pattern strongP = Pattern.compile("<strong>",Pattern.CASE_INSENSITIVE);
...@@ -110,6 +136,8 @@ public class Utility { ...@@ -110,6 +136,8 @@ public class Utility {
private static Pattern patDate4 = Pattern.compile("\\d+年\\d+月\\d+日"); private static Pattern patDate4 = Pattern.compile("\\d+年\\d+月\\d+日");
private static Pattern patDate5 = Pattern.compile("\\d+/\\d{1,2}/\\d+"); private static Pattern patDate5 = Pattern.compile("\\d+/\\d{1,2}/\\d+");
private static Pattern patDate6 = Pattern.compile("\\d+\\.\\d+\\.\\d+"); private static Pattern patDate6 = Pattern.compile("\\d+\\.\\d+\\.\\d+");
private static Pattern patDate7 = Pattern.compile("\\d{1,2}-\\d{1,2}");
private static Pattern patDate8 = Pattern.compile("\\d+月\\d+日");
private static SimpleDateFormat formatter0 = new SimpleDateFormat("yyyy-MM-dd"); private static SimpleDateFormat formatter0 = new SimpleDateFormat("yyyy-MM-dd");
private static SimpleDateFormat formatter0_1 = new SimpleDateFormat("yy-MM-dd"); private static SimpleDateFormat formatter0_1 = new SimpleDateFormat("yy-MM-dd");
...@@ -122,15 +150,103 @@ public class Utility { ...@@ -122,15 +150,103 @@ public class Utility {
private static SimpleDateFormat formatter5_4 = new SimpleDateFormat("yy/MM/dd"); private static SimpleDateFormat formatter5_4 = new SimpleDateFormat("yy/MM/dd");
private static SimpleDateFormat formatter5_3 = new SimpleDateFormat("dd/MM/yy"); private static SimpleDateFormat formatter5_3 = new SimpleDateFormat("dd/MM/yy");
private static SimpleDateFormat formatter6 = new SimpleDateFormat("yyyy.MM.dd"); private static SimpleDateFormat formatter6 = new SimpleDateFormat("yyyy.MM.dd");
private static SimpleDateFormat formatter7 = new SimpleDateFormat("MM-dd");
private static SimpleDateFormat formatter8 = new SimpleDateFormat("MM月dd");
private static Date thresholdDate = null; private static Date thresholdDate = null;
/**
* 对参数中的中文进行编码
* 创建人: 刘小鹏
* 创建时间: 2016-4-14 下午2:46:50
* @version 1.0
* @param uri
* @return
*/
public static String encodURI(String uri) {
if (uri == null) {
return null;
}
//只对中文参数进行转码
if (uri.contains("?")&&!uri.endsWith("?")) {
try {
StringBuffer sb = new StringBuffer();
sb.append("?");
String[] array = uri.split("\\?");
String uriPart = array[0];
String paramStr = array[1];
String[] params = paramStr.split("\\&");
for (int i = 0 ;i <params.length;i++) {
if(i>0){
sb.append("&");
}
String param = params[i];
Integer indexFlag = param.indexOf("=");
if(indexFlag!=-1){
String name = param.substring(0,indexFlag);
String value = param.substring(indexFlag+1);
value = URLEncoder.encode(value, "UTF-8");
value = value.replaceAll("%3D", "=");
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +value);
}else{
sb.append(URLEncoder.encode(param, "UTF-8") );
}
/* String name = param.substring(0,param.indexOf("="));
String value = nameAndValue[1];
if (nameAndValue.length == 1) {
sb.append(URLEncoder.encode(param, "UTF-8"));
} else if(nameAndValue.length == 2) {
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}else
{
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}*/
}
uri = uriPart+sb.toString();
/* Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]").matcher(uri);
while (matcher.find()) {
String tmp = matcher.group();
uri = uri.replaceAll(tmp, java.net.URLEncoder.encode(tmp, "UTF-8"));
}*/
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//对路径中的中文也进行转码
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(uri);
String find;
String replace = null;
while(matcher.find()){
find = matcher.group();
try {
replace = URLEncoder.encode(find, "UTF-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
while(uri.contains(find)&&!find.equals(replace)){
uri = uri.replace(find, replace);
}
}
uri = uri.replaceAll("/+$", "/");
uri = uri.replaceAll(" ", "%20");
return uri;
}
/* /*
* 判断网页文件的编码 * 判断网页文件的编码
*/ */
public static String getWebEncodingByStr(String content) { public static String getWebEncodingByStr(String content) {
String encoding = "GB2312"; String encoding = null;
Pattern p1 = Pattern.compile("<meta[^>]*>", Pattern p1 = Pattern.compile("<meta[^>]*>",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
Matcher m1 = p1.matcher(content); Matcher m1 = p1.matcher(content);
...@@ -219,20 +335,14 @@ public class Utility { ...@@ -219,20 +335,14 @@ public class Utility {
e.printStackTrace(); e.printStackTrace();
} }
if (encoding == null) { if (encoding == null) {
if (encoding == null) { encoding = detectCharSet(fileName);
encoding = "UTF-8"; //encoding = "GB2312"; // if (encoding == null) {
} // encoding = null; //encoding = "GB2312";
// }
} }
return encoding; return encoding;
} }
public static String getLanguageType(String content) {
String langType = LangTypeDetector.DetectLang(content);
if (langType == null || "".equals(langType)) {
return "error";
}
return langType;
}
public static String detectCharSet(String fileName) { public static String detectCharSet(String fileName) {
try try
...@@ -242,13 +352,13 @@ public class Utility { ...@@ -242,13 +352,13 @@ public class Utility {
return null; return null;
} }
byte[] buf = new byte[4096]; byte[] buf = new byte[4096];
FileInputStream fis = null; FileInputStream fis = null;
UniversalDetector detector = null; UniversalDetector detector = null;
try { try {
fis = new FileInputStream(fileName); fis = new FileInputStream(fileName);
detector = new UniversalDetector(null); detector = new UniversalDetector(null);
int nread; int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread); detector.handleData(buf, 0, nread);
...@@ -259,7 +369,7 @@ public class Utility { ...@@ -259,7 +369,7 @@ public class Utility {
//e.printStackTrace(); //e.printStackTrace();
} }
detector.dataEnd(); detector.dataEnd();
String encoding = detector.getDetectedCharset(); String encoding = detector.getDetectedCharset();
detector.reset(); detector.reset();
if (encoding == null) { if (encoding == null) {
...@@ -270,10 +380,11 @@ public class Utility { ...@@ -270,10 +380,11 @@ public class Utility {
} }
catch(Exception e) catch(Exception e)
{ {
// e.printStackTrace(); // e.printStackTrace();
return null; return null;
} }
} }
/* /*
* 判断文件的编码格式 * 判断文件的编码格式
*/ */
...@@ -331,6 +442,7 @@ public class Utility { ...@@ -331,6 +442,7 @@ public class Utility {
src = divP.matcher(src).replaceAll("\n\n"); src = divP.matcher(src).replaceAll("\n\n");
src = divRP.matcher(src).replaceAll("\n\n"); src = divRP.matcher(src).replaceAll("\n\n");
src = brP.matcher(src).replaceAll("\n\n"); src = brP.matcher(src).replaceAll("\n\n");
src = brP2.matcher(src).replaceAll("\n\n");
src = br2P.matcher(src).replaceAll("\n\n"); src = br2P.matcher(src).replaceAll("\n\n");
src = spaceP.matcher(src).replaceAll(" "); src = spaceP.matcher(src).replaceAll(" ");
src = src.replaceAll("&#8226;", "??"); src = src.replaceAll("&#8226;", "??");
...@@ -748,6 +860,14 @@ public class Utility { ...@@ -748,6 +860,14 @@ public class Utility {
} }
return true; return true;
} }
public static String getLanguageType(String content) {
String langType = LangTypeDetector.DetectLang(content);
if (langType == null || "".equals(langType)) {
return "error";
}
return langType;
}
public static List<String> getFiles(List<String> l, String directory,boolean bIncludeSubDir) { public static List<String> getFiles(List<String> l, String directory,boolean bIncludeSubDir) {
if (l == null) { if (l == null) {
...@@ -964,6 +1084,8 @@ public class Utility { ...@@ -964,6 +1084,8 @@ public class Utility {
text = text.replaceAll(" +", " "); text = text.replaceAll(" +", " ");
text = text.replaceAll("[\\u00A0\\u3000]", ""); text = text.replaceAll("[\\u00A0\\u3000]", "");
text = text.replaceAll(" ", ""); text = text.replaceAll(" ", "");
text = text.replaceAll(" \n", "\n");
text = text.replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n").replaceAll("\n\n", "\n");
return text; return text;
} }
...@@ -1092,9 +1214,9 @@ public class Utility { ...@@ -1092,9 +1214,9 @@ public class Utility {
htmlText = formRP.matcher(htmlText).replaceAll(""); htmlText = formRP.matcher(htmlText).replaceAll("");
// htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>"); htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
// htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>"); htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
// htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>"); htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");
htmlText = commentP.matcher(htmlText).replaceAll(""); htmlText = commentP.matcher(htmlText).replaceAll("");
htmlText = legendRemoveP.matcher(htmlText).replaceAll(""); htmlText = legendRemoveP.matcher(htmlText).replaceAll("");
...@@ -1187,42 +1309,7 @@ public class Utility { ...@@ -1187,42 +1309,7 @@ public class Utility {
} }
public static String RemoveAllLink(String contentWithTag)
{
Document doc = Jsoup.parse(contentWithTag);
Elements contentElems = doc.select("a");
if((contentElems == null) || (contentElems.size() == 0))
{
return contentWithTag;
}
for(Element aElement : contentElems)
{
try
{
String elementText = aElement.text().trim();
Element parentElement = aElement.parent();
String parentText = parentElement.text().trim();
elementText = elementText.replaceAll(" ", "").trim();
parentText = parentText.replaceAll(" ", "").trim();
aElement.remove();
while(parentElement.text().trim().isEmpty())
{
Element tempElement = parentElement;
parentElement = parentElement.parent();
tempElement.remove();
}
}
catch(Exception e)
{
continue;
}
}
return doc.outerHtml();
}
...@@ -1238,6 +1325,8 @@ public class Utility { ...@@ -1238,6 +1325,8 @@ public class Utility {
|| (dateMatcher = patDate4.matcher(content)).find() || (dateMatcher = patDate4.matcher(content)).find()
|| (dateMatcher = patDate5.matcher(content)).find() || (dateMatcher = patDate5.matcher(content)).find()
|| (dateMatcher = patDate6.matcher(content)).find() || (dateMatcher = patDate6.matcher(content)).find()
|| (dateMatcher = patDate7.matcher(content)).find()
|| (dateMatcher = patDate8.matcher(content)).find()
) )
{ {
return true; return true;
...@@ -1263,6 +1352,8 @@ public class Utility { ...@@ -1263,6 +1352,8 @@ public class Utility {
|| ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null)) || ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null))
|| ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null)) || ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null))
|| ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null)) || ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null))
|| ((dateMatcher = patDate7.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),7)!= null))
|| ((dateMatcher = patDate8.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),8)!= null))
) )
{ {
return true; return true;
...@@ -1290,6 +1381,8 @@ public class Utility { ...@@ -1290,6 +1381,8 @@ public class Utility {
|| (dateMatcher = patDate4.matcher(content)).find() || (dateMatcher = patDate4.matcher(content)).find()
|| (dateMatcher = patDate5.matcher(content)).find() || (dateMatcher = patDate5.matcher(content)).find()
|| (dateMatcher = patDate6.matcher(content)).find() || (dateMatcher = patDate6.matcher(content)).find()
|| (dateMatcher = patDate7.matcher(content)).find()
|| (dateMatcher = patDate8.matcher(content)).find()
) )
{ {
return dateMatcher; return dateMatcher;
...@@ -1316,6 +1409,8 @@ public class Utility { ...@@ -1316,6 +1409,8 @@ public class Utility {
|| ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null)) || ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null))
|| ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null)) || ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null))
|| ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null)) || ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null))
|| ((dateMatcher = patDate7.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),7)!= null))
|| ((dateMatcher = patDate8.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),8)!= null))
) )
{ {
return dateMatcher; return dateMatcher;
...@@ -1402,6 +1497,12 @@ public class Utility { ...@@ -1402,6 +1497,12 @@ public class Utility {
case 6: case 6:
date = formatter6.parse(source); date = formatter6.parse(source);
break; break;
case 7:
date = formatter7.parse(source);
break;
case 8:
date = formatter8.parse(source);
break;
} }
if((date != null) && (date.before(thresholdDate))) if((date != null) && (date.before(thresholdDate)))
...@@ -1431,6 +1532,8 @@ public class Utility { ...@@ -1431,6 +1532,8 @@ public class Utility {
|| ((dateMatcher = patDate4.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),4))!= null)) || ((dateMatcher = patDate4.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),4))!= null))
|| ((dateMatcher = patDate5.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),5))!= null)) || ((dateMatcher = patDate5.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),5))!= null))
|| ((dateMatcher = patDate6.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),6))!= null)) || ((dateMatcher = patDate6.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),6))!= null))
|| ((dateMatcher = patDate7.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),7))!= null))
|| ((dateMatcher = patDate8.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),8))!= null))
) )
{ {
return date; return date;
...@@ -1468,8 +1571,180 @@ public class Utility { ...@@ -1468,8 +1571,180 @@ public class Utility {
return null; return null;
} }
} }
/**
* 获取标准的本地时间
* 创建人: 李东亮
* 创建时间: 2015-7-2 上午10:32:25
* @version 1.0
* @param raw
* @return
*/
public static String transLocalTime(String raw){
Date date = Utility.transDate(raw);
if(date!=null){
Calendar c = Calendar.getInstance(TimeZone.getTimeZone("Asia/Shanghai"));
c.setTime(date);
Pattern p = Pattern.compile("(\\d{1,2})[:|:](\\d{1,2})([:|:]\\d{1,2}){0,1}");
Matcher m = p.matcher(raw);
while(m.find()){
String hour = m.group(1);
if(hour!=null){
c.set(Calendar.HOUR_OF_DAY, Integer.valueOf(hour));
}
String minute = m.group(2);
if(minute!=null){
c.set(Calendar.MINUTE, Integer.valueOf(minute));
}
String second = m.group(3);
if(second!=null){
c.set(Calendar.SECOND, Integer.valueOf(second.replaceAll(":|:", "")));
}
}
return DateUtil.format(c.getTime(),"yyyy-MM-dd HH:mm:ss");
}else
{
return null;
}
}
/** /**
* 获取正文中的图片路径
* 创建人: 李东亮
* 创建时间: 2015-11-13 下午5:27:27
* @version 1.0
* @param text
* @param uri
* @return
*/
public static List<String> getContentImgPath(String text,String uri){
List<String> result = new ArrayList<String>();
String baseUri=null;
Pattern p = Pattern.compile( "(<img.+?src=)(\"|')(.+?)(\"|')(.*?/?>)",Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(text);
String rawPath;
while(m.find()){
rawPath = m.group(3);
if(rawPath.startsWith("http://")&&!rawPath.startsWith("https://")){
}
}
return result;
}
/**
* 获取图片链接
* 创建人: 李东亮
* 创建时间: 2015-6-3 下午1:36:19
* @version 1.0
* @param contentStr
* @return
*/
// public static Map<String,ImgData> getContentImg(String contentStr,String uri){
// Map<String,ImgData> imgMap = new HashMap<String,ImgData>();
// if(contentStr==null||contentStr.length()==0){
// return imgMap;
// }
// Pattern p = Pattern.compile( "(<img.+?src=)(\"|')(.+?)(\"|')(.*?/?>)",Pattern.CASE_INSENSITIVE);
// Matcher m = p.matcher(contentStr);
// String imgPath;
// ImgData imgData;
// StringBuffer replacePath;
// while(m.find()){
// imgData = new ImgData();
// imgPath = m.group(3);
// if(!imgPath.startsWith("http://")&&!imgPath.startsWith("https://")&&uri!=null){
// String puriDir = getDirPath(uri.toString());
// imgPath = formatPath(puriDir,imgPath);
// }
// //图片完整路径
// imgData.setFormatTag(imgPath);
// //图片保存路径
// imgData.setLocalImgPath(genImgFileName());
// replacePath = new StringBuffer("");
// replacePath.append(m.group(1)).append(m.group(2)).append("IMG_SERVER/").append(imgData.getLocalImgPath()).append(m.group(4)).append(m.group(5));
// imgData.setReplaceTag(replacePath.toString());
// imgMap.put(m.group(), imgData);
// }
// return imgMap;
// }
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private static String genImgFileName(){
String dir = DateUtil.format(new Date(), "yyyy-MM-dd");
String uuid = UUID.randomUUID().toString();
return dir+"/"+uuid;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午5:13:46
* @version 1.0
* @param path
* @return
*/
public static String removeInvalidFileChar(String path){
int split = path.lastIndexOf("/");
String after = path.substring(split,path.length());
after = after.replaceAll(":|\\?|\\*|\"|\\|","");
path = path.substring(0, split)+after;
return path;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param
* @return
* @throws IOException
*/
public static String formatPath(String currentPageURL,String imgPath) {
String start="";
if(currentPageURL.indexOf("http://")!=-1){
start = "http://";
}else if(currentPageURL.indexOf("https://")!=-1){
start = "https://";
}
//绝对路径
if(imgPath.startsWith("/")){
currentPageURL = currentPageURL.replace(start, "");
int subIndex = currentPageURL.indexOf("/");
if(subIndex==-1){
subIndex = currentPageURL.length();
}
String domain = currentPageURL.substring(0, subIndex);
return start+domain+imgPath;
}
//相对路径
String path = currentPageURL+"/"+imgPath;
path = path.replaceAll(start, "D:/");
File f = new File(path);
String filePath="";
try {
filePath = f.getCanonicalPath();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String result = filePath.replaceAll("D:\\\\", start);
result = result.replaceAll("\\\\", "/");
return result;
}
/**
* 获取父路径 * 获取父路径
* 创建人: 李东亮 * 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44 * 创建时间: 2015-7-6 下午3:17:44
...@@ -1484,7 +1759,7 @@ public class Utility { ...@@ -1484,7 +1759,7 @@ public class Utility {
} }
/** /**
* 去除特殊字符 * 去除特殊字符
* 创建人: 刘小鹏 * 创建人: 李东亮
* 创建时间: 2015-6-4 下午6:40:19 * 创建时间: 2015-6-4 下午6:40:19
* @version 1.0 * @version 1.0
* @param str * @param str
...@@ -1513,9 +1788,11 @@ public class Utility { ...@@ -1513,9 +1788,11 @@ public class Utility {
String result = new String(newtemp, targetCharset); String result = new String(newtemp, targetCharset);
return result; return result;
} }
public static void main(String args[]) throws IOException {
System.out.println(UUID.randomUUID().toString());
}
/** /**
* 根据 * 获取请求路径后缀
* 创建人: 杨海龙 * 创建人: 杨海龙
* 创建时间: 2015年7月10日 上午10:14:52 * 创建时间: 2015年7月10日 上午10:14:52
* @version 1.0 * @version 1.0
...@@ -1540,12 +1817,27 @@ public class Utility { ...@@ -1540,12 +1817,27 @@ public class Utility {
return null; return null;
} }
/** /**
* 编码匹配 * 格式化URI
* 创建人: 李东亮
* 创建时间: 2015-8-20 下午3:26:00
* @version 1.0 * @version 1.0
* @param * @param uri
* @return * @return
*/ */
public static String formatURI(String uri){
uri = uri.trim();
uri = uri.replaceAll("/+$", "");
return uri;
}
/**
* 编码匹配
* @version 1.0
* @param
* @return
*/
public static String charsetcheck(String charset) { public static String charsetcheck(String charset) {
String charreset = "GB2312"; String charreset = "GB2312";
String[] charsetall = {"GB2312","GBK","UTF-8","ISO-8859-1", String[] charsetall = {"GB2312","GBK","UTF-8","ISO-8859-1",
...@@ -1582,7 +1874,7 @@ public class Utility { ...@@ -1582,7 +1874,7 @@ public class Utility {
"x-mswin-936","x-PCK","x-SJIS_0213","x-UTF-16LE-BOM","X-UTF-32BE-BOM", "x-mswin-936","x-PCK","x-SJIS_0213","x-UTF-16LE-BOM","X-UTF-32BE-BOM",
"X-UTF-32LE-BOM","x-windows-50220","x-windows-50221","x-windows-874", "X-UTF-32LE-BOM","x-windows-50220","x-windows-50221","x-windows-874",
"x-windows-949","x-windows-950","x-windows-iso2022jp" "x-windows-949","x-windows-950","x-windows-iso2022jp"
}; };
for (int i=0;i<charsetall.length;i++) { for (int i=0;i<charsetall.length;i++) {
if (charset.toLowerCase().contains(charsetall[i].toLowerCase())) { if (charset.toLowerCase().contains(charsetall[i].toLowerCase())) {
charreset = charsetall[i]; charreset = charsetall[i];
...@@ -1591,104 +1883,43 @@ public class Utility { ...@@ -1591,104 +1883,43 @@ public class Utility {
} }
return charreset; return charreset;
} }
public static String RemoveAllLink(String contentWithTag)
{
/** Document doc = Jsoup.parse(contentWithTag);
* 对参数中的中文进行编码 Elements contentElems = doc.select("a");
* 创建人: 刘小鹏 if((contentElems == null) || (contentElems.size() == 0))
* 创建时间: 2016-4-14 下午2:46:50 {
* @version 1.0 return contentWithTag;
* @param uri }
* @return for(Element aElement : contentElems)
*/ {
public static String encodURI(String uri) { try
if (uri == null) { {
return null; String elementText = aElement.text().trim();
} Element parentElement = aElement.parent();
//只对中文参数进行转码 String parentText = parentElement.text().trim();
if (uri.contains("?")&&!uri.endsWith("?")) { elementText = elementText.replaceAll(" ", "").trim();
try { parentText = parentText.replaceAll(" ", "").trim();
StringBuffer sb = new StringBuffer(); aElement.remove();
sb.append("?"); while(parentElement.text().trim().isEmpty())
String[] array = uri.split("\\?"); {
String uriPart = array[0]; Element tempElement = parentElement;
String paramStr = array[1]; parentElement = parentElement.parent();
String[] params = paramStr.split("\\&"); tempElement.remove();
for (int i = 0 ;i <params.length;i++) { }
if(i>0){ }
sb.append("&"); catch(Exception e)
} {
String param = params[i]; continue;
Integer indexFlag = param.indexOf("="); }
if(indexFlag!=-1){ }
String name = param.substring(0,indexFlag);
String value = param.substring(indexFlag+1);
value = URLEncoder.encode(value, "UTF-8");
value = value.replaceAll("%3D", "="); return doc.outerHtml();
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +value);
}else{ }
sb.append(URLEncoder.encode(param, "UTF-8") );
}
/* String name = param.substring(0,param.indexOf("="));
String value = nameAndValue[1];
if (nameAndValue.length == 1) {
sb.append(URLEncoder.encode(param, "UTF-8"));
} else if(nameAndValue.length == 2) {
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}else
{
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}*/
}
uri = uriPart+sb.toString();
/* Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]").matcher(uri);
while (matcher.find()) {
String tmp = matcher.group();
uri = uri.replaceAll(tmp, java.net.URLEncoder.encode(tmp, "UTF-8"));
}*/
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//对路径中的中文也进行转码
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(uri);
String find;
String replace = null;
while(matcher.find()){
find = matcher.group();
try {
replace = URLEncoder.encode(find, "UTF-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
while(uri.contains(find)&&!find.equals(replace)){
uri = uri.replace(find, replace);
}
}
uri = uri.replaceAll("/+$", "/");
uri = uri.replaceAll(" ", "%20");
return uri;
}
/**
* 对正则获取url进行统一格式化
* 创建人: 李东亮
* 创建时间: 2016-5-11 上午11:40:56
* @version 1.0
* @param url
* @return
*/
public static String formatURL(String url) { public static String formatURL(String url) {
if (url == null) { if (url == null) {
return null; return null;
...@@ -1704,19 +1935,4 @@ public class Utility { ...@@ -1704,19 +1935,4 @@ public class Utility {
url = url.replaceAll("/\\$$", ""); url = url.replaceAll("/\\$$", "");
return url; return url;
} }
public static String removeHttp(String content){
// Pattern p = Pattern.compile("https://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?|(\\s)?");
Pattern p = Pattern.compile("^(\\s)(https://|http://)([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)(\\s)?");
Matcher matcher = p.matcher(content);
while (matcher.find()) {
content = content.replace(matcher.group(), "");
}
return content;
}
public static void main(String args[]) throws IOException {
String url = "聚焦跑道 塑造变革 加快建设共同富裕美好社会 https://stc-new.8531.cn/assets/20220216/1644994342533_620c9f264dc8891126854d30.mp3 2月15日下午,湖州市委书记王纲主持召开会议,专题听取湖州共同富裕";
String s = removeHttp(url);
System.out.println(s);
}
} }
\ No newline at end of file
...@@ -2,6 +2,8 @@ package com.zzsn.util; ...@@ -2,6 +2,8 @@ package com.zzsn.util;
import com.zzsn.crawler.ReuseWebDriver; import com.zzsn.crawler.ReuseWebDriver;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.InputStreamReader; import java.io.InputStreamReader;
...@@ -13,10 +15,12 @@ import java.util.regex.Pattern; ...@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
*/ */
@Slf4j @Slf4j
@SuppressWarnings("all") @SuppressWarnings("all")
@EnableScheduling
public class WindowsProcess { public class WindowsProcess {
private static Pattern TASK_LIST_PATTERN = Pattern.compile("^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$"); private static Pattern TASK_LIST_PATTERN = Pattern.compile("^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$");
private static String DRIVER_NAME = "chrome.exe"; private static String CHROME_NAME = "chrome.exe";
private static String DRIVER_NAME = "chromedriver.exe";
public static void main(String[] args) { public static void main(String[] args) {
WindowsProcess process = new WindowsProcess(); WindowsProcess process = new WindowsProcess();
...@@ -28,6 +32,7 @@ public class WindowsProcess { ...@@ -28,6 +32,7 @@ public class WindowsProcess {
* @author andylau * @author andylau
* @date 2022/7/26 11:23 * @date 2022/7/26 11:23
*/ */
// @Scheduled(cron = "0 0 1 * * ?")
private void killProcess() { private void killProcess() {
try { try {
String line; String line;
...@@ -35,14 +40,14 @@ public class WindowsProcess { ...@@ -35,14 +40,14 @@ public class WindowsProcess {
BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream())); BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream()));
while ((line = input.readLine()) != null) { while ((line = input.readLine()) != null) {
if (line.contains(DRIVER_NAME)) { if (line.contains(CHROME_NAME)|| line.contains(DRIVER_NAME) ) {
Matcher matcher = TASK_LIST_PATTERN.matcher(line); Matcher matcher = TASK_LIST_PATTERN.matcher(line);
if (matcher.find()) { if (matcher.find()) {
// String serviceName = matcher.group(1); // String serviceName = matcher.group(1);
String pid = matcher.group(2); String pid = matcher.group(2);
// String sessionName = matcher.group(3); // String sessionName = matcher.group(3);
// String size = matcher.group(4).replace(",", "") + "K"; // String size = matcher.group(4).replace(",", "") + "K";
// log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size); // log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
Runtime.getRuntime().exec("taskkill /pid " + pid); Runtime.getRuntime().exec("taskkill /pid " + pid);
} }
} }
...@@ -51,11 +56,11 @@ public class WindowsProcess { ...@@ -51,11 +56,11 @@ public class WindowsProcess {
log.error("浏览器驱动关闭异常..."); log.error("浏览器驱动关闭异常...");
} finally { } finally {
// 定时任务关闭驱动后,重新打开驱动 // 定时任务关闭驱动后,重新打开驱动
try { // try {
reopenChromeDriver(); // reopenChromeDriver();
} catch (Exception e) { // } catch (Exception e) {
log.error("驱动打开异常..."); // log.error("驱动打开异常...");
} // }
} }
} }
......
IN-20220609-15205
IN-20220609-45135
IN-20220609-50717
IN-20220609-52785
IN-20220609-3804
IN-20220609-4071
IN-20220609-15069
IN-20220609-45126
IN-20220609-50722
IN-20220609-52787
IN-20220609-3817
IN-20220609-4084
IN-20220609-15090
IN-20220609-45128
IN-20220609-50721
IN-20220609-52786
IN-20220609-58494
IN-20220609-58502
IN-20220609-58503
IN-20220609-58504
IN-20220609-58505
IN-20220609-58506
IN-20220609-58508
IN-20220609-3840
IN-20220609-4107
IN-20220609-12827
IN-20220609-15126
IN-20220609-44997
IN-20220609-45132
IN-20220609-50720
IN-20220609-52779
IN-20220609-3874
IN-20220609-4141
IN-20220609-15147
IN-20220609-45133
IN-20220609-50719
IN-20220609-52778
IN-20220609-56218
IN-20220609-56227
IN-20220609-56229
IN-20220609-57566
IN-20220609-15041
IN-20220609-45120
IN-20220609-50724
IN-20220609-52789
IN-20220609-56185
IN-20220609-56187
IN-20220609-56188
IN-20220609-56189
IN-20220609-56190
IN-20220609-56191
IN-20220609-56192
IN-20220609-57726
IN-20220609-57739
IN-20220609-57740
IN-20220609-57741
IN-20220609-57742
IN-20220609-57771
IN-20220609-57772
IN-20220609-12932
IN-20220609-15043
IN-20220609-45005
IN-20220609-45122
IN-20220609-50723
IN-20220609-52788
IN-20220609-57997
IN-20220609-57998
IN-20220609-57999
IN-20220609-13241
IN-20220609-45062
IN-20220609-54133
IN-20220609-57070
IN-20220609-14979
IN-20220609-14981
IN-20220609-45097
IN-20220609-45098
IN-20220609-50737
IN-20220609-50738
IN-20220609-52766
IN-20220609-52767
IN-20220609-14977
IN-20220609-14978
IN-20220609-45095
IN-20220609-45096
IN-20220609-50739
IN-20220609-50744
IN-20220609-52768
IN-20220609-52769
IN-20220609-50867
IN-20220609-52521
IN-20220609-52773
IN-20220609-52840
IN-20220609-54294
IN-20220609-3947
IN-20220609-3970
IN-20220609-3971
IN-20220609-3972
IN-20220609-4214
IN-20220609-4237
IN-20220609-4238
IN-20220609-4239
IN-20220609-11340
IN-20220609-13247
IN-20220609-13248
IN-20220609-45068
IN-20220609-45069
IN-20220609-50746
IN-20220609-52783
IN-20220609-55032
IN-20220609-57345
IN-20220609-13060
IN-20220609-45024
IN-20220609-58325
IN-20220609-58469
IN-20220609-58471
IN-20220609-58473
IN-20220609-58477
IN-20220609-52580
IN-20220609-52796
IN-20220609-52582
IN-20220609-52798
IN-20220609-52583
IN-20220609-52799
IN-20220609-52632
IN-20220609-13092
IN-20220609-13162
IN-20220609-22258
IN-20220609-45054
IN-20220609-53041
IN-20220609-53044
IN-20220609-53051
IN-20220609-53052
IN-20220609-53055
IN-20220609-53056
IN-20220609-53099
IN-20220609-53101
IN-20220609-53111
IN-20220609-53112
IN-20220609-53139
IN-20220609-53140
IN-20220609-57072
IN-20220609-57074
IN-20220609-57076
IN-20220609-57077
IN-20220609-57084
IN-20220609-57085
IN-20220609-52723
IN-20220609-52728
IN-20220609-52803
IN-20220609-52814
IN-20220609-53102
IN-20220609-53103
IN-20220609-53162
IN-20220609-53180
IN-20220609-52611
IN-20220609-52806
IN-20220609-50475
IN-20220609-50476
IN-20220609-50477
IN-20220609-52568
IN-20220609-52569
IN-20220609-52570
IN-20220609-52810
IN-20220609-52811
IN-20220609-52812
IN-20220609-53602
IN-20220609-53603
IN-20220609-53604
IN-20220609-39173
IN-20220609-52571
IN-20220609-52574
IN-20220609-52813
IN-20220609-52838
IN-20220609-6007
IN-20220609-13080
IN-20220609-13081
IN-20220609-13082
IN-20220609-13083
IN-20220609-13084
IN-20220609-13085
IN-20220609-13086
IN-20220609-13219
IN-20220609-13220
IN-20220609-13221
IN-20220609-13222
IN-20220609-13223
IN-20220609-13224
IN-20220609-45043
IN-20220609-45044
IN-20220609-45045
IN-20220609-45046
IN-20220609-45047
IN-20220609-45048
IN-20220609-45049
IN-20220609-52436
IN-20220609-52524
IN-20220609-52525
IN-20220609-52526
IN-20220609-52533
IN-20220609-52536
IN-20220609-52815
IN-20220609-52830
IN-20220609-52857
IN-20220609-52858
IN-20220609-52866
IN-20220609-52867
IN-20220609-54292
IN-20220608-36685
IN-20220608-36686
IN-20220608-36687
IN-20220608-36688
IN-20220608-36689
IN-20220608-36690
IN-20220608-36691
IN-20220608-36692
IN-20220608-36693
IN-20220608-37141
IN-20220608-37142
IN-20220608-37143
IN-20220608-37144
IN-20220608-37463
IN-20220608-37464
IN-20220608-37465
IN-20220608-37466
IN-20220608-37467
IN-20220608-57178
IN-20220609-3773
IN-20220609-3774
IN-20220609-4040
IN-20220609-4041
IN-20220609-4353
IN-20220609-4354
IN-20220609-4363
IN-20220609-4372
IN-20220609-15012
IN-20220609-15013
IN-20220609-45203
IN-20220609-49938
IN-20220609-49939
IN-20220609-52553
IN-20220609-52554
IN-20220609-52556
IN-20220609-52557
IN-20220609-52558
IN-20220609-52559
IN-20220609-52560
IN-20220609-52816
IN-20220609-52817
IN-20220609-52841
IN-20220609-52842
IN-20220609-52843
IN-20220609-52923
IN-20220609-52939
IN-20220609-52974
IN-20220609-52975
IN-20220609-53107
IN-20220609-53108
IN-20220609-53049
IN-20220609-53054
IN-20220609-53057
IN-20220609-53059
IN-20220609-53060
IN-20220609-53062
IN-20220609-53110
IN-20220609-53113
IN-20220609-53141
IN-20220609-53158
IN-20220609-53159
IN-20220609-53177
IN-20220609-57075
IN-20220609-57078
IN-20220609-57086
IN-20220609-57094
IN-20220609-57095
IN-20220609-57102
IN-20220609-52417
IN-20220609-52657
IN-20220609-52660
IN-20220609-52661
IN-20220609-52665
IN-20220609-52666
IN-20220609-52826
IN-20220609-52827
IN-20220609-52828
IN-20220609-52829
IN-20220609-52832
IN-20220609-53114
IN-20220609-53115
IN-20220609-53116
IN-20220609-53117
IN-20220609-53130
IN-20220609-53190
IN-20220609-53191
IN-20220609-53192
IN-20220609-53193
IN-20220609-53194
IN-20220609-56428
IN-20220609-60657
IN-20220609-52942
IN-20220609-52943
IN-20220609-52944
IN-20220609-52946
IN-20220609-52947
IN-20220609-52948
IN-20220609-52950
IN-20220609-52952
IN-20220609-52953
IN-20220609-52954
IN-20220609-52955
IN-20220609-52957
IN-20220609-52958
IN-20220609-52960
IN-20220609-52961
IN-20220609-52977
IN-20220609-52978
IN-20220609-52979
IN-20220609-52980
IN-20220609-52981
IN-20220609-52982
IN-20220609-52983
IN-20220609-52984
IN-20220609-52985
IN-20220609-52986
IN-20220609-52987
IN-20220609-52988
IN-20220609-52990
IN-20220609-52991
IN-20220609-52998
IN-20220609-53118
IN-20220609-53119
IN-20220609-53120
IN-20220609-53121
IN-20220609-53122
IN-20220609-53123
IN-20220609-53124
IN-20220609-53133
IN-20220609-53134
IN-20220609-53135
IN-20220609-53136
IN-20220609-53137
IN-20220609-53147
IN-20220609-53148
IN-20220609-53155
IN-20220609-53073
IN-20220609-53126
IN-20220609-57080
IN-20220609-53089
IN-20220609-53092
IN-20220609-53128
IN-20220609-53129
IN-20220609-57082
IN-20220609-57083
IN-20220609-52669
IN-20220609-52833
IN-20220609-53131
IN-20220609-53195
IN-20220609-11536
IN-20220609-14969
IN-20220609-52572
IN-20220609-52835
IN-20220609-52585
IN-20220609-52836
IN-20220609-52550
IN-20220609-52573
IN-20220609-52837
IN-20220609-4526
IN-20220609-4527
IN-20220609-4528
IN-20220609-4529
IN-20220609-44926
IN-20220609-44927
IN-20220609-44928
IN-20220609-44929
IN-20220609-53000
IN-20220609-53001
IN-20220609-53006
IN-20220609-53008
IN-20220609-53010
IN-20220609-53143
IN-20220609-53163
IN-20220609-53166
IN-20220609-53167
IN-20220609-53168
IN-20220609-57088
IN-20220609-57096
IN-20220609-57099
IN-20220609-57100
IN-20220609-57101
IN-20220609-4530
IN-20220609-4531
IN-20220609-4532
IN-20220609-4533
IN-20220609-4534
IN-20220609-13094
IN-20220609-44930
IN-20220609-44931
IN-20220609-44932
IN-20220609-45056
IN-20220609-52522
IN-20220609-52860
IN-20220609-53079
IN-20220609-53144
IN-20220609-53782
IN-20220609-53785
IN-20220609-57089
IN-20220609-57114
IN-20220609-52962
IN-20220609-52992
IN-20220609-53149
IN-20220609-53094
IN-20220609-53157
IN-20220609-57093
IN-20220609-52970
IN-20220609-53002
IN-20220609-53005
IN-20220609-53164
IN-20220609-53165
IN-20220609-53243
IN-20220609-57097
IN-20220609-57098
IN-20220609-57112
IN-20220609-52653
IN-20220609-52852
IN-20220609-53173
IN-20220609-53216
IN-20220609-52654
IN-20220609-52853
IN-20220609-53174
IN-20220609-53217
IN-20220609-52655
IN-20220609-52854
IN-20220609-53175
IN-20220609-53218
IN-20220609-52656
IN-20220609-52855
IN-20220609-53176
IN-20220609-53219
IN-20220609-52523
IN-20220609-52856
IN-20220609-59064
IN-20220609-52680
IN-20220609-52682
IN-20220609-52684
IN-20220609-52862
IN-20220609-52863
IN-20220609-52864
IN-20220609-53186
IN-20220609-53187
IN-20220609-53188
IN-20220609-53231
IN-20220609-53232
IN-20220609-53233
IN-20220609-52686
IN-20220609-52687
IN-20220609-52690
IN-20220609-52865
IN-20220609-52895
IN-20220609-52896
IN-20220609-53189
IN-20220609-53229
IN-20220609-53230
IN-20220609-53234
IN-20220609-53261
IN-20220609-53262
IN-20220609-52636
IN-20220609-52869
IN-20220609-53197
IN-20220609-53235
IN-20220609-52694
IN-20220609-52870
IN-20220609-53198
IN-20220609-53236
IN-20220609-52633
IN-20220609-52873
IN-20220609-53201
IN-20220609-53237
IN-20220609-52634
IN-20220609-52874
IN-20220609-53202
IN-20220609-53246
IN-20220609-52640
IN-20220609-52878
IN-20220609-53205
IN-20220609-53249
IN-20220609-52642
IN-20220609-52879
IN-20220609-53206
IN-20220609-53250
IN-20220609-52646
IN-20220609-52881
IN-20220609-53208
IN-20220609-53252
IN-20220609-52670
IN-20220609-52671
IN-20220609-52672
IN-20220609-52890
IN-20220609-52891
IN-20220609-52892
IN-20220609-53224
IN-20220609-53225
IN-20220609-53226
IN-20220609-53256
IN-20220609-53257
IN-20220609-53258
IN-20220609-52673
IN-20220609-52893
IN-20220609-53227
IN-20220609-53259
IN-20220609-52677
IN-20220609-52894
IN-20220609-53228
IN-20220609-53260
IN-20220609-53031
IN-20220609-53238
IN-20220609-57107
IN-20220609-52700
IN-20220609-52897
IN-20220609-53244
IN-20220609-53263
IN-20220609-52705
IN-20220609-52898
IN-20220609-53095
IN-20220609-53245
IN-20220609-52578
IN-20220609-52899
IN-20220609-52586
IN-20220609-52908
IN-20220609-53775
IN-20220609-53778
IN-20220609-53781
IN-20220609-53783
IN-20220609-53786
IN-20220609-57113
IN-20220609-57115
IN-20220609-57143
IN-20220609-57144
IN-20220609-57146
IN-20220609-53789
IN-20220609-53792
IN-20220609-53793
IN-20220609-53799
IN-20220609-53806
IN-20220609-57116
IN-20220609-57117
IN-20220609-57130
IN-20220609-57131
IN-20220609-57132
IN-20220609-52464
IN-20220609-52465
IN-20220609-52466
IN-20220609-52467
IN-20220609-52468
IN-20220609-52469
IN-20220609-52470
IN-20220609-53373
IN-20220609-53374
IN-20220609-53375
IN-20220609-53376
IN-20220609-53384
IN-20220609-53385
IN-20220609-53386
IN-20220609-53810
IN-20220609-57120
IN-20220609-52476
IN-20220609-52477
IN-20220609-52478
IN-20220609-52479
IN-20220609-52480
IN-20220609-52481
IN-20220609-52482
IN-20220609-53298
IN-20220609-53299
IN-20220609-53300
IN-20220609-53301
IN-20220609-53365
IN-20220609-53366
IN-20220609-53392
IN-20220609-53811
IN-20220609-53813
IN-20220609-53815
IN-20220609-53816
IN-20220609-53817
IN-20220609-53819
IN-20220609-53821
IN-20220609-57121
IN-20220609-57122
IN-20220609-57123
IN-20220609-57320
IN-20220609-57321
IN-20220609-57322
IN-20220609-57323
IN-20220609-53832
IN-20220609-53838
IN-20220609-53843
IN-20220609-53845
IN-20220609-57124
IN-20220609-57125
IN-20220609-57126
IN-20220609-57133
IN-20220609-53851
IN-20220609-53859
IN-20220609-53861
IN-20220609-57127
IN-20220609-57140
IN-20220609-57141
IN-20220609-53873
IN-20220609-57128
IN-20220609-53876
IN-20220609-57129
IN-20220609-53864
IN-20220609-53867
IN-20220609-53871
IN-20220609-57134
IN-20220609-57139
IN-20220609-57142
IN-20220609-53755
IN-20220609-53758
IN-20220609-53760
IN-20220609-57135
IN-20220609-57227
IN-20220609-57228
IN-20220609-52471
IN-20220609-52472
IN-20220609-52473
IN-20220609-52474
IN-20220609-52475
IN-20220609-53387
IN-20220609-53388
IN-20220609-53389
IN-20220609-53390
IN-20220609-53391
IN-20220609-53764
IN-20220609-53767
IN-20220609-53768
IN-20220609-53772
IN-20220609-57136
IN-20220609-57137
IN-20220609-57231
IN-20220609-57254
IN-20220609-54215
IN-20220609-57147
IN-20220609-54225
IN-20220609-54226
IN-20220609-54242
IN-20220609-54244
IN-20220609-54246
IN-20220609-57148
IN-20220609-57149
IN-20220609-57263
IN-20220609-57264
IN-20220609-57265
IN-20220609-54158
IN-20220609-54159
IN-20220609-57150
IN-20220609-57339
IN-20220609-54160
IN-20220609-54161
IN-20220609-54162
IN-20220609-54163
IN-20220609-54164
IN-20220609-57151
IN-20220609-57152
IN-20220609-57153
IN-20220609-57291
IN-20220609-57292
IN-20220609-54165
IN-20220609-54167
IN-20220609-54168
IN-20220609-57154
IN-20220609-57247
IN-20220609-57248
IN-20220609-54175
IN-20220609-54178
IN-20220609-57155
IN-20220609-57177
IN-20220609-11997
IN-20220609-13154
IN-20220609-54222
IN-20220609-54227
IN-20220609-54228
IN-20220609-54229
IN-20220609-57156
IN-20220609-57157
IN-20220609-57158
IN-20220609-57324
IN-20220609-54253
IN-20220609-54257
IN-20220609-54262
IN-20220609-54267
IN-20220609-54269
IN-20220609-54270
IN-20220609-54277
IN-20220609-54278
IN-20220609-57159
IN-20220609-57276
IN-20220609-57277
IN-20220609-57279
IN-20220609-57280
IN-20220609-57281
IN-20220609-57295
IN-20220609-57328
IN-20220609-54280
IN-20220609-54290
IN-20220609-54295
IN-20220609-54297
IN-20220609-54299
IN-20220609-54302
IN-20220609-54303
IN-20220609-57160
IN-20220609-57167
IN-20220609-57169
IN-20220609-57297
IN-20220609-57304
IN-20220609-57331
IN-20220609-13051
IN-20220609-45017
IN-20220609-54281
IN-20220609-54282
IN-20220609-54283
IN-20220609-54284
IN-20220609-57161
IN-20220609-57162
IN-20220609-57163
IN-20220609-57164
IN-20220609-12030
IN-20220609-12033
IN-20220609-12037
IN-20220609-28374
IN-20220609-28375
IN-20220609-54285
IN-20220609-54289
IN-20220609-54291
IN-20220609-54296
IN-20220609-54298
IN-20220609-54300
IN-20220609-54301
IN-20220609-56421
IN-20220609-57165
IN-20220609-57166
IN-20220609-57168
IN-20220609-57296
IN-20220609-57302
IN-20220609-57303
IN-20220609-57332
IN-20220609-3938
IN-20220609-4205
IN-20220609-13074
IN-20220609-45037
IN-20220609-54305
IN-20220609-57170
IN-20220609-3790
IN-20220609-4057
IN-20220609-4717
IN-20220609-4718
IN-20220609-44969
IN-20220609-44970
IN-20220609-54304
IN-20220609-54306
IN-20220609-54307
IN-20220609-54308
IN-20220609-54309
IN-20220609-54310
IN-20220609-57171
IN-20220609-57172
IN-20220609-57173
IN-20220609-57174
IN-20220609-57175
IN-20220609-57305
IN-20220609-13075
IN-20220609-45038
IN-20220609-54311
IN-20220609-54312
IN-20220609-57176
IN-20220609-57187
IN-20220609-54179
IN-20220609-54180
IN-20220609-57178
IN-20220609-57179
IN-20220609-54181
IN-20220609-54182
IN-20220609-54183
IN-20220609-57180
IN-20220609-57181
IN-20220609-57182
IN-20220609-54216
IN-20220609-54218
IN-20220609-54219
IN-20220609-57183
IN-20220609-57184
IN-20220609-57185
IN-20220609-54220
IN-20220609-54221
IN-20220609-54223
IN-20220609-54224
IN-20220609-57186
IN-20220609-57325
IN-20220609-57329
IN-20220609-57330
IN-20220609-13076
IN-20220609-45039
IN-20220609-52539
IN-20220609-52804
IN-20220609-54313
IN-20220609-57188
IN-20220609-54314
IN-20220609-56620
IN-20220609-56622
IN-20220609-56623
IN-20220609-57189
IN-20220609-57705
IN-20220609-57727
IN-20220609-57728
IN-20220609-11542
IN-20220609-14999
IN-20220609-44990
IN-20220609-45106
IN-20220609-50733
IN-20220609-52549
IN-20220609-52764
IN-20220609-52847
IN-20220609-54315
IN-20220609-57190
IN-20220609-54184
IN-20220609-54185
IN-20220609-54186
IN-20220609-57191
IN-20220609-57192
IN-20220609-57193
IN-20220609-54197
IN-20220609-54199
IN-20220609-54200
IN-20220609-54202
IN-20220609-54203
IN-20220609-54204
IN-20220609-54206
IN-20220609-54207
IN-20220609-54208
IN-20220609-54209
IN-20220609-54210
IN-20220609-54211
IN-20220609-54212
IN-20220609-54213
IN-20220609-57194
IN-20220609-57195
IN-20220609-57196
IN-20220609-57197
IN-20220609-57198
IN-20220609-57298
IN-20220609-57299
IN-20220609-57300
IN-20220609-57301
IN-20220609-57306
IN-20220609-57318
IN-20220609-57319
IN-20220609-57326
IN-20220609-57327
IN-20220609-54061
IN-20220609-54062
IN-20220609-54063
IN-20220609-57199
IN-20220609-57200
IN-20220609-57315
IN-20220609-54064
IN-20220609-54065
IN-20220609-57201
IN-20220609-57202
IN-20220609-54066
IN-20220609-54067
IN-20220609-54068
IN-20220609-54069
IN-20220609-54070
IN-20220609-57203
IN-20220609-57204
IN-20220609-57205
IN-20220609-57206
IN-20220609-57207
IN-20220609-54021
IN-20220609-54022
IN-20220609-54023
IN-20220609-54024
IN-20220609-57208
IN-20220609-57209
IN-20220609-57310
IN-20220609-57311
IN-20220609-49671
IN-20220609-54025
IN-20220609-57210
IN-20220609-54026
IN-20220609-54032
IN-20220609-54040
IN-20220609-57211
IN-20220609-57212
IN-20220609-57213
IN-20220609-54052
IN-20220609-57216
IN-20220609-54071
IN-20220609-54072
IN-20220609-54073
IN-20220609-54074
IN-20220609-57217
IN-20220609-57218
IN-20220609-57219
IN-20220609-57220
IN-20220609-54075
IN-20220609-54076
IN-20220609-54077
IN-20220609-57221
IN-20220609-57222
IN-20220609-57223
IN-20220609-54078
IN-20220609-54090
IN-20220609-57224
IN-20220609-57225
IN-20220609-53753
IN-20220609-57226
IN-20220609-53762
IN-20220609-53763
IN-20220609-57229
IN-20220609-57230
IN-20220609-53888
IN-20220609-53890
IN-20220609-57232
IN-20220609-57317
IN-20220609-53891
IN-20220609-53892
IN-20220609-57233
IN-20220609-57234
IN-20220609-53893
IN-20220609-53894
IN-20220609-53895
IN-20220609-57235
IN-20220609-57236
IN-20220609-57237
IN-20220609-53896
IN-20220609-53897
IN-20220609-57238
IN-20220609-57239
IN-20220609-54091
IN-20220609-54094
IN-20220609-54097
IN-20220609-57240
IN-20220609-57268
IN-20220609-57269
IN-20220609-54110
IN-20220609-54111
IN-20220609-57241
IN-20220609-57275
IN-20220609-54112
IN-20220609-54113
IN-20220609-54114
IN-20220609-54115
IN-20220609-57242
IN-20220609-57243
IN-20220609-57244
IN-20220609-57245
IN-20220609-54116
IN-20220609-57246
IN-20220609-52483
IN-20220609-53367
IN-20220609-54169
IN-20220609-54170
IN-20220609-54171
IN-20220609-57249
IN-20220609-57258
IN-20220609-57259
IN-20220609-53880
IN-20220609-53884
IN-20220609-57250
IN-20220609-57316
IN-20220609-54120
IN-20220609-54124
IN-20220609-54125
IN-20220609-54126
IN-20220609-57251
IN-20220609-57333
IN-20220609-57334
IN-20220609-57335
IN-20220609-54130
IN-20220609-54131
IN-20220609-57252
IN-20220609-57253
IN-20220609-54172
IN-20220609-57260
IN-20220609-54173
IN-20220609-54174
IN-20220609-57261
IN-20220609-57262
IN-20220609-54146
IN-20220609-54148
IN-20220609-54149
IN-20220609-54150
IN-20220609-57266
IN-20220609-57267
IN-20220609-57285
IN-20220609-57336
IN-20220609-54098
IN-20220609-54102
IN-20220609-57270
IN-20220609-57271
IN-20220609-54106
IN-20220609-54108
IN-20220609-54109
IN-20220609-57272
IN-20220609-57273
IN-20220609-57274
IN-20220609-13050
IN-20220609-15122
IN-20220609-45016
IN-20220609-54261
IN-20220609-54271
IN-20220609-54272
IN-20220609-54274
IN-20220609-54275
IN-20220609-54276
IN-20220609-57278
IN-20220609-57282
IN-20220609-57283
IN-20220609-57284
IN-20220609-57293
IN-20220609-57294
IN-20220609-54151
IN-20220609-54152
IN-20220609-54153
IN-20220609-54154
IN-20220609-57286
IN-20220609-57287
IN-20220609-57288
IN-20220609-57337
IN-20220609-54155
IN-20220609-57289
IN-20220609-54156
IN-20220609-54157
IN-20220609-57290
IN-20220609-57338
IN-20220609-54019
IN-20220609-54020
IN-20220609-57308
IN-20220609-57309
IN-20220609-54055
IN-20220609-54059
IN-20220609-54060
IN-20220609-57312
IN-20220609-57313
IN-20220609-57314
IN-20220609-11551
IN-20220609-13079
IN-20220609-44995
IN-20220609-45042
IN-20220609-52538
IN-20220609-52900
IN-20220609-55170
IN-20220609-55171
IN-20220609-55172
IN-20220609-55174
IN-20220609-55177
IN-20220609-57354
IN-20220609-57355
IN-20220609-57356
IN-20220609-57357
IN-20220609-57358
IN-20220609-12820
IN-20220609-55233
IN-20220609-57366
IN-20220609-55234
IN-20220609-55235
IN-20220609-55236
IN-20220609-55238
IN-20220609-55244
IN-20220609-55245
IN-20220609-56094
IN-20220609-56116
IN-20220609-56117
IN-20220609-57367
IN-20220609-57368
IN-20220609-57369
IN-20220609-57370
IN-20220609-57371
IN-20220609-57372
IN-20220609-57812
IN-20220609-57813
IN-20220609-57835
IN-20220609-54686
IN-20220609-57374
IN-20220609-13077
IN-20220609-45040
IN-20220609-54687
IN-20220609-55165
IN-20220609-55166
IN-20220609-55167
IN-20220609-55168
IN-20220609-57375
IN-20220609-57380
IN-20220609-57381
IN-20220609-57382
IN-20220609-57383
IN-20220609-13078
IN-20220609-45041
IN-20220609-55169
IN-20220609-56679
IN-20220609-57384
IN-20220609-57658
IN-20220609-12767
IN-20220609-12768
IN-20220609-49929
IN-20220609-55055
IN-20220609-56423
IN-20220609-57396
IN-20220609-56886
IN-20220609-56887
IN-20220609-56888
IN-20220609-57413
IN-20220609-57650
IN-20220609-57651
IN-20220609-56895
IN-20220609-56899
IN-20220609-56900
IN-20220609-57415
IN-20220609-57416
IN-20220609-57417
IN-20220609-56902
IN-20220609-56904
IN-20220609-57418
IN-20220609-57419
IN-20220609-56775
IN-20220609-56776
IN-20220609-56779
IN-20220609-56782
IN-20220609-56784
IN-20220609-56786
IN-20220609-56788
IN-20220609-57455
IN-20220609-57457
IN-20220609-57458
IN-20220609-57460
IN-20220609-57806
IN-20220609-57807
IN-20220609-57817
IN-20220609-13053
IN-20220609-45019
IN-20220609-56758
IN-20220609-56763
IN-20220609-56765
IN-20220609-56767
IN-20220609-56769
IN-20220609-56771
IN-20220609-56772
IN-20220609-56773
IN-20220609-56777
IN-20220609-56778
IN-20220609-56780
IN-20220609-56781
IN-20220609-56783
IN-20220609-56785
IN-20220609-56787
IN-20220609-56789
IN-20220609-56792
IN-20220609-56794
IN-20220609-56795
IN-20220609-57456
IN-20220609-57459
IN-20220609-57461
IN-20220609-57464
IN-20220609-57618
IN-20220609-57628
IN-20220609-57630
IN-20220609-57632
IN-20220609-57634
IN-20220609-57635
IN-20220609-57636
IN-20220609-57814
IN-20220609-57815
IN-20220609-57816
IN-20220609-57818
IN-20220609-57819
IN-20220609-57821
IN-20220609-57822
IN-20220609-57860
IN-20220609-56790
IN-20220609-56791
IN-20220609-57462
IN-20220609-57463
IN-20220609-56798
IN-20220609-56801
IN-20220609-56803
IN-20220609-56804
IN-20220609-56805
IN-20220609-57478
IN-20220609-57479
IN-20220609-57846
IN-20220609-57847
IN-20220609-57848
IN-20220609-56807
IN-20220609-56810
IN-20220609-57480
IN-20220609-57483
IN-20220609-56806
IN-20220609-56808
IN-20220609-56809
IN-20220609-57481
IN-20220609-57482
IN-20220609-57849
IN-20220609-56811
IN-20220609-56819
IN-20220609-56841
IN-20220609-57484
IN-20220609-57502
IN-20220609-57714
IN-20220609-58152
IN-20220609-58153
IN-20220609-58154
IN-20220609-58155
IN-20220609-15032
IN-20220609-45115
IN-20220609-56812
IN-20220609-56813
IN-20220609-56814
IN-20220609-56815
IN-20220609-56816
IN-20220609-56817
IN-20220609-56818
IN-20220609-57485
IN-20220609-57486
IN-20220609-57487
IN-20220609-57488
IN-20220609-57850
IN-20220609-57851
IN-20220609-57852
IN-20220608-36694
IN-20220608-37468
IN-20220608-42005
IN-20220608-42009
IN-20220608-42011
IN-20220608-42013
IN-20220608-42016
IN-20220608-47267
IN-20220608-47268
IN-20220608-47269
IN-20220608-47270
IN-20220608-47271
IN-20220608-47272
IN-20220608-47273
IN-20220608-47274
IN-20220608-47275
IN-20220608-47276
IN-20220608-47277
IN-20220608-47279
IN-20220608-47280
IN-20220608-47281
IN-20220608-47282
IN-20220608-47283
IN-20220608-47284
IN-20220608-47286
IN-20220608-47287
IN-20220608-47288
IN-20220608-47289
IN-20220608-47290
IN-20220608-47291
IN-20220608-47292
IN-20220608-47293
IN-20220608-47294
IN-20220608-47295
IN-20220608-47296
IN-20220608-47297
IN-20220608-47299
IN-20220608-47300
IN-20220608-47301
IN-20220608-47302
IN-20220608-47304
IN-20220608-47351
IN-20220608-47352
IN-20220608-47353
IN-20220608-47354
IN-20220608-47355
IN-20220608-47356
IN-20220608-47357
IN-20220608-47358
IN-20220608-47359
IN-20220608-47360
IN-20220608-47361
IN-20220608-47362
IN-20220608-47363
IN-20220608-47364
IN-20220608-47365
IN-20220608-47366
IN-20220608-47367
IN-20220608-47368
IN-20220608-47369
IN-20220608-47370
IN-20220608-47371
IN-20220608-47372
IN-20220608-47373
IN-20220608-47374
IN-20220608-47375
IN-20220608-47376
IN-20220608-47377
IN-20220608-47378
IN-20220608-47379
IN-20220608-47380
IN-20220608-47381
IN-20220608-47382
IN-20220608-47383
IN-20220609-4339
IN-20220609-4385
IN-20220609-5165
IN-20220609-13416
IN-20220609-14589
IN-20220609-28984
IN-20220609-30929
IN-20220609-32870
IN-20220609-53909
IN-20220609-56488
IN-20220609-57500
IN-20220609-56152
IN-20220609-56153
IN-20220609-56154
IN-20220609-56155
IN-20220609-56158
IN-20220609-56159
IN-20220609-56162
IN-20220609-57505
IN-20220609-57506
IN-20220609-57507
IN-20220609-57524
IN-20220609-57525
IN-20220609-57526
IN-20220609-57527
IN-20220609-56264
IN-20220609-56267
IN-20220609-56270
IN-20220609-56274
IN-20220609-57522
IN-20220609-57523
IN-20220609-57600
IN-20220609-57601
IN-20220609-56166
IN-20220609-56172
IN-20220609-56173
IN-20220609-56174
IN-20220609-57528
IN-20220609-57542
IN-20220609-57543
IN-20220609-57544
IN-20220609-56193
IN-20220609-56197
IN-20220609-56201
IN-20220609-56203
IN-20220609-56204
IN-20220609-56206
IN-20220609-56207
IN-20220609-57560
IN-20220609-57743
IN-20220609-57744
IN-20220609-57757
IN-20220609-57758
IN-20220609-57773
IN-20220609-57774
IN-20220609-56209
IN-20220609-56210
IN-20220609-56211
IN-20220609-56212
IN-20220609-57562
IN-20220609-57563
IN-20220609-57564
IN-20220609-57759
IN-20220609-56231
IN-20220609-56238
IN-20220609-56241
IN-20220609-56249
IN-20220609-57580
IN-20220609-57795
IN-20220609-57797
IN-20220609-57836
IN-20220609-3878
IN-20220609-4145
IN-20220609-12832
IN-20220609-13155
IN-20220609-15037
IN-20220609-44998
IN-20220609-45118
IN-20220609-50725
IN-20220609-52790
IN-20220609-56250
IN-20220609-56255
IN-20220609-56258
IN-20220609-56259
IN-20220609-56260
IN-20220609-57581
IN-20220609-57582
IN-20220609-57583
IN-20220609-57598
IN-20220609-57599
IN-20220609-56629
IN-20220609-57597
IN-20220609-56320
IN-20220609-56325
IN-20220609-56334
IN-20220609-56342
IN-20220609-56343
IN-20220609-56345
IN-20220609-57607
IN-20220609-57608
IN-20220609-57654
IN-20220609-57656
IN-20220609-57755
IN-20220609-57756
IN-20220609-56349
IN-20220609-56352
IN-20220609-56353
IN-20220609-56356
IN-20220609-56359
IN-20220609-57609
IN-20220609-57611
IN-20220609-57612
IN-20220609-57613
IN-20220609-57764
IN-20220609-56351
IN-20220609-56371
IN-20220609-56373
IN-20220609-56376
IN-20220609-57610
IN-20220609-57625
IN-20220609-57670
IN-20220609-57671
IN-20220609-56357
IN-20220609-56363
IN-20220609-57614
IN-20220609-57623
IN-20220609-56754
IN-20220609-56755
IN-20220609-56756
IN-20220609-57617
IN-20220609-57832
IN-20220609-57833
IN-20220609-15114
IN-20220609-15116
IN-20220609-45130
IN-20220609-45131
IN-20220609-56277
IN-20220609-56282
IN-20220609-56285
IN-20220609-56288
IN-20220609-56291
IN-20220609-56294
IN-20220609-57620
IN-20220609-57761
IN-20220609-57810
IN-20220609-57811
IN-20220609-57824
IN-20220609-57854
IN-20220609-13242
IN-20220609-45063
IN-20220609-56312
IN-20220609-57621
IN-20220609-56365
IN-20220609-56366
IN-20220609-56367
IN-20220609-56369
IN-20220609-56370
IN-20220609-57624
IN-20220609-57668
IN-20220609-57669
IN-20220609-57678
IN-20220609-57679
IN-20220609-56757
IN-20220609-56759
IN-20220609-56760
IN-20220609-56761
IN-20220609-56762
IN-20220609-56764
IN-20220609-56766
IN-20220609-56768
IN-20220609-56770
IN-20220609-57619
IN-20220609-57629
IN-20220609-57631
IN-20220609-57633
IN-20220609-57834
IN-20220609-57857
IN-20220609-57858
IN-20220609-57859
IN-20220609-57861
IN-20220609-56302
IN-20220609-56305
IN-20220609-56306
IN-20220609-56308
IN-20220609-56309
IN-20220609-57639
IN-20220609-57640
IN-20220609-57871
IN-20220609-57872
IN-20220609-57873
IN-20220609-56727
IN-20220609-56732
IN-20220609-56736
IN-20220609-56737
IN-20220609-56739
IN-20220609-56740
IN-20220609-57644
IN-20220609-57647
IN-20220609-57648
IN-20220609-57782
IN-20220609-57801
IN-20220609-57802
IN-20220609-28346
IN-20220609-28347
IN-20220609-28348
IN-20220609-28349
IN-20220609-28350
IN-20220609-28351
IN-20220609-28352
IN-20220609-28353
IN-20220609-28354
IN-20220609-28355
IN-20220609-28356
IN-20220609-28357
IN-20220609-28358
IN-20220609-56702
IN-20220609-56706
IN-20220609-56707
IN-20220609-56714
IN-20220609-56718
IN-20220609-56724
IN-20220609-56729
IN-20220609-56733
IN-20220609-56738
IN-20220609-57645
IN-20220609-57649
IN-20220609-57776
IN-20220609-57778
IN-20220609-57783
IN-20220609-57862
IN-20220609-57863
IN-20220609-57864
IN-20220609-57869
IN-20220609-60141
IN-20220609-56722
IN-20220609-56725
IN-20220609-56735
IN-20220609-57646
IN-20220609-57779
IN-20220609-57781
IN-20220609-56348
IN-20220609-57657
IN-20220609-56675
IN-20220609-56682
IN-20220609-56685
IN-20220609-56688
IN-20220609-56693
IN-20220609-57661
IN-20220609-57663
IN-20220609-57665
IN-20220609-57667
IN-20220609-57738
IN-20220609-56687
IN-20220609-56691
IN-20220609-57664
IN-20220609-57746
IN-20220609-11385
IN-20220609-14789
IN-20220609-56690
IN-20220609-57666
IN-20220609-56624
IN-20220609-56625
IN-20220609-56626
IN-20220609-57673
IN-20220609-57729
IN-20220609-57730
IN-20220609-56634
IN-20220609-56635
IN-20220609-56636
IN-20220609-56637
IN-20220609-56638
IN-20220609-57674
IN-20220609-57789
IN-20220609-57790
IN-20220609-57791
IN-20220609-57792
IN-20220609-56639
IN-20220609-56642
IN-20220609-56643
IN-20220609-56644
IN-20220609-56645
IN-20220609-57615
IN-20220609-57675
IN-20220609-57677
IN-20220609-57793
IN-20220609-57794
IN-20220609-56640
IN-20220609-57676
IN-20220609-13243
IN-20220609-45064
IN-20220609-50747
IN-20220609-52784
IN-20220609-56378
IN-20220609-56379
IN-20220609-56380
IN-20220609-56381
IN-20220609-57680
IN-20220609-57681
IN-20220609-57682
IN-20220609-57683
IN-20220609-13244
IN-20220609-45065
IN-20220609-56594
IN-20220609-56596
IN-20220609-56610
IN-20220609-56611
IN-20220609-56614
IN-20220609-57684
IN-20220609-57702
IN-20220609-57703
IN-20220609-57722
IN-20220609-57724
IN-20220609-56646
IN-20220609-56648
IN-20220609-56649
IN-20220609-56650
IN-20220609-57687
IN-20220609-57688
IN-20220609-57689
IN-20220609-57798
IN-20220609-56651
IN-20220609-56652
IN-20220609-56653
IN-20220609-57690
IN-20220609-57691
IN-20220609-57692
IN-20220609-56654
IN-20220609-56657
IN-20220609-56659
IN-20220609-57693
IN-20220609-57734
IN-20220609-57799
IN-20220609-12747
IN-20220609-12748
IN-20220609-56662
IN-20220609-57698
IN-20220609-57901
IN-20220609-57906
IN-20220609-57909
IN-20220609-13285
IN-20220609-13286
IN-20220609-13287
IN-20220609-15046
IN-20220609-45124
IN-20220609-57918
IN-20220609-57923
IN-20220609-57924
IN-20220609-57925
IN-20220609-57926
IN-20220609-57928
IN-20220609-57929
IN-20220609-57930
IN-20220609-57931
IN-20220609-57943
IN-20220609-57945
IN-20220609-57946
IN-20220609-57948
IN-20220609-57949
IN-20220609-57950
IN-20220609-56600
IN-20220609-56605
IN-20220609-56606
IN-20220609-56608
IN-20220609-57699
IN-20220609-57700
IN-20220609-57701
IN-20220609-57723
IN-20220609-3883
IN-20220609-4150
IN-20220609-15016
IN-20220609-45110
IN-20220609-57028
IN-20220609-57031
IN-20220609-57033
IN-20220609-57034
IN-20220609-57036
IN-20220609-57707
IN-20220609-57715
IN-20220609-57716
IN-20220609-57717
IN-20220609-57718
IN-20220609-56664
IN-20220609-56665
IN-20220609-56666
IN-20220609-56667
IN-20220609-57709
IN-20220609-57710
IN-20220609-57711
IN-20220609-57712
IN-20220609-57037
IN-20220609-57719
IN-20220609-57039
IN-20220609-57041
IN-20220609-57720
IN-20220609-57721
IN-20220609-56186
IN-20220609-56198
IN-20220609-57725
IN-20220609-58529
IN-20220609-58532
IN-20220609-58534
IN-20220609-58537
IN-20220609-56627
IN-20220609-57731
IN-20220609-56628
IN-20220609-57733
IN-20220609-56676
IN-20220609-57745
IN-20220609-56699
IN-20220609-57749
IN-20220609-3799
IN-20220609-4066
IN-20220609-15061
IN-20220609-45125
IN-20220609-57958
IN-20220609-57959
IN-20220609-57960
IN-20220609-57961
IN-20220609-57962
IN-20220609-57963
IN-20220609-57964
IN-20220609-57965
IN-20220609-56347
IN-20220609-57762
IN-20220609-13301
IN-20220609-56709
IN-20220609-56716
IN-20220609-57767
IN-20220609-57777
IN-20220609-56712
IN-20220609-57768
IN-20220609-14729
IN-20220609-56713
IN-20220609-57769
IN-20220609-15042
IN-20220609-45121
IN-20220609-56175
IN-20220609-56184
IN-20220609-57545
IN-20220609-57770
IN-20220609-56723
IN-20220609-57780
IN-20220609-58000
IN-20220609-58001
IN-20220609-58002
IN-20220609-58003
IN-20220609-58004
IN-20220609-58005
IN-20220609-58006
IN-20220609-58007
IN-20220609-58008
IN-20220609-58009
IN-20220609-58010
IN-20220609-58011
IN-20220609-58012
IN-20220609-58013
IN-20220609-12936
IN-20220609-45006
IN-20220609-58014
IN-20220609-58015
IN-20220609-58016
IN-20220609-58017
IN-20220609-58018
IN-20220609-58019
IN-20220609-56630
IN-20220609-56631
IN-20220609-56632
IN-20220609-56633
IN-20220609-57785
IN-20220609-57786
IN-20220609-57787
IN-20220609-57788
IN-20220609-58021
IN-20220609-58022
IN-20220609-58023
IN-20220609-58024
IN-20220609-56741
IN-20220609-57803
IN-20220609-56745
IN-20220609-57804
IN-20220609-13052
IN-20220609-28361
IN-20220609-28362
IN-20220609-28363
IN-20220609-28364
IN-20220609-28365
IN-20220609-45018
IN-20220609-52544
IN-20220609-52902
IN-20220609-56743
IN-20220609-56746
IN-20220609-56747
IN-20220609-56748
IN-20220609-56749
IN-20220609-56750
IN-20220609-56751
IN-20220609-56752
IN-20220609-56753
IN-20220609-57616
IN-20220609-57805
IN-20220609-57825
IN-20220609-57826
IN-20220609-57827
IN-20220609-57828
IN-20220609-57829
IN-20220609-57830
IN-20220609-57831
IN-20220609-13054
IN-20220609-56796
IN-20220609-56797
IN-20220609-56799
IN-20220609-56800
IN-20220609-56802
IN-20220609-57823
IN-20220609-57842
IN-20220609-57843
IN-20220609-57844
IN-20220609-57845
IN-20220609-56671
IN-20220609-57840
IN-20220609-12716
IN-20220609-56298
IN-20220609-57856
IN-20220609-56710
IN-20220609-57865
IN-20220609-56720
IN-20220609-57868
IN-20220609-12769
IN-20220609-58140
IN-20220609-15036
IN-20220609-45117
IN-20220609-58141
IN-20220609-13057
IN-20220609-45022
IN-20220609-52542
IN-20220609-52901
IN-20220609-58163
IN-20220609-58165
IN-20220609-58166
IN-20220609-58167
IN-20220609-58168
IN-20220609-58172
IN-20220609-58213
IN-20220609-58215
IN-20220609-58217
IN-20220609-58222
IN-20220609-58232
IN-20220609-58238
IN-20220609-58240
IN-20220609-58242
IN-20220609-58243
IN-20220609-58248
IN-20220609-58249
IN-20220609-11974
IN-20220609-11977
IN-20220609-11981
IN-20220609-11983
IN-20220609-11985
IN-20220609-11986
IN-20220609-11987
IN-20220609-44996
IN-20220609-58320
IN-20220609-58321
IN-20220609-58322
IN-20220609-58301
IN-20220609-58306
IN-20220609-58309
IN-20220609-58311
IN-20220609-58313
IN-20220609-58314
IN-20220609-58315
IN-20220609-13058
IN-20220609-45023
IN-20220609-58251
IN-20220609-58255
IN-20220609-58257
IN-20220609-58258
IN-20220609-58262
IN-20220609-58265
IN-20220609-58266
IN-20220609-58268
IN-20220609-58272
IN-20220609-58274
IN-20220609-58277
IN-20220609-58278
IN-20220609-58279
IN-20220609-58286
IN-20220609-58289
IN-20220609-58293
IN-20220609-58294
IN-20220609-58299
IN-20220609-58142
IN-20220609-15033
IN-20220609-45116
IN-20220609-50726
IN-20220609-52761
IN-20220609-58143
IN-20220609-58144
IN-20220609-12823
IN-20220609-58145
IN-20220609-58146
IN-20220609-58147
IN-20220609-58148
IN-20220609-13047
IN-20220609-45014
IN-20220609-57879
IN-20220609-57880
IN-20220609-57882
IN-20220609-57889
IN-20220609-57890
IN-20220609-57891
IN-20220609-57892
IN-20220609-57894
IN-20220609-57895
IN-20220609-57897
IN-20220609-57967
IN-20220609-12720
IN-20220609-57968
IN-20220609-57969
IN-20220609-57970
IN-20220609-57971
IN-20220609-57972
IN-20220609-57973
IN-20220609-57974
IN-20220609-57975
IN-20220609-57976
IN-20220609-57977
IN-20220609-58053
IN-20220609-58851
IN-20220609-13068
IN-20220609-13069
IN-20220609-45031
IN-20220609-45032
IN-20220609-58626
IN-20220609-58632
IN-20220609-58635
IN-20220609-58639
IN-20220609-58642
IN-20220609-58645
IN-20220609-58647
IN-20220609-58650
IN-20220609-13067
IN-20220609-45030
IN-20220609-58653
IN-20220609-58668
IN-20220609-58672
IN-20220609-58673
IN-20220609-15101
IN-20220609-45129
IN-20220609-58618
IN-20220609-58619
IN-20220609-58623
IN-20220609-13063
IN-20220609-45027
IN-20220609-58538
IN-20220609-58539
IN-20220609-58540
IN-20220609-58541
IN-20220609-58543
IN-20220609-58544
IN-20220609-58545
IN-20220609-58546
IN-20220609-13064
IN-20220609-45028
IN-20220609-58551
IN-20220609-58566
IN-20220609-58570
IN-20220609-58572
IN-20220609-58582
IN-20220609-58586
IN-20220609-58590
IN-20220609-58511
IN-20220609-58513
IN-20220609-58514
IN-20220609-58515
IN-20220609-58516
IN-20220609-58518
IN-20220609-58519
IN-20220609-58520
IN-20220609-58523
IN-20220609-58525
IN-20220609-58527
IN-20220609-58711
IN-20220609-58714
IN-20220609-58716
IN-20220609-58718
IN-20220609-58720
IN-20220609-58723
IN-20220609-13071
IN-20220609-45034
IN-20220609-52540
IN-20220609-52905
IN-20220609-58730
IN-20220609-58738
IN-20220609-58739
IN-20220609-58741
IN-20220609-58743
IN-20220609-58744
IN-20220609-58745
IN-20220609-58748
IN-20220609-58749
IN-20220609-58752
IN-20220609-58754
IN-20220609-58757
IN-20220609-3836
IN-20220609-4103
IN-20220609-13048
IN-20220609-15120
IN-20220609-57875
IN-20220609-57877
IN-20220609-58149
IN-20220609-58150
IN-20220609-58151
IN-20220609-58171
IN-20220609-58173
IN-20220609-58174
IN-20220609-13061
IN-20220609-45025
IN-20220609-52541
IN-20220609-52906
IN-20220609-58510
IN-20220609-58512
IN-20220609-58517
IN-20220609-58522
IN-20220609-58524
IN-20220609-58536
IN-20220609-58929
IN-20220609-58932
IN-20220609-58934
IN-20220609-58936
IN-20220609-58942
IN-20220609-58948
IN-20220609-58950
IN-20220609-12712
IN-20220609-12713
IN-20220609-12714
IN-20220609-58782
IN-20220609-58784
IN-20220609-58927
IN-20220609-58928
IN-20220609-58780
IN-20220609-14984
IN-20220609-45099
IN-20220609-58790
IN-20220609-58792
IN-20220609-58793
IN-20220609-58799
IN-20220609-58806
IN-20220609-58825
IN-20220609-58827
IN-20220609-58834
IN-20220609-13066
IN-20220609-58598
IN-20220609-13070
IN-20220609-45033
IN-20220609-58726
IN-20220609-58727
IN-20220609-58728
IN-20220609-58729
IN-20220609-58607
IN-20220609-58610
IN-20220609-58613
IN-20220609-58615
IN-20220609-3816
IN-20220609-4083
IN-20220609-58478
IN-20220609-58479
IN-20220609-58480
IN-20220609-58481
IN-20220609-58482
IN-20220609-58483
IN-20220609-58485
IN-20220609-58487
IN-20220609-58490
IN-20220609-12703
IN-20220609-58839
IN-20220609-58843
IN-20220609-58845
IN-20220609-58847
IN-20220609-58850
IN-20220609-58853
IN-20220609-58492
IN-20220609-13055
IN-20220609-45020
IN-20220609-52543
IN-20220609-52907
IN-20220609-58156
IN-20220609-58157
IN-20220609-13056
IN-20220609-45021
IN-20220609-58158
IN-20220609-58493
IN-20220609-3819
IN-20220609-4086
IN-20220609-58159
IN-20220609-58160
IN-20220609-58161
IN-20220609-58162
IN-20220609-57966
IN-20220609-59206
IN-20220609-59207
IN-20220609-59208
IN-20220609-59210
IN-20220609-59211
IN-20220609-59212
IN-20220609-59214
IN-20220609-59215
IN-20220609-59216
IN-20220609-59217
IN-20220609-59282
IN-20220609-59283
IN-20220609-59285
IN-20220609-58994
IN-20220609-58995
IN-20220609-58996
IN-20220609-58997
IN-20220609-58998
IN-20220609-14971
IN-20220609-14973
IN-20220609-14975
IN-20220609-45091
IN-20220609-45092
IN-20220609-45093
IN-20220609-58999
IN-20220609-59000
IN-20220609-59001
IN-20220609-59002
IN-20220609-59003
IN-20220609-59004
IN-20220609-59005
IN-20220609-14970
IN-20220609-59009
IN-20220609-59010
IN-20220609-59011
IN-20220609-59012
IN-20220609-59013
IN-20220609-59014
IN-20220609-59017
IN-20220609-13185
IN-20220609-13186
IN-20220609-13187
IN-20220609-59088
IN-20220609-59024
IN-20220609-59034
IN-20220609-59045
IN-20220609-58980
IN-20220609-58981
IN-20220609-58982
IN-20220609-58983
IN-20220609-58984
IN-20220609-58985
IN-20220609-58986
IN-20220609-58987
IN-20220609-58988
IN-20220609-58989
IN-20220609-58990
IN-20220609-58991
IN-20220609-58992
IN-20220609-58993
IN-20220609-59223
IN-20220609-59225
IN-20220609-59227
IN-20220609-59228
IN-20220609-59229
IN-20220609-59230
IN-20220609-59231
IN-20220609-59232
IN-20220609-3871
IN-20220609-4138
IN-20220609-4614
IN-20220609-4615
IN-20220609-4616
IN-20220609-4617
IN-20220609-4618
IN-20220609-4619
IN-20220609-4620
IN-20220609-4621
IN-20220609-5702
IN-20220609-5703
IN-20220609-15212
IN-20220609-29518
IN-20220609-29519
IN-20220609-31463
IN-20220609-31464
IN-20220609-33404
IN-20220609-33405
IN-20220609-44955
IN-20220609-44956
IN-20220609-44957
IN-20220609-44958
IN-20220609-44959
IN-20220609-44960
IN-20220609-44961
IN-20220609-44962
IN-20220609-45137
IN-20220609-58964
IN-20220609-58966
IN-20220609-58968
IN-20220609-58970
IN-20220609-58971
IN-20220609-58972
IN-20220609-58973
IN-20220609-58974
IN-20220609-58976
IN-20220609-59293
IN-20220609-59020
IN-20220609-59023
IN-20220609-59026
IN-20220609-59027
IN-20220609-59028
IN-20220609-59029
IN-20220609-59030
IN-20220609-59280
IN-20220609-59281
IN-20220609-59292
IN-20220609-59294
IN-20220609-59295
IN-20220609-59296
IN-20220609-59297
IN-20220609-59274
IN-20220609-59244
IN-20220609-59246
IN-20220609-59248
IN-20220609-59250
IN-20220609-59253
IN-20220609-59254
IN-20220609-59256
IN-20220609-59257
IN-20220609-59258
IN-20220609-59260
IN-20220609-59261
IN-20220609-59262
IN-20220609-59265
IN-20220609-59031
IN-20220609-59032
IN-20220609-59033
IN-20220609-59035
IN-20220609-59036
IN-20220609-59037
IN-20220609-12904
IN-20220609-59038
IN-20220609-59039
IN-20220609-59040
IN-20220609-59041
IN-20220609-59042
IN-20220609-59043
IN-20220609-59044
IN-20220609-4493
IN-20220609-4494
IN-20220609-4495
IN-20220609-13240
IN-20220609-28366
IN-20220609-28367
IN-20220609-28368
IN-20220609-28369
IN-20220609-28370
IN-20220609-44921
IN-20220609-44922
IN-20220609-45061
IN-20220609-58977
IN-20220609-59298
IN-20220609-59299
IN-20220609-59300
IN-20220609-59301
IN-20220609-59302
IN-20220609-14990
IN-20220609-45102
IN-20220609-50735
IN-20220609-52765
IN-20220609-58951
IN-20220609-58960
IN-20220609-58961
IN-20220609-58962
IN-20220609-59006
IN-20220609-59007
IN-20220609-59008
IN-20220609-4496
IN-20220609-4497
IN-20220609-4498
IN-20220609-4499
IN-20220609-13127
IN-20220609-13128
IN-20220609-13129
IN-20220609-13130
IN-20220609-44923
IN-20220609-44924
IN-20220609-44925
IN-20220609-50869
IN-20220609-52652
IN-20220609-52791
IN-20220609-52851
IN-20220609-53172
IN-20220609-53215
IN-20220609-59057
IN-20220609-59059
IN-20220609-59060
IN-20220609-59061
IN-20220609-59062
IN-20220609-59063
IN-20220609-59289
IN-20220609-59290
IN-20220609-59233
IN-20220609-59234
IN-20220609-59235
IN-20220609-59236
IN-20220609-59237
IN-20220609-59238
IN-20220609-59239
IN-20220609-59291
IN-20220609-59279
\ No newline at end of file
...@@ -5,9 +5,9 @@ spring.profiles.active:=dev ...@@ -5,9 +5,9 @@ spring.profiles.active:=dev
server.port=8081 server.port=8081
spring.http.encoding.force=true #spring.http.encoding.force=true
spring.http.encoding.charset=UTF-8 #spring.http.encoding.charset=UTF-8
spring.http.encoding.enabled=true #spring.http.encoding.enabled=true
spring.thymeleaf.cache=false spring.thymeleaf.cache=false
spring.thymeleaf.enabled=false spring.thymeleaf.enabled=false
...@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000 ...@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000
logging.level.root=info logging.level.root=info
logging.level.org.springframework.web=info logging.level.org.springframework.web=info
logginglevelorghibernate=info logging.level.org.hibernate=info
logging.config=classpath:logback-spring.xml logging.config=classpath:logback-spring.xml
kafka.consumer.task=0 0/2 * * * ? kafka.consumer.task=0 0/2 * * * ?
......
...@@ -35,7 +35,7 @@ PROXYID=1 ...@@ -35,7 +35,7 @@ PROXYID=1
#线程池大小 #线程池大小
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= D:\\chrome\\chromedriver.exe CHROMEDRIVE= E:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
...@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0 ...@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
redis.host=114.116.26.150 redis.host=114.116.26.150
redis.port=6379 redis.port=6379
redis.pass=zzsn9988 redis.pass=zzsn9988
#redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
...@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn ...@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
HUAWEICLOUD_AK= VEHN7D0TJ9316H8AHCAV HUAWEICLOUD_AK= VEHN7D0TJ9316H8AHCAV
HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY HUAWEICLOUD_SK= heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
IMGPATH= E:\\chrome\\img\\shot.png #IMGPATH= E:\\chrome\\img\\shot.png
IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\aa.txt
selenium.driver.cache=comm_selenium_driver_cache_1
......
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=114.115.236.206
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=clbzzsn
redis.timeout=10000 redis.timeout=10000
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
#redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论