提交 f314a48b 作者: liuweigang

采集代码更新

上级 076c1840
1.八万多个信息源中,无效信息源占比大,占用很大一部分资源;
2.动态、静态参数设置不准确,导致调用的采集器可能采集不到信息;
3.爬虫程序不完善,采用selenium驱动,操作浏览器时,发生异常时,没有关闭相应的资源;导致资源占用越来越多,最终引起爬虫终断。
1.爬虫向上反馈,根据采集情况,向上游反馈该条信息源是否有效;若无效则关闭该条信息源;减少无效调度,降低资源浪费;
2.向上反馈,动态、静态参数设置是否正确;根据实际情况重新设置参数;这样不用爬虫每次都要采取两种方式采集。
3.修改爬虫业务逻辑,抛出异常时,关闭相应的资源,释放服务器资源,防止服务器资源占用过多,导致程序异常。
4.降低爬虫程序内部相应的等待时间,加快效率。
...@@ -41,7 +41,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -41,7 +41,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
// loadSiteMsg(); // loadSiteMsg();
// loadSiteMsgLoc(); // loadSiteMsgLoc();
loadSiteMsgLoc2(); // loadSiteMsgLoc2();
// loadSiteMsgLoc3(); // loadSiteMsgLoc3();
// loadSiteMsgLoc4(); // loadSiteMsgLoc4();
// loadSiteMsgLoc5(); // loadSiteMsgLoc5();
...@@ -98,11 +98,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -98,11 +98,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539590279724441602\",\n" + " \"id\": \"1541605392350359554\",\n" +
" \"infoSourceCode\": \"IN-20220622-0012\",\n" + " \"infoSourceCode\": \"IN-20220628-0001\",\n" +
" \"webSiteName\": \"走出去情报\",\n" + " \"webSiteName\": \"审计署\",\n" +
" \"siteName\": \"走出去情报-最新\",\n" + " \"siteName\": \"审计署-法律法规\",\n" +
" \"siteUri\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" + " \"siteUri\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -112,36 +112,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -112,36 +112,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n" + " \"listUrl\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"div[class=\\\"item-text-content-title\\\"]\",\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": null,\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"FeedList\\\"]\",\n" + " \"infoBlockPosition\": \"div[class=\\\"list-box-dl\\\"]>span>dl\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": null,\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"text-title\\\"]>h1</exp></title>\",\n" + " \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span[class=\\\"time\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>dd[class=\\\"fb-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": \"<origin><exp>dd[class=\\\"ly-name\\\"]</exp></origin>\",\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[data-spm=\\\"content\\\"]</exp><subtraction>div[class=\\\"text-title\\\"]</subtraction></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[id=\\\"textSize\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -166,7 +166,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -166,7 +166,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"01 45 20 1/1 * ?\"\n" + " \"cron\": \"21 12 10 1/1 * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -182,11 +182,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -182,11 +182,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539588694743097346\",\n" + " \"id\": \"1541618011601838081\",\n" +
" \"infoSourceCode\": \"IN-20220622-0007\",\n" + " \"infoSourceCode\": \"IN-20220628-0002\",\n" +
" \"webSiteName\": \"新华丝路\",\n" + " \"webSiteName\": \"北京市审计局\",\n" +
" \"siteName\": \"新华丝路-投资资讯\",\n" + " \"siteName\": \"北京市审计局-法律法规\",\n" +
" \"siteUri\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n" + " \"siteUri\": \"http://sjj.beijing.gov.cn/zwxx/flfg/\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -196,36 +196,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -196,36 +196,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"https://www.imsilkroad.com/news/category/touzizixun?page=[page_num]\",\n" + " \"listUrl\": \"http://sjj.beijing.gov.cn/zwxx/flfg/\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"h5[class=\\\"text-xl\\\"]>a\",\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": \"\",\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"mb-3\\\"]>ul>li\",\n" + " \"infoBlockPosition\": \"ul[class=\\\"list\\\"]>li\",\n" +
" \"linkLocation\": \"h5[class=\\\"text-xl\\\"]>a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": \"https://www.imsilkroad.com/news/category/touzizixun?page=[page_num]\",\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": \"https://www.imsilkroad.com/news/category/touzizixun?page=[page_num]\",\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 1,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 1000,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"text-2xl md:text-4xl mb-4 font-song\\\"]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>div[class=\\\"title\\\"]>h1</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"pubdate\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>span:contains(来源)</exp></origin>\",\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -250,7 +250,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -250,7 +250,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"43 38 20 1/1 * ?\"\n" + " \"cron\": \"30 02 11 1/1 * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -266,11 +266,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -266,11 +266,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539586364907888642\",\n" + " \"id\": \"1541670628478623746\",\n" +
" \"infoSourceCode\": \"IN-20220622-0005\",\n" + " \"infoSourceCode\": \"IN-20220628-0003\",\n" +
" \"webSiteName\": \"环球网\",\n" + " \"webSiteName\": \"上海市审计厅\",\n" +
" \"siteName\": \"环球网- 国际新闻\",\n" + " \"siteName\": \"上海市审计厅-规范性文件\",\n" +
" \"siteUri\": \"https://world.huanqiu.com/\",\n" + " \"siteUri\": \"https://sjj.sh.gov.cn/zcwj_gfxwj/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -280,15 +280,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -280,15 +280,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"https://world.huanqiu.com/\",\n" + " \"listUrl\": \"https://sjj.sh.gov.cn/zcwj_gfxwj/index.html\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"div[class=\\\"con-txt\\\"]>h4\",\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": \"span[class=\\\"time\\\"]\",\n" + " \"informationPublishDate\": \"span\",\n" +
" \"informationSource\": \"span[class=\\\"original\\\"]\",\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[id=\\\"recommend\\\"]>li[class=\\\"list-item-txt\\\"]\",\n" + " \"infoBlockPosition\": \"ul[class=\\\"zfgk_area_list\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
...@@ -297,19 +297,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -297,19 +297,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"t-container-title\\\"]>h3</exp></title>\",\n" + " \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>p[class=\\\"time\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"l-con clear\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[id=\\\"ivs_content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -334,7 +334,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -334,7 +334,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"28 29 20 1/1 * ?\"\n" + " \"cron\": \"35 31 14 1/1 * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -350,11 +350,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -350,11 +350,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539586364907888642\",\n" + " \"id\": \"1541705220539490306\",\n" +
" \"infoSourceCode\": \"IN-20220622-0005\",\n" + " \"infoSourceCode\": \"IN-20220628-0004\",\n" +
" \"webSiteName\": \"环球网\",\n" + " \"webSiteName\": \"湖北省审计厅\",\n" +
" \"siteName\": \"环球网- 国际新闻\",\n" + " \"siteName\": \"湖北省审计厅-规范性文件\",\n" +
" \"siteUri\": \"https://world.huanqiu.com/\",\n" + " \"siteUri\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -364,15 +364,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -364,15 +364,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"https://world.huanqiu.com/\",\n" + " \"listUrl\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"div[class=\\\"con-txt\\\"]>h4\",\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": \"span[class=\\\"time\\\"]\",\n" + " \"informationPublishDate\": \"span\",\n" +
" \"informationSource\": \"span[class=\\\"original\\\"]\",\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[id=\\\"recommend\\\"]>li[class=\\\"list-item-txt\\\"]\",\n" + " \"infoBlockPosition\": \"ul[id=\\\"ulList\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
...@@ -381,19 +381,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -381,19 +381,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"t-container-title\\\"]>h3</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>div[class=\\\"article\\\"]>h2</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>p[class=\\\"time\\\"]</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": null,\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"l-con clear\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[id=\\\"article-box\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -418,7 +418,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -418,7 +418,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"28 29 20 1/1 * ?\"\n" + " \"cron\": \"02 49 16 1/1 * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -434,11 +434,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -434,11 +434,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539587536314081282\",\n" + " \"id\": \"1541722286336188418\",\n" +
" \"infoSourceCode\": \"IN-20220622-0006\",\n" + " \"infoSourceCode\": \"IN-20220628-0005\",\n" +
" \"webSiteName\": \"人民网\",\n" + " \"webSiteName\": \"审计署\",\n" +
" \"siteName\": \"人民网-滚动新闻\",\n" + " \"siteName\": \"审计署-审计要闻\",\n" +
" \"siteUri\": \"http://world.people.com.cn/GB/157278/index.html\",\n" + " \"siteUri\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -448,15 +448,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -448,15 +448,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"http://world.people.com.cn/GB/157278/index.html\",\n" + " \"listUrl\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": \"dt[class=\\\"fl\\\"]>a\",\n" +
" \"informationPublishDate\": \"i\",\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"ej_bor\\\"]>ul>li\",\n" + " \"infoBlockPosition\": \"span[id=\\\"comp_10044770\\\"]>dl\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"dt[class=\\\"fl\\\"]>a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
...@@ -465,19 +465,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -465,19 +465,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"3\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"layout rm_txt cf\\\"]>div[class=\\\"col col-1 fl\\\"]>h1</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>div[class=\\\"con-article-title\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": null,\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>dd[class=\\\"fb-time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"rm_txt_con cf\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>div[id=\\\"textSize\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -502,7 +502,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -502,7 +502,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"07 34 20 1/1 * ?\"\n" + " \"cron\": \"51 56 1/2 * * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
...@@ -517,11 +517,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -517,11 +517,11 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1539587536314081282\",\n" + " \"id\": \"1541723496678105090\",\n" +
" \"infoSourceCode\": \"IN-20220622-0006\",\n" + " \"infoSourceCode\": \"IN-20220628-0006\",\n" +
" \"webSiteName\": \"人民网\",\n" + " \"webSiteName\": \"上海市审计局\",\n" +
" \"siteName\": \"人民网-滚动新闻\",\n" + " \"siteName\": \"上海市审计局-审计要闻\",\n" +
" \"siteUri\": \"http://world.people.com.cn/GB/157278/index.html\",\n" + " \"siteUri\": \"https://sjj.sh.gov.cn/n388/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": null,\n" +
" \"language\": null,\n" + " \"language\": null,\n" +
...@@ -531,36 +531,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -531,36 +531,36 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": null,\n" +
" \"listUrl\": \"http://world.people.com.cn/GB/157278/index.html\",\n" + " \"listUrl\": \"https://sjj.sh.gov.cn/n388/index.html\",\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": null,\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": null,\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": \"a\",\n" +
" \"informationPublishDate\": \"i\",\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"div[class=\\\"ej_bor\\\"]>ul>li\",\n" + " \"infoBlockPosition\": \"u1[class=\\\"dtul dtul1\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": null,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": null,\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"layout rm_txt cf\\\"]>div[class=\\\"col col-1 fl\\\"]>h1</exp></title>\",\n" + " \"detailExpressionTitle\": null,\n" +
" \"detailExpressionPublishDate\": null,\n" + " \"detailExpressionPublishDate\": null,\n" +
" \"detailExpressionSource\": null,\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"rm_txt_con cf\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": null,\n" +
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -585,7 +585,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -585,7 +585,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"07 34 20 1/1 * ?\"\n" + " \"cron\": \"40 01 1/2 * * ?\"\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
......
...@@ -8,7 +8,6 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular; ...@@ -8,7 +8,6 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import com.zzsn.crawler.paser.WebContentPaserByXpath; import com.zzsn.crawler.paser.WebContentPaserByXpath;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
......
...@@ -71,14 +71,13 @@ public class DynaminSiteThread implements Runnable{ ...@@ -71,14 +71,13 @@ public class DynaminSiteThread implements Runnable{
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now()); log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
String infoSourceId=siteMsgTemple.getId(); String infoSourceId=siteMsgTemple.getId();//获取信息源id
//默认表达式类型 //默认表达式类型
siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType()); siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//判断列表解析表达式类型 //判断列表解析表达式类型
if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式 if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
}else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple); metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
......
package com.zzsn.crawler; package com.zzsn.crawler;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.CreateSSLClientDefault; import com.zzsn.download.CreateSSLClientDefault;
import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants;
import com.zzsn.util.*; import com.zzsn.util.*;
import com.zzsn.web.ExtType;
import com.zzsn.web.JsoupTagProcessor;
import lombok.Data; import lombok.Data;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.params.HttpMethodParams;
...@@ -42,11 +32,9 @@ import org.jsoup.Jsoup; ...@@ -42,11 +32,9 @@ import org.jsoup.Jsoup;
//import org.jsoup.helper.W3CDom; //import org.jsoup.helper.W3CDom;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.kafka.core.KafkaTemplate;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import java.io.*; import java.io.*;
......
...@@ -4,17 +4,12 @@ import com.fasterxml.jackson.core.JsonProcessingException; ...@@ -4,17 +4,12 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.db.DBManager;
import com.zzsn.crawler.db.SnowIdUtils;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
...@@ -28,11 +23,8 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -28,11 +23,8 @@ import org.springframework.kafka.core.KafkaTemplate;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.sql.SQLException;
import java.sql.Types;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -73,30 +65,37 @@ public class WebContentPaserByCss { ...@@ -73,30 +65,37 @@ public class WebContentPaserByCss {
log.info(e.getMessage()); log.info(e.getMessage());
} }
if (StringUtils.isEmpty(body)) {//为空时调用 if (StringUtils.isEmpty(body)) {//为空时调用
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) {
try { try {
body = paserSiteDownload.getHtml(uri_code, charset); body = paserSiteDownload.getHtml(uri_code, charset);
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("静态请求失败:"+uri_code);
} }
}
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用 if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
TimeUnit.SECONDS.sleep(2); TimeUnit.SECONDS.sleep(2);
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
sentBadSiteMsg(siteMsgTemple,"动态请求异常","0");
}else{
sentBadSiteMsg(siteMsgTemple,"静态网络请求异常","0");
}
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
// body = SeleniumTime.getScopehtml(uri_code);
// doc = Jsoup.parse(body);
// catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
// catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
// }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
body = SeleniumTime.getScopehtml(uri_code); sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} }
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
...@@ -117,7 +116,33 @@ public class WebContentPaserByCss { ...@@ -117,7 +116,33 @@ public class WebContentPaserByCss {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param 爬虫类型(0:静态爬取 1:动态爬取)
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByCss(SiteMsgTemple siteMsgTemple,Document doc)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByCss(SiteMsgTemple siteMsgTemple,Document doc)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -244,10 +269,6 @@ public class WebContentPaserByCss { ...@@ -244,10 +269,6 @@ public class WebContentPaserByCss {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}else{ }else{
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
// if(StringUtils.isEmpty(content)){
// content = paserSiteDownload.getContent(cwbm);
// }
try { try {
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
}catch (Exception e){ }catch (Exception e){
...@@ -265,17 +286,7 @@ public class WebContentPaserByCss { ...@@ -265,17 +286,7 @@ public class WebContentPaserByCss {
} }
} }
}catch (Exception e) { }catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if (StringUtils.isEmpty(content)) {
content = paserSiteDownload.getContent(cwbm);
}
}
}
} }
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
String imagUrl=""; String imagUrl="";
...@@ -304,9 +315,11 @@ public class WebContentPaserByCss { ...@@ -304,9 +315,11 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) { if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { }else {
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
} }
}catch (Exception e){ }catch (Exception e){
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
} }
......
...@@ -6,7 +6,6 @@ import com.jayway.jsonpath.JsonPath; ...@@ -6,7 +6,6 @@ import com.jayway.jsonpath.JsonPath;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.CatchWebByMetaSearch;
...@@ -20,17 +19,11 @@ import com.zzsn.util.DateUtil; ...@@ -20,17 +19,11 @@ import com.zzsn.util.DateUtil;
import com.zzsn.util.Utility; import com.zzsn.util.Utility;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy; import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection; import org.jsoup.Connection;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
......
...@@ -8,13 +8,9 @@ import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; ...@@ -8,13 +8,9 @@ import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.*;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
...@@ -28,12 +24,9 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -28,12 +24,9 @@ import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import javax.annotation.Resource; import javax.annotation.Resource;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.util.*; import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -69,25 +62,22 @@ public class WebContentPaserByRegular { ...@@ -69,25 +62,22 @@ public class WebContentPaserByRegular {
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try { try {//先使用静态网络请求获取列表内容
body = pageDownload.downloadWithStr(uri_code, charset, false, false); body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
body = paserSiteDownload.getHtml(uri_code, charset);
} }
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) { if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl()==1) {
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = SeleniumTime.getScopehtml(uri_code);
if (StringUtils.isEmpty(body)) {
try {
body = paserSiteDownload.getHtml(uri_code, charset);
} catch (Exception e) {
log.info("静态请求失败:"+uri_code);
}
}
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body)) {
body = SeleniumTime.getScopehtml(uri_code); sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue;
} }
if( pageDownload.isBadDownloadPage(body)){ if( pageDownload.isBadDownloadPage(body)){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue; continue;
} }
} }
...@@ -95,24 +85,26 @@ public class WebContentPaserByRegular { ...@@ -95,24 +85,26 @@ public class WebContentPaserByRegular {
String imagUrl=""; String imagUrl="";
WebPageScreenShot webPageScreenShot=new WebPageScreenShot(); WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
webPageScreenShot.loadPage(uri_code,Constants.IMGPATH); webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
} }
//抽取资讯url //抽取资讯url
log.info("body的长度:"+body.length()); log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 ) {
// if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) { if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue;
}
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个"); log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个");
} }
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
log.info("异常信息"+e.getMessage()); log.info("异常信息"+e.getMessage());
// return catchWebByMetaSearchList; // return catchWebByMetaSearchList;
continue; continue;
} }
...@@ -125,6 +117,33 @@ public class WebContentPaserByRegular { ...@@ -125,6 +117,33 @@ public class WebContentPaserByRegular {
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param 爬虫类型(0:静态爬取 1:动态爬取)
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByRegular(SiteMsgTemple siteMsgTemple,String doc)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByRegular(SiteMsgTemple siteMsgTemple,String doc)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -259,27 +278,20 @@ public class WebContentPaserByRegular { ...@@ -259,27 +278,20 @@ public class WebContentPaserByRegular {
// 请求下载内容 先使用静态访问若内容为空调用动态请求若内容还为空则跳过 // 请求下载内容 先使用静态访问若内容为空调用动态请求若内容还为空则跳过
String content=""; String content="";
try { try {
try { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}catch (Exception e){
log.info(e.getMessage());
}
// StringUtils.isEmpty(content) && siteMsgTemple.getYnDynamicCrawl()==1
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }else{
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类 try {
}catch (Exception e) { content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
if (StringUtils.isEmpty(content)) { }catch (Exception e){
if (siteMsgTemple.getHeaders() != null) { log.info(e.getMessage());
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), null, true, false, siteMsgTemple.getHeaders()); content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false);
if (content == null||content =="") {
content = paserSiteDownload.getContent(cwbm);
}
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
}catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue;
} }
//使用浏览器截取图片 //使用浏览器截取图片
if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){ if(StringUtils.isNotEmpty(siteMsgTemple.getIsScreenshot()) && siteMsgTemple.getIsScreenshot().contains("1")){
...@@ -289,9 +301,9 @@ public class WebContentPaserByRegular { ...@@ -289,9 +301,9 @@ public class WebContentPaserByRegular {
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress()); // InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png"); // HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString(); // imagUrl=map.get("objectUrl").toString();
} }
if(StringUtils.isEmpty(content) ) { if(StringUtils.isEmpty(content) ) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
continue; continue;
} }
log.info("详情内容的长度:"+content.length()); log.info("详情内容的长度:"+content.length());
...@@ -321,6 +333,7 @@ public class WebContentPaserByRegular { ...@@ -321,6 +333,7 @@ public class WebContentPaserByRegular {
} }
}catch (Exception e){ }catch (Exception e){
log.info("文本内容解析不正确!"); log.info("文本内容解析不正确!");
sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
continue; continue;
} }
count++; count++;
...@@ -333,22 +346,12 @@ public class WebContentPaserByRegular { ...@@ -333,22 +346,12 @@ public class WebContentPaserByRegular {
}else{ }else{
processitem.setSource("1"); processitem.setSource("1");
} }
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())
if(StringUtils.isEmpty(docInfo.getContentNoTag())){ //内容为空则再次调用动态请求 ||StringUtils.isEmpty(processitem.getPublishDate())) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress());
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple ); sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent()) continue;
||StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress());
mark++;
if(mark>3){
break;
}
continue;
}
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC,Constants.KAFKA_CONSUMER_PARTITION , docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC,Constants.KAFKA_CONSUMER_PARTITION , docjson);
// int partition=0; // int partition=0;
......
...@@ -53,8 +53,6 @@ import java.security.NoSuchAlgorithmException; ...@@ -53,8 +53,6 @@ import java.security.NoSuchAlgorithmException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
......
...@@ -4,25 +4,17 @@ package com.zzsn.crawler.uriparser; ...@@ -4,25 +4,17 @@ package com.zzsn.crawler.uriparser;
import java.awt.*; import java.awt.*;
import java.awt.event.KeyEvent; import java.awt.event.KeyEvent;
import java.io.*; import java.io.*;
import java.net.URL; import java.time.Duration;
import java.text.SimpleDateFormat; import java.time.temporal.ChronoUnit;
import java.util.*;
import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zzsn.crawler.ChromeDriverPool;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.*; import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService; import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.interactions.Actions; import org.openqa.selenium.interactions.Actions;
import org.springframework.scheduling.annotation.Async;
@Slf4j @Slf4j
public class SeleniumTime { public class SeleniumTime {
...@@ -72,69 +64,115 @@ public class SeleniumTime { ...@@ -72,69 +64,115 @@ public class SeleniumTime {
// @Async("asyncTaskExecutorSelenium") // @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url){ public static String getScopehtml(String url){
String html = "";
ChromeOptions chromeOptions = new ChromeOptions(); ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver; ChromeDriver driver;
ChromeDriverService service; ChromeDriverService service = new ChromeDriverService.Builder().
service = new ChromeDriverService.Builder(). usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build(); try {
try { service.start();
service.start();
} catch (Exception e) {
service.stop();
return "";
// e.printStackTrace();
}
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) { if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080"); chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数 chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
} }
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080"); // chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数 // chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天 // chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(service, chromeOptions);//生成实例 driver = new ChromeDriver(chromeOptions);//生成实例
String html = "";
try { try {
driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS); Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
Thread.sleep(1000l); Thread.sleep(1000l);
try { try {
// byte[] screenshotAs = driver.getScreenshotAs(OutputType.BYTES);
// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
// SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
System.out.println("browser will be close");
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close");
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
try {
Thread.sleep(1000l);
driver.quit();
service.stop();
Thread.sleep(1000l);
} catch (InterruptedException e2) {
service.stop();
}
} }
} catch (Exception e) { } catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage()); log.info("chromedriver 出现异常:" + e.getMessage());
} finally { } finally {
try { try {
Thread.sleep(1000l);
driver.quit(); driver.quit();
service.stop(); service.stop();
Thread.sleep(1000l); Thread.sleep(3000l);
} catch (InterruptedException e) { } catch (InterruptedException e) {
} }
} }
} catch (Exception e) {
return "";
}
return html; return html;
} }
// public static String getScopehtml(String url){
//
// ChromeOptions chromeOptions = new ChromeOptions();
// ChromeDriver driver;
// ChromeDriverService service;
// service = new ChromeDriverService.Builder().
// usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
// try {
// service.start();
// } catch (Exception e) {
// service.stop();
// return "";
//// e.printStackTrace();
// }
// if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
// }
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
// driver = new ChromeDriver(service, chromeOptions);//生成实例
// String html = "";
// try {
// driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
// driver.get(url);
// Thread.sleep(1000l);
// try {
//// byte[] screenshotAs = driver.getScreenshotAs(OutputType.BYTES);
//// File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
//// SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
//// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
//// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
//
// System.out.println("browser will be close");
// WebElement webElement = driver.findElement(By.xpath("/html"));
// html = webElement.getAttribute("outerHTML");
// } catch (Exception e) {
// log.info("chromedriver 出现异常:" + e.getMessage());
// try {
// Thread.sleep(1000l);
// driver.quit();
// service.stop();
// Thread.sleep(1000l);
// } catch (InterruptedException e2) {
// service.stop();
// }
// }
// } catch (Exception e) {
// log.info("chromedriver 出现异常:" + e.getMessage());
// } finally {
// try {
// Thread.sleep(1000l);
// driver.quit();
// service.stop();
// Thread.sleep(1000l);
// } catch (InterruptedException e) {
//
// }
// }
//
// return html;
// }
public static InputStream getScreenshot(String url){ public static InputStream getScreenshot(String url){
ChromeOptions chromeOptions =new ChromeOptions() ; ChromeOptions chromeOptions =new ChromeOptions() ;
ChromeDriver driver; ChromeDriver driver;
......
...@@ -2,115 +2,131 @@ package com.zzsn.crawler.uriparser; ...@@ -2,115 +2,131 @@ package com.zzsn.crawler.uriparser;
import java.io.*; import java.io.*;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.TimeUnit;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import org.openqa.selenium.By; import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.WebElement; import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.chrome.ChromeOptions;
@Slf4j
public class SeleniumTime2 { public class SeleniumTime2 {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime2(){ public SeleniumTime2(){
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("user-data-dir="+Constants.USER_DATA_DIR);
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
public String getChromeDoc(String url) {
// ChromeOptions chromeOptions =new ChromeOptions() ;
// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// ChromeDriver driver = new ChromeDriver(chromeOptions);
String doc="";
try {
Thread.sleep(3000l);
driver.get(url);
Thread.sleep(3000l);
doc = driver.getPageSource();
} catch (Exception e) {
try {
Runtime.getRuntime().exec("taskkill /F /im " + "chromedriver.exe");
} catch (IOException e2) {
e2.printStackTrace();
}
return null;
}finally {
driver.quit();
}
return doc;
} }
/** /**
* 根据网址获取网页html信息 * 根据网址获取网页html信息
* @param url * @param url
* @return * @return
*/ */
public String getScopehtml(String url){
// @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url){
String html = "";
ChromeOptions chromeOptions = new ChromeOptions();
ChromeDriver driver;
ChromeDriverService service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
if (!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
// chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
// chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
driver = new ChromeDriver(chromeOptions);//生成实例
try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url);
Thread.sleep(1000l);
try {
WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML");
System.out.println("browser will be close");
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
}
} catch (Exception e) {
log.info("chromedriver 出现异常:" + e.getMessage());
} finally {
try {
driver.quit();
service.stop();
Thread.sleep(3000l);
} catch (InterruptedException e) {
}
}
} catch (Exception e) {
return "";
}
return html;
}
public static InputStream getScreenshot(String url){
ChromeOptions chromeOptions =new ChromeOptions() ; ChromeOptions chromeOptions =new ChromeOptions() ;
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE); ChromeDriver driver;
ChromeDriver driver = new ChromeDriver(chromeOptions); ChromeDriverService service;
service = new ChromeDriverService.Builder().
usingDriverExecutable(new File(Constants.CHROMEDRIVE)).usingAnyFreePort().build();
try {
service.start();
} catch (Exception e) {
e.printStackTrace();
}
if(!System.getProperty("os.name").toUpperCase().contains("WINDOWS")) {
chromeOptions.addArguments("--disable-gpu", "--window-size=1290,1080");
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
driver = new ChromeDriver(service, chromeOptions);//生成实例
InputStream inStream = null;
try{ try{
driver.manage ().timeouts().pageLoadTimeout (100 , TimeUnit.SECONDS ) ;
driver.get(url); driver.get(url);
Thread.sleep(2000l); Thread.sleep(3000l);
WebElement webElement = driver.findElement(By.xpath("/html")); try {
try{ byte[] screenshotBytes = driver.getScreenshotAs(OutputType.BYTES);
String html = webElement.getAttribute("outerHTML"); inStream = new ByteArrayInputStream(screenshotBytes);
Thread.sleep(500l); // File src = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
return html; // SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); //转换时间格式
// String time = dateFormat.format(Calendar.getInstance().getTime()); //获取当前时间
// FileUtils.copyFile(src, new File("Screenshots", time + ".png"));// 拷贝截图文件到我们项目./Screenshots
}catch(Exception e){ }catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常" log.info("chromedriver 出现异常:"+e.getMessage());
+"可能原因为过快的执行没有找到指定的页面元素"); }finally {
System.out.println("=============执行方法二==============");
Thread.sleep(1000l);
String html = driver.getPageSource();
Thread.sleep(2000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
} }
}catch(Exception e){ }catch(Exception e){
try { log.info("chromedriver 出现异常:"+e.getMessage());
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}finally { }finally {
try { try {
Thread.sleep(2000l); Thread.sleep(2000l);
driver.quit();
service.stop();
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace();
} }
driver.quit();
} }
return null; return inStream;
} }
public void close(){
// driver.close();
// driver.quit();
// service.stop();
}
public static void main(String[] args) { public static void main(String[] args) {
...@@ -121,12 +137,57 @@ public class SeleniumTime2 { ...@@ -121,12 +137,57 @@ public class SeleniumTime2 {
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取 * 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/ */
SeleniumTime2 s = new SeleniumTime2(); SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("https://www.baidu.com/"); String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
System.out.println(scopehtml);
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt"); File file = new File("D:/123.txt");
try { try {
PrintStream ps = new PrintStream(new FileOutputStream(file)); PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
e.printStackTrace(); e.printStackTrace();
...@@ -135,4 +196,26 @@ public class SeleniumTime2 { ...@@ -135,4 +196,26 @@ public class SeleniumTime2 {
} }
} }
package com.zzsn.crawlerOther.paser; package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.db.DBManager; import com.zzsn.crawler.db.DBManager;
import com.zzsn.crawler.db.SnowIdUtils; import com.zzsn.crawler.db.SnowIdUtils;
...@@ -13,7 +10,6 @@ import com.zzsn.crawlerOther.StandardWebExtractorHandler; ...@@ -13,7 +10,6 @@ import com.zzsn.crawlerOther.StandardWebExtractorHandler;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo; import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
...@@ -28,7 +24,6 @@ import org.jsoup.Jsoup; ...@@ -28,7 +24,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
......
...@@ -2,7 +2,6 @@ package com.zzsn.crawlerOther.paser; ...@@ -2,7 +2,6 @@ package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
......
...@@ -394,7 +394,6 @@ public class PageConnectioner { ...@@ -394,7 +394,6 @@ public class PageConnectioner {
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 3000; long exitTimeDis = 3000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
try { try {
......
...@@ -5,6 +5,7 @@ import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; ...@@ -5,6 +5,7 @@ import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import lombok.extern.slf4j.Slf4j;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.HttpsURLConnection;
...@@ -18,6 +19,7 @@ import java.util.Map; ...@@ -18,6 +19,7 @@ import java.util.Map;
import java.util.Timer; import java.util.Timer;
import java.util.TimerTask; import java.util.TimerTask;
@Slf4j
public class PageDownloader { public class PageDownloader {
private int interval = 5000; private int interval = 5000;
private long lastDownloadTime = -1; private long lastDownloadTime = -1;
...@@ -154,9 +156,7 @@ public class PageDownloader { ...@@ -154,9 +156,7 @@ public class PageDownloader {
public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) { public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval) if (interval > 0 && lastDownloadTime > 0 && dis < interval){
{
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
long startDtime = System.currentTimeMillis(); long startDtime = System.currentTimeMillis();
...@@ -164,13 +164,14 @@ public class PageDownloader { ...@@ -164,13 +164,14 @@ public class PageDownloader {
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
connection = pConn.connection(url); connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {//获取网站编码
// encoding = getEncodingFromHtmlFile(url, connection); // encoding = getEncodingFromHtmlFile(url, connection);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
encoding = paserSiteDownload.locateCharSet(url); encoding = paserSiteDownload.locateCharSet(url);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
log.info("获取编码失败");
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
......
package com.zzsn.entity;
import lombok.Data;
@Data
public class BadSiteMsg {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType;
}
...@@ -109,7 +109,7 @@ public class KafkaConsumerJob { ...@@ -109,7 +109,7 @@ public class KafkaConsumerJob {
} }
@Scheduled(cron = "0 0/58 * * * ?") // @Scheduled(cron = "0 0/30 * * * ?")
@Async("asyncTaskExecutor") @Async("asyncTaskExecutor")
public void runtimeTask (){ public void runtimeTask (){
try { try {
...@@ -118,19 +118,38 @@ public class KafkaConsumerJob { ...@@ -118,19 +118,38 @@ public class KafkaConsumerJob {
Process pro = mt.exec(cmd); Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream(); InputStream ers= pro.getErrorStream();
pro.waitFor(); pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) { } catch (IOException ioe) {
ioe.printStackTrace(); // ioe.printStackTrace();
} catch (InterruptedException e) { } catch (InterruptedException e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
} }
// try {
// Runtime mt = Runtime.getRuntime();
// String cmd = "taskkill /F /im chrome.exe";
// Process pro = mt.exec(cmd);
// InputStream ers= pro.getErrorStream();
// pro.waitFor();
// } catch (IOException ioe) {
// ioe.printStackTrace();
// } catch (InterruptedException e) {
// // TODO Auto-generated catch block
// }
}
// @Scheduled(cron = "0 0/25 * * * ?")
@Async("asyncTaskExecutor")
public void runtimeTask2 (){
try { try {
Runtime mt = Runtime.getRuntime(); Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chrome.exe"; String cmd = "taskkill /F /im chrome.exe";
Process pro = mt.exec(cmd); Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream(); InputStream ers= pro.getErrorStream();
pro.waitFor(); pro.waitFor();
System.out.println("++++++++ taskkill /F /im chrome.exe");
} catch (IOException ioe) { } catch (IOException ioe) {
ioe.printStackTrace(); // ioe.printStackTrace();
} catch (InterruptedException e) { } catch (InterruptedException e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
} }
......
package com.zzsn.test; package com.zzsn.test;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets;
public class UrlConnecttest { public class UrlConnecttest {
......
...@@ -2,6 +2,9 @@ package com.zzsn.test; ...@@ -2,6 +2,9 @@ package com.zzsn.test;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import java.io.IOException;
import java.io.InputStream;
/** /**
* 网站请求测试, * 网站请求测试,
* 在不调用浏览器情况下获取请求访问的网站内容 * 在不调用浏览器情况下获取请求访问的网站内容
...@@ -14,10 +17,21 @@ import com.zzsn.download.PageDownloader; ...@@ -14,10 +17,21 @@ import com.zzsn.download.PageDownloader;
public class WebTest { public class WebTest {
public static void main(String[] args) { public static void main(String[] args) {
String url="https://www.teriin.org/opinion"; // String url="https://www.teriin.org/opinion";
PageDownloader pageDownload=new PageDownloader(); // PageDownloader pageDownload=new PageDownloader();
String body = pageDownload.downloadWithStr(url, "utf-8", false, false); // String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
System.out.println(body); // System.out.println(body);
try {
Runtime mt = Runtime.getRuntime();
String cmd = "taskkill /F /im chrome.exe";
Process pro = mt.exec(cmd);
InputStream ers= pro.getErrorStream();
pro.waitFor();
System.out.println("++++++++ taskkill /F /im chromedriver.exe");
} catch (IOException ioe) {
ioe.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
}
} }
} }
英特尔+重大战略 Global Development Initiative
英特尔+科技资源 GDI
英特尔+科技攻关 \ No newline at end of file
英特尔+科技创新
英特尔+技术创新
英特尔+国家战略
英特尔+创新发展
英特尔+协同创新
英特尔+人工智能
英特尔+自主创新
英特尔+知识产权
英特尔+关键核心技术
英特尔+科技创新能力
英特尔+竞争力
英特尔+高新技术
英特尔+科技成果
英特尔+创新能力
英特尔+产业创新
英特尔+创新驱动
英特尔+核心技术
英特尔+创新引领
英特尔+关键技术
英特尔+制度创新
英特尔+研发投入
英特尔+技术研发
英特尔+人才队伍
英特尔+人才创新
英特尔+科技人才
超微半导体+重大战略
超微半导体+国家战略
超微半导体+科技资源
超微半导体+科技攻关
超微半导体+科技创新
超微半导体+技术创新
超微半导体+创新发展
超微半导体+协同创新
超微半导体+人工智能
超微半导体+自主创新
超微半导体+知识产权
超微半导体+关键核心技术
超微半导体+科技创新能力
超微半导体+竞争力
超微半导体+高新技术
超微半导体+科技成果
超微半导体+创新能力
超微半导体+产业创新
超微半导体+创新驱动
超微半导体+核心技术
超微半导体+创新引领
超微半导体+关键技术
超微半导体+制度创新
超微半导体+研发投入
超微半导体+技术研发
超微半导体+人才队伍
超微半导体+人才创新
超微半导体+科技人才
高通公司+重大战略
高通公司+国家战略
高通公司+科技资源
高通公司+科技攻关
高通公司+科技创新
高通公司+技术创新
高通公司+创新发展
高通公司+协同创新
高通公司+人工智能
高通公司+自主创新
高通公司+知识产权
高通公司+关键核心技术
高通公司+科技创新能力
高通公司+竞争力
高通公司+高新技术
高通公司+科技成果
高通公司+创新能力
高通公司+产业创新
高通公司+创新驱动
高通公司+核心技术
高通公司+创新引领
高通公司+关键技术
高通公司+制度创新
高通公司+研发投入
高通公司+技术研发
高通公司+人才队伍
高通公司+人才创新
高通公司+科技人才
日本电信电话株式会社+重大战略
日本电信电话株式会社+国家战略
日本电信电话株式会社+科技资源
日本电信电话株式会社+科技攻关
日本电信电话株式会社+科技创新
日本电信电话株式会社+技术创新
日本电信电话株式会社+创新发展
日本电信电话株式会社+协同创新
日本电信电话株式会社+人工智能
日本电信电话株式会社+自主创新
日本电信电话株式会社+知识产权
日本电信电话株式会社+关键核心技术
日本电信电话株式会社+科技创新能力
日本电信电话株式会社+竞争力
日本电信电话株式会社+高新技术
日本电信电话株式会社+科技成果
日本电信电话株式会社+创新能力
日本电信电话株式会社+产业创新
日本电信电话株式会社+创新驱动
日本电信电话株式会社+核心技术
日本电信电话株式会社+创新引领
日本电信电话株式会社+关键技术
日本电信电话株式会社+制度创新
日本电信电话株式会社+研发投入
日本电信电话株式会社+技术研发
日本电信电话株式会社+人才队伍
日本电信电话株式会社+人才创新
日本电信电话株式会社+科技人才
爱立信+重大战略
爱立信+国家战略
爱立信+科技资源
爱立信+科技攻关
爱立信+科技创新
爱立信+技术创新
爱立信+创新发展
爱立信+协同创新
爱立信+人工智能
爱立信+自主创新
爱立信+知识产权
爱立信+关键核心技术
爱立信+科技创新能力
爱立信+竞争力
爱立信+高新技术
爱立信+科技成果
爱立信+创新能力
爱立信+产业创新
爱立信+创新驱动
爱立信+核心技术
爱立信+创新引领
爱立信+关键技术
爱立信+制度创新
爱立信+研发投入
爱立信+技术研发
爱立信+人才队伍
爱立信+人才创新
爱立信+科技人才
东芝+重大战略
东芝+国家战略
东芝+科技资源
东芝+科技攻关
东芝+科技创新
东芝+技术创新
东芝+创新发展
东芝+协同创新
东芝+人工智能
东芝+自主创新
东芝+知识产权
东芝+关键核心技术
东芝+科技创新能力
东芝+竞争力
东芝+高新技术
东芝+科技成果
东芝+创新能力
东芝+产业创新
东芝+创新驱动
东芝+核心技术
东芝+创新引领
东芝+关键技术
东芝+制度创新
东芝+研发投入
东芝+技术研发
东芝+人才队伍
东芝+人才创新
东芝+科技人才
LG电子+重大战略
LG电子+国家战略
LG电子+科技资源
LG电子+科技攻关
LG电子+科技创新
LG电子+技术创新
LG电子+创新发展
LG电子+协同创新
LG电子+人工智能
LG电子+自主创新
LG电子+知识产权
LG电子+关键核心技术
LG电子+科技创新能力
LG电子+竞争力
LG电子+高新技术
LG电子+科技成果
LG电子+创新能力
LG电子+产业创新
LG电子+创新驱动
LG电子+核心技术
LG电子+创新引领
LG电子+关键技术
LG电子+制度创新
LG电子+研发投入
LG电子+技术研发
LG电子+人才队伍
LG电子+人才创新
LG电子+科技人才
三星+重大战略
三星+国家战略
三星+科技资源
三星+科技攻关
三星+科技创新
三星+技术创新
三星+创新发展
三星+协同创新
三星+人工智能
三星+自主创新
三星+知识产权
三星+关键核心技术
三星+科技创新能力
三星+竞争力
三星+高新技术
三星+科技成果
三星+创新能力
三星+产业创新
三星+创新驱动
三星+核心技术
三星+创新引领
三星+关键技术
三星+制度创新
三星+研发投入
三星+技术研发
三星+人才队伍
三星+人才创新
三星+科技人才
泰科电子+重大战略
泰科电子+国家战略
泰科电子+科技资源
泰科电子+科技攻关
泰科电子+科技创新
泰科电子+技术创新
泰科电子+创新发展
泰科电子+协同创新
泰科电子+人工智能
泰科电子+自主创新
泰科电子+知识产权
泰科电子+关键核心技术
泰科电子+科技创新能力
泰科电子+竞争力
泰科电子+高新技术
泰科电子+科技成果
泰科电子+创新能力
泰科电子+产业创新
泰科电子+创新驱动
泰科电子+核心技术
泰科电子+创新引领
泰科电子+关键技术
泰科电子+制度创新
泰科电子+研发投入
泰科电子+技术研发
泰科电子+人才队伍
泰科电子+人才创新
泰科电子+科技人才
苹果+重大战略
苹果+国家战略
苹果+科技资源
苹果+科技攻关
苹果+科技创新
苹果+技术创新
苹果+创新发展
苹果+协同创新
苹果+人工智能
苹果+自主创新
苹果+知识产权
苹果+关键核心技术
苹果+科技创新能力
苹果+竞争力
苹果+高新技术
苹果+科技成果
苹果+创新能力
苹果+产业创新
苹果+创新驱动
苹果+核心技术
苹果+创新引领
苹果+关键技术
苹果+制度创新
苹果+研发投入
苹果+技术研发
苹果+人才队伍
苹果+人才创新
苹果+科技人才
富士通+重大战略
富士通+国家战略
富士通+科技资源
富士通+科技攻关
富士通+科技创新
富士通+技术创新
富士通+创新发展
富士通+协同创新
富士通+人工智能
富士通+自主创新
富士通+知识产权
富士通+关键核心技术
富士通+科技创新能力
富士通+竞争力
富士通+高新技术
富士通+科技成果
富士通+创新能力
富士通+产业创新
富士通+创新驱动
富士通+核心技术
富士通+创新引领
富士通+关键技术
富士通+制度创新
富士通+研发投入
富士通+技术研发
富士通+人才队伍
富士通+人才创新
富士通+科技人才
日本电气+重大战略
日本电气+国家战略
日本电气+科技资源
日本电气+科技攻关
日本电气+科技创新
日本电气+技术创新
日本电气+创新发展
日本电气+协同创新
日本电气+人工智能
日本电气+自主创新
日本电气+知识产权
日本电气+关键核心技术
日本电气+科技创新能力
日本电气+竞争力
日本电气+高新技术
日本电气+科技成果
日本电气+创新能力
日本电气+产业创新
日本电气+创新驱动
日本电气+核心技术
日本电气+创新引领
日本电气+关键技术
日本电气+制度创新
日本电气+研发投入
日本电气+技术研发
日本电气+人才队伍
日本电气+人才创新
日本电气+科技人才
奥林巴斯+重大战略
奥林巴斯+国家战略
奥林巴斯+科技资源
奥林巴斯+科技攻关
奥林巴斯+科技创新
奥林巴斯+技术创新
奥林巴斯+创新发展
奥林巴斯+协同创新
奥林巴斯+人工智能
奥林巴斯+自主创新
奥林巴斯+知识产权
奥林巴斯+关键核心技术
奥林巴斯+科技创新能力
奥林巴斯+竞争力
奥林巴斯+高新技术
奥林巴斯+科技成果
奥林巴斯+创新能力
奥林巴斯+产业创新
奥林巴斯+创新驱动
奥林巴斯+核心技术
奥林巴斯+创新引领
奥林巴斯+关键技术
奥林巴斯+制度创新
奥林巴斯+研发投入
奥林巴斯+技术研发
奥林巴斯+人才队伍
奥林巴斯+人才创新
奥林巴斯+科技人才
索尼+重大战略
索尼+国家战略
索尼+科技资源
索尼+科技攻关
索尼+科技创新
索尼+技术创新
索尼+创新发展
索尼+协同创新
索尼+人工智能
索尼+自主创新
索尼+知识产权
索尼+关键核心技术
索尼+科技创新能力
索尼+竞争力
索尼+高新技术
索尼+科技成果
索尼+创新能力
索尼+产业创新
索尼+创新驱动
索尼+核心技术
索尼+创新引领
索尼+关键技术
索尼+制度创新
索尼+研发投入
索尼+技术研发
索尼+人才队伍
索尼+人才创新
索尼+科技人才
通用电气+重大战略
通用电气+国家战略
通用电气+科技资源
通用电气+科技攻关
通用电气+科技创新
通用电气+技术创新
通用电气+创新发展
通用电气+协同创新
通用电气+人工智能
通用电气+自主创新
通用电气+知识产权
通用电气+关键核心技术
通用电气+科技创新能力
通用电气+竞争力
通用电气+高新技术
通用电气+科技成果
通用电气+创新能力
通用电气+产业创新
通用电气+创新驱动
通用电气+核心技术
通用电气+创新引领
通用电气+关键技术
通用电气+制度创新
通用电气+研发投入
通用电气+技术研发
通用电气+人才队伍
通用电气+人才创新
通用电气+科技人才
\ No newline at end of file
...@@ -261,13 +261,13 @@ public class DetailGoogleSearchThread implements Runnable { ...@@ -261,13 +261,13 @@ public class DetailGoogleSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+ log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+"")); "|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
// intsertData(docInfo); intsertData(docInfo);
//信息转换 //信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); // ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper(); // ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem); // String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson); // System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
}else { }else {
log.info("资讯发布时间:"+docInfo.getPublishDate()); log.info("资讯发布时间:"+docInfo.getPublishDate());
} }
......
...@@ -161,28 +161,49 @@ public class GoogleRecorderUtil { ...@@ -161,28 +161,49 @@ public class GoogleRecorderUtil {
System.out.println("列表页内容"+docstr.length()); System.out.println("列表页内容"+docstr.length());
System.out.println("关键词请求:"+keyword+"第"+i+"页"); System.out.println("关键词请求:"+keyword+"第"+i+"页");
doc=Jsoup.parse(docstr); doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]"); // Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
Elements firstElementsLink = doc.select("div[class=\"xuvV6b BGxR7d\"]");
//若果没有结果则不循环 //若果没有结果则不循环
if(firstElementsLink.size()==0){ if(firstElementsLink.size()==0){
break; break;
} }
for (int j = 0; j < firstElementsLink.size(); j++) { for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch(); try {
//标题 catchWebByMetaSearch = new CatchWebByMetaSearch();
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]"); //标题
//链接 Elements e = firstElementsLink.get(j).select("div[class=\"iRPxbe\"]");
Elements a=firstElementsLink.get(j).select("a"); //链接
System.out.println(e.get(0).text()); Elements a = firstElementsLink.get(j).select("a");
System.out.println(a.get(0).attr("href")); System.out.println(e.get(0).text());
catchWebByMetaSearch.setTid(tid); System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setSid(tid); catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i)); catchWebByMetaSearch.setSid(tid);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href")); catchWebByMetaSearch.setSummary(urlList.get(i));
catchWebByMetaSearch.setTitle(e.get(0).text()); catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
//来源 catchWebByMetaSearch.setTitle(e.get(0).text());
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text(); //来源
catchWebByMetaSearch.setSourcesite(origin); String origin = firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
metaSearchList.add(catchWebByMetaSearch); catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
// //标题
// Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
// //链接
// Elements a=firstElementsLink.get(j).select("a");
// System.out.println(e.get(0).text());
// System.out.println(a.get(0).attr("href"));
// catchWebByMetaSearch.setTid(tid);
// catchWebByMetaSearch.setSid(tid);
// catchWebByMetaSearch.setSummary(urlList.get(i));
// catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
// catchWebByMetaSearch.setTitle(e.get(0).text());
// //来源
// String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
// catchWebByMetaSearch.setSourcesite(origin);
// metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){
System.out.println(e.getMessage());
}
} }
DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread(); DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread();
......
...@@ -25,8 +25,8 @@ public class WebGoogleSearch { ...@@ -25,8 +25,8 @@ public class WebGoogleSearch {
// String filepath=args[0]; // String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH; String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2021-01-01"; String startTime="2021-09-01";
String endTime="2022-05-23"; String endTime="2022-07-01";
startTime=dateToStamp(startTime); startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime); endTime=dateToStamp(endTime);
File f = new File(filepath); File f = new File(filepath);
...@@ -60,7 +60,7 @@ public class WebGoogleSearch { ...@@ -60,7 +60,7 @@ public class WebGoogleSearch {
webGoogleSearchThread.setStartTime(startTime); webGoogleSearchThread.setStartTime(startTime);
webGoogleSearchThread.setEndTime(endTime); webGoogleSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg(); KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2020052301"); keywordMsg.setId("2020070101");
keywordMsg.setStartTime(Long.parseLong(startTime)); keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime)); keywordMsg.setEndTime(Long.parseLong(endTime));
......
package com.zzsn.test; package com.zzsn.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
...@@ -11,11 +14,16 @@ public class TimePaser { ...@@ -11,11 +14,16 @@ public class TimePaser {
// String aa="2022-04-18"; // String aa="2022-04-18";
// String s = dateToStamp(aa); // String s = dateToStamp(aa);
// System.out.println(s); // System.out.println(s);
Date date = new Date(); // Date date = new Date();
String nowTime=""; // String nowTime="";
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy"); // SimpleDateFormat simpleDateFormat = new SimpleDateFormat("M/dd/yyyy");
// String format = simpleDateFormat.format("1650384000"); //// String format = simpleDateFormat.format("1650384000");
System.out.println(stampToTime("1650384000")); // System.out.println(stampToTime("1650384000"));
String realUrl="<html><head><meta name=\"referrer\" content=\"unsafe-url\"><script>window.opener=null;window.location.replace(\"https://www.ky3.com/prnewswire/2022/07/01/infinite-reality-launches-global-metaverse-hub-luxembourg/\");</script><noscript><META http-equiv=\"refresh\" content=\"0;URL='https://www.ky3.com/prnewswire/2022/07/01/infinite-reality-launches-global-metaverse-hub-luxembourg/'\"></noscript></head></html>";
Document parse = Jsoup.parse(realUrl);
String attr = parse.select("META").get(1).attr("content");
String attrurl=attr.substring(attr.indexOf("URL='")+5,attr.length()-2);
System.out.println(attrurl);
} }
public static String dateToStamp(String s) throws ParseException { public static String dateToStamp(String s) throws ParseException {
String res; String res;
......
...@@ -44,7 +44,7 @@ META_SEARCH_URL=https://www.google.com/search?q=[keyword]&newwindow=1&tbs=cdr:1, ...@@ -44,7 +44,7 @@ META_SEARCH_URL=https://www.google.com/search?q=[keyword]&newwindow=1&tbs=cdr:1,
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word= #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50 #META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak2.txt META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\google_crawler\\data\\project.txt
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=127.0.0.1
......
英特尔+重大战略 Global Development Initiative
英特尔+科技资源 \ No newline at end of file
英特尔+科技攻关
英特尔+科技创新
英特尔+技术创新
英特尔+国家战略
英特尔+创新发展
英特尔+协同创新
英特尔+人工智能
英特尔+自主创新
英特尔+知识产权
英特尔+关键核心技术
英特尔+科技创新能力
英特尔+竞争力
英特尔+高新技术
英特尔+科技成果
英特尔+创新能力
英特尔+产业创新
英特尔+创新驱动
英特尔+核心技术
英特尔+创新引领
英特尔+关键技术
英特尔+制度创新
英特尔+研发投入
英特尔+技术研发
英特尔+人才队伍
英特尔+人才创新
英特尔+科技人才
超微半导体+重大战略
超微半导体+国家战略
超微半导体+科技资源
超微半导体+科技攻关
超微半导体+科技创新
超微半导体+技术创新
超微半导体+创新发展
超微半导体+协同创新
超微半导体+人工智能
超微半导体+自主创新
超微半导体+知识产权
超微半导体+关键核心技术
超微半导体+科技创新能力
超微半导体+竞争力
超微半导体+高新技术
超微半导体+科技成果
超微半导体+创新能力
超微半导体+产业创新
超微半导体+创新驱动
超微半导体+核心技术
超微半导体+创新引领
超微半导体+关键技术
超微半导体+制度创新
超微半导体+研发投入
超微半导体+技术研发
超微半导体+人才队伍
超微半导体+人才创新
超微半导体+科技人才
高通公司+重大战略
高通公司+国家战略
高通公司+科技资源
高通公司+科技攻关
高通公司+科技创新
高通公司+技术创新
高通公司+创新发展
高通公司+协同创新
高通公司+人工智能
高通公司+自主创新
高通公司+知识产权
高通公司+关键核心技术
高通公司+科技创新能力
高通公司+竞争力
高通公司+高新技术
高通公司+科技成果
高通公司+创新能力
高通公司+产业创新
高通公司+创新驱动
高通公司+核心技术
高通公司+创新引领
高通公司+关键技术
高通公司+制度创新
高通公司+研发投入
高通公司+技术研发
高通公司+人才队伍
高通公司+人才创新
高通公司+科技人才
日本电信电话株式会社+重大战略
日本电信电话株式会社+国家战略
日本电信电话株式会社+科技资源
日本电信电话株式会社+科技攻关
日本电信电话株式会社+科技创新
日本电信电话株式会社+技术创新
日本电信电话株式会社+创新发展
日本电信电话株式会社+协同创新
日本电信电话株式会社+人工智能
日本电信电话株式会社+自主创新
日本电信电话株式会社+知识产权
日本电信电话株式会社+关键核心技术
日本电信电话株式会社+科技创新能力
日本电信电话株式会社+竞争力
日本电信电话株式会社+高新技术
日本电信电话株式会社+科技成果
日本电信电话株式会社+创新能力
日本电信电话株式会社+产业创新
日本电信电话株式会社+创新驱动
日本电信电话株式会社+核心技术
日本电信电话株式会社+创新引领
日本电信电话株式会社+关键技术
日本电信电话株式会社+制度创新
日本电信电话株式会社+研发投入
日本电信电话株式会社+技术研发
日本电信电话株式会社+人才队伍
日本电信电话株式会社+人才创新
日本电信电话株式会社+科技人才
爱立信+重大战略
爱立信+国家战略
爱立信+科技资源
爱立信+科技攻关
爱立信+科技创新
爱立信+技术创新
爱立信+创新发展
爱立信+协同创新
爱立信+人工智能
爱立信+自主创新
爱立信+知识产权
爱立信+关键核心技术
爱立信+科技创新能力
爱立信+竞争力
爱立信+高新技术
爱立信+科技成果
爱立信+创新能力
爱立信+产业创新
爱立信+创新驱动
爱立信+核心技术
爱立信+创新引领
爱立信+关键技术
爱立信+制度创新
爱立信+研发投入
爱立信+技术研发
爱立信+人才队伍
爱立信+人才创新
爱立信+科技人才
东芝+重大战略
东芝+国家战略
东芝+科技资源
东芝+科技攻关
东芝+科技创新
东芝+技术创新
东芝+创新发展
东芝+协同创新
东芝+人工智能
东芝+自主创新
东芝+知识产权
东芝+关键核心技术
东芝+科技创新能力
东芝+竞争力
东芝+高新技术
东芝+科技成果
东芝+创新能力
东芝+产业创新
东芝+创新驱动
东芝+核心技术
东芝+创新引领
东芝+关键技术
东芝+制度创新
东芝+研发投入
东芝+技术研发
东芝+人才队伍
东芝+人才创新
东芝+科技人才
LG电子+重大战略
LG电子+国家战略
LG电子+科技资源
LG电子+科技攻关
LG电子+科技创新
LG电子+技术创新
LG电子+创新发展
LG电子+协同创新
LG电子+人工智能
LG电子+自主创新
LG电子+知识产权
LG电子+关键核心技术
LG电子+科技创新能力
LG电子+竞争力
LG电子+高新技术
LG电子+科技成果
LG电子+创新能力
LG电子+产业创新
LG电子+创新驱动
LG电子+核心技术
LG电子+创新引领
LG电子+关键技术
LG电子+制度创新
LG电子+研发投入
LG电子+技术研发
LG电子+人才队伍
LG电子+人才创新
LG电子+科技人才
三星+重大战略
三星+国家战略
三星+科技资源
三星+科技攻关
三星+科技创新
三星+技术创新
三星+创新发展
三星+协同创新
三星+人工智能
三星+自主创新
三星+知识产权
三星+关键核心技术
三星+科技创新能力
三星+竞争力
三星+高新技术
三星+科技成果
三星+创新能力
三星+产业创新
三星+创新驱动
三星+核心技术
三星+创新引领
三星+关键技术
三星+制度创新
三星+研发投入
三星+技术研发
三星+人才队伍
三星+人才创新
三星+科技人才
泰科电子+重大战略
泰科电子+国家战略
泰科电子+科技资源
泰科电子+科技攻关
泰科电子+科技创新
泰科电子+技术创新
泰科电子+创新发展
泰科电子+协同创新
泰科电子+人工智能
泰科电子+自主创新
泰科电子+知识产权
泰科电子+关键核心技术
泰科电子+科技创新能力
泰科电子+竞争力
泰科电子+高新技术
泰科电子+科技成果
泰科电子+创新能力
泰科电子+产业创新
泰科电子+创新驱动
泰科电子+核心技术
泰科电子+创新引领
泰科电子+关键技术
泰科电子+制度创新
泰科电子+研发投入
泰科电子+技术研发
泰科电子+人才队伍
泰科电子+人才创新
泰科电子+科技人才
苹果+重大战略
苹果+国家战略
苹果+科技资源
苹果+科技攻关
苹果+科技创新
苹果+技术创新
苹果+创新发展
苹果+协同创新
苹果+人工智能
苹果+自主创新
苹果+知识产权
苹果+关键核心技术
苹果+科技创新能力
苹果+竞争力
苹果+高新技术
苹果+科技成果
苹果+创新能力
苹果+产业创新
苹果+创新驱动
苹果+核心技术
苹果+创新引领
苹果+关键技术
苹果+制度创新
苹果+研发投入
苹果+技术研发
苹果+人才队伍
苹果+人才创新
苹果+科技人才
富士通+重大战略
富士通+国家战略
富士通+科技资源
富士通+科技攻关
富士通+科技创新
富士通+技术创新
富士通+创新发展
富士通+协同创新
富士通+人工智能
富士通+自主创新
富士通+知识产权
富士通+关键核心技术
富士通+科技创新能力
富士通+竞争力
富士通+高新技术
富士通+科技成果
富士通+创新能力
富士通+产业创新
富士通+创新驱动
富士通+核心技术
富士通+创新引领
富士通+关键技术
富士通+制度创新
富士通+研发投入
富士通+技术研发
富士通+人才队伍
富士通+人才创新
富士通+科技人才
日本电气+重大战略
日本电气+国家战略
日本电气+科技资源
日本电气+科技攻关
日本电气+科技创新
日本电气+技术创新
日本电气+创新发展
日本电气+协同创新
日本电气+人工智能
日本电气+自主创新
日本电气+知识产权
日本电气+关键核心技术
日本电气+科技创新能力
日本电气+竞争力
日本电气+高新技术
日本电气+科技成果
日本电气+创新能力
日本电气+产业创新
日本电气+创新驱动
日本电气+核心技术
日本电气+创新引领
日本电气+关键技术
日本电气+制度创新
日本电气+研发投入
日本电气+技术研发
日本电气+人才队伍
日本电气+人才创新
日本电气+科技人才
奥林巴斯+重大战略
奥林巴斯+国家战略
奥林巴斯+科技资源
奥林巴斯+科技攻关
奥林巴斯+科技创新
奥林巴斯+技术创新
奥林巴斯+创新发展
奥林巴斯+协同创新
奥林巴斯+人工智能
奥林巴斯+自主创新
奥林巴斯+知识产权
奥林巴斯+关键核心技术
奥林巴斯+科技创新能力
奥林巴斯+竞争力
奥林巴斯+高新技术
奥林巴斯+科技成果
奥林巴斯+创新能力
奥林巴斯+产业创新
奥林巴斯+创新驱动
奥林巴斯+核心技术
奥林巴斯+创新引领
奥林巴斯+关键技术
奥林巴斯+制度创新
奥林巴斯+研发投入
奥林巴斯+技术研发
奥林巴斯+人才队伍
奥林巴斯+人才创新
奥林巴斯+科技人才
索尼+重大战略
索尼+国家战略
索尼+科技资源
索尼+科技攻关
索尼+科技创新
索尼+技术创新
索尼+创新发展
索尼+协同创新
索尼+人工智能
索尼+自主创新
索尼+知识产权
索尼+关键核心技术
索尼+科技创新能力
索尼+竞争力
索尼+高新技术
索尼+科技成果
索尼+创新能力
索尼+产业创新
索尼+创新驱动
索尼+核心技术
索尼+创新引领
索尼+关键技术
索尼+制度创新
索尼+研发投入
索尼+技术研发
索尼+人才队伍
索尼+人才创新
索尼+科技人才
通用电气+重大战略
通用电气+国家战略
通用电气+科技资源
通用电气+科技攻关
通用电气+科技创新
通用电气+技术创新
通用电气+创新发展
通用电气+协同创新
通用电气+人工智能
通用电气+自主创新
通用电气+知识产权
通用电气+关键核心技术
通用电气+科技创新能力
通用电气+竞争力
通用电气+高新技术
通用电气+科技成果
通用电气+创新能力
通用电气+产业创新
通用电气+创新驱动
通用电气+核心技术
通用电气+创新引领
通用电气+关键技术
通用电气+制度创新
通用电气+研发投入
通用电气+技术研发
通用电气+人才队伍
通用电气+人才创新
通用电气+科技人才
\ No newline at end of file
...@@ -25,8 +25,8 @@ public class WebYahooSearch { ...@@ -25,8 +25,8 @@ public class WebYahooSearch {
// String filepath=args[0]; // String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH; String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2018-04-18"; String startTime="2021-09-01";
String endTime="2019-04-18"; String endTime="2022-07-01";
startTime=dateToStamp(startTime); startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime); endTime=dateToStamp(endTime);
File f = new File(filepath); File f = new File(filepath);
...@@ -61,7 +61,7 @@ public class WebYahooSearch { ...@@ -61,7 +61,7 @@ public class WebYahooSearch {
webYahooSearchThread.setStartTime(startTime); webYahooSearchThread.setStartTime(startTime);
webYahooSearchThread.setEndTime(endTime); webYahooSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg(); KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("123456"); keywordMsg.setId("2022070304");
keywordMsg.setStartTime(Long.parseLong(startTime)); keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime)); keywordMsg.setEndTime(Long.parseLong(endTime));
......
...@@ -20,6 +20,7 @@ import com.zzsn.utility.index.Constants; ...@@ -20,6 +20,7 @@ import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch; import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.model.ContentFileResult; import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.model.FileTag; import com.zzsn.utility.model.FileTag;
import com.zzsn.utility.util.ChromeUtil;
import com.zzsn.utility.util.RequestUtil; import com.zzsn.utility.util.RequestUtil;
import com.zzsn.utility.util.SeleniumTime; import com.zzsn.utility.util.SeleniumTime;
import com.zzsn.utility.util.Utility; import com.zzsn.utility.util.Utility;
...@@ -100,7 +101,10 @@ public class WebYahooSearchThread implements Runnable { ...@@ -100,7 +101,10 @@ public class WebYahooSearchThread implements Runnable {
}catch (Exception e){ }catch (Exception e){
log.info("缓存出问题"); log.info("缓存出问题");
} }
String url1= Constants.META_SEARCH_URL; // String url1= Constants.META_SEARCH_URL;
// String url1= "https://www.bloomberg.org/page/[pn]/?s=[keyword]";
// String url1= "https://www.bbc.co.uk/search?q=[keyword]&page=[pn]";
String url1= "https://www.thejakartapost.com/search?q=[keyword]#gsc.tab=0&gsc.q=GDI&gsc.page=[pn]";
// String[] kwords=kWord.split("\\+"); // String[] kwords=kWord.split("\\+");
String url=""; String url="";
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
...@@ -112,12 +116,13 @@ public class WebYahooSearchThread implements Runnable { ...@@ -112,12 +116,13 @@ public class WebYahooSearchThread implements Runnable {
if(proxyid.equals("1")) { if(proxyid.equals("1")) {
CatchWebNews(YahooRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord); CatchWebNews(YahooRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord);
}else { }else {
for (int i = 0; i < 100; i++) { for (int i = 0; i < 10; i++) {
String urla = url1.replace("[keyword]",kWord); String urla = url1.replace("[keyword]",kWord);
urla=urla.replace("[pn]",1+i*10+""); // urla=urla.replace("[pn]",1+i*10+"");
urla=urla.replace("[pn]",1+i+"");
urlList.add(urla); urlList.add(urla);
} }
List<CatchWebByMetaSearch> catchWebByMetaSearches = YahooRecorderUtil.catchWebOfYahooList(urlList, charset, orgId, tid,kafkaTemplate); List<CatchWebByMetaSearch> catchWebByMetaSearches = YahooRecorderUtil.catchWebOfWebList(urlList, charset, orgId, tid,kafkaTemplate,kWord);
log.info("关键词搜索到信息数:"+catchWebByMetaSearches.size()); log.info("关键词搜索到信息数:"+catchWebByMetaSearches.size());
} }
...@@ -185,14 +190,177 @@ public class WebYahooSearchThread implements Runnable { ...@@ -185,14 +190,177 @@ public class WebYahooSearchThread implements Runnable {
return encoding; return encoding;
} }
// 抓取新闻内容
public void CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) {
try {
int count = 0;
for (int i = 0; i < catchWebList.size(); i++) {
try {
CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中
String orgId = String.valueOf(cwbm.getOrgId());
try {
String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+cwbm.getSourceaddress());
if (!StringUtils.isEmpty(urlflag)) {
log.info(cwbm.getSourceaddress()+" 数据重复");
continue;
}
}catch (Exception e){
log.info("redis获取信息失败");
}
String infourl = cwbm.getSourceaddress();
String infodata = "";
String charset = "";
System.out.println(cwbm.getTitle()+"=="+infourl);
if (infourl == null || infourl.contains(".pdf") || infourl.trim().length()==0|| infourl.contains(".PDF")||infourl.contains("download")) {
continue;
}
infodata=getContentByUrl(infourl);
// 测试获取内容通过模拟浏览器获取
// infodata= ChromeUtil.getChromeDoc(infourl);
// String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
// SeleniumTime seleniumTime=new SeleniumTime();
// infodata=seleniumTime.getScopehtml(infourl);
if(StringUtils.isEmpty(infodata)){
try {
Thread.sleep(1000*5);
SeleniumTime seleniumTime2=new SeleniumTime();
infodata=seleniumTime2.getScopehtml(infourl);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
if(infourl.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
infodata = RequestUtil.getTaotiaoData(infourl );
}
if(StringUtils.isEmpty(infodata)){
System.out.println("122222222222222222222222/为空,则爬取下一个");
//为空,则爬取下一个
continue;
}
String contentCharset = Utility.getWebEncodingByStr(infodata);
String content = null;
if (infodata != null && charset != null&& contentCharset != null) {
// content = Utility.convertCharset(infodata, charset,charset);
content=infodata;
}
if (content != null) {
cwbm.setCharset(charset);
cwbm.setLastModify("");
cwbm.setContent(content);
}
DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId());
docInfo.setSid(cwbm.getSid());
docInfo.setSourceType("News");
docInfo.setLastModified(cwbm.getLastModify());
docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate());
docInfo.setOrigin(cwbm.getSourcesite());
docInfo.setKeywords(keyword);
docInfo.setSummary(cwbm.getSummary());
StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
try {
swe.doHandler(content, docInfo);
} catch (Exception e1) {
log.info("模板解析异常"+e1.getMessage());
// swe.doHandler(content, docInfo);
}
System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
docInfo.setFileDownLoadPath(null);
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "Google元搜索");
if (null!=cwbm.getTid()) {
params.put("tid", String.valueOf(cwbm.getTid()));
}
docInfo.setOtherParams(params);
if (docInfo.getTitle() != null
&& docInfo.getTitle().trim().length() > 0
&& docInfo.getContentNoTag() != null
&& docInfo.getContentNoTag().trim().length() > 0) {
// String week = DateUtil.getDateBeforeWeek(new Date());
// && docInfo.getPublishDate().compareTo(week)>0
if (docInfo.getPublishDate()!=null && docInfo.getPublishDate().trim().length()>0
) {
ContentFileResult contentFileResult = new ContentFileResult();
try {
contentFileResult = getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
// docInfo.setContentWithTag(contentFileResult.getContentImgCvtTag());
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));
docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
Map<String, FileTag> imgMap = contentFileResult.getFileMap();
//
for (String keyImg : imgMap.keySet()) {
FileTag fileTag = imgMap.get(keyImg);
String savePath = fileTag.getSavePath();
InputStream is = getImg(fileTag.getAbsolutePath());
if (is!=null) {
int size = is.available();
if (size>0) {
// mqSender.sendFile(is, true, savePath);
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
log.info(e.getMessage());
}
System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
intsertData(docInfo);
//信息转换
// ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
// ObjectMapper mapper = new ObjectMapper();
// String docjson = mapper.writeValueAsString(processitem);
// System.out.println(docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
}else {
log.info("资讯发布时间:"+docInfo.getPublishDate());
}
count++;
}else {
log.info("资讯内容:"+docInfo.getContentNoTag());
// System.out.println(docInfo.getContentNoTag());
}
// 加入缓存池中
System.out.println("加入缓存池");
// addressList.add(docInfo.getSourceaddress());
// MemcachedUtils.set(key,addressList,60*60*24);
JedisUtil.setString(Constants.SOURCEADDRESS+"_"+cwbm.getSourceaddress(),"1",-1);
} catch (Exception e) {
log.info("访问出错"+e.getMessage());
continue;
}
}
System.out.println("本次成功件数:" + count);
log.info("本次成功件数:" + count);
} catch (Exception e) {
log.info("访问出错"+e.getMessage());
}
}
// 抓取新闻内容 // 抓取新闻内容
private void CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) { public void CatchWebNews2(List<CatchWebByMetaSearch> catchWebList,String keyword) {
try { try {
int count = 0; int count = 0;
int g=catchWebList.size()>5?5:catchWebList.size(); int g=catchWebList.size()>5?5:catchWebList.size();
for (int i = 0; i < g ; i++) { // for (int i = 0; i < g ; i++) {
// for (int i = 0; i < catchWebList.size(); i++) { for (int i = 0; i < catchWebList.size(); i++) {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中 // 判断该网址是否存在于缓存池中
...@@ -395,7 +563,8 @@ public class WebYahooSearchThread implements Runnable { ...@@ -395,7 +563,8 @@ public class WebYahooSearchThread implements Runnable {
{ {
long snowID = SnowIdUtils.uniqueLong(); long snowID = SnowIdUtils.uniqueLong();
String id=snowID+""; String id=snowID+"";
String sid=docInfo.getSid()+""; String sid="2022070304";
// String sid=docInfo.getSid()+"";
String title=docInfo.getTitle(); String title=docInfo.getTitle();
String summ = docInfo.getSummary(); String summ = docInfo.getSummary();
if(summ!=null && summ.length()>5000){ if(summ!=null && summ.length()>5000){
......
...@@ -2,6 +2,7 @@ package com.zzsn.search.yaooThread; ...@@ -2,6 +2,7 @@ package com.zzsn.search.yaooThread;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.paser.SeleniumTime;
import com.zzsn.search.util.GetCookies; import com.zzsn.search.util.GetCookies;
import com.zzsn.utility.index.Constants; import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch; import com.zzsn.utility.model.CatchWebByMetaSearch;
...@@ -36,7 +37,7 @@ public class YahooRecorderUtil { ...@@ -36,7 +37,7 @@ public class YahooRecorderUtil {
// 提取新闻列表URL // 提取新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfYahooList( public static List<CatchWebByMetaSearch> catchWebOfYahooList(
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate) { List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate,String kWord) {
try { try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
...@@ -68,41 +69,160 @@ public class YahooRecorderUtil { ...@@ -68,41 +69,160 @@ public class YahooRecorderUtil {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
CatchWebByMetaSearch catchWebByMetaSearch = null; CatchWebByMetaSearch catchWebByMetaSearch = null;
for (int m=0;m<firstElementsLink.size();m++) { for (int m=0;m<firstElementsLink.size();m++) {
catchWebByMetaSearch = new CatchWebByMetaSearch(); try {
String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text(); catchWebByMetaSearch = new CatchWebByMetaSearch();
//发布时间 String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text();
String publishDate = DateUtil.getPublishDate(orainAndDatestr); //发布时间
catchWebByMetaSearch.setPublishDate(publishDate); String publishDate = DateUtil.getPublishDate(orainAndDatestr);
//来源 catchWebByMetaSearch.setPublishDate(publishDate);
String orin = firstElementsLink.get(m).select("span.s-source").text().trim(); //来源
catchWebByMetaSearch.setSourcesite(orin); String orin = firstElementsLink.get(m).select("span.s-source").text().trim();
Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb"); catchWebByMetaSearch.setSourcesite(orin);
if (titleAndUrl.size()>0) { // Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb");
//标题 Elements titleAndUrl = firstElementsLink.get(m).select("a[referrerpolicy=\"origin\"]");
String title = titleAndUrl.get(0).attr("title"); String title = titleAndUrl.get(0).text();
catchWebByMetaSearch.setTitle(title); catchWebByMetaSearch.setTitle(title);
//源网址
Element element = titleAndUrl.get(0); Element element = titleAndUrl.get(0);
element.setBaseUri(uri_code); element.setBaseUri(uri_code);
String addressurl = titleAndUrl.get(0).absUrl("href"); String addressurl = titleAndUrl.get(0).absUrl("href");
String realUrl = sendGet(addressurl); String realUrl = sendGet(addressurl);
catchWebByMetaSearch.setSourceaddress(realUrl); Document parse = Jsoup.parse(realUrl);
String attr = parse.select("META").get(1).attr("content");
String attrurl = attr.substring(attr.indexOf("URL='") + 5, attr.length() - 2);
catchWebByMetaSearch.setSourceaddress(attrurl);
// if (titleAndUrl.size()>0) {
// //标题
// String title = titleAndUrl.get(0).attr("title");
// catchWebByMetaSearch.setTitle(title);
// //源网址
// Element element = titleAndUrl.get(0);
// element.setBaseUri(uri_code);
// String addressurl = titleAndUrl.get(0).absUrl("href");
// String realUrl = sendGet(addressurl);
// catchWebByMetaSearch.setSourceaddress(realUrl);
//
// }
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){
} }
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
} }
for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){ WebYahooSearchThread webYahooSearch =new WebYahooSearchThread();
ObjectMapper mapper = new ObjectMapper(); webYahooSearch.CatchWebNews(metaSearchList,kWord);
// for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
// ObjectMapper mapper = new ObjectMapper();
// try {
// String docjson = mapper.writeValueAsString(catchMetaSearch);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
// log.info("发送到kafka成功。");
// }catch (Exception e){
// log.info(e.getMessage());
// }
// }
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// 提取新闻列表URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfWebList(
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate,String kWord) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),
url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
} catch (URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Connection conn = Jsoup.connect(uri_code);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50");
Document doc = null;
try {
doc = conn.timeout(10000).get();
} catch (Exception ex) {
// ex.printStackTrace();
System.out.println("Yahoo搜索中该关键词搜索没有相关新闻!");
// continue;
SeleniumTime seleniumTime=new SeleniumTime();
String docstr=seleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(docstr);
}
System.out.println("----Yahoo搜索----" + uri);
Elements firstElementsLink = doc.select("div[class=\"gsc-webResult gsc-result\"]");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
if (firstElementsLink.size()<1){
SeleniumTime seleniumTime=new SeleniumTime();
String docstr=seleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(docstr);
firstElementsLink = doc.select("div[class=\"gsc-webResult gsc-result\"]");
}
CatchWebByMetaSearch catchWebByMetaSearch = null;
for (int m=0;m<firstElementsLink.size();m++) {
try { try {
String docjson = mapper.writeValueAsString(catchMetaSearch); catchWebByMetaSearch = new CatchWebByMetaSearch();
kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson); // String orainAndDatestr = firstElementsLink.get(m).select("span.s-time").text();
log.info("发送到kafka成功。"); // //发布时间
// String publishDate = DateUtil.getPublishDate(orainAndDatestr);
// catchWebByMetaSearch.setPublishDate(publishDate);
//来源
String orin = firstElementsLink.get(m).select("div[dir=\"ltr\"]").text().trim();
catchWebByMetaSearch.setSourcesite(orin);
// Elements titleAndUrl = firstElementsLink.get(m).select("a.thmb");
Elements titleAndUrl = firstElementsLink.get(m).select("a[class=\"gs-title\"]");
String title = titleAndUrl.get(0).text();
catchWebByMetaSearch.setTitle(title);
Element element = titleAndUrl.get(0);
element.setBaseUri(uri_code);
String addressurl = titleAndUrl.get(0).absUrl("href");
catchWebByMetaSearch.setSourceaddress(addressurl);
// if (titleAndUrl.size()>0) {
// //标题
// String title = titleAndUrl.get(0).attr("title");
// catchWebByMetaSearch.setTitle(title);
// //源网址
// Element element = titleAndUrl.get(0);
// element.setBaseUri(uri_code);
// String addressurl = titleAndUrl.get(0).absUrl("href");
// String realUrl = sendGet(addressurl);
// catchWebByMetaSearch.setSourceaddress(realUrl);
//
// }
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage());
} }
} }
WebYahooSearchThread webYahooSearch =new WebYahooSearchThread();
webYahooSearch.CatchWebNews(metaSearchList,kWord);
// for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
// ObjectMapper mapper = new ObjectMapper();
// try {
// String docjson = mapper.writeValueAsString(catchMetaSearch);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
// log.info("发送到kafka成功。");
// }catch (Exception e){
// log.info(e.getMessage());
// }
// }
catchWebByMetaSearchList.addAll(metaSearchList); catchWebByMetaSearchList.addAll(metaSearchList);
} }
......
...@@ -43,7 +43,7 @@ META_SEARCH_URL=https://news.search.yahoo.com/search?p=[keyword]&ei=UTF-8&b=[pn] ...@@ -43,7 +43,7 @@ META_SEARCH_URL=https://news.search.yahoo.com/search?p=[keyword]&ei=UTF-8&b=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word= #META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50 #META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak.txt META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\yahoo\\data\\project.txt
# Redis settings # Redis settings
redis.host=127.0.0.1 redis.host=127.0.0.1
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论