Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
af30a040
提交
af30a040
authored
7月 27, 2022
作者:
liuweigang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
通用采集代码更新
上级
649ac47c
隐藏空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
3325 行增加
和
974 行删除
+3325
-974
doc.txt
comm_crawler/doc.txt
+2
-0
CrawlerStaticApplication.java
...wler/src/main/java/com/zzsn/CrawlerStaticApplication.java
+86
-723
PaserSiteDownload.java
...ler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+1
-0
SiteThread.java
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+65
-3
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+25
-13
HttpgetUtil.java
...src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
+129
-0
SeleniumTime.java
...rc/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+1
-0
SeleniumTime4.java
...c/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
+231
-0
WebContentPaserByXpath.java
...a/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+2
-2
JedisUtil.java
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
+2
-1
KafkaConsumerJob.java
..._crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+1
-1
ChromeTest.java
comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
+1
-0
HttpClientTester.java
...crawler/src/main/java/com/zzsn/test/HttpClientTester.java
+4
-1
WebTest.java
comm_crawler/src/main/java/com/zzsn/test/WebTest.java
+5
-13
ContentUtility.java
comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
+9
-7
DriverUtil.java
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
+17
-6
Utility.java
comm_crawler/src/main/java/com/zzsn/util/Utility.java
+398
-183
WindowsProcess.java
comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
+17
-12
aa.txt
comm_crawler/src/main/resources/aa.txt
+2312
-0
application.properties
comm_crawler/src/main/resources/application.properties
+4
-4
constants.properties
comm_crawler/src/main/resources/constants.properties
+7
-3
redis.properties
comm_crawler/src/main/resources/redis.properties
+6
-2
没有找到文件。
comm_crawler/doc.txt
浏览文件 @
af30a040
...
@@ -11,3 +11,5 @@
...
@@ -11,3 +11,5 @@
comm_crawler/src/main/java/com/zzsn/CrawlerStaticApplication.java
浏览文件 @
af30a040
...
@@ -3,9 +3,12 @@ package com.zzsn;
...
@@ -3,9 +3,12 @@ package com.zzsn;
import
com.google.gson.Gson
;
import
com.google.gson.Gson
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.crawler.DynaminSiteThread
;
import
com.zzsn.crawler.DynaminSiteThread
;
import
com.zzsn.crawler.SiteThread
;
import
com.zzsn.crawlerOther.ArticleCrawler
;
import
com.zzsn.crawlerOther.ArticleCrawler
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.entity.SiteMsgTemple
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.FileUtil
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.job.KafkaConsumerJob
;
import
com.zzsn.job.KafkaConsumerJob
;
import
org.apache.kafka.clients.consumer.ConsumerRecord
;
import
org.apache.kafka.clients.consumer.ConsumerRecord
;
import
org.apache.kafka.clients.consumer.ConsumerRecords
;
import
org.apache.kafka.clients.consumer.ConsumerRecords
;
...
@@ -20,7 +23,9 @@ import org.springframework.boot.web.servlet.ServletComponentScan;
...
@@ -20,7 +23,9 @@ import org.springframework.boot.web.servlet.ServletComponentScan;
import
org.springframework.boot.web.servlet.support.SpringBootServletInitializer
;
import
org.springframework.boot.web.servlet.support.SpringBootServletInitializer
;
import
org.springframework.context.ConfigurableApplicationContext
;
import
org.springframework.context.ConfigurableApplicationContext
;
import
java.io.File
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
@SpringBootApplication
(
scanBasePackages
=
"com.zzsn"
)
@SpringBootApplication
(
scanBasePackages
=
"com.zzsn"
)
//@ServletComponentScan
//@ServletComponentScan
...
@@ -39,17 +44,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -39,17 +44,15 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
}
}
@Override
@Override
public
void
run
(
String
...
args
)
throws
Exception
{
public
void
run
(
String
...
args
)
throws
Exception
{
// loadSiteMsg();
try
{
loadSiteMsg
();
}
catch
(
Exception
e
)
{
loadSiteMsg
();
}
// loadSiteMsgLoc();
// loadSiteMsgLoc();
// loadSiteMsgLoc2();
// loadSiteMsgLoc2();
// loadSiteMsgLoc3();
// loadSiteMsgLoc3();
// loadSiteMsgLoc4();
// loadSiteMsgLoc5();
// loadSiteMsgLoc6();
// loadSiteMsgLoc7();
// loadSiteMsgLoc8();
// loadSiteMsgLoc9();
// loadSiteMsgLoc10();
}
}
public
void
loadSiteMsg
(){
public
void
loadSiteMsg
(){
try
{
try
{
...
@@ -90,88 +93,31 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -90,88 +93,31 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
loadSiteMsg
();
loadSiteMsg
();
}
}
}
}
public
void
loadSiteMsgLoc
(){
public
void
loadSiteMsgLoc
()
{
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
String
filepath
=
Constants
.
IMGPATH
;
// kafkaConsumerJob.consumer();
System
.
out
.
println
(
filepath
);
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
// articleCrawler.consumer();
try
{
System
.
out
.
println
(
"——————++++++++++++——————==="
);
File
f
=
new
File
(
filepath
);
String
value
=
"{\n"
+
List
<
String
>
allLines
=
FileUtil
.
getFileLines
(
f
,
"utf-8"
);
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
System
.
out
.
println
(
allLines
.
size
());
" \"id\": \"1541605392350359554\",\n"
+
for
(
String
keysite:
allLines
)
{
" \"infoSourceCode\": \"IN-20220628-0001\",\n"
+
try
{
" \"webSiteName\": \"审计署\",\n"
+
String
value
=
JedisUtil
.
getString
(
"INFO_SOURCE_TO_REDIS::"
+
keysite
);
" \"siteName\": \"审计署-法律法规\",\n"
+
System
.
out
.
println
(
"——————++++++++++++——————==="
);
" \"siteUri\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n"
+
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
" \"infoSourceTypeId\": \"1\",\n"
+
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
" \"siteLevel\": null,\n"
+
siteMsgTemple
.
setYnDynamicCrawl
(
1
);
" \"language\": null,\n"
+
// siteMsgTemple.getYnDynamicCrawl()
" \"checkedList\": null,\n"
+
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
" \"hisUriExp\": null,\n"
+
siteThread
.
crawler
();
" \"hisDateStartTime\": null,\n"
+
}
catch
(
Exception
e
){
" \"hisDateEndTime\": null,\n"
+
continue
;
" \"ynHisDataAll\": \"0\",\n"
+
}
" \"status\": null,\n"
+
}
" \"listUrl\": \"https://www.audit.gov.cn/n6/n36/index.html\",\n"
+
}
catch
(
Exception
e
){
" \"listExpressionType\": \"3\",\n"
+
e
.
getMessage
();
" \"informationUrl\": null,\n"
+
}
" \"informationTitle\": \"a\",\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"div[class=\\\"list-box-dl\\\"]>span>dl\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": null,\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": null,\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>dd[class=\\\"fb-time\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": \"<origin><exp>dd[class=\\\"ly-name\\\"]</exp></origin>\",\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"textSize\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"21 12 10 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
}
public
void
loadSiteMsgLoc2
(){
public
void
loadSiteMsgLoc2
(){
...
@@ -182,59 +128,59 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -182,59 +128,59 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System
.
out
.
println
(
"——————++++++++++++——————==="
);
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"15
41618011601838081
\",\n"
+
" \"id\": \"15
34691893595426818
\",\n"
+
" \"infoSourceCode\": \"IN-202206
28-0002
\",\n"
+
" \"infoSourceCode\": \"IN-202206
09-50867
\",\n"
+
" \"webSiteName\": \"
北京市审计局
\",\n"
+
" \"webSiteName\": \"
美国CNN有线电视新闻网
\",\n"
+
" \"siteName\": \"
北京市审计局-法律法规
\",\n"
+
" \"siteName\": \"
美国CNN有线电视新闻网-world
\",\n"
+
" \"siteUri\": \"http
://sjj.beijing.gov.cn/zwxx/flfg/
\",\n"
+
" \"siteUri\": \"http
s://edition.cnn.com/world
\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\":
null
,\n"
+
" \"siteLevel\":
\"2\"
,\n"
+
" \"language\":
null
,\n"
+
" \"language\":
\"en\"
,\n"
+
" \"checkedList\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\":
\"0\"
,\n"
+
" \"ynHisDataAll\":
null
,\n"
+
" \"status\":
null
,\n"
+
" \"status\":
\"1\"
,\n"
+
" \"listUrl\":
\"http://sjj.beijing.gov.cn/zwxx/flfg/\"
,\n"
+
" \"listUrl\":
null
,\n"
+
" \"listExpressionType\": \"
3
\",\n"
+
" \"listExpressionType\": \"
0
\",\n"
+
" \"informationUrl\":
null
,\n"
+
" \"informationUrl\":
\"https://edition\\\\.cnn\\\\.com/[\\\\d]{1,}/[\\\\d]{1,}/[\\\\d]{1,}/.*\"
,\n"
+
" \"informationTitle\":
\"a\"
,\n"
+
" \"informationTitle\":
null
,\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\":
\"ul[class=\\\"list\\\"]>li\"
,\n"
+
" \"infoBlockPosition\":
null
,\n"
+
" \"linkLocation\":
\"a\"
,\n"
+
" \"linkLocation\":
null
,\n"
+
" \"extractInfo\": \"[{\\\"id\\\":
4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":
\\\"\\\"}]\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":
0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\":
\\\"\\\"}]\",\n"
+
" \"crawlDepth\":
null
,\n"
+
" \"crawlDepth\":
3
,\n"
+
" \"pageUrl\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\":
\"0\"
,\n"
+
" \"ynPageAll\":
null
,\n"
+
" \"detailExpressionType\": \"
3
\",\n"
+
" \"detailExpressionType\": \"
0
\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>
div[class=\\\"title\\\"]>h1
</exp></title>\",\n"
+
" \"detailExpressionTitle\": \"<title><exp>
*.h1[class=\\\"pg-headline\\\"]
</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>
div[class=\\\"pubdat
e\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>
*.p[class=\\\"update-tim
e\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>
div[class=\\\"content
\\\"]</exp></content>\",\n"
+
" \"detailExpressionContent\": \"<content><exp>
*.div[class=\\\"l-container
\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":
4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":
\\\"\\\"}]\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":
0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"expression\\\":
\\\"\\\"}]\",\n"
+
" \"ynDownload\":
\"0\"
,\n"
+
" \"ynDownload\":
null
,\n"
+
" \"formUrl\": null,\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":
4,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":
\\\"\\\"}]\",\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":
0, \\\"name\\\": \\\"\\\", \\\"explain\\\": \\\"\\\", \\\"mapping\\\": \\\"\\\", \\\"expression\\\": \\\"\\\", \\\"primaryKey\\\":
\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\":
\"0\"
,\n"
+
" \"ynDataPageAll\":
null
,\n"
+
" \"dataType\": 0,\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{
\\\"accessMode\\\":\\\"FTP\\\"
}\",\n"
+
" \"dataStorageInfo\": \"{}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"domainName\": null,\n"
+
...
@@ -249,8 +195,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -249,8 +195,8 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlType\": 1,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\":
\"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\"
,\n"
+
" \"parameter\":
null
,\n"
+
" \"cron\": \"
30 02 11 1/1
* ?\"\n"
+
" \"cron\": \"
12 5 0/10 *
* ?\"\n"
+
"}"
;
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
...
@@ -266,211 +212,43 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -266,211 +212,43 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System
.
out
.
println
(
"——————++++++++++++——————==="
);
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"15
41670628478623746
\",\n"
+
" \"id\": \"15
34691893595426818
\",\n"
+
" \"infoSourceCode\": \"IN-202206
28-0003
\",\n"
+
" \"infoSourceCode\": \"IN-202206
09-50867
\",\n"
+
" \"webSiteName\": \"
上海市审计厅
\",\n"
+
" \"webSiteName\": \"
美国CNN有线电视新闻网
\",\n"
+
" \"siteName\": \"
上海市审计厅-规范性文件
\",\n"
+
" \"siteName\": \"
美国CNN有线电视新闻网-world
\",\n"
+
" \"siteUri\": \"https://
sjj.sh.gov.cn/zcwj_gfxwj/index.html
\",\n"
+
" \"siteUri\": \"https://
edition.cnn.com/world
\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\":
null
,\n"
+
" \"siteLevel\":
\"2\"
,\n"
+
" \"language\":
null
,\n"
+
" \"language\":
\"en\"
,\n"
+
" \"checkedList\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"status\": \"1\",\n"
+
" \"listUrl\": \"https://sjj.sh.gov.cn/zcwj_gfxwj/index.html\",\n"
+
" \"listUrl\": null,\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"listExpressionType\": \"0\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationUrl\": \"https://edition.cnn.com/[\\\\d]{1,}/[\\\\d]{1,}/[\\\\d]{1,}/.*\",\n"
+
" \"informationTitle\": \"a\",\n"
+
" \"informationTitle\": null,\n"
+
" \"informationPublishDate\": \"span\",\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"ul[class=\\\"zfgk_area_list\\\"]>li\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": null,\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"ivs_content\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"35 31 14 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc4
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1541705220539490306\",\n"
+
" \"infoSourceCode\": \"IN-20220628-0004\",\n"
+
" \"webSiteName\": \"湖北省审计厅\",\n"
+
" \"siteName\": \"湖北省审计厅-规范性文件\",\n"
+
" \"siteUri\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://sjt.hubei.gov.cn/zfxxgk_GK2020/zc_GK2020/gfxwj_GK2020/#test\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"a\",\n"
+
" \"informationPublishDate\": \"span\",\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"ul[id=\\\"ulList\\\"]>li\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"article\\\"]>h2</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": null,\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article-box\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"02 49 16 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc5
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1541722286336188418\",\n"
+
" \"infoSourceCode\": \"IN-20220628-0005\",\n"
+
" \"webSiteName\": \"审计署\",\n"
+
" \"siteName\": \"审计署-审计要闻\",\n"
+
" \"siteUri\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://www.audit.gov.cn/n4/n19/index.html\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"dt[class=\\\"fl\\\"]>a\",\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\":
\"span[id=\\\"comp_10044770\\\"]>dl\"
,\n"
+
" \"infoBlockPosition\":
null
,\n"
+
" \"linkLocation\":
\"dt[class=\\\"fl\\\"]>a\"
,\n"
+
" \"linkLocation\":
null
,\n"
+
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\":
null
,\n"
+
" \"crawlDepth\":
3
,\n"
+
" \"pageUrl\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"
3
\",\n"
+
" \"detailExpressionType\": \"
0
\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>
div[class=\\\"con-article-titl
e\\\"]</exp></title>\",\n"
+
" \"detailExpressionTitle\": \"<title><exp>
*.h1[class=\\\"pg-headlin
e\\\"]</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>
dd[class=\\\"fb
-time\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>
*.p[class=\\\"update
-time\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>
div[id=\\\"textSize
\\\"]</exp></content>\",\n"
+
" \"detailExpressionContent\": \"<content><exp>
*.div[class=\\\"l-container
\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formUrl\": null,\n"
+
...
@@ -486,7 +264,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -486,7 +264,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"dataType\": 0,\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{
\\\"accessMode\\\":\\\"FTP\\\"
}\",\n"
+
" \"dataStorageInfo\": \"{}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"domainName\": null,\n"
+
...
@@ -502,256 +280,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -502,256 +280,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
" \"crawlName\": null,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"51 56 1/2 * * ?\"\n"
+
" \"cron\": \"12 5 0/10 * * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc6
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1541723496678105090\",\n"
+
" \"infoSourceCode\": \"IN-20220628-0006\",\n"
+
" \"webSiteName\": \"上海市审计局\",\n"
+
" \"siteName\": \"上海市审计局-审计要闻\",\n"
+
" \"siteUri\": \"https://sjj.sh.gov.cn/n388/index.html\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://sjj.sh.gov.cn/n388/index.html\",\n"
+
" \"listExpressionType\": null,\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"a\",\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"u1[class=\\\"dtul dtul1\\\"]>li\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": null,\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": null,\n"
+
" \"detailExpressionPublishDate\": null,\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": null,\n"
+
" \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"40 01 1/2 * * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc7
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1539588694743097346\",\n"
+
" \"infoSourceCode\": \"IN-20220622-0007\",\n"
+
" \"webSiteName\": \"新华丝路\",\n"
+
" \"siteName\": \"新华丝路-投资资讯\",\n"
+
" \"siteUri\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"h5[class=\\\"text-xl\\\"]>a\",\n"
+
" \"informationPublishDate\": \"\",\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"div[class=\\\"mb-3\\\"]>ul>li\",\n"
+
" \"linkLocation\": \"h5[class=\\\"text-xl\\\"]>a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"text-2xl md:text-4xl mb-4 font-song\\\"]</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": \"<origin><exp>span:contains(来源)</exp></origin>\",\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"43 38 20 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc8
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1539588694743097346\",\n"
+
" \"infoSourceCode\": \"IN-20220622-0007\",\n"
+
" \"webSiteName\": \"新华丝路\",\n"
+
" \"siteName\": \"新华丝路-投资资讯\",\n"
+
" \"siteUri\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://www.imsilkroad.com/news/category/touzizixun\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"h5[class=\\\"text-xl\\\"]>a\",\n"
+
" \"informationPublishDate\": \"\",\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"div[class=\\\"mb-3\\\"]>ul>li\",\n"
+
" \"linkLocation\": \"h5[class=\\\"text-xl\\\"]>a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"text-2xl md:text-4xl mb-4 font-song\\\"]</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>span:contains(时间)</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": \"<origin><exp>span:contains(来源)</exp></origin>\",\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>div[id=\\\"article\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"43 38 20 1/1 * ?\"\n"
+
"}"
;
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
...
@@ -759,171 +288,5 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
...
@@ -759,171 +288,5 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
siteThread
.
crawler
();
siteThread
.
crawler
();
}
}
public
void
loadSiteMsgLoc9
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1539590279724441602\",\n"
+
" \"infoSourceCode\": \"IN-20220622-0012\",\n"
+
" \"webSiteName\": \"走出去情报\",\n"
+
" \"siteName\": \"走出去情报-最新\",\n"
+
" \"siteUri\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"div[class=\\\"item-text-content-title\\\"]\",\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"div[class=\\\"FeedList\\\"]\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"text-title\\\"]>h1</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>span[id=\\\"news-time\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>article[class=\\\"article-info\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"01 45 20 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
public
void
loadSiteMsgLoc10
(){
// KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
// kafkaConsumerJob.consumer();
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer();
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
" \"id\": \"1539590279724441602\",\n"
+
" \"infoSourceCode\": \"IN-20220622-0012\",\n"
+
" \"webSiteName\": \"走出去情报\",\n"
+
" \"siteName\": \"走出去情报-最新\",\n"
+
" \"siteUri\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
" \"siteLevel\": null,\n"
+
" \"language\": null,\n"
+
" \"checkedList\": null,\n"
+
" \"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
" \"status\": null,\n"
+
" \"listUrl\": \"https://mp.sohu.com/profile?xpt=OTU4MzI0Nzc0Mzg2NjEwMTc2QHNvaHUuY29t&_f=index_pagemp_1&spm=smpc.content.author.2.1655886952826vbhbnCn\",\n"
+
" \"listExpressionType\": \"3\",\n"
+
" \"informationUrl\": null,\n"
+
" \"informationTitle\": \"div[class=\\\"item-text-content-title\\\"]\",\n"
+
" \"informationPublishDate\": null,\n"
+
" \"informationSource\": null,\n"
+
" \"infoBlockPosition\": \"div[class=\\\"FeedList\\\"]\",\n"
+
" \"linkLocation\": \"a\",\n"
+
" \"extractInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"crawlDepth\": null,\n"
+
" \"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
" \"detailExpressionType\": \"3\",\n"
+
" \"detailUrl\": null,\n"
+
" \"detailExpressionTitle\": \"<title><exp>div[class=\\\"text-title\\\"]>h1</exp></title>\",\n"
+
" \"detailExpressionPublishDate\": \"<publish_date><exp>span[id=\\\"news-time\\\"]</exp></publish_date>\",\n"
+
" \"detailExpressionSource\": null,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
" \"detailExpressionContent\": \"<content><exp>article[class=\\\"article-info\\\"]</exp></content>\",\n"
+
" \"detailInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
" \"dataFormInfo\": \"[{\\\"id\\\":1,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
" \"link\": null,\n"
+
" \"account\": null,\n"
+
" \"password\": null,\n"
+
" \"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
" \"cookies\": null,\n"
+
" \"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
" \"cron\": \"01 45 20 1/1 * ?\"\n"
+
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
DynaminSiteThread
siteThread
=
new
DynaminSiteThread
();
siteThread
.
siteMsgTemple
=
siteMsgTemple
;
siteThread
.
crawler
();
}
}
}
comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
浏览文件 @
af30a040
...
@@ -469,6 +469,7 @@ public class PaserSiteDownload {
...
@@ -469,6 +469,7 @@ public class PaserSiteDownload {
}
}
public
static
String
getHtml
(
String
url
,
String
charset
)
{
public
static
String
getHtml
(
String
url
,
String
charset
)
{
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.ttl"
,
"0"
);
String
html
=
""
;
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
...
...
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
浏览文件 @
af30a040
...
@@ -2,6 +2,7 @@ package com.zzsn.crawler;
...
@@ -2,6 +2,7 @@ package com.zzsn.crawler;
import
cn.hutool.core.date.DateTime
;
import
cn.hutool.core.date.DateTime
;
import
cn.hutool.core.date.DateUtil
;
import
cn.hutool.core.date.DateUtil
;
import
cn.hutool.core.io.FileUtil
;
import
com.fasterxml.jackson.core.JsonProcessingException
;
import
com.fasterxml.jackson.core.JsonProcessingException
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.configuration.SpringContextUtil
;
...
@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
...
@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import
com.zzsn.crawler.paser.WebContentPaserByXpath
;
import
com.zzsn.crawler.paser.WebContentPaserByXpath
;
import
com.zzsn.crawler.uriparser.HisURIConfig
;
import
com.zzsn.crawler.uriparser.HisURIConfig
;
import
com.zzsn.crawler.uriparser.HisURIParser
;
import
com.zzsn.crawler.uriparser.HisURIParser
;
import
com.zzsn.crawler.uriparser.HttpgetUtil
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
...
@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
...
@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
public
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
public
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
public
SiteMsgTemple
siteMsgTemple
=
new
SiteMsgTemple
();
public
SiteMsgTemple
siteMsgTemple
=
new
SiteMsgTemple
();
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
//
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
crawler
();
crawler
();
}
}
public
static
PageDownloader
pageDownload
=
new
PageDownloader
();
public
void
crawler
(){
public
void
crawler
(){
//获取栏目链接以及翻页的链接
//获取栏目链接以及翻页的链接
// List<String> urlList=getPageListUrl(siteMsgTemple);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
siteMsgTemple
.
getSiteUri
());
//兼容就平台的历史链接方法
String
charset
=
"utf-8"
;
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
DocInfo
>
docInfoList
=
new
ArrayList
<>();
log
.
info
(
"信息源名称:"
+
siteMsgTemple
.
getSiteName
()+
" 信息源采集开始时间:"
+
DateTime
.
now
());
// Date collectTime=DateTime.now();
// String infoSourceId=siteMsgTemple.getId();
// //默认表达式类型
// siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//
// //判断列表解析表达式类型
// if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
// WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
// metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
// WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
// metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
// WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
// metaSearchList = webContentPaserByJsonXpath.catchWebOfStaticmsgByJsonPath(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
// WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
// metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
// }
String
body
=
""
;
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getHeaders
())){
body
=
pageDownload
.
downloadWithStrAddHeader
(
urlList
.
get
(
0
),
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
try
{
//先使用静态网络请求获取列表内容
body
=
HttpgetUtil
.
getHtml
(
urlList
.
get
(
0
));
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
body
=
pageDownload
.
downloadWithStr
(
urlList
.
get
(
0
),
charset
,
false
,
false
);
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if
(
StringUtils
.
isEmpty
(
body
)
&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
body
=
SeleniumTime
.
getScopehtml
(
urlList
.
get
(
0
));
}
}
if
(
body
.
length
()<
1000
){
FileUtil
.
appendString
(
siteMsgTemple
.
getInfoSourceCode
()+
"\n\r"
,
"D:\\jingwai.txt"
,
"utf-8"
);
}
}
public
void
crawler2
(){
//获取栏目链接以及翻页的链接
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
//兼容就平台的历史链接方法
//兼容就平台的历史链接方法
HisURIParser
hisURIParser
=
new
HisURIParser
();
HisURIParser
hisURIParser
=
new
HisURIParser
();
...
@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
...
@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
siteMsgRecord
.
setCollectTime
(
collectTime
);
siteMsgRecord
.
setCollectTime
(
collectTime
);
String
docjson
=
mapper
.
writeValueAsString
(
siteMsgRecord
);
String
docjson
=
mapper
.
writeValueAsString
(
siteMsgRecord
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_COLLECT_TOPIC
,
"key"
,
docjson
);
//
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log
.
info
(
"发送到kafka成功。"
);
log
.
info
(
"发送到kafka成功。"
);
}
catch
(
JsonProcessingException
e
)
{
}
catch
(
JsonProcessingException
e
)
{
// e.printStackTrace();
// e.printStackTrace();
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
af30a040
...
@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
...
@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.crawler.PaserSiteDownload
;
import
com.zzsn.crawler.PaserSiteDownload
;
import
com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder
;
import
com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder
;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.crawler.uriparser.*
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.crawler.uriparser.WebPageScreenShot
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
...
@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
...
@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
try
{
//先使用静态网络请求获取列表内容
try
{
//先使用静态网络请求获取列表内容
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
body
=
HttpgetUtil
.
getHtml
(
uri_code
);
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
body
=
paserSiteDownload
.
getHtml
(
uri_code
,
charset
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
// body = paserSiteDownload.getHtml(uri_code, charset);
}
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if
(
StringUtils
.
isEmpty
(
body
)
&&
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
if
(
StringUtils
.
isEmpty
(
body
)
&&
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
// SeleniumTime seleniumTime=new SeleniumTime();
// body = seleniumTime.getScopehtml(uri_code);
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
}
}
if
(
StringUtils
.
isEmpty
(
body
)
||
pageDownload
.
isBadDownloadPage
(
body
))
{
if
(
StringUtils
.
isEmpty
(
body
)
||
pageDownload
.
isBadDownloadPage
(
body
))
{
...
@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
...
@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
String
content
=
""
;
String
content
=
""
;
try
{
try
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
// SeleniumTime seleniumTime=new SeleniumTime();
// content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
else
{
}
else
{
try
{
try
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
null
,
false
,
false
);
content
=
HttpgetUtil
.
getHtml
(
cwbm
.
getSourceaddress
());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
null
,
false
,
false
);
log
.
info
(
e
.
getMessage
());
log
.
info
(
e
.
getMessage
());
content
=
paserSiteDownload
.
getHtml
(
cwbm
.
getSourceaddress
(),
null
);
//
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
}
}
}
}
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
...
@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
...
@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
docInfo
.
setTitle
(
cwbm
.
getTitle
()
==
null
?
""
:
cwbm
.
getTitle
().
replace
(
"..."
,
""
));
docInfo
.
setTitle
(
cwbm
.
getTitle
()
==
null
?
""
:
cwbm
.
getTitle
().
replace
(
"..."
,
""
));
docInfo
.
setAuthor
(
cwbm
.
getAuthor
());
docInfo
.
setAuthor
(
cwbm
.
getAuthor
());
docInfo
.
setPublishDate
(
cwbm
.
getPublishDate
());
docInfo
.
setPublishDate
(
cwbm
.
getPublishDate
());
if
(
cwbm
.
getSourceaddress
()
!=
null
)
{
// if (cwbm.getSourceaddress() != null) {
docInfo
.
setOrigin
(
cwbm
.
getSourcesite
());
// docInfo.setOrigin(cwbm.getSourcesite());
}
else
{
// } else {
docInfo
.
setOrigin
(
siteMsgTemple
.
getSiteName
());
// docInfo.setOrigin(siteMsgTemple.getSiteName());
}
// }
docInfo
.
setOrigin
(
siteMsgTemple
.
getSiteName
());
docInfo
.
setSummary
(
cwbm
.
getSummary
());
docInfo
.
setSummary
(
cwbm
.
getSummary
());
//封装解析的docinfo对象
//封装解析的docinfo对象
try
{
try
{
...
@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {
...
@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {
}
}
docInfo
.
setContentWithTag
(
contentWithTag
);
docInfo
.
setContentWithTag
(
contentWithTag
);
docInfo
.
setContentNoTag
(
Utility
.
TransferHTML2Text
(
contentWithTag
).
replaceAll
(
"\\n"
,
""
));
docInfo
.
setContentNoTag
(
Content
Utility
.
TransferHTML2Text
(
contentWithTag
).
replaceAll
(
"\\n"
,
""
));
}
}
//作者
//作者
...
@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
...
@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
origin
=
paseElementByCSS
(
doc
,
siteTemplate
.
getDetailExpressionSource
());
origin
=
paseElementByCSS
(
doc
,
siteTemplate
.
getDetailExpressionSource
());
if
(
StringUtils
.
isNotEmpty
(
origin
))
{
if
(
StringUtils
.
isNotEmpty
(
origin
))
{
docInfo
.
setOrigin
(
origin
);
docInfo
.
setOrigin
(
origin
);
}
else
{
docInfo
.
setOrigin
(
siteTemplate
.
getSiteName
());
}
}
}
else
{
docInfo
.
setOrigin
(
siteTemplate
.
getSiteName
());
}
}
return
docInfo
;
return
docInfo
;
}
}
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
0 → 100644
浏览文件 @
af30a040
package
com
.
zzsn
.
crawler
.
uriparser
;
import
java.io.IOException
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
com.zzsn.download.CreateSSLClientDefault
;
import
com.zzsn.util.Utility
;
import
org.apache.commons.httpclient.params.HttpMethodParams
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.ParseException
;
import
org.apache.http.client.ClientProtocolException
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.params.CoreConnectionPNames
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Connection
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
public
class
HttpgetUtil
{
public
static
String
getHtml
(
String
url
)
{
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
60000
);
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
60000
);
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
httpgeturl
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
httpgeturl
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse
httprespse
=
null
;
try
{
Thread
.
sleep
(
500L
);
httprespse
=
httpClient
.
execute
(
httpgeturl
);
}
catch
(
Exception
e2
)
{
// TODO Auto-generated catch block
e2
.
printStackTrace
();
}
// 发送请求
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
Header
lastModify
=
httprespse
.
getFirstHeader
(
"Last-Modified"
);
if
(
lastModify
==
null
)
{
lastModify
=
httprespse
.
getLastHeader
(
"Last-Modified"
);
}
String
charset
=
"utf-8"
;
String
charstype
=
EntityUtils
.
getContentCharSet
(
entitydata
);
if
(
charstype
!=
null
)
{
charset
=
charstype
;
}
else
{
charset
=
LocateCharSet
(
url
);
}
charset
=
Utility
.
charsetcheck
(
charset
);
String
infodata
=
""
;
try
{
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
httpgeturl
.
releaseConnection
();
return
infodata
;
}
public
static
String
LocateCharSet
(
String
url
)
{
String
encoding
=
"gb2312"
;
try
{
Thread
.
sleep
(
500L
);
Connection
conn
=
Jsoup
.
connect
(
url
);
conn
.
header
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
// 伪装成浏览器
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
10000
).
get
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m1
=
p1
.
matcher
(
doc
.
toString
());
while
(
m1
.
find
())
{
String
str
=
m1
.
group
();
Pattern
p2
=
Pattern
.
compile
(
"charset[^\\s||\"||;||'||>]*"
);
Matcher
m2
=
p2
.
matcher
(
str
);
if
(
m2
.
find
())
{
encoding
=
m2
.
group
().
substring
(
8
);
if
(
encoding
.
trim
().
length
()
==
0
)
{
Pattern
p3
=
Pattern
.
compile
(
"charset=\"[^\\s||\"||;||>]*"
);
Matcher
m3
=
p3
.
matcher
(
str
);
if
(
m3
.
find
())
{
encoding
=
m3
.
group
().
substring
(
9
);
}
if
(
encoding
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding
=
"GB2312"
;
// }
}
}
return
encoding
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
System
.
out
.
println
(
"获取出错编码方式"
);
return
encoding
;
}
return
encoding
;
}
}
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
浏览文件 @
af30a040
...
@@ -32,6 +32,7 @@ public class SeleniumTime {
...
@@ -32,6 +32,7 @@ public class SeleniumTime {
public
static
String
getScopehtml
(
String
url
)
{
public
static
String
getScopehtml
(
String
url
)
{
String
html
=
""
;
String
html
=
""
;
try
{
try
{
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
try
{
try
{
Duration
duration
=
Duration
.
of
(
100
,
ChronoUnit
.
SECONDS
);
Duration
duration
=
Duration
.
of
(
100
,
ChronoUnit
.
SECONDS
);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
0 → 100644
浏览文件 @
af30a040
package
com
.
zzsn
.
crawler
.
uriparser
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileOutputStream
;
import
java.io.PrintStream
;
import
com.zzsn.generation.Constants
;
import
org.openqa.selenium.By
;
import
org.openqa.selenium.WebElement
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.chrome.ChromeOptions
;
public
class
SeleniumTime4
{
public
ChromeOptions
chromeOptions
=
new
ChromeOptions
()
;
public
ChromeDriver
driver
;
public
SeleniumTime4
(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
// System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
chromeOptions
.
addArguments
(
"blink-settings=imagesEnabled=false"
);
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("--start-maximized");
// chromeOptions.addArguments("--headless");
driver
=
new
ChromeDriver
(
chromeOptions
);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public
String
getScopehtml
(
String
url
){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try
{
driver
.
get
(
url
);
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
try
{
Thread
.
sleep
(
3000
l
);
String
html
=
webElement
.
getAttribute
(
"outerHTML"
);
Thread
.
sleep
(
5000
l
);
driver
.
quit
();
// System.out.println(html);
if
(
url
.
contains
(
"http://www.flw.ph"
)){
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
if
(
html
.
contains
(
a
)&&
html
.
contains
(
b
)){
String
[]
split
=
html
.
split
(
a
);
String
sa
=
split
[
0
];
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
String
sab
=
sa
+
substring
;
return
sab
;
}
}
return
html
;
}
catch
(
Exception
e
){
System
.
out
.
println
(
"动态爬取方式一出现+"
+
"org.openqa.selenium.StaleElementReferenceException异常"
+
"可能原因为过快的执行没有找到指定的页面元素"
);
System
.
out
.
println
(
"=============执行方法二=============="
);
Thread
.
sleep
(
3000
l
);
String
html
=
driver
.
getPageSource
();
Thread
.
sleep
(
5000
l
);
driver
.
quit
();
if
(
url
.
contains
(
"http://www.flw.ph"
)){
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
if
(
html
.
contains
(
a
)&&
html
.
contains
(
b
)){
String
[]
split
=
html
.
split
(
a
);
String
sa
=
split
[
0
];
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
String
sab
=
sa
+
substring
;
return
sab
;
}
}
return
html
;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}
catch
(
Exception
e
){
try
{
Thread
.
sleep
(
5000
l
);
}
catch
(
InterruptedException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
driver
.
quit
();
e
.
printStackTrace
();
}
try
{
Thread
.
sleep
(
5000
l
);
}
catch
(
InterruptedException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
driver
.
quit
();
return
null
;
}
public
static
void
main
(
String
[]
args
)
{
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime4
s
=
new
SeleniumTime4
();
String
scopehtml
=
s
.
getScopehtml
(
"http://www.flw.ph/thread-869016-1-1.html"
);
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
System
.
out
.
println
(
"开始"
);
if
(
scopehtml
.
contains
(
a
)){
System
.
out
.
println
(
"包含a"
);
}
if
(
scopehtml
.
contains
(
a
)){
System
.
out
.
println
(
"包含b"
);
}
System
.
out
.
println
(
"结束"
);
String
[]
split
=
scopehtml
.
split
(
a
);
String
sa
=
split
[
0
];
System
.
out
.
println
(
"首次截取的长度"
+
split
.
length
);
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
System
.
out
.
println
(
"再次截取的长度"
+
split2
.
length
);
String
sab
=
sa
+
substring
;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File
file
=
new
File
(
"D:/123.txt"
);
try
{
PrintStream
ps
=
new
PrintStream
(
new
FileOutputStream
(
file
));
ps
.
println
(
sab
);
}
catch
(
FileNotFoundException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
}
}
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
浏览文件 @
af30a040
...
@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
...
@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
}
else
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
seleniumTime
=
new
SeleniumTime
();
//
seleniumTime=new SeleniumTime();
body
=
s
eleniumTime
.
getScopehtml
(
uri_code
);
body
=
S
eleniumTime
.
getScopehtml
(
uri_code
);
TimeUnit
.
SECONDS
.
sleep
(
5
);
TimeUnit
.
SECONDS
.
sleep
(
5
);
seleniumTime
.
close
();
seleniumTime
.
close
();
}
else
{
}
else
{
...
...
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
浏览文件 @
af30a040
...
@@ -165,7 +165,8 @@ public class JedisUtil {
...
@@ -165,7 +165,8 @@ public class JedisUtil {
throw
new
Exception
(
"key is null"
);
throw
new
Exception
(
"key is null"
);
}
}
jedis
=
getDefaultJedis
();
jedis
=
getDefaultJedis
();
value
=
jedis
.
get
(
PREFIX
+
key
);
// value = jedis.get(PREFIX + key);
value
=
jedis
.
get
(
key
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
}
finally
{
}
finally
{
...
...
comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
浏览文件 @
af30a040
...
@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
...
@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled
(
cron
=
"0 0/2 * * * ?"
)
//
@Scheduled(cron = "0 0/2 * * * ?")
// @Async("asyncTaskExecutor")
// @Async("asyncTaskExecutor")
public
void
consumer
(){
public
void
consumer
(){
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
...
...
comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
浏览文件 @
af30a040
...
@@ -98,6 +98,7 @@ public class ChromeTest {
...
@@ -98,6 +98,7 @@ public class ChromeTest {
// 可复用驱动使用Demo
// 可复用驱动使用Demo
public
static
void
main
(
String
[]
args
)
throws
Exception
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
if
(
driver
==
null
)
{
if
(
driver
==
null
)
{
// 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存
// 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存
...
...
comm_crawler/src/main/java/com/zzsn/test/HttpClientTester.java
浏览文件 @
af30a040
package
com
.
zzsn
.
test
;
package
com
.
zzsn
.
test
;
import
com.zzsn.crawler.uriparser.HttpgetUtil
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageBuilderParser
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.NameValuePair
;
...
@@ -41,7 +42,9 @@ import java.util.List;
...
@@ -41,7 +42,9 @@ import java.util.List;
public
class
HttpClientTester
{
public
class
HttpClientTester
{
private
static
PageBuilderParser
builderParser
=
null
;
private
static
PageBuilderParser
builderParser
=
null
;
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
get
(
"https://www.cas.cn/zjs/"
);
// get("https://edition.cnn.com/world");
String
html
=
HttpgetUtil
.
getHtml
(
"https://edition.cnn.com/world"
);
System
.
out
.
println
(
html
);
// post();
// post();
}
}
...
...
comm_crawler/src/main/java/com/zzsn/test/WebTest.java
浏览文件 @
af30a040
package
com
.
zzsn
.
test
;
package
com
.
zzsn
.
test
;
import
com.zzsn.crawler.PaserSiteDownload
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.download.PageDownloader
;
import
java.io.IOException
;
import
java.io.IOException
;
...
@@ -17,21 +18,12 @@ import java.io.InputStream;
...
@@ -17,21 +18,12 @@ import java.io.InputStream;
public
class
WebTest
{
public
class
WebTest
{
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
//
String url="https://www.teriin.org/opinion";
String
url
=
"https://www.teriin.org/opinion"
;
// PageDownloader pageDownload=new PageDownloader();
// PageDownloader pageDownload=new PageDownloader();
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// System.out.println(body);
// System.out.println(body);
try
{
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
Runtime
mt
=
Runtime
.
getRuntime
();
String
html
=
paserSiteDownload
.
getHtml
(
"https://edition.cnn.com/world"
,
"utf-8"
);
String
cmd
=
"taskkill /F /im chrome.exe"
;
System
.
out
.
println
(
html
);
Process
pro
=
mt
.
exec
(
cmd
);
InputStream
ers
=
pro
.
getErrorStream
();
pro
.
waitFor
();
System
.
out
.
println
(
"++++++++ taskkill /F /im chromedriver.exe"
);
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
catch
(
InterruptedException
e
)
{
// TODO Auto-generated catch block
}
}
}
}
}
comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
浏览文件 @
af30a040
...
@@ -287,15 +287,17 @@ public class ContentUtility {
...
@@ -287,15 +287,17 @@ public class ContentUtility {
if
(
htmlText
==
null
){
if
(
htmlText
==
null
){
return
null
;
return
null
;
}
}
String
text
=
ContentUtility
.
HTMLDecode
(
ContentUtility
.
RemoveHTMLCode
(
ContentUtility
.
RemoveStyleCode
(
Content
Utility
.
RemoveHTMLReturnCode
(
htmlText
))));
String
text
=
Utility
.
HTMLDecode
(
Utility
.
RemoveHTMLCode
(
Utility
.
RemoveStyleCode
(
Utility
.
RemoveHTMLReturnCode
(
htmlText
))));
text
=
text
.
replaceAll
(
" "
,
"\r\n"
);
text
=
text
.
replaceAll
(
" "
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +\r\n"
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +\r\n"
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" \n"
,
"\n"
);
text
=
text
.
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
);
return
text
;
return
text
;
}
}
...
...
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
浏览文件 @
af30a040
...
@@ -58,10 +58,10 @@ public class DriverUtil {
...
@@ -58,10 +58,10 @@ public class DriverUtil {
}
}
public
static
ReuseWebDriver
connectChrome
(
String
sessionId
,
String
serverUrl
)
throws
Exception
{
public
static
ReuseWebDriver
connectChrome
(
String
sessionId
,
String
serverUrl
)
throws
Exception
{
if
(
serverUrl
==
null
||
""
.
equals
(
serverUrl
)
||
sessionId
==
null
||
""
.
equals
(
sessionId
))
{
//
if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
log
.
error
(
"未获取到驱动服务地址、sessionId"
);
//
log.error("未获取到驱动服务地址、sessionId");
return
null
;
//
return null;
}
//
}
ReuseWebDriver
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
ReuseWebDriver
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
if
(
driver
.
connectTestFail
())
{
if
(
driver
.
connectTestFail
())
{
...
@@ -89,10 +89,21 @@ public class DriverUtil {
...
@@ -89,10 +89,21 @@ public class DriverUtil {
* @date 2022/7/25 15:07
* @date 2022/7/25 15:07
*/
*/
public
static
ReuseWebDriver
getChromeDriver
()
throws
Exception
{
public
static
ReuseWebDriver
getChromeDriver
()
throws
Exception
{
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
Map
<
String
,
String
>
map
=
getSessionInfo
();
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
String
sessionId
=
map
.
get
(
"sessionId"
);
String
sessionId
=
map
.
get
(
"sessionId"
);
String
serverUrl
=
map
.
get
(
"serverUrl"
);
String
serverUrl
=
map
.
get
(
"serverUrl"
);
return
connectChrome
(
sessionId
,
serverUrl
);
return
connectChrome
(
sessionId
,
serverUrl
);
}
}
public
static
Map
<
String
,
String
>
getSessionInfo
()
throws
Exception
{
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
if
(
map
==
null
||
map
.
size
()<
1
)
{
map
=
new
HashMap
<>(
2
);
map
.
put
(
"sessionId"
,
"sessionId"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
}
return
map
;
}
}
}
comm_crawler/src/main/java/com/zzsn/util/Utility.java
浏览文件 @
af30a040
package
com
.
zzsn
.
util
;
package
com
.
zzsn
.
util
;
import
org.jsoup.Jsoup
;
import
java.io.BufferedReader
;
import
org.jsoup.nodes.Document
;
import
java.io.BufferedWriter
;
import
org.jsoup.nodes.Element
;
import
java.io.File
;
import
org.jsoup.select.Elements
;
import
java.io.FileInputStream
;
import
org.mozilla.universalchardet.UniversalDetector
;
import
java.io.FileNotFoundException
;
import
java.io.FileOutputStream
;
import
java.io.*
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.io.UnsupportedEncodingException
;
import
java.net.MalformedURLException
;
import
java.net.MalformedURLException
;
import
java.net.URL
;
import
java.net.URL
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.nio.MappedByteBuffer
;
import
java.nio.MappedByteBuffer
;
import
java.nio.channels.FileChannel
;
import
java.nio.channels.FileChannel
;
import
java.text.SimpleDateFormat
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.ArrayList
;
import
java.util.Calendar
;
import
java.util.Collections
;
import
java.util.Comparator
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.Iterator
;
import
java.util.LinkedHashMap
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.Locale
;
import
java.util.Map
;
import
java.util.UUID
;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
java.util.TimeZone
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
import
java.util.regex.PatternSyntaxException
;
import
java.util.regex.PatternSyntaxException
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.mozilla.universalchardet.UniversalDetector
;
//import com.zzsn.worker.db.model.ImgData;
//import com.zzsn.worker.index.Constants;
/*
/*
*
*
...
@@ -39,6 +62,8 @@ import java.util.regex.PatternSyntaxException;
...
@@ -39,6 +62,8 @@ import java.util.regex.PatternSyntaxException;
*/
*/
@SuppressWarnings
(
"deprecation"
)
@SuppressWarnings
(
"deprecation"
)
public
class
Utility
{
public
class
Utility
{
//定时器控制flg
public
static
int
flg
=
0
;
//任务执行状态flg
//任务执行状态flg
public
static
int
status_flg
=
0
;
public
static
int
status_flg
=
0
;
static
String
regEx
=
"[\\u4e00-\\u9fa5]"
;
static
String
regEx
=
"[\\u4e00-\\u9fa5]"
;
...
@@ -58,6 +83,7 @@ public class Utility {
...
@@ -58,6 +83,7 @@ public class Utility {
static
Pattern
divP
=
Pattern
.
compile
(
"<div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
divP
=
Pattern
.
compile
(
"<div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
divRP
=
Pattern
.
compile
(
"</div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
divRP
=
Pattern
.
compile
(
"</div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
brP
=
Pattern
.
compile
(
"<br />"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
brP
=
Pattern
.
compile
(
"<br />"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
brP2
=
Pattern
.
compile
(
"<br/>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
br2P
=
Pattern
.
compile
(
"<br>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
br2P
=
Pattern
.
compile
(
"<br>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
spaceP
=
Pattern
.
compile
(
" "
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
spaceP
=
Pattern
.
compile
(
" "
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
strongP
=
Pattern
.
compile
(
"<strong>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
strongP
=
Pattern
.
compile
(
"<strong>"
,
Pattern
.
CASE_INSENSITIVE
);
...
@@ -110,6 +136,8 @@ public class Utility {
...
@@ -110,6 +136,8 @@ public class Utility {
private
static
Pattern
patDate4
=
Pattern
.
compile
(
"\\d+年\\d+月\\d+日"
);
private
static
Pattern
patDate4
=
Pattern
.
compile
(
"\\d+年\\d+月\\d+日"
);
private
static
Pattern
patDate5
=
Pattern
.
compile
(
"\\d+/\\d{1,2}/\\d+"
);
private
static
Pattern
patDate5
=
Pattern
.
compile
(
"\\d+/\\d{1,2}/\\d+"
);
private
static
Pattern
patDate6
=
Pattern
.
compile
(
"\\d+\\.\\d+\\.\\d+"
);
private
static
Pattern
patDate6
=
Pattern
.
compile
(
"\\d+\\.\\d+\\.\\d+"
);
private
static
Pattern
patDate7
=
Pattern
.
compile
(
"\\d{1,2}-\\d{1,2}"
);
private
static
Pattern
patDate8
=
Pattern
.
compile
(
"\\d+月\\d+日"
);
private
static
SimpleDateFormat
formatter0
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
);
private
static
SimpleDateFormat
formatter0
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
);
private
static
SimpleDateFormat
formatter0_1
=
new
SimpleDateFormat
(
"yy-MM-dd"
);
private
static
SimpleDateFormat
formatter0_1
=
new
SimpleDateFormat
(
"yy-MM-dd"
);
...
@@ -122,15 +150,103 @@ public class Utility {
...
@@ -122,15 +150,103 @@ public class Utility {
private
static
SimpleDateFormat
formatter5_4
=
new
SimpleDateFormat
(
"yy/MM/dd"
);
private
static
SimpleDateFormat
formatter5_4
=
new
SimpleDateFormat
(
"yy/MM/dd"
);
private
static
SimpleDateFormat
formatter5_3
=
new
SimpleDateFormat
(
"dd/MM/yy"
);
private
static
SimpleDateFormat
formatter5_3
=
new
SimpleDateFormat
(
"dd/MM/yy"
);
private
static
SimpleDateFormat
formatter6
=
new
SimpleDateFormat
(
"yyyy.MM.dd"
);
private
static
SimpleDateFormat
formatter6
=
new
SimpleDateFormat
(
"yyyy.MM.dd"
);
private
static
SimpleDateFormat
formatter7
=
new
SimpleDateFormat
(
"MM-dd"
);
private
static
SimpleDateFormat
formatter8
=
new
SimpleDateFormat
(
"MM月dd"
);
private
static
Date
thresholdDate
=
null
;
private
static
Date
thresholdDate
=
null
;
/**
* 对参数中的中文进行编码
* 创建人: 刘小鹏
* 创建时间: 2016-4-14 下午2:46:50
* @version 1.0
* @param uri
* @return
*/
public
static
String
encodURI
(
String
uri
)
{
if
(
uri
==
null
)
{
return
null
;
}
//只对中文参数进行转码
if
(
uri
.
contains
(
"?"
)&&!
uri
.
endsWith
(
"?"
))
{
try
{
StringBuffer
sb
=
new
StringBuffer
();
sb
.
append
(
"?"
);
String
[]
array
=
uri
.
split
(
"\\?"
);
String
uriPart
=
array
[
0
];
String
paramStr
=
array
[
1
];
String
[]
params
=
paramStr
.
split
(
"\\&"
);
for
(
int
i
=
0
;
i
<
params
.
length
;
i
++)
{
if
(
i
>
0
){
sb
.
append
(
"&"
);
}
String
param
=
params
[
i
];
Integer
indexFlag
=
param
.
indexOf
(
"="
);
if
(
indexFlag
!=-
1
){
String
name
=
param
.
substring
(
0
,
indexFlag
);
String
value
=
param
.
substring
(
indexFlag
+
1
);
value
=
URLEncoder
.
encode
(
value
,
"UTF-8"
);
value
=
value
.
replaceAll
(
"%3D"
,
"="
);
sb
.
append
(
URLEncoder
.
encode
(
name
,
"UTF-8"
)
+
"="
+
value
);
}
else
{
sb
.
append
(
URLEncoder
.
encode
(
param
,
"UTF-8"
)
);
}
/* String name = param.substring(0,param.indexOf("="));
String value = nameAndValue[1];
if (nameAndValue.length == 1) {
sb.append(URLEncoder.encode(param, "UTF-8"));
} else if(nameAndValue.length == 2) {
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}else
{
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}*/
}
uri
=
uriPart
+
sb
.
toString
();
/* Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]").matcher(uri);
while (matcher.find()) {
String tmp = matcher.group();
uri = uri.replaceAll(tmp, java.net.URLEncoder.encode(tmp, "UTF-8"));
}*/
}
catch
(
UnsupportedEncodingException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
}
//对路径中的中文也进行转码
String
regex
=
"([\u4e00-\u9fa5]+)"
;
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
uri
);
String
find
;
String
replace
=
null
;
while
(
matcher
.
find
()){
find
=
matcher
.
group
();
try
{
replace
=
URLEncoder
.
encode
(
find
,
"UTF-8"
);
}
catch
(
UnsupportedEncodingException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
while
(
uri
.
contains
(
find
)&&!
find
.
equals
(
replace
)){
uri
=
uri
.
replace
(
find
,
replace
);
}
}
uri
=
uri
.
replaceAll
(
"/+$"
,
"/"
);
uri
=
uri
.
replaceAll
(
" "
,
"%20"
);
return
uri
;
}
/*
/*
* 判断网页文件的编码
* 判断网页文件的编码
*/
*/
public
static
String
getWebEncodingByStr
(
String
content
)
{
public
static
String
getWebEncodingByStr
(
String
content
)
{
String
encoding
=
"GB2312"
;
String
encoding
=
null
;
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
Pattern
.
CASE_INSENSITIVE
);
Matcher
m1
=
p1
.
matcher
(
content
);
Matcher
m1
=
p1
.
matcher
(
content
);
...
@@ -219,20 +335,14 @@ public class Utility {
...
@@ -219,20 +335,14 @@ public class Utility {
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
if
(
encoding
==
null
)
{
if
(
encoding
==
null
)
{
if
(
encoding
==
null
)
{
encoding
=
detectCharSet
(
fileName
);
encoding
=
"UTF-8"
;
//encoding = "GB2312";
// if (encoding == null) {
}
// encoding = null; //encoding = "GB2312";
// }
}
}
return
encoding
;
return
encoding
;
}
}
public
static
String
getLanguageType
(
String
content
)
{
String
langType
=
LangTypeDetector
.
DetectLang
(
content
);
if
(
langType
==
null
||
""
.
equals
(
langType
))
{
return
"error"
;
}
return
langType
;
}
public
static
String
detectCharSet
(
String
fileName
)
{
public
static
String
detectCharSet
(
String
fileName
)
{
try
try
...
@@ -242,13 +352,13 @@ public class Utility {
...
@@ -242,13 +352,13 @@ public class Utility {
return
null
;
return
null
;
}
}
byte
[]
buf
=
new
byte
[
4096
];
byte
[]
buf
=
new
byte
[
4096
];
FileInputStream
fis
=
null
;
FileInputStream
fis
=
null
;
UniversalDetector
detector
=
null
;
UniversalDetector
detector
=
null
;
try
{
try
{
fis
=
new
FileInputStream
(
fileName
);
fis
=
new
FileInputStream
(
fileName
);
detector
=
new
UniversalDetector
(
null
);
detector
=
new
UniversalDetector
(
null
);
int
nread
;
int
nread
;
while
((
nread
=
fis
.
read
(
buf
))
>
0
&&
!
detector
.
isDone
())
{
while
((
nread
=
fis
.
read
(
buf
))
>
0
&&
!
detector
.
isDone
())
{
detector
.
handleData
(
buf
,
0
,
nread
);
detector
.
handleData
(
buf
,
0
,
nread
);
...
@@ -259,7 +369,7 @@ public class Utility {
...
@@ -259,7 +369,7 @@ public class Utility {
//e.printStackTrace();
//e.printStackTrace();
}
}
detector
.
dataEnd
();
detector
.
dataEnd
();
String
encoding
=
detector
.
getDetectedCharset
();
String
encoding
=
detector
.
getDetectedCharset
();
detector
.
reset
();
detector
.
reset
();
if
(
encoding
==
null
)
{
if
(
encoding
==
null
)
{
...
@@ -270,10 +380,11 @@ public class Utility {
...
@@ -270,10 +380,11 @@ public class Utility {
}
}
catch
(
Exception
e
)
catch
(
Exception
e
)
{
{
// e.printStackTrace();
// e.printStackTrace();
return
null
;
return
null
;
}
}
}
}
/*
/*
* 判断文件的编码格式
* 判断文件的编码格式
*/
*/
...
@@ -331,6 +442,7 @@ public class Utility {
...
@@ -331,6 +442,7 @@ public class Utility {
src
=
divP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
divP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
divRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
divRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
brP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
brP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
brP2
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
br2P
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
br2P
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
spaceP
.
matcher
(
src
).
replaceAll
(
" "
);
src
=
spaceP
.
matcher
(
src
).
replaceAll
(
" "
);
src
=
src
.
replaceAll
(
"•"
,
"??"
);
src
=
src
.
replaceAll
(
"•"
,
"??"
);
...
@@ -748,6 +860,14 @@ public class Utility {
...
@@ -748,6 +860,14 @@ public class Utility {
}
}
return
true
;
return
true
;
}
}
public
static
String
getLanguageType
(
String
content
)
{
String
langType
=
LangTypeDetector
.
DetectLang
(
content
);
if
(
langType
==
null
||
""
.
equals
(
langType
))
{
return
"error"
;
}
return
langType
;
}
public
static
List
<
String
>
getFiles
(
List
<
String
>
l
,
String
directory
,
boolean
bIncludeSubDir
)
{
public
static
List
<
String
>
getFiles
(
List
<
String
>
l
,
String
directory
,
boolean
bIncludeSubDir
)
{
if
(
l
==
null
)
{
if
(
l
==
null
)
{
...
@@ -964,6 +1084,8 @@ public class Utility {
...
@@ -964,6 +1084,8 @@ public class Utility {
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" \n"
,
"\n"
);
text
=
text
.
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
);
return
text
;
return
text
;
}
}
...
@@ -1092,9 +1214,9 @@ public class Utility {
...
@@ -1092,9 +1214,9 @@ public class Utility {
htmlText
=
formRP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
formRP
.
matcher
(
htmlText
).
replaceAll
(
""
);
//
htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
htmlText
=
imgReplaceP
.
matcher
(
htmlText
).
replaceAll
(
"<_img$1>"
);
//
htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
htmlText
=
removeAttrP
.
matcher
(
htmlText
).
replaceAll
(
"<$1>"
);
//
htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");
htmlText
=
imgRevReplaceP
.
matcher
(
htmlText
).
replaceAll
(
"<img$1>"
);
htmlText
=
commentP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
commentP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
legendRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
legendRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
...
@@ -1187,42 +1309,7 @@ public class Utility {
...
@@ -1187,42 +1309,7 @@ public class Utility {
}
}
public
static
String
RemoveAllLink
(
String
contentWithTag
)
{
Document
doc
=
Jsoup
.
parse
(
contentWithTag
);
Elements
contentElems
=
doc
.
select
(
"a"
);
if
((
contentElems
==
null
)
||
(
contentElems
.
size
()
==
0
))
{
return
contentWithTag
;
}
for
(
Element
aElement
:
contentElems
)
{
try
{
String
elementText
=
aElement
.
text
().
trim
();
Element
parentElement
=
aElement
.
parent
();
String
parentText
=
parentElement
.
text
().
trim
();
elementText
=
elementText
.
replaceAll
(
" "
,
""
).
trim
();
parentText
=
parentText
.
replaceAll
(
" "
,
""
).
trim
();
aElement
.
remove
();
while
(
parentElement
.
text
().
trim
().
isEmpty
())
{
Element
tempElement
=
parentElement
;
parentElement
=
parentElement
.
parent
();
tempElement
.
remove
();
}
}
catch
(
Exception
e
)
{
continue
;
}
}
return
doc
.
outerHtml
();
}
...
@@ -1238,6 +1325,8 @@ public class Utility {
...
@@ -1238,6 +1325,8 @@ public class Utility {
||
(
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate7
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate8
.
matcher
(
content
)).
find
()
)
)
{
{
return
true
;
return
true
;
...
@@ -1263,6 +1352,8 @@ public class Utility {
...
@@ -1263,6 +1352,8 @@ public class Utility {
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
4
)!=
null
))
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
4
)!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
5
)!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
5
)!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
6
)!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
6
)!=
null
))
||
((
dateMatcher
=
patDate7
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
7
)!=
null
))
||
((
dateMatcher
=
patDate8
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
8
)!=
null
))
)
)
{
{
return
true
;
return
true
;
...
@@ -1290,6 +1381,8 @@ public class Utility {
...
@@ -1290,6 +1381,8 @@ public class Utility {
||
(
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate7
.
matcher
(
content
)).
find
()
||
(
dateMatcher
=
patDate8
.
matcher
(
content
)).
find
()
)
)
{
{
return
dateMatcher
;
return
dateMatcher
;
...
@@ -1316,6 +1409,8 @@ public class Utility {
...
@@ -1316,6 +1409,8 @@ public class Utility {
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
4
)!=
null
))
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
4
)!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
5
)!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
5
)!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
6
)!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
6
)!=
null
))
||
((
dateMatcher
=
patDate7
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
7
)!=
null
))
||
((
dateMatcher
=
patDate8
.
matcher
(
content
)).
find
()&&
(
Utility
.
transDate
(
dateMatcher
.
group
(),
8
)!=
null
))
)
)
{
{
return
dateMatcher
;
return
dateMatcher
;
...
@@ -1402,6 +1497,12 @@ public class Utility {
...
@@ -1402,6 +1497,12 @@ public class Utility {
case
6
:
case
6
:
date
=
formatter6
.
parse
(
source
);
date
=
formatter6
.
parse
(
source
);
break
;
break
;
case
7
:
date
=
formatter7
.
parse
(
source
);
break
;
case
8
:
date
=
formatter8
.
parse
(
source
);
break
;
}
}
if
((
date
!=
null
)
&&
(
date
.
before
(
thresholdDate
)))
if
((
date
!=
null
)
&&
(
date
.
before
(
thresholdDate
)))
...
@@ -1431,6 +1532,8 @@ public class Utility {
...
@@ -1431,6 +1532,8 @@ public class Utility {
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
4
))!=
null
))
||
((
dateMatcher
=
patDate4
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
4
))!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
5
))!=
null
))
||
((
dateMatcher
=
patDate5
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
5
))!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
6
))!=
null
))
||
((
dateMatcher
=
patDate6
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
6
))!=
null
))
||
((
dateMatcher
=
patDate7
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
7
))!=
null
))
||
((
dateMatcher
=
patDate8
.
matcher
(
content
)).
find
()&&
((
date
=
Utility
.
transDate
(
dateMatcher
.
group
(),
8
))!=
null
))
)
)
{
{
return
date
;
return
date
;
...
@@ -1468,8 +1571,180 @@ public class Utility {
...
@@ -1468,8 +1571,180 @@ public class Utility {
return
null
;
return
null
;
}
}
}
}
/**
* 获取标准的本地时间
* 创建人: 李东亮
* 创建时间: 2015-7-2 上午10:32:25
* @version 1.0
* @param raw
* @return
*/
public
static
String
transLocalTime
(
String
raw
){
Date
date
=
Utility
.
transDate
(
raw
);
if
(
date
!=
null
){
Calendar
c
=
Calendar
.
getInstance
(
TimeZone
.
getTimeZone
(
"Asia/Shanghai"
));
c
.
setTime
(
date
);
Pattern
p
=
Pattern
.
compile
(
"(\\d{1,2})[:|:](\\d{1,2})([:|:]\\d{1,2}){0,1}"
);
Matcher
m
=
p
.
matcher
(
raw
);
while
(
m
.
find
()){
String
hour
=
m
.
group
(
1
);
if
(
hour
!=
null
){
c
.
set
(
Calendar
.
HOUR_OF_DAY
,
Integer
.
valueOf
(
hour
));
}
String
minute
=
m
.
group
(
2
);
if
(
minute
!=
null
){
c
.
set
(
Calendar
.
MINUTE
,
Integer
.
valueOf
(
minute
));
}
String
second
=
m
.
group
(
3
);
if
(
second
!=
null
){
c
.
set
(
Calendar
.
SECOND
,
Integer
.
valueOf
(
second
.
replaceAll
(
":|:"
,
""
)));
}
}
return
DateUtil
.
format
(
c
.
getTime
(),
"yyyy-MM-dd HH:mm:ss"
);
}
else
{
return
null
;
}
}
/**
/**
* 获取正文中的图片路径
* 创建人: 李东亮
* 创建时间: 2015-11-13 下午5:27:27
* @version 1.0
* @param text
* @param uri
* @return
*/
public
static
List
<
String
>
getContentImgPath
(
String
text
,
String
uri
){
List
<
String
>
result
=
new
ArrayList
<
String
>();
String
baseUri
=
null
;
Pattern
p
=
Pattern
.
compile
(
"(<img.+?src=)(\"|')(.+?)(\"|')(.*?/?>)"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m
=
p
.
matcher
(
text
);
String
rawPath
;
while
(
m
.
find
()){
rawPath
=
m
.
group
(
3
);
if
(
rawPath
.
startsWith
(
"http://"
)&&!
rawPath
.
startsWith
(
"https://"
)){
}
}
return
result
;
}
/**
* 获取图片链接
* 创建人: 李东亮
* 创建时间: 2015-6-3 下午1:36:19
* @version 1.0
* @param contentStr
* @return
*/
// public static Map<String,ImgData> getContentImg(String contentStr,String uri){
// Map<String,ImgData> imgMap = new HashMap<String,ImgData>();
// if(contentStr==null||contentStr.length()==0){
// return imgMap;
// }
// Pattern p = Pattern.compile( "(<img.+?src=)(\"|')(.+?)(\"|')(.*?/?>)",Pattern.CASE_INSENSITIVE);
// Matcher m = p.matcher(contentStr);
// String imgPath;
// ImgData imgData;
// StringBuffer replacePath;
// while(m.find()){
// imgData = new ImgData();
// imgPath = m.group(3);
// if(!imgPath.startsWith("http://")&&!imgPath.startsWith("https://")&&uri!=null){
// String puriDir = getDirPath(uri.toString());
// imgPath = formatPath(puriDir,imgPath);
// }
// //图片完整路径
// imgData.setFormatTag(imgPath);
// //图片保存路径
// imgData.setLocalImgPath(genImgFileName());
// replacePath = new StringBuffer("");
// replacePath.append(m.group(1)).append(m.group(2)).append("IMG_SERVER/").append(imgData.getLocalImgPath()).append(m.group(4)).append(m.group(5));
// imgData.setReplaceTag(replacePath.toString());
// imgMap.put(m.group(), imgData);
// }
// return imgMap;
// }
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private
static
String
genImgFileName
(){
String
dir
=
DateUtil
.
format
(
new
Date
(),
"yyyy-MM-dd"
);
String
uuid
=
UUID
.
randomUUID
().
toString
();
return
dir
+
"/"
+
uuid
;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午5:13:46
* @version 1.0
* @param path
* @return
*/
public
static
String
removeInvalidFileChar
(
String
path
){
int
split
=
path
.
lastIndexOf
(
"/"
);
String
after
=
path
.
substring
(
split
,
path
.
length
());
after
=
after
.
replaceAll
(
":|\\?|\\*|\"|\\|"
,
""
);
path
=
path
.
substring
(
0
,
split
)+
after
;
return
path
;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param
* @return
* @throws IOException
*/
public
static
String
formatPath
(
String
currentPageURL
,
String
imgPath
)
{
String
start
=
""
;
if
(
currentPageURL
.
indexOf
(
"http://"
)!=-
1
){
start
=
"http://"
;
}
else
if
(
currentPageURL
.
indexOf
(
"https://"
)!=-
1
){
start
=
"https://"
;
}
//绝对路径
if
(
imgPath
.
startsWith
(
"/"
)){
currentPageURL
=
currentPageURL
.
replace
(
start
,
""
);
int
subIndex
=
currentPageURL
.
indexOf
(
"/"
);
if
(
subIndex
==-
1
){
subIndex
=
currentPageURL
.
length
();
}
String
domain
=
currentPageURL
.
substring
(
0
,
subIndex
);
return
start
+
domain
+
imgPath
;
}
//相对路径
String
path
=
currentPageURL
+
"/"
+
imgPath
;
path
=
path
.
replaceAll
(
start
,
"D:/"
);
File
f
=
new
File
(
path
);
String
filePath
=
""
;
try
{
filePath
=
f
.
getCanonicalPath
();
}
catch
(
IOException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
String
result
=
filePath
.
replaceAll
(
"D:\\\\"
,
start
);
result
=
result
.
replaceAll
(
"\\\\"
,
"/"
);
return
result
;
}
/**
* 获取父路径
* 获取父路径
* 创建人: 李东亮
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44
* 创建时间: 2015-7-6 下午3:17:44
...
@@ -1484,7 +1759,7 @@ public class Utility {
...
@@ -1484,7 +1759,7 @@ public class Utility {
}
}
/**
/**
* 去除特殊字符
* 去除特殊字符
* 创建人:
刘小鹏
* 创建人:
李东亮
* 创建时间: 2015-6-4 下午6:40:19
* 创建时间: 2015-6-4 下午6:40:19
* @version 1.0
* @version 1.0
* @param str
* @param str
...
@@ -1513,9 +1788,11 @@ public class Utility {
...
@@ -1513,9 +1788,11 @@ public class Utility {
String
result
=
new
String
(
newtemp
,
targetCharset
);
String
result
=
new
String
(
newtemp
,
targetCharset
);
return
result
;
return
result
;
}
}
public
static
void
main
(
String
args
[])
throws
IOException
{
System
.
out
.
println
(
UUID
.
randomUUID
().
toString
());
}
/**
/**
*
根据
*
获取请求路径后缀
* 创建人: 杨海龙
* 创建人: 杨海龙
* 创建时间: 2015年7月10日 上午10:14:52
* 创建时间: 2015年7月10日 上午10:14:52
* @version 1.0
* @version 1.0
...
@@ -1540,12 +1817,27 @@ public class Utility {
...
@@ -1540,12 +1817,27 @@ public class Utility {
return
null
;
return
null
;
}
}
/**
/**
* 编码匹配
* 格式化URI
* 创建人: 李东亮
* 创建时间: 2015-8-20 下午3:26:00
* @version 1.0
* @version 1.0
* @param
* @param
uri
* @return
* @return
*/
*/
public
static
String
formatURI
(
String
uri
){
uri
=
uri
.
trim
();
uri
=
uri
.
replaceAll
(
"/+$"
,
""
);
return
uri
;
}
/**
* 编码匹配
* @version 1.0
* @param
* @return
*/
public
static
String
charsetcheck
(
String
charset
)
{
public
static
String
charsetcheck
(
String
charset
)
{
String
charreset
=
"GB2312"
;
String
charreset
=
"GB2312"
;
String
[]
charsetall
=
{
"GB2312"
,
"GBK"
,
"UTF-8"
,
"ISO-8859-1"
,
String
[]
charsetall
=
{
"GB2312"
,
"GBK"
,
"UTF-8"
,
"ISO-8859-1"
,
...
@@ -1582,7 +1874,7 @@ public class Utility {
...
@@ -1582,7 +1874,7 @@ public class Utility {
"x-mswin-936"
,
"x-PCK"
,
"x-SJIS_0213"
,
"x-UTF-16LE-BOM"
,
"X-UTF-32BE-BOM"
,
"x-mswin-936"
,
"x-PCK"
,
"x-SJIS_0213"
,
"x-UTF-16LE-BOM"
,
"X-UTF-32BE-BOM"
,
"X-UTF-32LE-BOM"
,
"x-windows-50220"
,
"x-windows-50221"
,
"x-windows-874"
,
"X-UTF-32LE-BOM"
,
"x-windows-50220"
,
"x-windows-50221"
,
"x-windows-874"
,
"x-windows-949"
,
"x-windows-950"
,
"x-windows-iso2022jp"
"x-windows-949"
,
"x-windows-950"
,
"x-windows-iso2022jp"
};
};
for
(
int
i
=
0
;
i
<
charsetall
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
charsetall
.
length
;
i
++)
{
if
(
charset
.
toLowerCase
().
contains
(
charsetall
[
i
].
toLowerCase
()))
{
if
(
charset
.
toLowerCase
().
contains
(
charsetall
[
i
].
toLowerCase
()))
{
charreset
=
charsetall
[
i
];
charreset
=
charsetall
[
i
];
...
@@ -1591,104 +1883,43 @@ public class Utility {
...
@@ -1591,104 +1883,43 @@ public class Utility {
}
}
return
charreset
;
return
charreset
;
}
}
public
static
String
RemoveAllLink
(
String
contentWithTag
)
{
/**
Document
doc
=
Jsoup
.
parse
(
contentWithTag
);
* 对参数中的中文进行编码
Elements
contentElems
=
doc
.
select
(
"a"
);
* 创建人: 刘小鹏
if
((
contentElems
==
null
)
||
(
contentElems
.
size
()
==
0
))
* 创建时间: 2016-4-14 下午2:46:50
{
* @version 1.0
return
contentWithTag
;
* @param uri
}
* @return
for
(
Element
aElement
:
contentElems
)
*/
{
public
static
String
encodURI
(
String
uri
)
{
try
if
(
uri
==
null
)
{
{
return
null
;
String
elementText
=
aElement
.
text
().
trim
();
}
Element
parentElement
=
aElement
.
parent
();
//只对中文参数进行转码
String
parentText
=
parentElement
.
text
().
trim
();
if
(
uri
.
contains
(
"?"
)&&!
uri
.
endsWith
(
"?"
))
{
elementText
=
elementText
.
replaceAll
(
" "
,
""
).
trim
();
try
{
parentText
=
parentText
.
replaceAll
(
" "
,
""
).
trim
();
StringBuffer
sb
=
new
StringBuffer
();
aElement
.
remove
();
sb
.
append
(
"?"
);
while
(
parentElement
.
text
().
trim
().
isEmpty
())
String
[]
array
=
uri
.
split
(
"\\?"
);
{
String
uriPart
=
array
[
0
];
Element
tempElement
=
parentElement
;
String
paramStr
=
array
[
1
];
parentElement
=
parentElement
.
parent
();
String
[]
params
=
paramStr
.
split
(
"\\&"
);
tempElement
.
remove
();
for
(
int
i
=
0
;
i
<
params
.
length
;
i
++)
{
}
if
(
i
>
0
){
}
sb
.
append
(
"&"
);
catch
(
Exception
e
)
}
{
String
param
=
params
[
i
];
continue
;
Integer
indexFlag
=
param
.
indexOf
(
"="
);
}
if
(
indexFlag
!=-
1
){
}
String
name
=
param
.
substring
(
0
,
indexFlag
);
String
value
=
param
.
substring
(
indexFlag
+
1
);
value
=
URLEncoder
.
encode
(
value
,
"UTF-8"
);
value
=
value
.
replaceAll
(
"%3D"
,
"="
);
return
doc
.
outerHtml
();
sb
.
append
(
URLEncoder
.
encode
(
name
,
"UTF-8"
)
+
"="
+
value
);
}
else
{
}
sb
.
append
(
URLEncoder
.
encode
(
param
,
"UTF-8"
)
);
}
/* String name = param.substring(0,param.indexOf("="));
String value = nameAndValue[1];
if (nameAndValue.length == 1) {
sb.append(URLEncoder.encode(param, "UTF-8"));
} else if(nameAndValue.length == 2) {
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}else
{
String name = nameAndValue[0];
String value = nameAndValue[1];
sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
}*/
}
uri
=
uriPart
+
sb
.
toString
();
/* Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]").matcher(uri);
while (matcher.find()) {
String tmp = matcher.group();
uri = uri.replaceAll(tmp, java.net.URLEncoder.encode(tmp, "UTF-8"));
}*/
}
catch
(
UnsupportedEncodingException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
}
//对路径中的中文也进行转码
String
regex
=
"([\u4e00-\u9fa5]+)"
;
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
uri
);
String
find
;
String
replace
=
null
;
while
(
matcher
.
find
()){
find
=
matcher
.
group
();
try
{
replace
=
URLEncoder
.
encode
(
find
,
"UTF-8"
);
}
catch
(
UnsupportedEncodingException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
while
(
uri
.
contains
(
find
)&&!
find
.
equals
(
replace
)){
uri
=
uri
.
replace
(
find
,
replace
);
}
}
uri
=
uri
.
replaceAll
(
"/+$"
,
"/"
);
uri
=
uri
.
replaceAll
(
" "
,
"%20"
);
return
uri
;
}
/**
* 对正则获取url进行统一格式化
* 创建人: 李东亮
* 创建时间: 2016-5-11 上午11:40:56
* @version 1.0
* @param url
* @return
*/
public
static
String
formatURL
(
String
url
)
{
public
static
String
formatURL
(
String
url
)
{
if
(
url
==
null
)
{
if
(
url
==
null
)
{
return
null
;
return
null
;
...
@@ -1704,19 +1935,4 @@ public class Utility {
...
@@ -1704,19 +1935,4 @@ public class Utility {
url
=
url
.
replaceAll
(
"/\\$$"
,
""
);
url
=
url
.
replaceAll
(
"/\\$$"
,
""
);
return
url
;
return
url
;
}
}
public
static
String
removeHttp
(
String
content
){
// Pattern p = Pattern.compile("https://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?|(\\s)?");
Pattern
p
=
Pattern
.
compile
(
"^(\\s)(https://|http://)([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)(\\s)?"
);
Matcher
matcher
=
p
.
matcher
(
content
);
while
(
matcher
.
find
())
{
content
=
content
.
replace
(
matcher
.
group
(),
""
);
}
return
content
;
}
public
static
void
main
(
String
args
[])
throws
IOException
{
String
url
=
"聚焦跑道 塑造变革 加快建设共同富裕美好社会 https://stc-new.8531.cn/assets/20220216/1644994342533_620c9f264dc8891126854d30.mp3 2月15日下午,湖州市委书记王纲主持召开会议,专题听取湖州共同富裕"
;
String
s
=
removeHttp
(
url
);
System
.
out
.
println
(
s
);
}
}
}
\ No newline at end of file
comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
浏览文件 @
af30a040
...
@@ -2,6 +2,8 @@ package com.zzsn.util;
...
@@ -2,6 +2,8 @@ package com.zzsn.util;
import
com.zzsn.crawler.ReuseWebDriver
;
import
com.zzsn.crawler.ReuseWebDriver
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
import
org.springframework.scheduling.annotation.EnableScheduling
;
import
org.springframework.scheduling.annotation.Scheduled
;
import
java.io.BufferedReader
;
import
java.io.BufferedReader
;
import
java.io.InputStreamReader
;
import
java.io.InputStreamReader
;
...
@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
...
@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
*/
*/
@Slf4j
@Slf4j
@SuppressWarnings
(
"all"
)
@SuppressWarnings
(
"all"
)
@EnableScheduling
public
class
WindowsProcess
{
public
class
WindowsProcess
{
private
static
Pattern
TASK_LIST_PATTERN
=
Pattern
.
compile
(
"^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$"
);
private
static
Pattern
TASK_LIST_PATTERN
=
Pattern
.
compile
(
"^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$"
);
private
static
String
DRIVER_NAME
=
"chrome.exe"
;
private
static
String
CHROME_NAME
=
"chrome.exe"
;
private
static
String
DRIVER_NAME
=
"chromedriver.exe"
;
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
WindowsProcess
process
=
new
WindowsProcess
();
WindowsProcess
process
=
new
WindowsProcess
();
...
@@ -28,6 +32,7 @@ public class WindowsProcess {
...
@@ -28,6 +32,7 @@ public class WindowsProcess {
* @author andylau
* @author andylau
* @date 2022/7/26 11:23
* @date 2022/7/26 11:23
*/
*/
// @Scheduled(cron = "0 0 1 * * ?")
private
void
killProcess
()
{
private
void
killProcess
()
{
try
{
try
{
String
line
;
String
line
;
...
@@ -35,14 +40,14 @@ public class WindowsProcess {
...
@@ -35,14 +40,14 @@ public class WindowsProcess {
BufferedReader
input
=
new
BufferedReader
(
new
InputStreamReader
(
p
.
getInputStream
()));
BufferedReader
input
=
new
BufferedReader
(
new
InputStreamReader
(
p
.
getInputStream
()));
while
((
line
=
input
.
readLine
())
!=
null
)
{
while
((
line
=
input
.
readLine
())
!=
null
)
{
if
(
line
.
contains
(
DRIVER_NAME
)
)
{
if
(
line
.
contains
(
CHROME_NAME
)||
line
.
contains
(
DRIVER_NAME
)
)
{
Matcher
matcher
=
TASK_LIST_PATTERN
.
matcher
(
line
);
Matcher
matcher
=
TASK_LIST_PATTERN
.
matcher
(
line
);
if
(
matcher
.
find
())
{
if
(
matcher
.
find
())
{
//
String serviceName = matcher.group(1);
//
String serviceName = matcher.group(1);
String
pid
=
matcher
.
group
(
2
);
String
pid
=
matcher
.
group
(
2
);
//
String sessionName = matcher.group(3);
//
String sessionName = matcher.group(3);
//
String size = matcher.group(4).replace(",", "") + "K";
//
String size = matcher.group(4).replace(",", "") + "K";
//
log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
//
log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
Runtime
.
getRuntime
().
exec
(
"taskkill /pid "
+
pid
);
Runtime
.
getRuntime
().
exec
(
"taskkill /pid "
+
pid
);
}
}
}
}
...
@@ -51,11 +56,11 @@ public class WindowsProcess {
...
@@ -51,11 +56,11 @@ public class WindowsProcess {
log
.
error
(
"浏览器驱动关闭异常..."
);
log
.
error
(
"浏览器驱动关闭异常..."
);
}
finally
{
}
finally
{
// 定时任务关闭驱动后,重新打开驱动
// 定时任务关闭驱动后,重新打开驱动
try
{
//
try {
reopenChromeDriver
();
//
reopenChromeDriver();
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
log
.
error
(
"驱动打开异常..."
);
//
log.error("驱动打开异常...");
}
//
}
}
}
}
}
...
...
comm_crawler/src/main/resources/aa.txt
0 → 100644
浏览文件 @
af30a040
IN-20220609-15205
IN-20220609-45135
IN-20220609-50717
IN-20220609-52785
IN-20220609-3804
IN-20220609-4071
IN-20220609-15069
IN-20220609-45126
IN-20220609-50722
IN-20220609-52787
IN-20220609-3817
IN-20220609-4084
IN-20220609-15090
IN-20220609-45128
IN-20220609-50721
IN-20220609-52786
IN-20220609-58494
IN-20220609-58502
IN-20220609-58503
IN-20220609-58504
IN-20220609-58505
IN-20220609-58506
IN-20220609-58508
IN-20220609-3840
IN-20220609-4107
IN-20220609-12827
IN-20220609-15126
IN-20220609-44997
IN-20220609-45132
IN-20220609-50720
IN-20220609-52779
IN-20220609-3874
IN-20220609-4141
IN-20220609-15147
IN-20220609-45133
IN-20220609-50719
IN-20220609-52778
IN-20220609-56218
IN-20220609-56227
IN-20220609-56229
IN-20220609-57566
IN-20220609-15041
IN-20220609-45120
IN-20220609-50724
IN-20220609-52789
IN-20220609-56185
IN-20220609-56187
IN-20220609-56188
IN-20220609-56189
IN-20220609-56190
IN-20220609-56191
IN-20220609-56192
IN-20220609-57726
IN-20220609-57739
IN-20220609-57740
IN-20220609-57741
IN-20220609-57742
IN-20220609-57771
IN-20220609-57772
IN-20220609-12932
IN-20220609-15043
IN-20220609-45005
IN-20220609-45122
IN-20220609-50723
IN-20220609-52788
IN-20220609-57997
IN-20220609-57998
IN-20220609-57999
IN-20220609-13241
IN-20220609-45062
IN-20220609-54133
IN-20220609-57070
IN-20220609-14979
IN-20220609-14981
IN-20220609-45097
IN-20220609-45098
IN-20220609-50737
IN-20220609-50738
IN-20220609-52766
IN-20220609-52767
IN-20220609-14977
IN-20220609-14978
IN-20220609-45095
IN-20220609-45096
IN-20220609-50739
IN-20220609-50744
IN-20220609-52768
IN-20220609-52769
IN-20220609-50867
IN-20220609-52521
IN-20220609-52773
IN-20220609-52840
IN-20220609-54294
IN-20220609-3947
IN-20220609-3970
IN-20220609-3971
IN-20220609-3972
IN-20220609-4214
IN-20220609-4237
IN-20220609-4238
IN-20220609-4239
IN-20220609-11340
IN-20220609-13247
IN-20220609-13248
IN-20220609-45068
IN-20220609-45069
IN-20220609-50746
IN-20220609-52783
IN-20220609-55032
IN-20220609-57345
IN-20220609-13060
IN-20220609-45024
IN-20220609-58325
IN-20220609-58469
IN-20220609-58471
IN-20220609-58473
IN-20220609-58477
IN-20220609-52580
IN-20220609-52796
IN-20220609-52582
IN-20220609-52798
IN-20220609-52583
IN-20220609-52799
IN-20220609-52632
IN-20220609-13092
IN-20220609-13162
IN-20220609-22258
IN-20220609-45054
IN-20220609-53041
IN-20220609-53044
IN-20220609-53051
IN-20220609-53052
IN-20220609-53055
IN-20220609-53056
IN-20220609-53099
IN-20220609-53101
IN-20220609-53111
IN-20220609-53112
IN-20220609-53139
IN-20220609-53140
IN-20220609-57072
IN-20220609-57074
IN-20220609-57076
IN-20220609-57077
IN-20220609-57084
IN-20220609-57085
IN-20220609-52723
IN-20220609-52728
IN-20220609-52803
IN-20220609-52814
IN-20220609-53102
IN-20220609-53103
IN-20220609-53162
IN-20220609-53180
IN-20220609-52611
IN-20220609-52806
IN-20220609-50475
IN-20220609-50476
IN-20220609-50477
IN-20220609-52568
IN-20220609-52569
IN-20220609-52570
IN-20220609-52810
IN-20220609-52811
IN-20220609-52812
IN-20220609-53602
IN-20220609-53603
IN-20220609-53604
IN-20220609-39173
IN-20220609-52571
IN-20220609-52574
IN-20220609-52813
IN-20220609-52838
IN-20220609-6007
IN-20220609-13080
IN-20220609-13081
IN-20220609-13082
IN-20220609-13083
IN-20220609-13084
IN-20220609-13085
IN-20220609-13086
IN-20220609-13219
IN-20220609-13220
IN-20220609-13221
IN-20220609-13222
IN-20220609-13223
IN-20220609-13224
IN-20220609-45043
IN-20220609-45044
IN-20220609-45045
IN-20220609-45046
IN-20220609-45047
IN-20220609-45048
IN-20220609-45049
IN-20220609-52436
IN-20220609-52524
IN-20220609-52525
IN-20220609-52526
IN-20220609-52533
IN-20220609-52536
IN-20220609-52815
IN-20220609-52830
IN-20220609-52857
IN-20220609-52858
IN-20220609-52866
IN-20220609-52867
IN-20220609-54292
IN-20220608-36685
IN-20220608-36686
IN-20220608-36687
IN-20220608-36688
IN-20220608-36689
IN-20220608-36690
IN-20220608-36691
IN-20220608-36692
IN-20220608-36693
IN-20220608-37141
IN-20220608-37142
IN-20220608-37143
IN-20220608-37144
IN-20220608-37463
IN-20220608-37464
IN-20220608-37465
IN-20220608-37466
IN-20220608-37467
IN-20220608-57178
IN-20220609-3773
IN-20220609-3774
IN-20220609-4040
IN-20220609-4041
IN-20220609-4353
IN-20220609-4354
IN-20220609-4363
IN-20220609-4372
IN-20220609-15012
IN-20220609-15013
IN-20220609-45203
IN-20220609-49938
IN-20220609-49939
IN-20220609-52553
IN-20220609-52554
IN-20220609-52556
IN-20220609-52557
IN-20220609-52558
IN-20220609-52559
IN-20220609-52560
IN-20220609-52816
IN-20220609-52817
IN-20220609-52841
IN-20220609-52842
IN-20220609-52843
IN-20220609-52923
IN-20220609-52939
IN-20220609-52974
IN-20220609-52975
IN-20220609-53107
IN-20220609-53108
IN-20220609-53049
IN-20220609-53054
IN-20220609-53057
IN-20220609-53059
IN-20220609-53060
IN-20220609-53062
IN-20220609-53110
IN-20220609-53113
IN-20220609-53141
IN-20220609-53158
IN-20220609-53159
IN-20220609-53177
IN-20220609-57075
IN-20220609-57078
IN-20220609-57086
IN-20220609-57094
IN-20220609-57095
IN-20220609-57102
IN-20220609-52417
IN-20220609-52657
IN-20220609-52660
IN-20220609-52661
IN-20220609-52665
IN-20220609-52666
IN-20220609-52826
IN-20220609-52827
IN-20220609-52828
IN-20220609-52829
IN-20220609-52832
IN-20220609-53114
IN-20220609-53115
IN-20220609-53116
IN-20220609-53117
IN-20220609-53130
IN-20220609-53190
IN-20220609-53191
IN-20220609-53192
IN-20220609-53193
IN-20220609-53194
IN-20220609-56428
IN-20220609-60657
IN-20220609-52942
IN-20220609-52943
IN-20220609-52944
IN-20220609-52946
IN-20220609-52947
IN-20220609-52948
IN-20220609-52950
IN-20220609-52952
IN-20220609-52953
IN-20220609-52954
IN-20220609-52955
IN-20220609-52957
IN-20220609-52958
IN-20220609-52960
IN-20220609-52961
IN-20220609-52977
IN-20220609-52978
IN-20220609-52979
IN-20220609-52980
IN-20220609-52981
IN-20220609-52982
IN-20220609-52983
IN-20220609-52984
IN-20220609-52985
IN-20220609-52986
IN-20220609-52987
IN-20220609-52988
IN-20220609-52990
IN-20220609-52991
IN-20220609-52998
IN-20220609-53118
IN-20220609-53119
IN-20220609-53120
IN-20220609-53121
IN-20220609-53122
IN-20220609-53123
IN-20220609-53124
IN-20220609-53133
IN-20220609-53134
IN-20220609-53135
IN-20220609-53136
IN-20220609-53137
IN-20220609-53147
IN-20220609-53148
IN-20220609-53155
IN-20220609-53073
IN-20220609-53126
IN-20220609-57080
IN-20220609-53089
IN-20220609-53092
IN-20220609-53128
IN-20220609-53129
IN-20220609-57082
IN-20220609-57083
IN-20220609-52669
IN-20220609-52833
IN-20220609-53131
IN-20220609-53195
IN-20220609-11536
IN-20220609-14969
IN-20220609-52572
IN-20220609-52835
IN-20220609-52585
IN-20220609-52836
IN-20220609-52550
IN-20220609-52573
IN-20220609-52837
IN-20220609-4526
IN-20220609-4527
IN-20220609-4528
IN-20220609-4529
IN-20220609-44926
IN-20220609-44927
IN-20220609-44928
IN-20220609-44929
IN-20220609-53000
IN-20220609-53001
IN-20220609-53006
IN-20220609-53008
IN-20220609-53010
IN-20220609-53143
IN-20220609-53163
IN-20220609-53166
IN-20220609-53167
IN-20220609-53168
IN-20220609-57088
IN-20220609-57096
IN-20220609-57099
IN-20220609-57100
IN-20220609-57101
IN-20220609-4530
IN-20220609-4531
IN-20220609-4532
IN-20220609-4533
IN-20220609-4534
IN-20220609-13094
IN-20220609-44930
IN-20220609-44931
IN-20220609-44932
IN-20220609-45056
IN-20220609-52522
IN-20220609-52860
IN-20220609-53079
IN-20220609-53144
IN-20220609-53782
IN-20220609-53785
IN-20220609-57089
IN-20220609-57114
IN-20220609-52962
IN-20220609-52992
IN-20220609-53149
IN-20220609-53094
IN-20220609-53157
IN-20220609-57093
IN-20220609-52970
IN-20220609-53002
IN-20220609-53005
IN-20220609-53164
IN-20220609-53165
IN-20220609-53243
IN-20220609-57097
IN-20220609-57098
IN-20220609-57112
IN-20220609-52653
IN-20220609-52852
IN-20220609-53173
IN-20220609-53216
IN-20220609-52654
IN-20220609-52853
IN-20220609-53174
IN-20220609-53217
IN-20220609-52655
IN-20220609-52854
IN-20220609-53175
IN-20220609-53218
IN-20220609-52656
IN-20220609-52855
IN-20220609-53176
IN-20220609-53219
IN-20220609-52523
IN-20220609-52856
IN-20220609-59064
IN-20220609-52680
IN-20220609-52682
IN-20220609-52684
IN-20220609-52862
IN-20220609-52863
IN-20220609-52864
IN-20220609-53186
IN-20220609-53187
IN-20220609-53188
IN-20220609-53231
IN-20220609-53232
IN-20220609-53233
IN-20220609-52686
IN-20220609-52687
IN-20220609-52690
IN-20220609-52865
IN-20220609-52895
IN-20220609-52896
IN-20220609-53189
IN-20220609-53229
IN-20220609-53230
IN-20220609-53234
IN-20220609-53261
IN-20220609-53262
IN-20220609-52636
IN-20220609-52869
IN-20220609-53197
IN-20220609-53235
IN-20220609-52694
IN-20220609-52870
IN-20220609-53198
IN-20220609-53236
IN-20220609-52633
IN-20220609-52873
IN-20220609-53201
IN-20220609-53237
IN-20220609-52634
IN-20220609-52874
IN-20220609-53202
IN-20220609-53246
IN-20220609-52640
IN-20220609-52878
IN-20220609-53205
IN-20220609-53249
IN-20220609-52642
IN-20220609-52879
IN-20220609-53206
IN-20220609-53250
IN-20220609-52646
IN-20220609-52881
IN-20220609-53208
IN-20220609-53252
IN-20220609-52670
IN-20220609-52671
IN-20220609-52672
IN-20220609-52890
IN-20220609-52891
IN-20220609-52892
IN-20220609-53224
IN-20220609-53225
IN-20220609-53226
IN-20220609-53256
IN-20220609-53257
IN-20220609-53258
IN-20220609-52673
IN-20220609-52893
IN-20220609-53227
IN-20220609-53259
IN-20220609-52677
IN-20220609-52894
IN-20220609-53228
IN-20220609-53260
IN-20220609-53031
IN-20220609-53238
IN-20220609-57107
IN-20220609-52700
IN-20220609-52897
IN-20220609-53244
IN-20220609-53263
IN-20220609-52705
IN-20220609-52898
IN-20220609-53095
IN-20220609-53245
IN-20220609-52578
IN-20220609-52899
IN-20220609-52586
IN-20220609-52908
IN-20220609-53775
IN-20220609-53778
IN-20220609-53781
IN-20220609-53783
IN-20220609-53786
IN-20220609-57113
IN-20220609-57115
IN-20220609-57143
IN-20220609-57144
IN-20220609-57146
IN-20220609-53789
IN-20220609-53792
IN-20220609-53793
IN-20220609-53799
IN-20220609-53806
IN-20220609-57116
IN-20220609-57117
IN-20220609-57130
IN-20220609-57131
IN-20220609-57132
IN-20220609-52464
IN-20220609-52465
IN-20220609-52466
IN-20220609-52467
IN-20220609-52468
IN-20220609-52469
IN-20220609-52470
IN-20220609-53373
IN-20220609-53374
IN-20220609-53375
IN-20220609-53376
IN-20220609-53384
IN-20220609-53385
IN-20220609-53386
IN-20220609-53810
IN-20220609-57120
IN-20220609-52476
IN-20220609-52477
IN-20220609-52478
IN-20220609-52479
IN-20220609-52480
IN-20220609-52481
IN-20220609-52482
IN-20220609-53298
IN-20220609-53299
IN-20220609-53300
IN-20220609-53301
IN-20220609-53365
IN-20220609-53366
IN-20220609-53392
IN-20220609-53811
IN-20220609-53813
IN-20220609-53815
IN-20220609-53816
IN-20220609-53817
IN-20220609-53819
IN-20220609-53821
IN-20220609-57121
IN-20220609-57122
IN-20220609-57123
IN-20220609-57320
IN-20220609-57321
IN-20220609-57322
IN-20220609-57323
IN-20220609-53832
IN-20220609-53838
IN-20220609-53843
IN-20220609-53845
IN-20220609-57124
IN-20220609-57125
IN-20220609-57126
IN-20220609-57133
IN-20220609-53851
IN-20220609-53859
IN-20220609-53861
IN-20220609-57127
IN-20220609-57140
IN-20220609-57141
IN-20220609-53873
IN-20220609-57128
IN-20220609-53876
IN-20220609-57129
IN-20220609-53864
IN-20220609-53867
IN-20220609-53871
IN-20220609-57134
IN-20220609-57139
IN-20220609-57142
IN-20220609-53755
IN-20220609-53758
IN-20220609-53760
IN-20220609-57135
IN-20220609-57227
IN-20220609-57228
IN-20220609-52471
IN-20220609-52472
IN-20220609-52473
IN-20220609-52474
IN-20220609-52475
IN-20220609-53387
IN-20220609-53388
IN-20220609-53389
IN-20220609-53390
IN-20220609-53391
IN-20220609-53764
IN-20220609-53767
IN-20220609-53768
IN-20220609-53772
IN-20220609-57136
IN-20220609-57137
IN-20220609-57231
IN-20220609-57254
IN-20220609-54215
IN-20220609-57147
IN-20220609-54225
IN-20220609-54226
IN-20220609-54242
IN-20220609-54244
IN-20220609-54246
IN-20220609-57148
IN-20220609-57149
IN-20220609-57263
IN-20220609-57264
IN-20220609-57265
IN-20220609-54158
IN-20220609-54159
IN-20220609-57150
IN-20220609-57339
IN-20220609-54160
IN-20220609-54161
IN-20220609-54162
IN-20220609-54163
IN-20220609-54164
IN-20220609-57151
IN-20220609-57152
IN-20220609-57153
IN-20220609-57291
IN-20220609-57292
IN-20220609-54165
IN-20220609-54167
IN-20220609-54168
IN-20220609-57154
IN-20220609-57247
IN-20220609-57248
IN-20220609-54175
IN-20220609-54178
IN-20220609-57155
IN-20220609-57177
IN-20220609-11997
IN-20220609-13154
IN-20220609-54222
IN-20220609-54227
IN-20220609-54228
IN-20220609-54229
IN-20220609-57156
IN-20220609-57157
IN-20220609-57158
IN-20220609-57324
IN-20220609-54253
IN-20220609-54257
IN-20220609-54262
IN-20220609-54267
IN-20220609-54269
IN-20220609-54270
IN-20220609-54277
IN-20220609-54278
IN-20220609-57159
IN-20220609-57276
IN-20220609-57277
IN-20220609-57279
IN-20220609-57280
IN-20220609-57281
IN-20220609-57295
IN-20220609-57328
IN-20220609-54280
IN-20220609-54290
IN-20220609-54295
IN-20220609-54297
IN-20220609-54299
IN-20220609-54302
IN-20220609-54303
IN-20220609-57160
IN-20220609-57167
IN-20220609-57169
IN-20220609-57297
IN-20220609-57304
IN-20220609-57331
IN-20220609-13051
IN-20220609-45017
IN-20220609-54281
IN-20220609-54282
IN-20220609-54283
IN-20220609-54284
IN-20220609-57161
IN-20220609-57162
IN-20220609-57163
IN-20220609-57164
IN-20220609-12030
IN-20220609-12033
IN-20220609-12037
IN-20220609-28374
IN-20220609-28375
IN-20220609-54285
IN-20220609-54289
IN-20220609-54291
IN-20220609-54296
IN-20220609-54298
IN-20220609-54300
IN-20220609-54301
IN-20220609-56421
IN-20220609-57165
IN-20220609-57166
IN-20220609-57168
IN-20220609-57296
IN-20220609-57302
IN-20220609-57303
IN-20220609-57332
IN-20220609-3938
IN-20220609-4205
IN-20220609-13074
IN-20220609-45037
IN-20220609-54305
IN-20220609-57170
IN-20220609-3790
IN-20220609-4057
IN-20220609-4717
IN-20220609-4718
IN-20220609-44969
IN-20220609-44970
IN-20220609-54304
IN-20220609-54306
IN-20220609-54307
IN-20220609-54308
IN-20220609-54309
IN-20220609-54310
IN-20220609-57171
IN-20220609-57172
IN-20220609-57173
IN-20220609-57174
IN-20220609-57175
IN-20220609-57305
IN-20220609-13075
IN-20220609-45038
IN-20220609-54311
IN-20220609-54312
IN-20220609-57176
IN-20220609-57187
IN-20220609-54179
IN-20220609-54180
IN-20220609-57178
IN-20220609-57179
IN-20220609-54181
IN-20220609-54182
IN-20220609-54183
IN-20220609-57180
IN-20220609-57181
IN-20220609-57182
IN-20220609-54216
IN-20220609-54218
IN-20220609-54219
IN-20220609-57183
IN-20220609-57184
IN-20220609-57185
IN-20220609-54220
IN-20220609-54221
IN-20220609-54223
IN-20220609-54224
IN-20220609-57186
IN-20220609-57325
IN-20220609-57329
IN-20220609-57330
IN-20220609-13076
IN-20220609-45039
IN-20220609-52539
IN-20220609-52804
IN-20220609-54313
IN-20220609-57188
IN-20220609-54314
IN-20220609-56620
IN-20220609-56622
IN-20220609-56623
IN-20220609-57189
IN-20220609-57705
IN-20220609-57727
IN-20220609-57728
IN-20220609-11542
IN-20220609-14999
IN-20220609-44990
IN-20220609-45106
IN-20220609-50733
IN-20220609-52549
IN-20220609-52764
IN-20220609-52847
IN-20220609-54315
IN-20220609-57190
IN-20220609-54184
IN-20220609-54185
IN-20220609-54186
IN-20220609-57191
IN-20220609-57192
IN-20220609-57193
IN-20220609-54197
IN-20220609-54199
IN-20220609-54200
IN-20220609-54202
IN-20220609-54203
IN-20220609-54204
IN-20220609-54206
IN-20220609-54207
IN-20220609-54208
IN-20220609-54209
IN-20220609-54210
IN-20220609-54211
IN-20220609-54212
IN-20220609-54213
IN-20220609-57194
IN-20220609-57195
IN-20220609-57196
IN-20220609-57197
IN-20220609-57198
IN-20220609-57298
IN-20220609-57299
IN-20220609-57300
IN-20220609-57301
IN-20220609-57306
IN-20220609-57318
IN-20220609-57319
IN-20220609-57326
IN-20220609-57327
IN-20220609-54061
IN-20220609-54062
IN-20220609-54063
IN-20220609-57199
IN-20220609-57200
IN-20220609-57315
IN-20220609-54064
IN-20220609-54065
IN-20220609-57201
IN-20220609-57202
IN-20220609-54066
IN-20220609-54067
IN-20220609-54068
IN-20220609-54069
IN-20220609-54070
IN-20220609-57203
IN-20220609-57204
IN-20220609-57205
IN-20220609-57206
IN-20220609-57207
IN-20220609-54021
IN-20220609-54022
IN-20220609-54023
IN-20220609-54024
IN-20220609-57208
IN-20220609-57209
IN-20220609-57310
IN-20220609-57311
IN-20220609-49671
IN-20220609-54025
IN-20220609-57210
IN-20220609-54026
IN-20220609-54032
IN-20220609-54040
IN-20220609-57211
IN-20220609-57212
IN-20220609-57213
IN-20220609-54052
IN-20220609-57216
IN-20220609-54071
IN-20220609-54072
IN-20220609-54073
IN-20220609-54074
IN-20220609-57217
IN-20220609-57218
IN-20220609-57219
IN-20220609-57220
IN-20220609-54075
IN-20220609-54076
IN-20220609-54077
IN-20220609-57221
IN-20220609-57222
IN-20220609-57223
IN-20220609-54078
IN-20220609-54090
IN-20220609-57224
IN-20220609-57225
IN-20220609-53753
IN-20220609-57226
IN-20220609-53762
IN-20220609-53763
IN-20220609-57229
IN-20220609-57230
IN-20220609-53888
IN-20220609-53890
IN-20220609-57232
IN-20220609-57317
IN-20220609-53891
IN-20220609-53892
IN-20220609-57233
IN-20220609-57234
IN-20220609-53893
IN-20220609-53894
IN-20220609-53895
IN-20220609-57235
IN-20220609-57236
IN-20220609-57237
IN-20220609-53896
IN-20220609-53897
IN-20220609-57238
IN-20220609-57239
IN-20220609-54091
IN-20220609-54094
IN-20220609-54097
IN-20220609-57240
IN-20220609-57268
IN-20220609-57269
IN-20220609-54110
IN-20220609-54111
IN-20220609-57241
IN-20220609-57275
IN-20220609-54112
IN-20220609-54113
IN-20220609-54114
IN-20220609-54115
IN-20220609-57242
IN-20220609-57243
IN-20220609-57244
IN-20220609-57245
IN-20220609-54116
IN-20220609-57246
IN-20220609-52483
IN-20220609-53367
IN-20220609-54169
IN-20220609-54170
IN-20220609-54171
IN-20220609-57249
IN-20220609-57258
IN-20220609-57259
IN-20220609-53880
IN-20220609-53884
IN-20220609-57250
IN-20220609-57316
IN-20220609-54120
IN-20220609-54124
IN-20220609-54125
IN-20220609-54126
IN-20220609-57251
IN-20220609-57333
IN-20220609-57334
IN-20220609-57335
IN-20220609-54130
IN-20220609-54131
IN-20220609-57252
IN-20220609-57253
IN-20220609-54172
IN-20220609-57260
IN-20220609-54173
IN-20220609-54174
IN-20220609-57261
IN-20220609-57262
IN-20220609-54146
IN-20220609-54148
IN-20220609-54149
IN-20220609-54150
IN-20220609-57266
IN-20220609-57267
IN-20220609-57285
IN-20220609-57336
IN-20220609-54098
IN-20220609-54102
IN-20220609-57270
IN-20220609-57271
IN-20220609-54106
IN-20220609-54108
IN-20220609-54109
IN-20220609-57272
IN-20220609-57273
IN-20220609-57274
IN-20220609-13050
IN-20220609-15122
IN-20220609-45016
IN-20220609-54261
IN-20220609-54271
IN-20220609-54272
IN-20220609-54274
IN-20220609-54275
IN-20220609-54276
IN-20220609-57278
IN-20220609-57282
IN-20220609-57283
IN-20220609-57284
IN-20220609-57293
IN-20220609-57294
IN-20220609-54151
IN-20220609-54152
IN-20220609-54153
IN-20220609-54154
IN-20220609-57286
IN-20220609-57287
IN-20220609-57288
IN-20220609-57337
IN-20220609-54155
IN-20220609-57289
IN-20220609-54156
IN-20220609-54157
IN-20220609-57290
IN-20220609-57338
IN-20220609-54019
IN-20220609-54020
IN-20220609-57308
IN-20220609-57309
IN-20220609-54055
IN-20220609-54059
IN-20220609-54060
IN-20220609-57312
IN-20220609-57313
IN-20220609-57314
IN-20220609-11551
IN-20220609-13079
IN-20220609-44995
IN-20220609-45042
IN-20220609-52538
IN-20220609-52900
IN-20220609-55170
IN-20220609-55171
IN-20220609-55172
IN-20220609-55174
IN-20220609-55177
IN-20220609-57354
IN-20220609-57355
IN-20220609-57356
IN-20220609-57357
IN-20220609-57358
IN-20220609-12820
IN-20220609-55233
IN-20220609-57366
IN-20220609-55234
IN-20220609-55235
IN-20220609-55236
IN-20220609-55238
IN-20220609-55244
IN-20220609-55245
IN-20220609-56094
IN-20220609-56116
IN-20220609-56117
IN-20220609-57367
IN-20220609-57368
IN-20220609-57369
IN-20220609-57370
IN-20220609-57371
IN-20220609-57372
IN-20220609-57812
IN-20220609-57813
IN-20220609-57835
IN-20220609-54686
IN-20220609-57374
IN-20220609-13077
IN-20220609-45040
IN-20220609-54687
IN-20220609-55165
IN-20220609-55166
IN-20220609-55167
IN-20220609-55168
IN-20220609-57375
IN-20220609-57380
IN-20220609-57381
IN-20220609-57382
IN-20220609-57383
IN-20220609-13078
IN-20220609-45041
IN-20220609-55169
IN-20220609-56679
IN-20220609-57384
IN-20220609-57658
IN-20220609-12767
IN-20220609-12768
IN-20220609-49929
IN-20220609-55055
IN-20220609-56423
IN-20220609-57396
IN-20220609-56886
IN-20220609-56887
IN-20220609-56888
IN-20220609-57413
IN-20220609-57650
IN-20220609-57651
IN-20220609-56895
IN-20220609-56899
IN-20220609-56900
IN-20220609-57415
IN-20220609-57416
IN-20220609-57417
IN-20220609-56902
IN-20220609-56904
IN-20220609-57418
IN-20220609-57419
IN-20220609-56775
IN-20220609-56776
IN-20220609-56779
IN-20220609-56782
IN-20220609-56784
IN-20220609-56786
IN-20220609-56788
IN-20220609-57455
IN-20220609-57457
IN-20220609-57458
IN-20220609-57460
IN-20220609-57806
IN-20220609-57807
IN-20220609-57817
IN-20220609-13053
IN-20220609-45019
IN-20220609-56758
IN-20220609-56763
IN-20220609-56765
IN-20220609-56767
IN-20220609-56769
IN-20220609-56771
IN-20220609-56772
IN-20220609-56773
IN-20220609-56777
IN-20220609-56778
IN-20220609-56780
IN-20220609-56781
IN-20220609-56783
IN-20220609-56785
IN-20220609-56787
IN-20220609-56789
IN-20220609-56792
IN-20220609-56794
IN-20220609-56795
IN-20220609-57456
IN-20220609-57459
IN-20220609-57461
IN-20220609-57464
IN-20220609-57618
IN-20220609-57628
IN-20220609-57630
IN-20220609-57632
IN-20220609-57634
IN-20220609-57635
IN-20220609-57636
IN-20220609-57814
IN-20220609-57815
IN-20220609-57816
IN-20220609-57818
IN-20220609-57819
IN-20220609-57821
IN-20220609-57822
IN-20220609-57860
IN-20220609-56790
IN-20220609-56791
IN-20220609-57462
IN-20220609-57463
IN-20220609-56798
IN-20220609-56801
IN-20220609-56803
IN-20220609-56804
IN-20220609-56805
IN-20220609-57478
IN-20220609-57479
IN-20220609-57846
IN-20220609-57847
IN-20220609-57848
IN-20220609-56807
IN-20220609-56810
IN-20220609-57480
IN-20220609-57483
IN-20220609-56806
IN-20220609-56808
IN-20220609-56809
IN-20220609-57481
IN-20220609-57482
IN-20220609-57849
IN-20220609-56811
IN-20220609-56819
IN-20220609-56841
IN-20220609-57484
IN-20220609-57502
IN-20220609-57714
IN-20220609-58152
IN-20220609-58153
IN-20220609-58154
IN-20220609-58155
IN-20220609-15032
IN-20220609-45115
IN-20220609-56812
IN-20220609-56813
IN-20220609-56814
IN-20220609-56815
IN-20220609-56816
IN-20220609-56817
IN-20220609-56818
IN-20220609-57485
IN-20220609-57486
IN-20220609-57487
IN-20220609-57488
IN-20220609-57850
IN-20220609-57851
IN-20220609-57852
IN-20220608-36694
IN-20220608-37468
IN-20220608-42005
IN-20220608-42009
IN-20220608-42011
IN-20220608-42013
IN-20220608-42016
IN-20220608-47267
IN-20220608-47268
IN-20220608-47269
IN-20220608-47270
IN-20220608-47271
IN-20220608-47272
IN-20220608-47273
IN-20220608-47274
IN-20220608-47275
IN-20220608-47276
IN-20220608-47277
IN-20220608-47279
IN-20220608-47280
IN-20220608-47281
IN-20220608-47282
IN-20220608-47283
IN-20220608-47284
IN-20220608-47286
IN-20220608-47287
IN-20220608-47288
IN-20220608-47289
IN-20220608-47290
IN-20220608-47291
IN-20220608-47292
IN-20220608-47293
IN-20220608-47294
IN-20220608-47295
IN-20220608-47296
IN-20220608-47297
IN-20220608-47299
IN-20220608-47300
IN-20220608-47301
IN-20220608-47302
IN-20220608-47304
IN-20220608-47351
IN-20220608-47352
IN-20220608-47353
IN-20220608-47354
IN-20220608-47355
IN-20220608-47356
IN-20220608-47357
IN-20220608-47358
IN-20220608-47359
IN-20220608-47360
IN-20220608-47361
IN-20220608-47362
IN-20220608-47363
IN-20220608-47364
IN-20220608-47365
IN-20220608-47366
IN-20220608-47367
IN-20220608-47368
IN-20220608-47369
IN-20220608-47370
IN-20220608-47371
IN-20220608-47372
IN-20220608-47373
IN-20220608-47374
IN-20220608-47375
IN-20220608-47376
IN-20220608-47377
IN-20220608-47378
IN-20220608-47379
IN-20220608-47380
IN-20220608-47381
IN-20220608-47382
IN-20220608-47383
IN-20220609-4339
IN-20220609-4385
IN-20220609-5165
IN-20220609-13416
IN-20220609-14589
IN-20220609-28984
IN-20220609-30929
IN-20220609-32870
IN-20220609-53909
IN-20220609-56488
IN-20220609-57500
IN-20220609-56152
IN-20220609-56153
IN-20220609-56154
IN-20220609-56155
IN-20220609-56158
IN-20220609-56159
IN-20220609-56162
IN-20220609-57505
IN-20220609-57506
IN-20220609-57507
IN-20220609-57524
IN-20220609-57525
IN-20220609-57526
IN-20220609-57527
IN-20220609-56264
IN-20220609-56267
IN-20220609-56270
IN-20220609-56274
IN-20220609-57522
IN-20220609-57523
IN-20220609-57600
IN-20220609-57601
IN-20220609-56166
IN-20220609-56172
IN-20220609-56173
IN-20220609-56174
IN-20220609-57528
IN-20220609-57542
IN-20220609-57543
IN-20220609-57544
IN-20220609-56193
IN-20220609-56197
IN-20220609-56201
IN-20220609-56203
IN-20220609-56204
IN-20220609-56206
IN-20220609-56207
IN-20220609-57560
IN-20220609-57743
IN-20220609-57744
IN-20220609-57757
IN-20220609-57758
IN-20220609-57773
IN-20220609-57774
IN-20220609-56209
IN-20220609-56210
IN-20220609-56211
IN-20220609-56212
IN-20220609-57562
IN-20220609-57563
IN-20220609-57564
IN-20220609-57759
IN-20220609-56231
IN-20220609-56238
IN-20220609-56241
IN-20220609-56249
IN-20220609-57580
IN-20220609-57795
IN-20220609-57797
IN-20220609-57836
IN-20220609-3878
IN-20220609-4145
IN-20220609-12832
IN-20220609-13155
IN-20220609-15037
IN-20220609-44998
IN-20220609-45118
IN-20220609-50725
IN-20220609-52790
IN-20220609-56250
IN-20220609-56255
IN-20220609-56258
IN-20220609-56259
IN-20220609-56260
IN-20220609-57581
IN-20220609-57582
IN-20220609-57583
IN-20220609-57598
IN-20220609-57599
IN-20220609-56629
IN-20220609-57597
IN-20220609-56320
IN-20220609-56325
IN-20220609-56334
IN-20220609-56342
IN-20220609-56343
IN-20220609-56345
IN-20220609-57607
IN-20220609-57608
IN-20220609-57654
IN-20220609-57656
IN-20220609-57755
IN-20220609-57756
IN-20220609-56349
IN-20220609-56352
IN-20220609-56353
IN-20220609-56356
IN-20220609-56359
IN-20220609-57609
IN-20220609-57611
IN-20220609-57612
IN-20220609-57613
IN-20220609-57764
IN-20220609-56351
IN-20220609-56371
IN-20220609-56373
IN-20220609-56376
IN-20220609-57610
IN-20220609-57625
IN-20220609-57670
IN-20220609-57671
IN-20220609-56357
IN-20220609-56363
IN-20220609-57614
IN-20220609-57623
IN-20220609-56754
IN-20220609-56755
IN-20220609-56756
IN-20220609-57617
IN-20220609-57832
IN-20220609-57833
IN-20220609-15114
IN-20220609-15116
IN-20220609-45130
IN-20220609-45131
IN-20220609-56277
IN-20220609-56282
IN-20220609-56285
IN-20220609-56288
IN-20220609-56291
IN-20220609-56294
IN-20220609-57620
IN-20220609-57761
IN-20220609-57810
IN-20220609-57811
IN-20220609-57824
IN-20220609-57854
IN-20220609-13242
IN-20220609-45063
IN-20220609-56312
IN-20220609-57621
IN-20220609-56365
IN-20220609-56366
IN-20220609-56367
IN-20220609-56369
IN-20220609-56370
IN-20220609-57624
IN-20220609-57668
IN-20220609-57669
IN-20220609-57678
IN-20220609-57679
IN-20220609-56757
IN-20220609-56759
IN-20220609-56760
IN-20220609-56761
IN-20220609-56762
IN-20220609-56764
IN-20220609-56766
IN-20220609-56768
IN-20220609-56770
IN-20220609-57619
IN-20220609-57629
IN-20220609-57631
IN-20220609-57633
IN-20220609-57834
IN-20220609-57857
IN-20220609-57858
IN-20220609-57859
IN-20220609-57861
IN-20220609-56302
IN-20220609-56305
IN-20220609-56306
IN-20220609-56308
IN-20220609-56309
IN-20220609-57639
IN-20220609-57640
IN-20220609-57871
IN-20220609-57872
IN-20220609-57873
IN-20220609-56727
IN-20220609-56732
IN-20220609-56736
IN-20220609-56737
IN-20220609-56739
IN-20220609-56740
IN-20220609-57644
IN-20220609-57647
IN-20220609-57648
IN-20220609-57782
IN-20220609-57801
IN-20220609-57802
IN-20220609-28346
IN-20220609-28347
IN-20220609-28348
IN-20220609-28349
IN-20220609-28350
IN-20220609-28351
IN-20220609-28352
IN-20220609-28353
IN-20220609-28354
IN-20220609-28355
IN-20220609-28356
IN-20220609-28357
IN-20220609-28358
IN-20220609-56702
IN-20220609-56706
IN-20220609-56707
IN-20220609-56714
IN-20220609-56718
IN-20220609-56724
IN-20220609-56729
IN-20220609-56733
IN-20220609-56738
IN-20220609-57645
IN-20220609-57649
IN-20220609-57776
IN-20220609-57778
IN-20220609-57783
IN-20220609-57862
IN-20220609-57863
IN-20220609-57864
IN-20220609-57869
IN-20220609-60141
IN-20220609-56722
IN-20220609-56725
IN-20220609-56735
IN-20220609-57646
IN-20220609-57779
IN-20220609-57781
IN-20220609-56348
IN-20220609-57657
IN-20220609-56675
IN-20220609-56682
IN-20220609-56685
IN-20220609-56688
IN-20220609-56693
IN-20220609-57661
IN-20220609-57663
IN-20220609-57665
IN-20220609-57667
IN-20220609-57738
IN-20220609-56687
IN-20220609-56691
IN-20220609-57664
IN-20220609-57746
IN-20220609-11385
IN-20220609-14789
IN-20220609-56690
IN-20220609-57666
IN-20220609-56624
IN-20220609-56625
IN-20220609-56626
IN-20220609-57673
IN-20220609-57729
IN-20220609-57730
IN-20220609-56634
IN-20220609-56635
IN-20220609-56636
IN-20220609-56637
IN-20220609-56638
IN-20220609-57674
IN-20220609-57789
IN-20220609-57790
IN-20220609-57791
IN-20220609-57792
IN-20220609-56639
IN-20220609-56642
IN-20220609-56643
IN-20220609-56644
IN-20220609-56645
IN-20220609-57615
IN-20220609-57675
IN-20220609-57677
IN-20220609-57793
IN-20220609-57794
IN-20220609-56640
IN-20220609-57676
IN-20220609-13243
IN-20220609-45064
IN-20220609-50747
IN-20220609-52784
IN-20220609-56378
IN-20220609-56379
IN-20220609-56380
IN-20220609-56381
IN-20220609-57680
IN-20220609-57681
IN-20220609-57682
IN-20220609-57683
IN-20220609-13244
IN-20220609-45065
IN-20220609-56594
IN-20220609-56596
IN-20220609-56610
IN-20220609-56611
IN-20220609-56614
IN-20220609-57684
IN-20220609-57702
IN-20220609-57703
IN-20220609-57722
IN-20220609-57724
IN-20220609-56646
IN-20220609-56648
IN-20220609-56649
IN-20220609-56650
IN-20220609-57687
IN-20220609-57688
IN-20220609-57689
IN-20220609-57798
IN-20220609-56651
IN-20220609-56652
IN-20220609-56653
IN-20220609-57690
IN-20220609-57691
IN-20220609-57692
IN-20220609-56654
IN-20220609-56657
IN-20220609-56659
IN-20220609-57693
IN-20220609-57734
IN-20220609-57799
IN-20220609-12747
IN-20220609-12748
IN-20220609-56662
IN-20220609-57698
IN-20220609-57901
IN-20220609-57906
IN-20220609-57909
IN-20220609-13285
IN-20220609-13286
IN-20220609-13287
IN-20220609-15046
IN-20220609-45124
IN-20220609-57918
IN-20220609-57923
IN-20220609-57924
IN-20220609-57925
IN-20220609-57926
IN-20220609-57928
IN-20220609-57929
IN-20220609-57930
IN-20220609-57931
IN-20220609-57943
IN-20220609-57945
IN-20220609-57946
IN-20220609-57948
IN-20220609-57949
IN-20220609-57950
IN-20220609-56600
IN-20220609-56605
IN-20220609-56606
IN-20220609-56608
IN-20220609-57699
IN-20220609-57700
IN-20220609-57701
IN-20220609-57723
IN-20220609-3883
IN-20220609-4150
IN-20220609-15016
IN-20220609-45110
IN-20220609-57028
IN-20220609-57031
IN-20220609-57033
IN-20220609-57034
IN-20220609-57036
IN-20220609-57707
IN-20220609-57715
IN-20220609-57716
IN-20220609-57717
IN-20220609-57718
IN-20220609-56664
IN-20220609-56665
IN-20220609-56666
IN-20220609-56667
IN-20220609-57709
IN-20220609-57710
IN-20220609-57711
IN-20220609-57712
IN-20220609-57037
IN-20220609-57719
IN-20220609-57039
IN-20220609-57041
IN-20220609-57720
IN-20220609-57721
IN-20220609-56186
IN-20220609-56198
IN-20220609-57725
IN-20220609-58529
IN-20220609-58532
IN-20220609-58534
IN-20220609-58537
IN-20220609-56627
IN-20220609-57731
IN-20220609-56628
IN-20220609-57733
IN-20220609-56676
IN-20220609-57745
IN-20220609-56699
IN-20220609-57749
IN-20220609-3799
IN-20220609-4066
IN-20220609-15061
IN-20220609-45125
IN-20220609-57958
IN-20220609-57959
IN-20220609-57960
IN-20220609-57961
IN-20220609-57962
IN-20220609-57963
IN-20220609-57964
IN-20220609-57965
IN-20220609-56347
IN-20220609-57762
IN-20220609-13301
IN-20220609-56709
IN-20220609-56716
IN-20220609-57767
IN-20220609-57777
IN-20220609-56712
IN-20220609-57768
IN-20220609-14729
IN-20220609-56713
IN-20220609-57769
IN-20220609-15042
IN-20220609-45121
IN-20220609-56175
IN-20220609-56184
IN-20220609-57545
IN-20220609-57770
IN-20220609-56723
IN-20220609-57780
IN-20220609-58000
IN-20220609-58001
IN-20220609-58002
IN-20220609-58003
IN-20220609-58004
IN-20220609-58005
IN-20220609-58006
IN-20220609-58007
IN-20220609-58008
IN-20220609-58009
IN-20220609-58010
IN-20220609-58011
IN-20220609-58012
IN-20220609-58013
IN-20220609-12936
IN-20220609-45006
IN-20220609-58014
IN-20220609-58015
IN-20220609-58016
IN-20220609-58017
IN-20220609-58018
IN-20220609-58019
IN-20220609-56630
IN-20220609-56631
IN-20220609-56632
IN-20220609-56633
IN-20220609-57785
IN-20220609-57786
IN-20220609-57787
IN-20220609-57788
IN-20220609-58021
IN-20220609-58022
IN-20220609-58023
IN-20220609-58024
IN-20220609-56741
IN-20220609-57803
IN-20220609-56745
IN-20220609-57804
IN-20220609-13052
IN-20220609-28361
IN-20220609-28362
IN-20220609-28363
IN-20220609-28364
IN-20220609-28365
IN-20220609-45018
IN-20220609-52544
IN-20220609-52902
IN-20220609-56743
IN-20220609-56746
IN-20220609-56747
IN-20220609-56748
IN-20220609-56749
IN-20220609-56750
IN-20220609-56751
IN-20220609-56752
IN-20220609-56753
IN-20220609-57616
IN-20220609-57805
IN-20220609-57825
IN-20220609-57826
IN-20220609-57827
IN-20220609-57828
IN-20220609-57829
IN-20220609-57830
IN-20220609-57831
IN-20220609-13054
IN-20220609-56796
IN-20220609-56797
IN-20220609-56799
IN-20220609-56800
IN-20220609-56802
IN-20220609-57823
IN-20220609-57842
IN-20220609-57843
IN-20220609-57844
IN-20220609-57845
IN-20220609-56671
IN-20220609-57840
IN-20220609-12716
IN-20220609-56298
IN-20220609-57856
IN-20220609-56710
IN-20220609-57865
IN-20220609-56720
IN-20220609-57868
IN-20220609-12769
IN-20220609-58140
IN-20220609-15036
IN-20220609-45117
IN-20220609-58141
IN-20220609-13057
IN-20220609-45022
IN-20220609-52542
IN-20220609-52901
IN-20220609-58163
IN-20220609-58165
IN-20220609-58166
IN-20220609-58167
IN-20220609-58168
IN-20220609-58172
IN-20220609-58213
IN-20220609-58215
IN-20220609-58217
IN-20220609-58222
IN-20220609-58232
IN-20220609-58238
IN-20220609-58240
IN-20220609-58242
IN-20220609-58243
IN-20220609-58248
IN-20220609-58249
IN-20220609-11974
IN-20220609-11977
IN-20220609-11981
IN-20220609-11983
IN-20220609-11985
IN-20220609-11986
IN-20220609-11987
IN-20220609-44996
IN-20220609-58320
IN-20220609-58321
IN-20220609-58322
IN-20220609-58301
IN-20220609-58306
IN-20220609-58309
IN-20220609-58311
IN-20220609-58313
IN-20220609-58314
IN-20220609-58315
IN-20220609-13058
IN-20220609-45023
IN-20220609-58251
IN-20220609-58255
IN-20220609-58257
IN-20220609-58258
IN-20220609-58262
IN-20220609-58265
IN-20220609-58266
IN-20220609-58268
IN-20220609-58272
IN-20220609-58274
IN-20220609-58277
IN-20220609-58278
IN-20220609-58279
IN-20220609-58286
IN-20220609-58289
IN-20220609-58293
IN-20220609-58294
IN-20220609-58299
IN-20220609-58142
IN-20220609-15033
IN-20220609-45116
IN-20220609-50726
IN-20220609-52761
IN-20220609-58143
IN-20220609-58144
IN-20220609-12823
IN-20220609-58145
IN-20220609-58146
IN-20220609-58147
IN-20220609-58148
IN-20220609-13047
IN-20220609-45014
IN-20220609-57879
IN-20220609-57880
IN-20220609-57882
IN-20220609-57889
IN-20220609-57890
IN-20220609-57891
IN-20220609-57892
IN-20220609-57894
IN-20220609-57895
IN-20220609-57897
IN-20220609-57967
IN-20220609-12720
IN-20220609-57968
IN-20220609-57969
IN-20220609-57970
IN-20220609-57971
IN-20220609-57972
IN-20220609-57973
IN-20220609-57974
IN-20220609-57975
IN-20220609-57976
IN-20220609-57977
IN-20220609-58053
IN-20220609-58851
IN-20220609-13068
IN-20220609-13069
IN-20220609-45031
IN-20220609-45032
IN-20220609-58626
IN-20220609-58632
IN-20220609-58635
IN-20220609-58639
IN-20220609-58642
IN-20220609-58645
IN-20220609-58647
IN-20220609-58650
IN-20220609-13067
IN-20220609-45030
IN-20220609-58653
IN-20220609-58668
IN-20220609-58672
IN-20220609-58673
IN-20220609-15101
IN-20220609-45129
IN-20220609-58618
IN-20220609-58619
IN-20220609-58623
IN-20220609-13063
IN-20220609-45027
IN-20220609-58538
IN-20220609-58539
IN-20220609-58540
IN-20220609-58541
IN-20220609-58543
IN-20220609-58544
IN-20220609-58545
IN-20220609-58546
IN-20220609-13064
IN-20220609-45028
IN-20220609-58551
IN-20220609-58566
IN-20220609-58570
IN-20220609-58572
IN-20220609-58582
IN-20220609-58586
IN-20220609-58590
IN-20220609-58511
IN-20220609-58513
IN-20220609-58514
IN-20220609-58515
IN-20220609-58516
IN-20220609-58518
IN-20220609-58519
IN-20220609-58520
IN-20220609-58523
IN-20220609-58525
IN-20220609-58527
IN-20220609-58711
IN-20220609-58714
IN-20220609-58716
IN-20220609-58718
IN-20220609-58720
IN-20220609-58723
IN-20220609-13071
IN-20220609-45034
IN-20220609-52540
IN-20220609-52905
IN-20220609-58730
IN-20220609-58738
IN-20220609-58739
IN-20220609-58741
IN-20220609-58743
IN-20220609-58744
IN-20220609-58745
IN-20220609-58748
IN-20220609-58749
IN-20220609-58752
IN-20220609-58754
IN-20220609-58757
IN-20220609-3836
IN-20220609-4103
IN-20220609-13048
IN-20220609-15120
IN-20220609-57875
IN-20220609-57877
IN-20220609-58149
IN-20220609-58150
IN-20220609-58151
IN-20220609-58171
IN-20220609-58173
IN-20220609-58174
IN-20220609-13061
IN-20220609-45025
IN-20220609-52541
IN-20220609-52906
IN-20220609-58510
IN-20220609-58512
IN-20220609-58517
IN-20220609-58522
IN-20220609-58524
IN-20220609-58536
IN-20220609-58929
IN-20220609-58932
IN-20220609-58934
IN-20220609-58936
IN-20220609-58942
IN-20220609-58948
IN-20220609-58950
IN-20220609-12712
IN-20220609-12713
IN-20220609-12714
IN-20220609-58782
IN-20220609-58784
IN-20220609-58927
IN-20220609-58928
IN-20220609-58780
IN-20220609-14984
IN-20220609-45099
IN-20220609-58790
IN-20220609-58792
IN-20220609-58793
IN-20220609-58799
IN-20220609-58806
IN-20220609-58825
IN-20220609-58827
IN-20220609-58834
IN-20220609-13066
IN-20220609-58598
IN-20220609-13070
IN-20220609-45033
IN-20220609-58726
IN-20220609-58727
IN-20220609-58728
IN-20220609-58729
IN-20220609-58607
IN-20220609-58610
IN-20220609-58613
IN-20220609-58615
IN-20220609-3816
IN-20220609-4083
IN-20220609-58478
IN-20220609-58479
IN-20220609-58480
IN-20220609-58481
IN-20220609-58482
IN-20220609-58483
IN-20220609-58485
IN-20220609-58487
IN-20220609-58490
IN-20220609-12703
IN-20220609-58839
IN-20220609-58843
IN-20220609-58845
IN-20220609-58847
IN-20220609-58850
IN-20220609-58853
IN-20220609-58492
IN-20220609-13055
IN-20220609-45020
IN-20220609-52543
IN-20220609-52907
IN-20220609-58156
IN-20220609-58157
IN-20220609-13056
IN-20220609-45021
IN-20220609-58158
IN-20220609-58493
IN-20220609-3819
IN-20220609-4086
IN-20220609-58159
IN-20220609-58160
IN-20220609-58161
IN-20220609-58162
IN-20220609-57966
IN-20220609-59206
IN-20220609-59207
IN-20220609-59208
IN-20220609-59210
IN-20220609-59211
IN-20220609-59212
IN-20220609-59214
IN-20220609-59215
IN-20220609-59216
IN-20220609-59217
IN-20220609-59282
IN-20220609-59283
IN-20220609-59285
IN-20220609-58994
IN-20220609-58995
IN-20220609-58996
IN-20220609-58997
IN-20220609-58998
IN-20220609-14971
IN-20220609-14973
IN-20220609-14975
IN-20220609-45091
IN-20220609-45092
IN-20220609-45093
IN-20220609-58999
IN-20220609-59000
IN-20220609-59001
IN-20220609-59002
IN-20220609-59003
IN-20220609-59004
IN-20220609-59005
IN-20220609-14970
IN-20220609-59009
IN-20220609-59010
IN-20220609-59011
IN-20220609-59012
IN-20220609-59013
IN-20220609-59014
IN-20220609-59017
IN-20220609-13185
IN-20220609-13186
IN-20220609-13187
IN-20220609-59088
IN-20220609-59024
IN-20220609-59034
IN-20220609-59045
IN-20220609-58980
IN-20220609-58981
IN-20220609-58982
IN-20220609-58983
IN-20220609-58984
IN-20220609-58985
IN-20220609-58986
IN-20220609-58987
IN-20220609-58988
IN-20220609-58989
IN-20220609-58990
IN-20220609-58991
IN-20220609-58992
IN-20220609-58993
IN-20220609-59223
IN-20220609-59225
IN-20220609-59227
IN-20220609-59228
IN-20220609-59229
IN-20220609-59230
IN-20220609-59231
IN-20220609-59232
IN-20220609-3871
IN-20220609-4138
IN-20220609-4614
IN-20220609-4615
IN-20220609-4616
IN-20220609-4617
IN-20220609-4618
IN-20220609-4619
IN-20220609-4620
IN-20220609-4621
IN-20220609-5702
IN-20220609-5703
IN-20220609-15212
IN-20220609-29518
IN-20220609-29519
IN-20220609-31463
IN-20220609-31464
IN-20220609-33404
IN-20220609-33405
IN-20220609-44955
IN-20220609-44956
IN-20220609-44957
IN-20220609-44958
IN-20220609-44959
IN-20220609-44960
IN-20220609-44961
IN-20220609-44962
IN-20220609-45137
IN-20220609-58964
IN-20220609-58966
IN-20220609-58968
IN-20220609-58970
IN-20220609-58971
IN-20220609-58972
IN-20220609-58973
IN-20220609-58974
IN-20220609-58976
IN-20220609-59293
IN-20220609-59020
IN-20220609-59023
IN-20220609-59026
IN-20220609-59027
IN-20220609-59028
IN-20220609-59029
IN-20220609-59030
IN-20220609-59280
IN-20220609-59281
IN-20220609-59292
IN-20220609-59294
IN-20220609-59295
IN-20220609-59296
IN-20220609-59297
IN-20220609-59274
IN-20220609-59244
IN-20220609-59246
IN-20220609-59248
IN-20220609-59250
IN-20220609-59253
IN-20220609-59254
IN-20220609-59256
IN-20220609-59257
IN-20220609-59258
IN-20220609-59260
IN-20220609-59261
IN-20220609-59262
IN-20220609-59265
IN-20220609-59031
IN-20220609-59032
IN-20220609-59033
IN-20220609-59035
IN-20220609-59036
IN-20220609-59037
IN-20220609-12904
IN-20220609-59038
IN-20220609-59039
IN-20220609-59040
IN-20220609-59041
IN-20220609-59042
IN-20220609-59043
IN-20220609-59044
IN-20220609-4493
IN-20220609-4494
IN-20220609-4495
IN-20220609-13240
IN-20220609-28366
IN-20220609-28367
IN-20220609-28368
IN-20220609-28369
IN-20220609-28370
IN-20220609-44921
IN-20220609-44922
IN-20220609-45061
IN-20220609-58977
IN-20220609-59298
IN-20220609-59299
IN-20220609-59300
IN-20220609-59301
IN-20220609-59302
IN-20220609-14990
IN-20220609-45102
IN-20220609-50735
IN-20220609-52765
IN-20220609-58951
IN-20220609-58960
IN-20220609-58961
IN-20220609-58962
IN-20220609-59006
IN-20220609-59007
IN-20220609-59008
IN-20220609-4496
IN-20220609-4497
IN-20220609-4498
IN-20220609-4499
IN-20220609-13127
IN-20220609-13128
IN-20220609-13129
IN-20220609-13130
IN-20220609-44923
IN-20220609-44924
IN-20220609-44925
IN-20220609-50869
IN-20220609-52652
IN-20220609-52791
IN-20220609-52851
IN-20220609-53172
IN-20220609-53215
IN-20220609-59057
IN-20220609-59059
IN-20220609-59060
IN-20220609-59061
IN-20220609-59062
IN-20220609-59063
IN-20220609-59289
IN-20220609-59290
IN-20220609-59233
IN-20220609-59234
IN-20220609-59235
IN-20220609-59236
IN-20220609-59237
IN-20220609-59238
IN-20220609-59239
IN-20220609-59291
IN-20220609-59279
\ No newline at end of file
comm_crawler/src/main/resources/application.properties
浏览文件 @
af30a040
...
@@ -5,9 +5,9 @@ spring.profiles.active:=dev
...
@@ -5,9 +5,9 @@ spring.profiles.active:=dev
server.port
=
8081
server.port
=
8081
spring.http.encoding.force
=
true
#
spring.http.encoding.force=true
spring.http.encoding.charset
=
UTF-8
#
spring.http.encoding.charset=UTF-8
spring.http.encoding.enabled
=
true
#
spring.http.encoding.enabled=true
spring.thymeleaf.cache
=
false
spring.thymeleaf.cache
=
false
spring.thymeleaf.enabled
=
false
spring.thymeleaf.enabled
=
false
...
@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000
...
@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000
logging.level.root
=
info
logging.level.root
=
info
logging.level.org.springframework.web
=
info
logging.level.org.springframework.web
=
info
logging
levelorg
hibernate
=
info
logging
.level.org.
hibernate
=
info
logging.config
=
classpath:logback-spring.xml
logging.config
=
classpath:logback-spring.xml
kafka.consumer.task
=
0 0/2 * * * ?
kafka.consumer.task
=
0 0/2 * * * ?
...
...
comm_crawler/src/main/resources/constants.properties
浏览文件 @
af30a040
...
@@ -35,7 +35,7 @@ PROXYID=1
...
@@ -35,7 +35,7 @@ PROXYID=1
#线程池大小
#线程池大小
THREAD_SIZE
=
1
THREAD_SIZE
=
1
#
#
CHROMEDRIVE
=
D
:
\\
chrome
\\
chromedriver.exe
CHROMEDRIVE
=
E
:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\\
Program Files
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
CHROMEBIN
=
C:
\\
Program Files
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
...
@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
...
@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
redis.host
=
114.116.26.150
redis.host
=
114.116.26.150
redis.port
=
6379
redis.port
=
6379
redis.pass
=
zzsn9988
redis.pass
=
zzsn9988
#redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
#redis.host=8.130.30.33
#redis.host=8.130.30.33
#redis.port=9010
#redis.port=9010
#redis.pass=wxadS&jklim
#redis.pass=wxadS&jklim
...
@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
...
@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
HUAWEICLOUD_AK
=
VEHN7D0TJ9316H8AHCAV
HUAWEICLOUD_AK
=
VEHN7D0TJ9316H8AHCAV
HUAWEICLOUD_SK
=
heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
HUAWEICLOUD_SK
=
heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
IMGPATH
=
E:
\\
chrome
\\
img
\\
shot.png
#
IMGPATH= E:\\chrome\\img\\shot.png
IMGPATH
=
E:
\\
ideaWorkerspace
\\
meta_crawler
\\
comm_crawler
\\
src
\\
main
\\
resources
\\
aa.txt
selenium.driver.cache
=
comm_selenium_driver_cache_1
...
...
comm_crawler/src/main/resources/redis.properties
浏览文件 @
af30a040
# Redis settings
# Redis settings
redis.host
=
1
27.0.0.1
redis.host
=
1
14.115.236.206
redis.port
=
6379
redis.port
=
6379
redis.pass
=
xxxxxx
redis.pass
=
clbzzsn
redis.timeout
=
10000
redis.timeout
=
10000
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
#redis.timeout=10000
redis.maxIdle
=
300
redis.maxIdle
=
300
redis.maxTotal
=
600
redis.maxTotal
=
600
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论