Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
a18c008d
提交
a18c008d
authored
10月 11, 2022
作者:
liuweigang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
采集代码更新9
上级
83f00b0f
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
56 个修改的文件
包含
1055 行增加
和
1734 行删除
+1055
-1734
pom.xml
baidu_search/pom.xml
+4
-4
CrawlerMateSearchApplication.java
.../src/main/java/com/zzsn/CrawlerMateSearchApplication.java
+23
-27
CrawlerMateSearchApplicationbak.java
...c/main/java/com/zzsn/CrawlerMateSearchApplicationbak.java
+156
-0
ThreadExecutorConfig.java
...rch/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
+52
-51
BadSiteMsg.java
baidu_search/src/main/java/com/zzsn/entity/BadSiteMsg.java
+20
-0
KafkaConsumerJob.java
...u_search/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+40
-31
MetaBaiduSearchThread.java
.../src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
+21
-0
DetailBaiduSearchThread.java
.../com/zzsn/search/baiduThread/DetailBaiduSearchThread.java
+1
-1
WebBaiduSearchThread.java
...ava/com/zzsn/search/baiduThread/WebBaiduSearchThread.java
+1
-1
Constants.java
...earch/src/main/java/com/zzsn/utility/index/Constants.java
+9
-8
GrabUtil.java
baidu_search/src/main/java/com/zzsn/utils/GrabUtil.java
+0
-205
BaiduContentThread.java
...h/src/main/java/com/zzsn/webMagic/BaiduContentThread.java
+0
-91
BaiduKeyWordProcessor.java
...rc/main/java/com/zzsn/webMagic/BaiduKeyWordProcessor.java
+0
-121
BaiduTask.java
baidu_search/src/main/java/com/zzsn/webMagic/BaiduTask.java
+0
-136
BaiduTxtProcessor.java
...ch/src/main/java/com/zzsn/webMagic/BaiduTxtProcessor.java
+0
-145
Expaler.java
baidu_search/src/main/java/com/zzsn/webMagic/Expaler.java
+0
-62
KafkaConsumers.java
...earch/src/main/java/com/zzsn/webMagic/KafkaConsumers.java
+0
-110
LinksReadThread.java
...arch/src/main/java/com/zzsn/webMagic/LinksReadThread.java
+0
-104
SeleniumDownloader.java
...java/com/zzsn/webMagic/downloader/SeleniumDownloader.java
+0
-154
WebDriverPool.java
...main/java/com/zzsn/webMagic/downloader/WebDriverPool.java
+0
-253
constants.properties
baidu_search/src/main/resources/constants.properties
+10
-6
CrawlerStaticApplication.java
...wler/src/main/java/com/zzsn/CrawlerStaticApplication.java
+0
-0
CrawlerCommVerifyController.java
...c/main/java/com/zzsn/api/CrawlerCommVerifyController.java
+1
-1
DynaminSiteThread.java
...ler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
+32
-12
WebContentPaserByCss.java
...ain/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
+0
-0
WebContentPaserByIntellige.java
...va/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
+0
-0
WebContentPaserByJsonXpath.java
...va/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
+1
-1
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+0
-0
WebContentPaserByXpath.java
...n/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
+0
-0
SeleniumTime.java
...rc/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+34
-4
ArticleCrawler.java
...r/src/main/java/com/zzsn/crawlerOther/ArticleCrawler.java
+73
-73
WebContentPaserByCss.java
...ava/com/zzsn/crawlerOther/paser/WebContentPaserByCss.java
+0
-0
WebContentPaserByRegular.java
...com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
+0
-0
WebContentPaserByXpath.java
...a/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+0
-0
HtmlPageParser.java
...awler/src/main/java/com/zzsn/download/HtmlPageParser.java
+54
-0
PageBuilderParser.java
...er/src/main/java/com/zzsn/download/PageBuilderParser.java
+10
-65
PageConnectioner.java
...ler/src/main/java/com/zzsn/download/PageConnectioner.java
+0
-0
PageDownloader.java
...awler/src/main/java/com/zzsn/download/PageDownloader.java
+0
-0
PageGet.java
comm_crawler/src/main/java/com/zzsn/download/PageGet.java
+7
-9
PagePost.java
comm_crawler/src/main/java/com/zzsn/download/PagePost.java
+4
-9
RequestUtil.java
..._crawler/src/main/java/com/zzsn/download/RequestUtil.java
+16
-7
StringUtil.java
comm_crawler/src/main/java/com/zzsn/download/StringUtil.java
+387
-0
Test.java
comm_crawler/src/main/java/com/zzsn/download/Test.java
+13
-0
BadSiteMsg.java
comm_crawler/src/main/java/com/zzsn/entity/BadSiteMsg.java
+7
-11
BadSiteMsgBak.java
..._crawler/src/main/java/com/zzsn/entity/BadSiteMsgBak.java
+24
-0
Constants.java
..._crawler/src/main/java/com/zzsn/generation/Constants.java
+2
-1
JedisUtil.java
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
+7
-2
HttpClientTester.java
...crawler/src/main/java/com/zzsn/test/HttpClientTester.java
+1
-1
WebClientTest.java
comm_crawler/src/main/java/com/zzsn/test/WebClientTest.java
+1
-1
test.java
comm_crawler/src/main/java/com/zzsn/test/test.java
+9
-1
DriverUtil.java
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
+15
-9
PublishDateUtil.java
..._crawler/src/main/java/com/zzsn/util/PublishDateUtil.java
+3
-1
aa.txt
comm_crawler/src/main/resources/aa.txt
+0
-0
application.properties
comm_crawler/src/main/resources/application.properties
+2
-2
constants.properties
comm_crawler/src/main/resources/constants.properties
+10
-9
redis.properties
comm_crawler/src/main/resources/redis.properties
+5
-5
没有找到文件。
baidu_search/pom.xml
浏览文件 @
a18c008d
...
@@ -253,7 +253,7 @@
...
@@ -253,7 +253,7 @@
</dependency>
</dependency>
<!--WebMagic 爬虫框架-->
<!--WebMagic 爬虫框架-->
<dependency>
<
!--<
dependency>
<groupId>us.codecraft</groupId>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
<version>0.7.5</version>
...
@@ -274,7 +274,7 @@
...
@@ -274,7 +274,7 @@
<artifactId>slf4j-log4j12</artifactId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusion>
</exclusions>
</exclusions>
</dependency>
</dependency>
-->
<!-- 下面引用本地包,webMgic源码包修过后-->
<!-- 下面引用本地包,webMgic源码包修过后-->
<!-- <dependency>-->
<!-- <dependency>-->
<!-- <groupId>us.codecraft</groupId>-->
<!-- <groupId>us.codecraft</groupId>-->
...
@@ -307,11 +307,11 @@
...
@@ -307,11 +307,11 @@
<!-- </exclusions>-->
<!-- </exclusions>-->
<!-- </dependency>-->
<!-- </dependency>-->
<!--<!– <!– 上面引用本地包,webMgic源码包修过后–>–>-->
<!--<!– <!– 上面引用本地包,webMgic源码包修过后–>–>-->
<dependency>
<
!--<
dependency>
<groupId>com.github.detro</groupId>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
<version>1.2.0</version>
</dependency>
</dependency>
-->
<dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<groupId>
org.apache.httpcomponents
</groupId>
...
...
baidu_search/src/main/java/com/zzsn/CrawlerMateSearchApplication.java
浏览文件 @
a18c008d
...
@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread;
...
@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.search.util.SpringContextUtil
;
import
com.zzsn.search.util.SpringContextUtil
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.webMagic.*
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.kafka.clients.CommonClientConfigs
;
import
org.apache.kafka.clients.CommonClientConfigs
;
import
org.apache.kafka.clients.consumer.ConsumerConfig
;
import
org.apache.kafka.clients.consumer.ConsumerConfig
;
...
@@ -35,39 +34,36 @@ import java.util.Properties;
...
@@ -35,39 +34,36 @@ import java.util.Properties;
@Slf4j
@Slf4j
@SpringBootApplication
(
scanBasePackages
=
"com.zzsn"
)
@SpringBootApplication
(
scanBasePackages
=
"com.zzsn"
)
public
class
CrawlerMateSearchApplication
extends
SpringBootServletInitializer
{
public
class
CrawlerMateSearchApplication
extends
SpringBootServletInitializer
{
@Override
//
@Override
protected
SpringApplicationBuilder
configure
(
SpringApplicationBuilder
builder
)
{
//
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return
builder
.
sources
(
CrawlerMateSearchApplication
.
class
);
//
return builder.sources(CrawlerMateSearchApplication.class);
}
//
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
SpringApplication
.
run
(
CrawlerMateSearchApplication
.
class
,
args
);
SpringApplication
.
run
(
CrawlerMateSearchApplication
.
class
,
args
);
}
}
/**
// @Override
* 采用webMagic框架爬取
// public void run(String... args) throws Exception {
*/
//// try {
public
void
webMagic
(){
//// consumerPartition();
new
LinksReadThread
().
start
();
//// }catch (Exception e){
new
BaiduContentThread
().
start
();
//// consumerPartition();
//// }
}
//// try {
//// loadSiteMsgLoc();
//// }catch (Exception e){
//// loadSiteMsgLoc();
//// }
//// try {
//// consumerPartition();
//// }catch (Exception e){
//// consumerPartition();
//// }
// }
/**
* 老方法抓取
* @throws Exception
*/
public
void
ordinary
()
{
System
.
out
.
println
(
"——————++++++++++++——————==="
);
try
{
consumerPartition
();
}
catch
(
Exception
e
)
{
consumerPartition
();
}
loadSiteMsgLoc
();
}
public
void
consumerPartition
(){
public
void
consumerPartition
(){
log
.
info
(
"定时获取mq消息"
);
log
.
info
(
"定时获取mq消息"
);
...
...
baidu_search/src/main/java/com/zzsn/CrawlerMateSearchApplicationbak.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
;
import
com.google.gson.Gson
;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.search.FileUtil
;
import
com.zzsn.search.MetaBaiduSearchThread
;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.utility.index.Constants
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.kafka.clients.CommonClientConfigs
;
import
org.apache.kafka.clients.consumer.ConsumerConfig
;
import
org.apache.kafka.clients.consumer.ConsumerRecord
;
import
org.apache.kafka.clients.consumer.ConsumerRecords
;
import
org.apache.kafka.clients.consumer.KafkaConsumer
;
import
org.apache.kafka.common.TopicPartition
;
import
org.apache.kafka.common.serialization.StringDeserializer
;
import
org.springframework.boot.CommandLineRunner
;
import
org.springframework.boot.SpringApplication
;
import
org.springframework.boot.autoconfigure.SpringBootApplication
;
import
org.springframework.boot.web.servlet.support.SpringBootServletInitializer
;
import
java.io.File
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Properties
;
@Slf4j
@SpringBootApplication
(
scanBasePackages
=
"com.zzsn"
)
public
class
CrawlerMateSearchApplicationbak
extends
SpringBootServletInitializer
implements
CommandLineRunner
{
// @Override
// protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
// return builder.sources(CrawlerMateSearchApplication.class);
// }
public
static
void
main
(
String
[]
args
)
{
SpringApplication
.
run
(
CrawlerMateSearchApplicationbak
.
class
,
args
);
}
@Override
public
void
run
(
String
...
args
)
throws
Exception
{
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
// try {
// loadSiteMsgLoc();
// }catch (Exception e){
// loadSiteMsgLoc();
// }
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
}
public
void
consumerPartition
(){
log
.
info
(
"定时获取mq消息"
);
//1.创建消费者
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
ArrayList
<
TopicPartition
>
topicPartitions
=
new
ArrayList
<>();
String
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
String
[]
partitions
=
kafkaConsumerPartition
.
split
(
","
);
for
(
int
i
=
0
;
i
<
partitions
.
length
;
i
++)
{
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions
[
i
])));
}
consumer
.
assign
(
topicPartitions
);
while
(
true
){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
0
);
for
(
ConsumerRecord
record
:
records
){
try
{
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
record
.
value
().
toString
(),
KeywordMsg
.
class
);
log
.
info
(
"关键词解析keywordMsg正常"
);
consumer
.
commitSync
();
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
metaSearchThread
.
crawler
();
log
.
info
(
"关键词请求结束++++"
);
}
catch
(
Exception
e
){
log
.
info
(
"关键词解析异常: "
+
record
.
value
().
toString
());
}
}
}
}
public
void
loadSiteMsgLoc
()
{
String
filepath
=
Constants
.
META_SEARCH_KEYWORDPATH
;
System
.
out
.
println
(
filepath
);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try
{
File
f
=
new
File
(
filepath
);
List
<
String
>
allLines
=
FileUtil
.
getFileLines
(
f
,
"utf-8"
);
System
.
out
.
println
(
allLines
.
size
());
for
(
String
keysite:
allLines
)
{
try
{
String
value
=
JedisUtil
.
getNoPrefixString
(
"KEY_WORDS_TO_REDIS::"
+
keysite
);
System
.
out
.
println
(
"——————++++++++++++——————==="
);
String
subvalue
=
value
.
replace
(
value
.
substring
(
value
.
indexOf
(
"startTime"
),
value
.
indexOf
(
"searchEngines"
)),
""
);
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
subvalue
,
KeywordMsg
.
class
);
log
.
info
(
"关键词解析keywordMsg正常"
);
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
metaSearchThread
.
crawler
();
}
catch
(
Exception
e
){
continue
;
}
}
}
catch
(
Exception
e
){
e
.
getMessage
();
}
}
public
void
loadloc
(){
String
key
=
"{\n"
+
" \"id\": \"2022090522\",\n"
+
" \"wordsCode\": \"KW-20220602-0003\",\n"
+
" \"wordsName\": \"2022世界机器人大会\",\n"
+
" \"keyWord\": \"2022世界机器人大会\",\n"
+
" \"exclusionWord\": null,\n"
+
" \"status\": \"1\",\n"
+
" \"subjectId\": null,\n"
+
" \"subjectIds\": null,\n"
+
" \"startTime\": null,\n"
+
" \"endTime\": null \n"
+
"}"
;
try
{
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
key
,
KeywordMsg
.
class
);
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
metaSearchThread
.
crawler
();
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
}
private
static
KafkaConsumer
<
String
,
String
>
createConsumer
()
{
Properties
properties
=
new
Properties
();
System
.
out
.
println
(
Constants
.
KAFKA_CONSUMER_SERVERS
);
properties
.
put
(
CommonClientConfigs
.
BOOTSTRAP_SERVERS_CONFIG
,
Constants
.
KAFKA_CONSUMER_SERVERS
);
properties
.
put
(
ConsumerConfig
.
KEY_DESERIALIZER_CLASS_CONFIG
,
StringDeserializer
.
class
);
properties
.
put
(
ConsumerConfig
.
VALUE_DESERIALIZER_CLASS_CONFIG
,
StringDeserializer
.
class
);
properties
.
put
(
ConsumerConfig
.
GROUP_ID_CONFIG
,
Constants
.
KAFKA_CONSUMER_GROUP_ID
);
properties
.
put
(
ConsumerConfig
.
ENABLE_AUTO_COMMIT_CONFIG
,
"true"
);
//kafka数据的读取方式
properties
.
put
(
ConsumerConfig
.
AUTO_OFFSET_RESET_CONFIG
,
Constants
.
KAFKA_CONSUMER_AUTO_OFFSET_RESET
);
// latest earliest
//时间间隔设置为1h
properties
.
put
(
"max.poll.interval.ms"
,
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_RECORDS_CONFIG
,
1
);
return
new
KafkaConsumer
<>(
properties
);
}
}
baidu_search/src/main/java/com/zzsn/conf/ThreadExecutorConfig.java
浏览文件 @
a18c008d
...
@@ -87,56 +87,56 @@ public class ThreadExecutorConfig {
...
@@ -87,56 +87,56 @@ public class ThreadExecutorConfig {
}
}
@Bean
(
value
=
"asyncexecutorService"
)
//
@Bean(value = "asyncexecutorService")
public
Executor
executorService
()
{
//
public Executor executorService() {
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
//
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor
.
setCorePoolSize
(
1
);
//线程池维护线程的最少数量
//
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
1
);
//线程池维护线程的最大数量
//
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor
.
setQueueCapacity
(
100000
);
//缓存队列
//
executor.setQueueCapacity(100000);//缓存队列
executor
.
setThreadNamePrefix
(
"selenium-"
);
//
executor.setThreadNamePrefix("selenium-");
/**
//
/**
* 对拒绝task的处理策略
//
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
//
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
//
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
//
*/
executor
.
setRejectedExecutionHandler
(
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
//
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor
.
setKeepAliveSeconds
(
60
*
60
);
//允许的空闲时间
//
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor
.
initialize
();
//
executor.initialize();
return
executor
;
//
return executor;
}
//
}
@Bean
(
value
=
"asyncexecutorServiceWebBaidu"
)
//
@Bean(value = "asyncexecutorServiceWebBaidu")
public
Executor
executorServiceWebBaidu
()
{
//
public Executor executorServiceWebBaidu() {
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
//
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor
.
setCorePoolSize
(
5
);
//线程池维护线程的最少数量
//
executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
5
);
//线程池维护线程的最大数量
//
executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor
.
setQueueCapacity
(
100000
);
//缓存队列
//
executor.setQueueCapacity(100000);//缓存队列
executor
.
setThreadNamePrefix
(
"selenium-"
);
//
executor.setThreadNamePrefix("selenium-");
/**
//
/**
* 对拒绝task的处理策略
//
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
//
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
//
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
//
*/
executor
.
setRejectedExecutionHandler
(
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
//
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor
.
setKeepAliveSeconds
(
60
*
60
);
//允许的空闲时间
//
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor
.
initialize
();
//
executor.initialize();
return
executor
;
//
return executor;
}
//
}
@Bean
(
value
=
"asyncexecutorServiceDetailUrl"
)
//
@Bean(value = "asyncexecutorServiceDetailUrl")
public
Executor
asyncexecutorServiceDetailUrl
()
{
//
public Executor asyncexecutorServiceDetailUrl() {
ThreadPoolTaskExecutor
executor
=
new
ThreadPoolTaskExecutor
();
//
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor
.
setCorePoolSize
(
5
);
//线程池维护线程的最少数量
//
executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor
.
setMaxPoolSize
(
5
);
//线程池维护线程的最大数量
//
executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor
.
setQueueCapacity
(
100000
);
//缓存队列
//
executor.setQueueCapacity(100000);//缓存队列
executor
.
setThreadNamePrefix
(
"selenium-"
);
//
executor.setThreadNamePrefix("selenium-");
/**
//
/**
* 对拒绝task的处理策略
//
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
//
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
//
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
//
*/
executor
.
setRejectedExecutionHandler
(
new
ThreadPoolExecutor
.
CallerRunsPolicy
());
//
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor
.
setKeepAliveSeconds
(
60
*
60
);
//允许的空闲时间
//
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor
.
initialize
();
//
executor.initialize();
return
executor
;
//
return executor;
}
//
}
}
}
\ No newline at end of file
baidu_search/src/main/java/com/zzsn/entity/BadSiteMsg.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
.
entity
;
import
lombok.Data
;
import
java.util.Date
;
@Data
public
class
BadSiteMsg
{
/**主键*/
private
String
id
;
/**信息源编码*/
private
String
infoSourceCode
;
/**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private
String
crawlerType
;
/**分区id (多个用英文逗号隔开)*/
private
String
partition
;
/**消费时间*/
private
Date
consumerDate
;
}
baidu_search/src/main/java/com/zzsn/job/KafkaConsumerJob.java
浏览文件 @
a18c008d
...
@@ -52,8 +52,10 @@ public class KafkaConsumerJob {
...
@@ -52,8 +52,10 @@ public class KafkaConsumerJob {
public
static
final
ExecutorService
poolExecuter
=
new
BlockThreadPoolExecute
(
5
public
static
final
ExecutorService
poolExecuter
=
new
BlockThreadPoolExecute
(
5
,
10
,
0
,
TimeUnit
.
SECONDS
,
new
ArrayBlockingQueue
<>(
1
));
,
10
,
0
,
TimeUnit
.
SECONDS
,
new
ArrayBlockingQueue
<>(
1
));
/**fixedDelay:上一次执行完毕时间点之后一分钟再执行*/
// @Scheduled(cron = "0 0/2 * * * ?")
// @Scheduled(cron = "0 0/2 * * * ?")
@Async
(
"asyncTaskExecutor"
)
@Scheduled
(
fixedDelay
=
60000
)
// @Async("asyncTaskExecutor")
public
void
consumer
(){
public
void
consumer
(){
log
.
info
(
"定时获取mq消息"
);
log
.
info
(
"定时获取mq消息"
);
//1.创建消费者
//1.创建消费者
...
@@ -67,10 +69,14 @@ public class KafkaConsumerJob {
...
@@ -67,10 +69,14 @@ public class KafkaConsumerJob {
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
0
);
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
0
);
consumer
.
commitSync
();
consumer
.
commitSync
();
for
(
ConsumerRecord
record
:
records
){
for
(
ConsumerRecord
record
:
records
){
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
record
.
value
().
toString
(),
KeywordMsg
.
class
);
try
{
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
record
.
value
().
toString
(),
KeywordMsg
.
class
);
metaSearchThread
.
keywordMsg
=
keywordMsg
;
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
metaSearchThread
.
crawler
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
metaSearchThread
.
crawler
();
}
catch
(
Exception
e
){
continue
;
}
}
}
}
}
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
...
@@ -81,37 +87,40 @@ public class KafkaConsumerJob {
...
@@ -81,37 +87,40 @@ public class KafkaConsumerJob {
}
}
// @Scheduled(cron = "0 0/2 * * * ?")
// @Scheduled(cron = "0 0/2 * * * ?")
@
Async
(
"asyncTaskExecutor"
)
@
Scheduled
(
fixedDelay
=
300000
)
public
void
consumerPartition
(){
public
void
consumerPartition
(){
log
.
info
(
"定时获取mq消息"
);
try
{
//1.创建消费者
log
.
info
(
"定时获取mq消息"
);
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
//1.创建消费者
ArrayList
<
TopicPartition
>
topicPartitions
=
new
ArrayList
<>();
KafkaConsumer
<
String
,
String
>
consumer
=
createConsumer
();
String
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
ArrayList
<
TopicPartition
>
topicPartitions
=
new
ArrayList
<>();
String
[]
partitions
=
kafkaConsumerPartition
.
split
(
","
);
String
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
for
(
int
i
=
0
;
i
<
partitions
.
length
;
i
++)
{
String
[]
partitions
=
kafkaConsumerPartition
.
split
(
","
);
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions
[
i
])));
for
(
int
i
=
0
;
i
<
partitions
.
length
;
i
++)
{
}
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions
[
i
])));
consumer
.
assign
(
topicPartitions
);
}
consumer
.
assign
(
topicPartitions
);
try
{
while
(
true
){
try
{
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
while
(
true
)
{
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
0
);
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
for
(
ConsumerRecord
record
:
records
){
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
0
);
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
record
.
value
().
toString
(),
KeywordMsg
.
class
);
for
(
ConsumerRecord
record
:
records
)
{
consumer
.
commitSync
();
KeywordMsg
keywordMsg
=
new
Gson
().
fromJson
(
record
.
value
().
toString
(),
KeywordMsg
.
class
);
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
consumer
.
commitSync
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
MetaBaiduSearchThread
metaSearchThread
=
new
MetaBaiduSearchThread
();
metaSearchThread
.
crawler
();
metaSearchThread
.
keywordMsg
=
keywordMsg
;
metaSearchThread
.
crawler
();
}
}
}
}
catch
(
Exception
e
)
{
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
consumer
=
createConsumer
();
log
.
info
(
"kafka调用信息失败"
);
consumer
.
subscribe
(
Arrays
.
asList
(
Constants
.
KAFKA_CONSUMER_TOPIC
));
}
}
}
}
// @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE)
// @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE)
@Async
(
"asyncTaskExecutor"
)
@Async
(
"asyncTaskExecutor"
)
...
...
baidu_search/src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
浏览文件 @
a18c008d
...
@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
...
@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.cache.MemcachedUtils
;
import
com.zzsn.cache.MemcachedUtils
;
import
com.zzsn.docinfo.DocInfo
;
import
com.zzsn.docinfo.DocInfo
;
import
com.zzsn.entity.BadSiteMsg
;
import
com.zzsn.entity.Site
;
import
com.zzsn.entity.Site
;
import
com.zzsn.entity.SiteTemplate
;
import
com.zzsn.entity.SiteTemplate
;
import
com.zzsn.paser.SourceTemplateByTag
;
import
com.zzsn.paser.SourceTemplateByTag
;
...
@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable {
// @Async("asyncexecutorServiceWebBaidu")
// @Async("asyncexecutorServiceWebBaidu")
public
void
crawler
(){
public
void
crawler
(){
// sentBadSiteMsg(keywordMsg,Constants.CRAWLER_SERVER,Constants.KAFKA_CONSUMER_PARTITION);
//对传进来的关键词组进行组合
//对传进来的关键词组进行组合
String
keyWord
=
keywordMsg
.
getKeyWord
();
String
keyWord
=
keywordMsg
.
getKeyWord
();
List
<
String
>
keyWords
=
SplitKeyword
.
transForm
(
keyWord
);
List
<
String
>
keyWords
=
SplitKeyword
.
transForm
(
keyWord
);
...
@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable {
}
}
}
}
public
void
sentBadSiteMsg
(
KeywordMsg
keymsg
,
String
crawlerType
,
String
partition
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
keymsg
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
keymsg
.
getWordsCode
());
badSiteMsg
.
setConsumerDate
(
new
Date
());
badSiteMsg
.
setCrawlerType
(
crawlerType
);
badSiteMsg
.
setPartition
(
partition
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"crawler_consumer"
,
docjson
);
}
catch
(
Exception
e
){
}
}
private
String
locateCharSet
(
String
url
)
{
private
String
locateCharSet
(
String
url
)
{
String
encoding
=
"gbk"
;
String
encoding
=
"gbk"
;
try
{
try
{
...
...
baidu_search/src/main/java/com/zzsn/search/baiduThread/DetailBaiduSearchThread.java
浏览文件 @
a18c008d
...
@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable {
...
@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable {
@Async
(
"asyncexecutorServiceDetailUrl"
)
//
@Async("asyncexecutorServiceDetailUrl")
public
void
crawler
(){
public
void
crawler
(){
try
{
try
{
...
...
baidu_search/src/main/java/com/zzsn/search/baiduThread/WebBaiduSearchThread.java
浏览文件 @
a18c008d
...
@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable {
...
@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable {
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
@Async
(
"asyncexecutorServiceWebBaidu"
)
//
@Async("asyncexecutorServiceWebBaidu")
public
void
crawler
(){
public
void
crawler
(){
//对传进来的关键词组进行组合
//对传进来的关键词组进行组合
String
keyWord
=
keywordMsg
.
getKeyWord
();
String
keyWord
=
keywordMsg
.
getKeyWord
();
...
...
baidu_search/src/main/java/com/zzsn/utility/index/Constants.java
浏览文件 @
a18c008d
...
@@ -113,12 +113,13 @@ public class Constants {
...
@@ -113,12 +113,13 @@ public class Constants {
public
static
final
String
KAFKA_CONSUMER_PARTITION
=
prop
.
getProperty
(
"KAFKA_CONSUMER_PARTITION"
);
public
static
final
String
KAFKA_CONSUMER_PARTITION
=
prop
.
getProperty
(
"KAFKA_CONSUMER_PARTITION"
);
public
static
final
String
KAFKA_PRODUCT_PARTITION
=
prop
.
getProperty
(
"KAFKA_PRODUCT_PARTITION"
);
public
static
final
String
KAFKA_PRODUCT_PARTITION
=
prop
.
getProperty
(
"KAFKA_PRODUCT_PARTITION"
);
public
static
final
String
CRAWLER_SERVER
=
prop
.
getProperty
(
"crawler_server"
);
public
static
final
int
KAFKA_COUNT
=
Integer
.
valueOf
(
prop
.
getProperty
(
"whiles"
));
// public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
public
static
final
String
testBaidu
=
prop
.
getProperty
(
"KAFKA_test_TOPIC"
);
//
// public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC");
public
static
final
Integer
PAGESIZE
=
Integer
.
valueOf
(
prop
.
getProperty
(
"pageSize"
));
//
// public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize"));
public
static
final
Integer
AVERGER
=
Integer
.
valueOf
(
prop
.
getProperty
(
"averger"
));
//
// public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger"));
}
}
baidu_search/src/main/java/com/zzsn/utils/GrabUtil.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
utils
;
import
com.zzsn.webMagic.KafkaConsumers
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpHost
;
import
org.apache.http.auth.AuthScope
;
import
org.apache.http.auth.UsernamePasswordCredentials
;
import
org.apache.http.client.CredentialsProvider
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.BasicCredentialsProvider
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.List
;
public
class
GrabUtil
{
private
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
GrabUtil
.
class
);
/**
* 批量代理IP有效检测
*
*/
public
static
boolean
checkProxyIp
(
Proxy
proxyx
)
{
//创建httpGet实例
// HttpGet httpGet = new HttpGet("http://www.baidu.com");
HttpGet
httpGet
=
new
HttpGet
(
"https://www.163.com/dy/article/GDT03TFD05158K7T.html"
);
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost
proxy
=
new
HttpHost
(
proxyx
.
getHost
(),
proxyx
.
getPort
());
CredentialsProvider
provider
=
new
BasicCredentialsProvider
();
//包含账号密码的代理
provider
.
setCredentials
(
new
AuthScope
(
proxy
),
new
UsernamePasswordCredentials
(
proxyx
.
getUsername
(),
proxyx
.
getPassword
()));
//创建httpClient实例
CloseableHttpClient
httpClient
=
HttpClients
.
custom
().
setDefaultCredentialsProvider
(
provider
).
build
();
RequestConfig
requestConfig
=
RequestConfig
.
custom
()
.
setProxy
(
proxy
)
.
setConnectTimeout
(
2000
)
.
setSocketTimeout
(
2000
)
.
setConnectionRequestTimeout
(
2000
)
.
build
();
httpGet
.
setConfig
(
requestConfig
);
//设置请求头消息
httpGet
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
CloseableHttpResponse
response
=
null
;
try
{
response
=
httpClient
.
execute
(
httpGet
);
int
statusCode
=
response
.
getStatusLine
().
getStatusCode
();
if
(
response
!=
null
){
HttpEntity
entity
=
response
.
getEntity
();
//获取返回实体
if
(
entity
!=
null
){
System
.
out
.
println
(
"网页内容为:"
+
EntityUtils
.
toString
(
entity
,
"utf-8"
));
}
}
if
(
statusCode
==
200
){
return
true
;
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
// logger.error("校验代理ip是否可用出错,错误信息:",e);
}
finally
{
if
(
response
!=
null
){
try
{
response
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
if
(
httpClient
!=
null
){
try
{
httpClient
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
}
return
false
;
}
public
static
String
httpGet
(
String
url
,
Proxy
myproxy
){
HttpGet
httpGet
=
new
HttpGet
(
url
);
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost
proxy
=
new
HttpHost
(
myproxy
.
getHost
(),
myproxy
.
getPort
());
CredentialsProvider
provider
=
new
BasicCredentialsProvider
();
//包含账号密码的代理
provider
.
setCredentials
(
new
AuthScope
(
proxy
),
new
UsernamePasswordCredentials
(
myproxy
.
getUsername
(),
myproxy
.
getPassword
()));
//创建httpClient实例
CloseableHttpClient
httpClient
=
HttpClients
.
custom
().
setDefaultCredentialsProvider
(
provider
).
build
();
RequestConfig
requestConfig
=
RequestConfig
.
custom
()
.
setProxy
(
proxy
)
.
setConnectTimeout
(
5000
)
.
setSocketTimeout
(
5000
)
.
setConnectionRequestTimeout
(
5000
)
.
build
();
httpGet
.
setConfig
(
requestConfig
);
//设置请求头消息
httpGet
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
);
httpGet
.
setHeader
(
"Accept-Encoding"
,
"gzip, deflate"
);
httpGet
.
setHeader
(
"Cache-Control"
,
"max-age=0"
);
httpGet
.
setHeader
(
"Connection"
,
"keep-alive"
);
httpGet
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
);
CloseableHttpResponse
response
=
null
;
try
{
response
=
httpClient
.
execute
(
httpGet
);
int
statusCode
=
response
.
getStatusLine
().
getStatusCode
();
if
(
statusCode
==
200
){
logger
.
info
(
" page sucess by ["
+
proxy
.
getHostName
()+
"],"
+
url
+
" "
);
HttpEntity
entity
=
response
.
getEntity
();
//获取返回实体
return
EntityUtils
.
toString
(
entity
,
"utf-8"
);
}
else
{
logger
.
info
(
"page error by ["
+
proxy
.
getHostName
()+
"],"
+
url
+
" code:"
+
statusCode
);
}
}
catch
(
Exception
e
){
logger
.
info
(
"page error by ["
+
proxy
.
getHostName
()+
"],"
+
url
+
" code:500 "
,
e
);
}
finally
{
if
(
response
!=
null
){
try
{
response
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
if
(
httpClient
!=
null
){
try
{
httpClient
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
}
return
null
;
}
/**
* 给list进行分包
* @param iPacket 包大小
* @param list
*/
public
static
List
<
List
<
Request
>>
AverageList
(
List
<
Request
>
list
,
int
iPacket
){
int
iCount
=
list
.
size
()/
iPacket
;
int
iRemit
=
list
.
size
()%
iPacket
;
List
<
List
<
Request
>>
Average
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
iCount
;
i
++){
Average
.
add
(
list
.
subList
(
i
*
iPacket
,(
i
+
1
)*
iPacket
));
}
if
(
iRemit
>
0
){
Average
.
add
(
list
.
subList
(
iCount
*
iPacket
,(
iCount
*
iPacket
+
iRemit
)));
}
return
Average
;
}
/**
* 获取神龙ip接口
* 一次获取一个,有效3分钟
*/
// public static MyProxy getShenlongIp(){
// MyProxy proxy=null;
// String url="http://api.shenlongip.com/ip?key=jy0tr2q0&pattern=json&count=1&need=1100&protocol=1&sign=d5654a620e9b424ca87fe1802f4c9e88";
// String result=HttpUtil.get(url);
// logger.info("调用代理IP接口返回结果:"+result);
// if (result!=null && result.length()>0){
// JSONObject re=JSONObject.parseObject(result);
// if (200== re.getInteger("code")){
// JSONArray datas= re.getJSONArray("data");
// if (datas.size()>0){
// for (Object data:datas){
// try {
// JSONObject o=(JSONObject) data;
// long expire= DateUtil.convertDate(o.getString("expire"),"yyyy-MM-dd HH:mm:ss").getTime();
// proxy=new MyProxy(o.getString("ip"),o.getInteger("port"),"hys_81310170_41c8","12345678",expire);
// }catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
// }else {
// logger.info("获取代理IP出错,返回信息:"+result);
// }
// }
// return proxy;
// }
public
static
void
main
(
String
[]
args
)
{
String
url
=
"http://baijiahao.baidu.com/s?id=1662021416338923958&wfr=spider&for=pc;"
;
Proxy
proxy
=
KafkaConsumers
.
getProxy
().
get
(
0
);
System
.
out
.
println
(
httpGet
(
url
,
proxy
));
}
// 3 175.10.141.61-40013-hys_81310170_41c8-12345678
// 1 125.119.174.226-40032-hys_81310170_41c8-12345678
// 2 122.230.139.103-40004-hys_81310170_41c8-12345678
// 4 183.165.248.64-40007-hys_81310170_41c8-12345678
}
baidu_search/src/main/java/com/zzsn/webMagic/BaiduContentThread.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.search.entity.ClbAnsProcessitem
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.utils.GrabUtil
;
import
com.zzsn.webMagic.downloader.SeleniumDownloader
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.SimpleProxyProvider
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* 抓取 百度内容
*/
public
class
BaiduContentThread
extends
Thread
{
private
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduContentThread
.
class
);
@Override
public
void
run
()
{
//从队列获取连接进行内容提取
List
<
Request
>
requestsList
=
new
ArrayList
<>();
Request
request
=
null
;
while
(
true
){
try
{
//队列
ClbAnsProcessitem
entity
=(
ClbAnsProcessitem
)
BaiduTask
.
baiduUrlQueue
.
take
();
if
(
entity
!=
null
){
logger
.
info
(
"等到内容连接来了,开始处理列表连接。。。。。"
);
// 根据连接判断url是否在redis里存在,存在则丢掉
boolean
sismember
=
JedisUtil
.
sismember
(
"baidu_web_test5::"
+
entity
.
getOrgId
(),
entity
.
getSourceAddress
());
if
(!
sismember
){
request
=
new
Request
();
request
.
setUrl
(
entity
.
getSourceAddress
());
//把相关信息传递到下个对象中
request
.
putExtra
(
"model"
,
JSONObject
.
toJSONString
(
entity
));
requestsList
.
add
(
request
);
}
else
{
BaiduTask
.
doublep
++;
logger
.
info
(
"连接:"
+
entity
.
getSourceAddress
()+
"已存在。"
);
}
}
if
(
requestsList
.
size
()<
5
){
continue
;
}
//清除jvm dns缓存ip信息
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.ttl"
,
"0"
);
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.negative.ttl"
,
"0"
);
if
(
"1"
.
equals
(
Constants
.
PROXY
)){
Proxy
p
=
new
Proxy
(
""
,
3232
);
//ProxyMap.getProxy();
logger
.
info
(
"获取资讯内容url,启动代理ip,进行下一步内容处理"
);
if
(
p
!=
null
){
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
httpClientDownloader
.
setProxyProvider
(
new
SimpleProxyProvider
(
KafkaConsumers
.
getProxy
()));
Spider
.
create
(
new
BaiduTxtProcessor
())
.
setDownloader
(
httpClientDownloader
)
.
startRequest
(
requestsList
)
.
thread
(
1
)
.
runAsync
();
}
else
{
Thread
.
sleep
(
1000
*
30
);
continue
;
}
}
else
{
logger
.
info
(
"获取资讯内容url,组装好了http请求信息,提交到WebMgic.................."
);
Spider
.
create
(
new
BaiduTxtProcessor
())
.
startRequest
(
requestsList
)
.
thread
(
1
)
.
runAsync
();
}
requestsList
=
new
ArrayList
<>();
try
{
Thread
.
sleep
(
1000
*
5
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
}
}
}
baidu_search/src/main/java/com/zzsn/webMagic/BaiduKeyWordProcessor.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
com.zzsn.search.entity.ClbAnsProcessitem
;
import
com.zzsn.utility.util.DateUtil
;
import
org.apache.http.HttpHeaders
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.SimpleProxyProvider
;
import
us.codecraft.webmagic.selector.Html
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
/**
* 根据百度关键字搜索返回列表结果处理
*/
public
class
BaiduKeyWordProcessor
implements
PageProcessor
{
private
final
static
Logger
log
=
LoggerFactory
.
getLogger
(
BaiduKeyWordProcessor
.
class
);
private
ClbAnsProcessitem
processitem
=
null
;
private
List
<
ClbAnsProcessitem
>
processitemList
=
null
;
private
Site
site
;
@Override
public
void
process
(
Page
page
)
{
log
.
info
(
"监听到百度关键字搜索,返回结果"
);
List
<
String
>
links
=
page
.
getHtml
().
xpath
(
"//div[@id=content_left]/div[@tpl='news-normal']"
).
all
();
processitem
=
new
ClbAnsProcessitem
();
processitemList
=
new
ArrayList
<>();
Request
request
=
page
.
getRequest
();
log
.
info
(
"获取到列表Url数:"
+(
links
.
size
()));
for
(
String
s:
links
){
processitem
=
new
ClbAnsProcessitem
();
//解析出标题,时间
String
link
=
"关键字列表"
;
try
{
Html
html
=
Html
.
create
(
s
);
link
=
html
.
xpath
(
"//div/div/h3"
).
links
().
toString
();
if
(
link
==
null
||
link
.
contains
(
".pdf"
)
||
link
.
trim
().
length
()==
0
||
link
.
contains
(
".PDF"
)||
link
.
contains
(
"download"
))
{
BaiduTask
.
invalidUrl
++;
continue
;
}
Elements
title
=
html
.
getDocument
().
select
(
"a[data-click]"
);
Elements
tiem
=
html
.
getDocument
().
select
(
"span.c-color-gray2"
);
Elements
org
=
html
.
getDocument
().
select
(
"span.c-color-gray"
);
if
(
title
!=
null
&&
title
.
get
(
0
)!=
null
){
processitem
.
setTitle
(
title
.
get
(
0
).
text
());
}
if
(
tiem
!=
null
&&
tiem
.
size
()>
0
){
processitem
.
setPublishDate
(
DateUtil
.
getPublishDate
(
tiem
.
get
(
0
).
text
()));
}
if
(
org
!=
null
&&
org
.
size
()>
0
){
processitem
.
setSourceSite
(
org
.
get
(
0
).
text
());
}
processitem
.
setSourceAddress
(
link
);
processitem
.
setSid
(
request
.
getExtra
(
"sid"
).
toString
());
processitem
.
setOrgId
(
request
.
getExtra
(
"tid"
).
toString
());
processitem
.
setTid
(
request
.
getExtra
(
"tid"
).
toString
());
processitem
.
setKeyWords
(
request
.
getExtra
(
"words"
).
toString
());
//完成列表相关属性组装放入全局对列,等待下一步处理连接里的内容
BaiduTask
.
baiduUrlQueue
.
put
(
processitem
);
//加入到队列,如果队列满则阻塞
BaiduTask
.
linksUrl
++;
}
catch
(
Exception
e
){
log
.
error
(
"解析列表Url["
+
page
.
getRequest
().
getUrl
()+
"]下的"
+
link
+
"+出错,错误信息:"
,
e
);
}
}
processitemList
=
null
;
}
@Override
public
Site
getSite
()
{
if
(
site
==
null
){
site
=
Site
.
me
().
setSleepTime
(
3000
);
//停3秒
site
.
addHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
site
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
site
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
);
site
.
setCycleRetryTimes
(
3
);
//设置循环次数
site
.
setTimeOut
(
8000
);
}
return
site
;
}
// public static void main(String[] args) {
// List<Request> list=new ArrayList<>();
// Request request=new Request();
// Map<String,Object>map=new HashMap<>();
// map.put("aa","这是测试传参");
// request.setExtras(map);
// String url="https://sichuan.scol.com.cn/ggxw/202209/58604583.html";
// String url1="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%E5%9F%83%E5%A1%9E%E4%BF%84%E6%AF%94%E4%BA%9A%2B%E5%AE%A2%E6%9C%BA%2B%E9%81%87%E9%9A%BE&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=0";
// request.setUrl(url);
// list.add(request);
//// Request r2=new Request();
//// r2.setExtras(map);
//// r2.setUrl(url1);
//// list.add(r2);
// // Proxy proxy=new Proxy("113.243.178.169",40016,"hys_81310170_41c8","12345678");
// // HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
// // httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(proxy));
// Spider.create(new BaiduKeyWordProcessor()).thread(1)
// .startRequest(list)
// // .setDownloader(new SeleniumDownloader("E:\\chromdriver\\chromedriver.exe"))
// // .setDownloader(httpClientDownloader)
// // .addUrl(url)
// .runAsync();
//
// }
}
baidu_search/src/main/java/com/zzsn/webMagic/BaiduTask.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
com.alibaba.fastjson.JSONObject
;
import
com.google.gson.Gson
;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.cache.MemcachedUtils
;
import
com.zzsn.job.KafkaConsumerTask
;
import
com.zzsn.search.entity.ClbAnsProcessitem
;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.search.util.SplitKeyword
;
import
com.zzsn.utility.index.Constants
;
import
org.apache.kafka.clients.consumer.ConsumerRecord
;
import
org.apache.kafka.clients.consumer.ConsumerRecords
;
import
org.apache.kafka.clients.consumer.KafkaConsumer
;
import
org.apache.kafka.common.TopicPartition
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.scheduling.annotation.EnableScheduling
;
import
org.springframework.scheduling.annotation.Scheduled
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.LinkedBlockingQueue
;
/**
* 百度关键字搜索爬虫任务
*/
@Component
@EnableScheduling
public
class
BaiduTask
{
private
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduTask
.
class
);
//缓存url列表对象对列,先进先出原则
public
static
LinkedBlockingQueue
<
ClbAnsProcessitem
>
baiduUrlQueue
=
new
LinkedBlockingQueue
();
/**
* 关键字队列
*/
public
static
LinkedBlockingQueue
<
KeywordMsg
>
keyWordsQueue
=
new
LinkedBlockingQueue
<>();
private
final
String
BAIDU_KEY_URL
=
"https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%s&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=%d"
;
/**
* 关键词组个数
*/
static
int
keyWordsArrey
=
0
;
/**
* 关键词个数
*/
static
int
keyWordsSigin
=
0
;
/**
* 生成有效百度连接
*/
static
int
linksUrl
=
0
;
/**
* 无效百度连接
*/
static
int
invalidUrl
=
0
;
/**
* 重复连接
*/
static
int
doublep
=
0
;
static
int
toKafka
=
0
;
/**
* 根据关键词组搜索抓取连接列表
*/
@Scheduled
(
initialDelay
=
40000
,
fixedRate
=
1000
*
60
*
10
)
public
void
getFormKeyWordsArry
(){
//创建消费者
logger
.
info
(
"10分钟跑一次。。。。。"
);
KafkaConsumer
<
String
,
String
>
consumer
=
KafkaConsumers
.
createConsumer
();
ArrayList
<
TopicPartition
>
topicPartitions
=
new
ArrayList
<>();
String
kafkaConsumerPartition
=
Constants
.
KAFKA_CONSUMER_PARTITION
;
String
[]
partitions
=
kafkaConsumerPartition
.
split
(
","
);
for
(
int
i
=
0
;
i
<
partitions
.
length
;
i
++)
{
topicPartitions
.
add
(
new
TopicPartition
(
Constants
.
KAFKA_CONSUMER_TOPIC
,
Integer
.
parseInt
(
partitions
[
i
])));
}
consumer
.
assign
(
topicPartitions
);
KeywordMsg
keywordMsg
=
null
;
//关键词对象
KeywordMsg
nextKeyWords
=
null
;
//关键词对象
List
<
String
>
keyword
=
null
;
//关键词字符串集合
List
<
KeywordMsg
>
keywordMsgsList
=
new
ArrayList
<>();
//保存所有关键字列表
for
(
int
i
=
0
;
i
<
Constants
.
KAFKA_COUNT
;
i
++){
ConsumerRecords
<
String
,
String
>
records
=
consumer
.
poll
(
100
);
consumer
.
commitSync
();
logger
.
info
(
"百度关键字搜索消费消息开始。。。。。。。。。。。。"
);
for
(
ConsumerRecord
consumerRecord:
records
){
//封装关键字对象
try
{
keywordMsg
=
new
Gson
().
fromJson
(
consumerRecord
.
value
().
toString
(),
KeywordMsg
.
class
);
/***
* 拿到关键词组,由关键词组再次拼装成关键词生成关键词对象放入队列
* 下一步需要用代理ip来分包爬取连接
*/
if
(
keywordMsg
!=
null
){
keyWordsArrey
++;
//拿到关键词组,再次组装关键词对象
keyword
=
SplitKeyword
.
transForm
(
keywordMsg
.
getKeyWord
());
if
(
keyword
!=
null
&&
keyword
.
size
()>
0
){
for
(
String
keys:
keyword
){
keyWordsSigin
++;
nextKeyWords
=
new
KeywordMsg
();
nextKeyWords
.
setId
(
keywordMsg
.
getId
());
nextKeyWords
.
setCrawlerType
(
keywordMsg
.
getCrawlerType
());
nextKeyWords
.
setKeyWord
(
keys
);
nextKeyWords
.
setStatus
(
keywordMsg
.
getStatus
());
nextKeyWords
.
setExclusionWord
(
keywordMsg
.
getExclusionWord
());
nextKeyWords
.
setSubjectId
(
keywordMsg
.
getSubjectId
());
nextKeyWords
.
setWordsCode
(
keywordMsg
.
getWordsCode
());
nextKeyWords
.
setWordsName
(
keywordMsg
.
getWordsName
());
keyWordsQueue
.
put
(
nextKeyWords
);
}
}
}
else
{
logger
.
error
(
"百度搜索关键字,未解析到关键字"
);
}
}
catch
(
Exception
e
){
logger
.
error
(
"从kafka获取关键字出错,错误信息:"
,
e
);
}
}
}
}
}
baidu_search/src/main/java/com/zzsn/webMagic/BaiduTxtProcessor.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
cn.hutool.core.date.DateUtil
;
import
com.alibaba.fastjson.JSONObject
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.zzsn.cache.JedisUtil
;
import
com.zzsn.cache.MemcachedUtils
;
import
com.zzsn.docinfo.DocInfo
;
import
com.zzsn.entity.SiteTemplate
;
import
com.zzsn.paser.SourceTemplateByTag
;
import
com.zzsn.search.entity.ClbAnsProcessitem
;
import
com.zzsn.search.extractor.ContentFileFinder
;
import
com.zzsn.search.extractor.StandardWebExtractorHandler
;
import
com.zzsn.search.util.SpringContextUtil
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.utility.model.ContentFileResult
;
import
com.zzsn.utility.util.RequestUtil
;
import
com.zzsn.utility.util.Utility
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpHeaders
;
import
org.apache.ibatis.annotations.Param
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.kafka.core.KafkaTemplate
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* 资讯正文抓取
*/
public
class
BaiduTxtProcessor
implements
PageProcessor
{
private
final
static
Logger
log
=
LoggerFactory
.
getLogger
(
BaiduTxtProcessor
.
class
);
private
Site
site
;
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
private
ClbAnsProcessitem
processitem
=
null
;
private
List
<
ClbAnsProcessitem
>
processitemList
=
null
;
@Override
public
void
process
(
Page
page
)
{
log
.
info
(
"监听到进入内容解析中。。。。。。。。。。。。。"
);
Request
request
=
page
.
getRequest
();
processitem
=
new
ClbAnsProcessitem
();
processitemList
=
new
ArrayList
<>();
processitem
=
JSONObject
.
parseObject
(
request
.
getExtra
(
"model"
).
toString
(),
ClbAnsProcessitem
.
class
);
processitem
.
setSource
(
"3"
);
DocInfo
docInfo
=
new
DocInfo
();
try
{
String
link
=
page
.
getUrl
().
toString
();
//当前页面连接
String
infodata
=
page
.
getHtml
().
toString
();
//页面内容
//判断网页编码
String
contentCharset
=
Utility
.
getWebEncodingByStr
(
infodata
);
if
(
link
.
contains
(
"toutiao.com"
)
&&(
null
==
infodata
||
infodata
.
length
()
<
50
)){
infodata
=
RequestUtil
.
getTaotiaoData
(
link
);
}
// 判断是否存在对应域名的模板
if
(
link
.
contains
(
"qq.com"
)
&&
!
link
.
contains
(
"://new.qq.com"
)){
link
=
KafkaConsumers
.
transqqURl
(
link
);
}
String
domainurl
=
new
URL
(
link
).
getHost
();
Object
siteTempObj
=
MemcachedUtils
.
get
(
"domainUri_"
+
domainurl
);
SiteTemplate
siteTemplate
=
new
SiteTemplate
();
if
(
siteTempObj
!=
null
&&
!
"null"
.
equals
(
siteTempObj
))
{
com
.
zzsn
.
entity
.
Site
site
=(
com
.
zzsn
.
entity
.
Site
)
siteTempObj
;
siteTemplate
.
setMatchTitle
(
site
.
getMatchTitle
());
siteTemplate
.
setMatchAuthor
(
site
.
getMatchAuthor
());
siteTemplate
.
setMatchContent
(
site
.
getMatchContent
());
siteTemplate
.
setMatchOrigin
(
site
.
getMatchOrigin
());
siteTemplate
.
setMatchPublishDate
(
site
.
getMatchPublishDate
());
siteTemplate
.
setMatchSummary
(
site
.
getMatchSummary
());
docInfo
=
SourceTemplateByTag
.
doPaserByTag
(
infodata
,
docInfo
,
siteTemplate
);
}
if
(
null
!=
docInfo
.
getContentWithTag
())
{
System
.
out
.
println
(
"使用模板解析内容成功"
+
domainurl
);
log
.
info
(
"使用模板解析内容成功"
+
domainurl
);
}
//使用内容规则解析
if
(
null
==
docInfo
.
getContentWithTag
()
||
docInfo
.
getContentWithTag
().
trim
().
length
()
==
0
)
{
new
StandardWebExtractorHandler
().
doHandler
(
infodata
,
docInfo
);
}
//替换图片相对路径为绝对路径
if
(
docInfo
.
getTitle
()
!=
null
&&
docInfo
.
getTitle
().
trim
().
length
()
>
0
&&
docInfo
.
getContentNoTag
()
!=
null
&&
docInfo
.
getContentNoTag
().
trim
().
length
()
>
0
){
ContentFileResult
contentFileResult
=
new
ContentFileResult
();
contentFileResult
=
KafkaConsumers
.
getContentFile
(
docInfo
.
getContentWithTag
(),
docInfo
.
getSourceaddress
());
docInfo
.
setContentWithTag
(
ContentFileFinder
.
rmHtmlImgOrAtag
(
contentFileResult
.
getContentImgCvtTag
()));
docInfo
.
setContentImgCvtTag
(
contentFileResult
.
getContentImgCvtTag
());
}
if
(
StringUtils
.
isNotBlank
(
docInfo
.
getPublishDate
())){
processitem
.
setPublishDate
(
docInfo
.
getPublishDate
());
}
//只有内容不为空的才进下一步处理
if
(
StringUtils
.
isNotBlank
(
docInfo
.
getContentNoTag
())){
processitem
.
setContent
(
docInfo
.
getContentNoTag
());
processitem
.
setContentWithtag
(
docInfo
.
getContentWithTag
());
processitem
.
setSummary
(
docInfo
.
getSummary
());
processitem
.
setAuthor
(
docInfo
.
getAuthor
());
processitem
.
setCreateDate
(
DateUtil
.
now
());
String
msg
=
new
ObjectMapper
().
writeValueAsString
(
processitem
);
//输出
kafkaTemplate
.
send
(
Constants
.
testBaidu
,
msg
);
BaiduTask
.
toKafka
++;
log
.
info
(
"加入到kfaka..........."
);
// 加入缓存池中
// JedisUtil.sadd("baidu::"+processitem.getOrgId(), processitem.getSourceAddress());
//测试redis
JedisUtil
.
sadd
(
"baidu_web_test5::"
+
processitem
.
getOrgId
(),
processitem
.
getSourceAddress
());
}
else
{
log
.
info
(
"没有获取资讯内容,连接:"
+
link
);
}
}
catch
(
Exception
e
){
log
.
error
(
"解析内容逻辑出错,错误信息:"
,
e
);
}
log
.
info
(
"程序处理:关键词组总数:"
+
BaiduTask
.
keyWordsArrey
+
";关键词总数:"
+
BaiduTask
.
keyWordsSigin
+
";有效URL连接数:"
+
BaiduTask
.
linksUrl
+
";无效URL连接数:"
+
BaiduTask
.
invalidUrl
+
";重复连接个数:"
+
BaiduTask
.
doublep
+
";入kafak数量:"
+
BaiduTask
.
toKafka
);
}
@Override
public
Site
getSite
()
{
if
(
site
==
null
){
site
=
Site
.
me
().
setSleepTime
(
4000
);
site
.
addHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
site
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
);
site
.
addHeader
(
HttpHeaders
.
CONNECTION
,
"close"
);
site
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
site
.
setTimeOut
(
8000
);
site
.
setCycleRetryTimes
(
3
);
}
return
site
;
}
}
baidu_search/src/main/java/com/zzsn/webMagic/Expaler.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
com.zzsn.search.entity.ClbAnsProcessitem
;
import
com.zzsn.utility.util.DateUtil
;
import
com.zzsn.utils.GrabUtil
;
import
com.zzsn.webMagic.downloader.SeleniumDownloader
;
import
org.apache.http.HttpHeaders
;
import
org.jsoup.select.Elements
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.SimpleProxyProvider
;
import
us.codecraft.webmagic.selector.Html
;
import
javax.net.ssl.SSLContext
;
import
javax.net.ssl.SSLSocket
;
import
javax.net.ssl.SSLSocketFactory
;
import
java.util.List
;
public
class
Expaler
implements
PageProcessor
{
Site
site
;
@Override
public
void
process
(
Page
page
)
{
// List<String> links=page.getHtml().xpath("//div[@id=content_left]/div[@tpl='news-normal']").all();
// links.forEach(t-> System.out.println(t));
System
.
out
.
println
(
page
.
getHtml
().
toString
());
}
@Override
public
Site
getSite
()
{
if
(
site
==
null
){
site
=
Site
.
me
().
setSleepTime
(
0
);
site
.
addHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
site
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
);
site
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
site
.
addHeader
(
HttpHeaders
.
CONNECTION
,
"close"
);
}
return
site
;
}
public
static
void
main
(
String
[]
args
)
{
String
url
=
"https://www.163.com/dy/article/GTOJV0AF0551M1ZM.html"
;
Proxy
proxy
=
new
Proxy
(
"114.102.52.200"
,
61798
,
"hys_81310170_41c8"
,
"12345678"
);
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
httpClientDownloader
.
setProxyProvider
(
SimpleProxyProvider
.
from
(
KafkaConsumers
.
getProxy
().
get
(
0
)));
SeleniumDownloader
seleniumDownloader
=
new
SeleniumDownloader
(
"E:\\chromdriver\\chromedriver.exe"
);
seleniumDownloader
.
setProxyProvider
(
new
SimpleProxyProvider
(
KafkaConsumers
.
getProxy
()));
Spider
.
create
(
new
Expaler
()).
thread
(
1
)
// .startRequest(list)
.
setDownloader
(
seleniumDownloader
)
// .setDownloader(httpClientDownloader)
.
addUrl
(
url
)
.
runAsync
();
}
}
baidu_search/src/main/java/com/zzsn/webMagic/KafkaConsumers.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
cn.hutool.core.util.RandomUtil
;
import
com.zzsn.search.extractor.ContentFileFinder
;
import
com.zzsn.search.oracledb.OracleDBManager
;
import
com.zzsn.search.oracledb.OracleDataTable
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.utility.model.ContentFileResult
;
import
com.zzsn.utility.model.FileTag
;
import
org.apache.kafka.clients.CommonClientConfigs
;
import
org.apache.kafka.clients.consumer.ConsumerConfig
;
import
org.apache.kafka.clients.consumer.KafkaConsumer
;
import
org.apache.kafka.common.serialization.StringDeserializer
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
java.sql.SQLException
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
* kafka消费者
*/
public
class
KafkaConsumers
{
public
static
org
.
apache
.
kafka
.
clients
.
consumer
.
KafkaConsumer
<
String
,
String
>
createConsumer
()
{
Properties
properties
=
new
Properties
();
System
.
out
.
println
(
Constants
.
KAFKA_CONSUMER_SERVERS
);
properties
.
put
(
CommonClientConfigs
.
BOOTSTRAP_SERVERS_CONFIG
,
Constants
.
KAFKA_CONSUMER_SERVERS
);
properties
.
put
(
ConsumerConfig
.
KEY_DESERIALIZER_CLASS_CONFIG
,
StringDeserializer
.
class
);
properties
.
put
(
ConsumerConfig
.
VALUE_DESERIALIZER_CLASS_CONFIG
,
StringDeserializer
.
class
);
properties
.
put
(
ConsumerConfig
.
GROUP_ID_CONFIG
,
Constants
.
KAFKA_CONSUMER_GROUP_ID
);
properties
.
put
(
ConsumerConfig
.
ENABLE_AUTO_COMMIT_CONFIG
,
"false"
);
//kafka数据的读取方式
properties
.
put
(
ConsumerConfig
.
AUTO_OFFSET_RESET_CONFIG
,
Constants
.
KAFKA_CONSUMER_AUTO_OFFSET_RESET
);
// latest earliest
//时间间隔设置为1h
properties
.
put
(
"max.poll.interval.ms"
,
60
*
60
*
1000
);
properties
.
put
(
ConsumerConfig
.
MAX_POLL_RECORDS_CONFIG
,
1
);
return
new
KafkaConsumer
<>(
properties
);
}
/**
* 从oracl里获取代理ip
* @return
*/
public
static
List
<
Proxy
>
getProxy
(){
List
<
Proxy
>
proxyList
=
null
;
String
searchSql
=
"select proxy from CIS_sys_Proxy where id="
+(
RandomUtil
.
randomInt
(
4
)+
1
);
// String searchSql = "select proxy from CIS_sys_Proxy ";
String
proxy
=
null
;
OracleDBManager
dm
=
new
OracleDBManager
();
try
{
OracleDataTable
dt
=
dm
.
getResultData
(
null
,
null
,
searchSql
);
if
(
dt
!=
null
&&
dt
.
getRowCount
()>
0
){
proxyList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
dt
.
getRowCount
();
i
++){
for
(
int
j
=
0
;
j
<
dt
.
getColCoun
();
j
++)
if
(
dt
.
getRow
()[
i
][
j
].
length
()>
5
){
proxy
=
dt
.
getRow
()[
i
][
j
];
String
[]
ps
=
proxy
.
split
(
"-"
);
proxyList
.
add
(
new
Proxy
(
ps
[
0
],
Integer
.
valueOf
(
ps
[
1
]),
ps
[
2
],
ps
[
3
]));
}
}
}
}
catch
(
SQLException
e
)
{
e
.
printStackTrace
();
}
return
proxyList
;
}
//转换qq新闻链接
public
static
String
transqqURl
(
String
oldurl
){
String
patt
=
"https://new.qq.com/omn/[date]/[pamars].html"
;
String
b1
=
oldurl
.
substring
(
oldurl
.
lastIndexOf
(
"/"
)+
1
);
String
b2
=
getNumbers
(
b1
);
String
curl
=
patt
.
replace
(
"[date]"
,
b2
).
replace
(
"[pamars]"
,
b1
);
return
curl
;
}
public
static
String
getNumbers
(
String
content
)
{
Pattern
pattern
=
Pattern
.
compile
(
"\\d+"
);
Matcher
matcher
=
pattern
.
matcher
(
content
);
while
(
matcher
.
find
())
{
return
matcher
.
group
(
0
);
}
return
""
;
}
public
static
ContentFileResult
getContentFile
(
String
contentWithTag
,
String
sourceaddress
)
throws
Exception
{
String
contentImgCvtTag
=
contentWithTag
;
String
formatImgContent
=
contentWithTag
;
Map
<
String
,
FileTag
>
imgDataMap
=
ContentFileFinder
.
getContentFileTag
(
contentWithTag
,
sourceaddress
);
//key为图片爬取路径,value为图片保存路径
Map
<
String
,
FileTag
>
imgMap
=
new
HashMap
<
String
,
FileTag
>();
for
(
String
key
:
imgDataMap
.
keySet
())
{
FileTag
fileTag
=
imgDataMap
.
get
(
key
);
while
(
contentImgCvtTag
.
contains
(
key
))
{
//IMG_SERVER开头的路径
contentImgCvtTag
=
contentImgCvtTag
.
replace
(
key
,
fileTag
.
getSaveTag
());
}
imgMap
.
put
(
fileTag
.
getAbsolutePath
(),
fileTag
);
}
ContentFileResult
cis
=
new
ContentFileResult
();
cis
.
setContentAbsoulute
(
formatImgContent
);
cis
.
setContentImgCvtTag
(
contentImgCvtTag
);
cis
.
setFileMap
(
imgMap
);
return
cis
;
}
}
baidu_search/src/main/java/com/zzsn/webMagic/LinksReadThread.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.utility.index.Constants
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.proxy.SimpleProxyProvider
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* 从队列里读取关键字
*/
public
class
LinksReadThread
extends
Thread
{
private
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
LinksReadThread
.
class
);
private
final
String
BAIDU_KEY_URL
=
"https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%s&pn=%d"
;
@Override
public
void
run
()
{
List
<
Request
>
requestByMcList
=
new
ArrayList
<>();
//reques列表
Request
webReq
=
null
;
while
(
true
){
try
{
KeywordMsg
keywordMsg
=
BaiduTask
.
keyWordsQueue
.
take
();
logger
.
info
(
"等到关键字来了。。。。。。"
);
/****
* 根据关键字词组再次拼装成需要搜索的关键字
* 一个关键字词组能生成N个关键字,一个关键字词组启动一个webMagic爬虫任务
* 同时把关键字词组的基本属性传递给下一步
* 处理流程:1、初始化 webMagic
* 2、 代理获取:增加代理验证
* 3、由关键字拼接百度url,每个关键词只处理一页(10),后续可修改
*/
//组装下一步用 webMagic进行多线程处理,组装url
try
{
for
(
int
i
=
0
;
i
<
Constants
.
PAGESIZE
;
i
++){
//根据配置文件配置抓取页数,此方法只对百度元搜索有用
webReq
=
new
Request
();
webReq
.
putExtra
(
"sid"
,
keywordMsg
.
getId
());
webReq
.
putExtra
(
"tid"
,
keywordMsg
.
getWordsCode
());
webReq
.
putExtra
(
"words"
,
keywordMsg
.
getKeyWord
());
webReq
.
setUrl
(
String
.
format
(
BAIDU_KEY_URL
,
URLEncoder
.
encode
(
keywordMsg
.
getKeyWord
(),
"UTF-8"
),(
i
*
10
)));
requestByMcList
.
add
(
webReq
);
}
}
catch
(
Exception
e
){
logger
.
error
(
"根据关键词组生成request对象出错,错误信息:"
,
e
);
}
// if (requestByMcList.size()<10){
// continue;
// }
//清除jvm 缓存Ip
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.ttl"
,
"0"
);
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.negative.ttl"
,
"0"
);
if
(
Constants
.
PROXY
.
equals
(
"1"
)){
Proxy
p
=
new
Proxy
(
""
,
32323
);
//ProxyMap.getProxy();
logger
.
info
(
"获取待处理一批url,获取代理ip请等待进行下步处理"
);
if
(
p
!=
null
){
HttpClientDownloader
httpClientDownloader
=
new
HttpClientDownloader
();
httpClientDownloader
.
setProxyProvider
(
new
SimpleProxyProvider
(
KafkaConsumers
.
getProxy
()));
Spider
.
create
(
new
BaiduKeyWordProcessor
())
.
setDownloader
(
httpClientDownloader
)
.
startRequest
(
requestByMcList
)
.
thread
(
1
)
.
runAsync
();
}
else
{
Thread
.
sleep
(
1000
*
10
);
continue
;
}
}
else
{
//不走代理则直接开始跑
logger
.
info
(
"获取待处理url提交到webMgic,提交数:"
+
requestByMcList
.
size
());
Spider
.
create
(
new
BaiduKeyWordProcessor
())
.
startRequest
(
requestByMcList
)
.
thread
(
1
)
.
runAsync
();
}
requestByMcList
=
new
ArrayList
<>();
try
{
Thread
.
sleep
(
1000
*
8
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
baidu_search/src/main/java/com/zzsn/webMagic/downloader/SeleniumDownloader.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
.
downloader
;
import
org.openqa.selenium.*
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.chrome.ChromeOptions
;
import
org.openqa.selenium.remote.RemoteWebDriver
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.proxy.ProxyProvider
;
import
us.codecraft.webmagic.selector.Html
;
import
us.codecraft.webmagic.selector.PlainText
;
import
java.io.Closeable
;
import
java.io.IOException
;
import
java.util.Map
;
import
java.util.concurrent.TimeUnit
;
/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public
class
SeleniumDownloader
implements
Downloader
,
Closeable
{
private
volatile
WebDriverPool
webDriverPool
;
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
int
sleepTime
=
0
;
private
int
poolSize
=
1
;
private
volatile
ChromeDriver
webDriver
;
private
static
final
String
DRIVER_PHANTOMJS
=
"phantomjs"
;
Dimension
targetSize
=
new
Dimension
(
600
,
600
);
// 设置窗口大小
private
ProxyProvider
proxyProvider
;
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public
SeleniumDownloader
(
String
chromeDriverPath
)
{
System
.
getProperties
().
setProperty
(
"webdriver.chrome.driver"
,
chromeDriverPath
);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public
SeleniumDownloader
()
{
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
public
void
setProxyProvider
(
ProxyProvider
proxyProvider
)
{
this
.
proxyProvider
=
proxyProvider
;
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public
SeleniumDownloader
setSleepTime
(
int
sleepTime
)
{
this
.
sleepTime
=
sleepTime
;
return
this
;
}
@Override
public
Page
download
(
Request
request
,
Task
task
)
{
if
(
webDriver
!=
null
){
webDriver
.
quit
();
}
try
{
ChromeOptions
options
=
new
ChromeOptions
();
//设置 chrome 的无头模式
options
.
addArguments
(
"--headless"
);
options
.
addArguments
(
"--disable-gpu"
);
options
.
addArguments
(
"--no-sandbox"
);
options
.
addArguments
(
"--disable-dev-shm-usage"
);
options
.
addArguments
(
"--start-maximized"
);
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options
.
addArguments
(
"--window-size=1280,4300"
);
if
(
proxyProvider
!=
null
){
us
.
codecraft
.
webmagic
.
proxy
.
Proxy
p
=
proxyProvider
.
getProxy
(
task
);
String
proxyServer
=
p
.
getHost
()+
":"
+
p
.
getPort
();
org
.
openqa
.
selenium
.
Proxy
proxy
=
new
Proxy
().
setHttpProxy
(
proxyServer
).
setSslProxy
(
proxyServer
);
proxy
.
setSocksUsername
(
"hys_81310170_41c8"
);
proxy
.
setSocksPassword
(
"1234567"
);
options
.
setProxy
(
proxy
);
}
webDriver
=
new
ChromeDriver
(
options
);
}
catch
(
Exception
e
)
{
logger
.
warn
(
"interrupted"
,
e
);
return
null
;
}
webDriver
.
get
(
request
.
getUrl
());
try
{
Thread
.
sleep
(
sleepTime
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement
webElement
=
webDriver
.
findElement
(
By
.
xpath
(
"/html"
));
String
content
=
webElement
.
getAttribute
(
"outerHTML"
);
Page
page
=
new
Page
();
page
.
setRawText
(
content
);
page
.
setHtml
(
new
Html
(
content
,
request
.
getUrl
()));
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
webDriver
.
quit
();
return
page
;
}
// private void checkInit() {
// if (webDriver == null) {
// synchronized (this) {
// webDriverPool = new WebDriverPool(poolSize);
// }
// }
// }
@Override
public
void
setThread
(
int
thread
)
{
this
.
poolSize
=
thread
;
}
@Override
public
void
close
()
throws
IOException
{
webDriver
.
quit
();
}
}
baidu_search/src/main/java/com/zzsn/webMagic/downloader/WebDriverPool.java
deleted
100644 → 0
浏览文件 @
83f00b0f
package
com
.
zzsn
.
webMagic
.
downloader
;
import
org.openqa.selenium.Proxy
;
import
org.openqa.selenium.WebDriver
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.chrome.ChromeOptions
;
import
org.openqa.selenium.firefox.FirefoxDriver
;
import
org.openqa.selenium.phantomjs.PhantomJSDriver
;
import
org.openqa.selenium.phantomjs.PhantomJSDriverService
;
import
org.openqa.selenium.remote.DesiredCapabilities
;
import
org.openqa.selenium.remote.RemoteWebDriver
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.io.FileReader
;
import
java.io.IOException
;
import
java.net.MalformedURLException
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Properties
;
import
java.util.concurrent.BlockingDeque
;
import
java.util.concurrent.LinkedBlockingDeque
;
import
java.util.concurrent.atomic.AtomicInteger
;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class
WebDriverPool
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
final
static
int
DEFAULT_CAPACITY
=
5
;
private
final
int
capacity
;
private
final
static
int
STAT_RUNNING
=
1
;
private
final
static
int
STAT_CLODED
=
2
;
private
AtomicInteger
stat
=
new
AtomicInteger
(
STAT_RUNNING
);
/*
* new fields for configuring phantomJS
*/
private
WebDriver
mDriver
=
null
;
private
boolean
mAutoQuitDriver
=
true
;
private
static
final
String
DEFAULT_CONFIG_FILE
=
"E:/chromdriver/config.ini"
;
private
static
final
String
DRIVER_FIREFOX
=
"firefox"
;
private
static
final
String
DRIVER_CHROME
=
"chrome"
;
private
static
final
String
DRIVER_PHANTOMJS
=
"phantomjs"
;
protected
static
Properties
sConfig
;
protected
static
DesiredCapabilities
sCaps
;
/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public
void
configure
()
throws
IOException
{
// Read config file
sConfig
=
new
Properties
();
String
configFile
=
DEFAULT_CONFIG_FILE
;
if
(
System
.
getProperty
(
"selenuim_config"
)!=
null
){
configFile
=
System
.
getProperty
(
"selenuim_config"
);
}
sConfig
.
load
(
new
FileReader
(
configFile
));
// Prepare capabilities
sCaps
=
new
DesiredCapabilities
();
sCaps
.
setJavascriptEnabled
(
true
);
sCaps
.
setCapability
(
"takesScreenshot"
,
false
);
String
driver
=
sConfig
.
getProperty
(
"driver"
,
DRIVER_PHANTOMJS
);
// Fetch PhantomJS-specific configuration parameters
if
(
driver
.
equals
(
DRIVER_PHANTOMJS
))
{
// "phantomjs_exec_path"
if
(
sConfig
.
getProperty
(
"phantomjs_exec_path"
)
!=
null
)
{
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_EXECUTABLE_PATH_PROPERTY
,
sConfig
.
getProperty
(
"phantomjs_exec_path"
));
}
else
{
throw
new
IOException
(
String
.
format
(
"Property '%s' not set!"
,
PhantomJSDriverService
.
PHANTOMJS_EXECUTABLE_PATH_PROPERTY
));
}
// "phantomjs_driver_path"
if
(
sConfig
.
getProperty
(
"phantomjs_driver_path"
)
!=
null
)
{
System
.
out
.
println
(
"Test will use an external GhostDriver"
);
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY
,
sConfig
.
getProperty
(
"phantomjs_driver_path"
));
}
else
{
System
.
out
.
println
(
"Test will use PhantomJS internal GhostDriver"
);
}
}
// Disable "web-security", enable all possible "ssl-protocols" and
// "ignore-ssl-errors" for PhantomJSDriver
// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
// String[] {
// "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
ArrayList
<
String
>
cliArgsCap
=
new
ArrayList
<
String
>();
cliArgsCap
.
add
(
"--web-security=false"
);
cliArgsCap
.
add
(
"--ssl-protocol=any"
);
cliArgsCap
.
add
(
"--ignore-ssl-errors=true"
);
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_CLI_ARGS
,
cliArgsCap
);
// Control LogLevel for GhostDriver, via CLI arguments
sCaps
.
setCapability
(
PhantomJSDriverService
.
PHANTOMJS_GHOSTDRIVER_CLI_ARGS
,
new
String
[]
{
"--logLevel="
+
(
sConfig
.
getProperty
(
"phantomjs_driver_loglevel"
)
!=
null
?
sConfig
.
getProperty
(
"phantomjs_driver_loglevel"
)
:
"INFO"
)
});
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if
(
isUrl
(
driver
))
{
sCaps
.
setBrowserName
(
"phantomjs"
);
mDriver
=
new
RemoteWebDriver
(
new
URL
(
driver
),
sCaps
);
}
else
if
(
driver
.
equals
(
DRIVER_FIREFOX
))
{
mDriver
=
new
FirefoxDriver
(
sCaps
);
}
else
if
(
driver
.
equals
(
DRIVER_CHROME
))
{
ChromeOptions
options
=
new
ChromeOptions
();
//设置 chrome 的无头模式
options
.
addArguments
(
"--headless"
);
options
.
addArguments
(
"--disable-gpu"
);
options
.
addArguments
(
"--no-sandbox"
);
options
.
addArguments
(
"--disable-dev-shm-usage"
);
options
.
addArguments
(
"--start-maximized"
);
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options
.
addArguments
(
"--window-size=1280,4300"
);
String
proxyServer
=
"1.83.251.72:40013"
;
Proxy
proxy
=
new
Proxy
().
setHttpProxy
(
proxyServer
).
setSslProxy
(
proxyServer
);
options
.
setProxy
(
proxy
);
mDriver
=
new
ChromeDriver
(
options
);
}
else
if
(
driver
.
equals
(
DRIVER_PHANTOMJS
))
{
mDriver
=
new
PhantomJSDriver
(
sCaps
);
}
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private
boolean
isUrl
(
String
urlString
)
{
try
{
new
URL
(
urlString
);
return
true
;
}
catch
(
MalformedURLException
mue
)
{
return
false
;
}
}
/**
* store webDrivers created
*/
private
List
<
WebDriver
>
webDriverList
=
Collections
.
synchronizedList
(
new
ArrayList
<
WebDriver
>());
/**
* store webDrivers available
*/
private
BlockingDeque
<
WebDriver
>
innerQueue
=
new
LinkedBlockingDeque
<
WebDriver
>();
public
WebDriverPool
(
int
capacity
)
{
this
.
capacity
=
capacity
;
}
public
WebDriverPool
()
{
this
(
DEFAULT_CAPACITY
);
}
/**
*
* @return
* @throws InterruptedException
*/
public
WebDriver
get
()
throws
InterruptedException
{
checkRunning
();
WebDriver
poll
=
innerQueue
.
poll
();
if
(
poll
!=
null
)
{
return
poll
;
}
if
(
webDriverList
.
size
()
<
capacity
)
{
synchronized
(
webDriverList
)
{
if
(
webDriverList
.
size
()
<
capacity
)
{
// add new WebDriver instance into pool
try
{
configure
();
innerQueue
.
add
(
mDriver
);
webDriverList
.
add
(
mDriver
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
}
return
innerQueue
.
take
();
}
public
void
returnToPool
(
WebDriver
webDriver
)
{
checkRunning
();
innerQueue
.
add
(
webDriver
);
}
protected
void
checkRunning
()
{
if
(!
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_RUNNING
))
{
throw
new
IllegalStateException
(
"Already closed!"
);
}
}
public
void
closeAll
()
{
boolean
b
=
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_CLODED
);
if
(!
b
)
{
throw
new
IllegalStateException
(
"Already closed!"
);
}
for
(
WebDriver
webDriver
:
webDriverList
)
{
logger
.
info
(
"Quit webDriver"
+
webDriver
);
webDriver
.
quit
();
webDriver
=
null
;
}
}
}
baidu_search/src/main/resources/constants.properties
浏览文件 @
a18c008d
...
@@ -23,16 +23,16 @@ THREAD_SIZE=1
...
@@ -23,16 +23,16 @@ THREAD_SIZE=1
#地址
#地址
KAFKA_CONSUMER_SERVERS
=
114.115.159.144:9092
KAFKA_CONSUMER_SERVERS
=
114.115.159.144:9092
#消费主题
#消费主题
#
KAFKA_CONSUMER_TOPIC=keyWordsInfo
KAFKA_CONSUMER_TOPIC
=
keyWordsInfo
#测试主题
#测试主题
KAFKA_CONSUMER_TOPIC
=
baiduTest
#
KAFKA_CONSUMER_TOPIC=baiduTest
#消费者
#消费者
#KAFKA_CONSUMER_GROUP_ID=baidu-web-test
KAFKA_CONSUMER_GROUP_ID
=
baidu2
#测试消费者
#测试消费者
KAFKA_CONSUMER_GROUP_ID
=
baidu-wemagic
#
KAFKA_CONSUMER_GROUP_ID=baidu-wemagic
#发送消息
#发送消息
KAFKA_test_TOPIC
=
baidu-bind2
-test
#KAFKA_test_TOPIC=baidu-bind
-test
#
KAFKA_test_TOPIC=baidu-new-test
KAFKA_test_TOPIC
=
baidu-new-test
...
@@ -81,3 +81,6 @@ whiles=10
...
@@ -81,3 +81,6 @@ whiles=10
pageSize
=
5
pageSize
=
5
#平分因子
#平分因子
averger
=
2000
averger
=
2000
#
crawler_server
=
baidu-1
\ No newline at end of file
comm_crawler/src/main/java/com/zzsn/CrawlerStaticApplication.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/api/CrawlerCommVerifyController.java
浏览文件 @
a18c008d
...
@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
...
@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody
@ResponseBody
public
String
VerifyDetailMsg
(
@RequestBody
SiteMsgTemple
siteMsgTemple
,
HttpServletResponse
response
){
public
String
VerifyDetailMsg
(
@RequestBody
SiteMsgTemple
siteMsgTemple
,
HttpServletResponse
response
){
SiteInfoVerify
siteInfoVerify
=
new
SiteInfoVerify
();
SiteInfoVerify
siteInfoVerify
=
new
SiteInfoVerify
();
siteMsgTemple
.
setVerifyType
(
"1"
);
//
siteMsgTemple.setVerifyType("1");
VerifyResult
verifyResult
=
siteInfoVerify
.
crawlerDetialMsg
(
siteMsgTemple
);
VerifyResult
verifyResult
=
siteInfoVerify
.
crawlerDetialMsg
(
siteMsgTemple
);
return
MsgUtil
.
outSiteJSON
(
verifyResult
);
return
MsgUtil
.
outSiteJSON
(
verifyResult
);
}
}
...
...
comm_crawler/src/main/java/com/zzsn/crawler/DynaminSiteThread.java
浏览文件 @
a18c008d
package
com
.
zzsn
.
crawler
;
package
com
.
zzsn
.
crawler
;
import
cn.hutool.core.date.DateTime
;
import
cn.hutool.core.date.DateTime
;
import
cn.hutool.core.date.DateUtil
;
import
com.fasterxml.jackson.core.JsonProcessingException
;
import
com.fasterxml.jackson.core.JsonProcessingException
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.crawler.paser.*
;
import
com.zzsn.crawler.paser.*
;
import
com.zzsn.crawler.uriparser.HisURIConfig
;
import
com.zzsn.crawler.uriparser.HisURIConfig
;
import
com.zzsn.crawler.uriparser.HisURIParser
;
import
com.zzsn.crawler.uriparser.HisURIParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
...
@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{
...
@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{
public
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
public
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
public
SiteMsgTemple
siteMsgTemple
=
new
SiteMsgTemple
();
public
SiteMsgTemple
siteMsgTemple
=
new
SiteMsgTemple
();
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
@Override
@Override
...
@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{
...
@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{
// @Async("asyncexecutorService")
// @Async("asyncexecutorService")
public
void
crawler
(){
public
void
crawler
(){
sentBadSiteMsg
(
siteMsgTemple
,
Constants
.
MODEL_SCORE_URL
,
Constants
.
KAFKA_CONSUMER_PARTITION
);
//获取栏目链接以及翻页的链接
//获取栏目链接以及翻页的链接
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
log
.
info
(
"获取urlList: "
+
urlList
.
size
());
log
.
info
(
"获取urlList: "
+
urlList
.
size
());
...
@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{
...
@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{
// String charset = paserSiteDownload.getCharSet(urlList.get(0));
// String charset = paserSiteDownload.getCharSet(urlList.get(0));
// String charset = paserSiteDownload.getCharSet(urlList.get(0));
// String charset = paserSiteDownload.getCharSet(urlList.get(0));
String
charset
=
""
;
String
charset
=
""
;
try
{
//
try {
charset
=
paserSiteDownload
.
locateCharSet
(
urlList
.
get
(
0
));
//
charset = paserSiteDownload.locateCharSet(urlList.get(0));
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
try
{
//
try {
charset
=
paserSiteDownload
.
getCharSet
(
urlList
.
get
(
0
));
//
charset = paserSiteDownload.getCharSet(urlList.get(0));
}
catch
(
IOException
ex
)
{
//
} catch (IOException ex) {
//
//
}
//
}
}
//
}
//获取列表url等信息通过匹配url过滤
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
DocInfo
>
docInfoList
=
new
ArrayList
<>();
List
<
DocInfo
>
docInfoList
=
new
ArrayList
<>();
log
.
info
(
"信息源名称:"
+
siteMsgTemple
.
getSiteName
()+
" 信息源采集开始时间:"
+
DateTime
.
now
());
log
.
info
(
"信息源名称:"
+
siteMsgTemple
.
getSiteName
()+
" 信息源采集开始时间:"
+
DateTime
.
now
());
String
infoSourceId
=
siteMsgTemple
.
getId
();
//获取信息源id
String
infoSourceId
=
siteMsgTemple
.
getId
();
//获取信息源id
//默认表达式类型
//默认表达式类型
siteMsgTemple
.
setListExpressionType
(
siteMsgTemple
.
getListExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getListExpressionType
());
siteMsgTemple
.
setListExpressionType
(
siteMsgTemple
.
getListExpressionType
()==
null
?
"0"
:
siteMsgTemple
.
getListExpressionType
());
...
@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{
...
@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByCss
webContentPaserByCss
=
new
WebContentPaserByCss
();
WebContentPaserByCss
webContentPaserByCss
=
new
WebContentPaserByCss
();
//获取资讯详情信息 根据标签解析
//获取资讯详情信息 根据标签解析
docInfoList
=
webContentPaserByCss
.
catchWebNewsByCSS
(
metaSearchList
,
siteMsgTemple
);
docInfoList
=
webContentPaserByCss
.
catchWebNewsByCSS
(
metaSearchList
,
siteMsgTemple
);
}
else
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"2"
)){
//xpath解析
}
else
if
(
siteMsgTemple
.
getDetailExpressionType
().
equals
(
"2"
)){
//xpath解析
WebContentPaserByXpath
webContentPaserByXpath
=
new
WebContentPaserByXpath
();
WebContentPaserByXpath
webContentPaserByXpath
=
new
WebContentPaserByXpath
();
//获取资讯详情信息 根据标签解析
//获取资讯详情信息 根据标签解析
...
@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{
...
@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{
}
}
}
}
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
crawlerType
,
String
partition
){
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setConsumerDate
(
new
Date
());
badSiteMsg
.
setCrawlerType
(
crawlerType
);
badSiteMsg
.
setPartition
(
partition
);
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
badSiteMsg
);
kafkaTemplate
.
send
(
"crawler_consumer"
,
docjson
);
}
catch
(
Exception
e
){
}
}
public
ClbAnsProcessitem
docInfoTrans2Processitem
(
DocInfo
docInfo
){
public
ClbAnsProcessitem
docInfoTrans2Processitem
(
DocInfo
docInfo
){
ClbAnsProcessitem
clbAnsProcessitem
=
new
ClbAnsProcessitem
();
ClbAnsProcessitem
clbAnsProcessitem
=
new
ClbAnsProcessitem
();
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
clbAnsProcessitem
.
setSid
(
docInfo
.
getSid
()+
""
);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByCss.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByIntellige.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByJsonXpath.java
浏览文件 @
a18c008d
...
@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath {
...
@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath {
*/
*/
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
public
void
sentBadSiteMsg
(
SiteMsgTemple
siteMsgTemple
,
String
msg
,
String
problemType
){
try
{
try
{
BadSiteMsg
badSiteMsg
=
new
BadSiteMsg
();
BadSiteMsg
Bak
badSiteMsg
=
new
BadSiteMsgBak
();
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setId
(
siteMsgTemple
.
getId
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setInfoSourceCode
(
siteMsgTemple
.
getInfoSourceCode
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
badSiteMsg
.
setWebSiteName
(
siteMsgTemple
.
getWebSiteName
());
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByXpath.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
浏览文件 @
a18c008d
...
@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent;
...
@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent;
import
java.io.*
;
import
java.io.*
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.time.temporal.ChronoUnit
;
import
java.time.temporal.ChronoUnit
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
com.alibaba.fastjson.JSON
;
import
com.zzsn.crawler.ReuseWebDriver
;
import
com.zzsn.crawler.ReuseWebDriver
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.generation.Constants
;
import
com.zzsn.job.JedisUtil
;
import
com.zzsn.util.DriverUtil
;
import
com.zzsn.util.DriverUtil
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
import
org.openqa.selenium.*
;
import
org.openqa.selenium.*
;
...
@@ -72,25 +76,51 @@ public class SeleniumTime {
...
@@ -72,25 +76,51 @@ public class SeleniumTime {
// @Async("asyncTaskExecutorSelenium")
// @Async("asyncTaskExecutorSelenium")
public
static
String
getScopehtml
(
String
url
)
{
public
static
String
getScopehtml
(
String
url
)
{
String
html
=
""
;
String
html
=
""
;
ReuseWebDriver
driver
=
null
;
try
{
try
{
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
try
{
try
{
Duration
duration
=
Duration
.
of
(
50
,
ChronoUnit
.
SECONDS
);
driver
=
DriverUtil
.
getChromeDriver
();
}
catch
(
Exception
e
){
log
.
info
(
"获取浏览器ReuseWebDriver异常:"
+
e
.
getMessage
());
Map
<
String
,
String
>
map
=
new
HashMap
<>();
map
.
put
(
"sessionId"
,
"sessionId"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
}
try
{
Duration
duration
=
Duration
.
of
(
60
,
ChronoUnit
.
SECONDS
);
driver
.
manage
().
timeouts
().
pageLoadTimeout
(
duration
);
driver
.
manage
().
timeouts
().
pageLoadTimeout
(
duration
);
driver
.
get
(
url
);
driver
.
get
(
url
);
// Thread.sleep(1
000);
Thread
.
sleep
(
5
000
);
try
{
try
{
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
html
=
webElement
.
getAttribute
(
"outerHTML"
);
html
=
webElement
.
getAttribute
(
"outerHTML"
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
driver
.
quit
();
log
.
info
(
"获取页面内容异常:"
+
e
.
getMessage
());
log
.
info
(
"获取页面内容异常:"
+
e
.
getMessage
());
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
driver
.
quit
();
// 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动
// 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动
log
.
info
(
"驱动打开URL异常:"
+
e
.
getMessage
());
log
.
info
(
"驱动打开URL异常:"
+
e
.
getMessage
());
Map
<
String
,
String
>
map
=
new
HashMap
<>();
map
.
put
(
"sessionId"
,
"sessionId"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
driver
.
quit
();
try
{
Map
<
String
,
String
>
map
=
new
HashMap
<>();
map
.
put
(
"sessionId"
,
"sessionId"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
}
catch
(
Exception
ex
)
{
}
log
.
info
(
"驱动访问页面出现出现异常:"
+
e
.
getMessage
());
log
.
info
(
"驱动访问页面出现出现异常:"
+
e
.
getMessage
());
}
}
return
html
;
return
html
;
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/ArticleCrawler.java
浏览文件 @
a18c008d
...
@@ -13,80 +13,80 @@ public class ArticleCrawler {
...
@@ -13,80 +13,80 @@ public class ArticleCrawler {
articleCrawler
.
consumer
();
articleCrawler
.
consumer
();
}
}
public
void
consumer
(){
public
void
consumer
(){
String
record
=
"{\n"
+
String
value
=
"{\n"
+
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
"
\"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n"
+
"
\"id\": \"1560150270181019650
\",\n"
+
"
\"id\": \"1534713917403385858
\",\n"
+
"
\"infoSourceCode\": \"IN-20220818-0011
\",\n"
+
"
\"infoSourceCode\": \"IN-20220609-58696
\",\n"
+
"
\"webSiteName\": \"一带一路-项目周报
\",\n"
+
"
\"webSiteName\": \"中华人民共和国工业和信息化部
\",\n"
+
"
\"siteName\": \"一带一路-项目周报
\",\n"
+
"
\"siteName\": \"中华人民共和国工业和信息化部-领导活动
\",\n"
+
"
\"siteUri\": \"https://www.yidaiyilu.gov.cn/info/iList.jsp?cat_id=11432&cur_page=3
\",\n"
+
"
\"siteUri\": \"https://www.miit.gov.cn/xwdt/gxdt/ldhd/index.html
\",\n"
+
" \"infoSourceTypeId\": \"1\",\n"
+
"
\"infoSourceTypeId\": \"1\",\n"
+
"
\"siteLevel\": null
,\n"
+
"
\"siteLevel\": \"2\"
,\n"
+
"
\"language\": null
,\n"
+
"
\"language\": \"zh\"
,\n"
+
"
\"checkedList\": null
,\n"
+
"
\"checkedList\": \"1\"
,\n"
+
" \"hisUriExp\": null,\n"
+
"
\"hisUriExp\": null,\n"
+
" \"hisDateStartTime\": null,\n"
+
"
\"hisDateStartTime\": null,\n"
+
" \"hisDateEndTime\": null,\n"
+
"
\"hisDateEndTime\": null,\n"
+
" \"ynHisDataAll\": \"0\",\n"
+
"
\"ynHisDataAll\": \"0\",\n"
+
"
\"status\": null
,\n"
+
"
\"status\": \"1\"
,\n"
+
" \"listUrl\": null,\n"
+
"
\"listUrl\": null,\n"
+
" \"listExpressionType\": \"3\",\n"
+
"
\"listExpressionType\": \"3\",\n"
+
"
\"informationUrl\": null
,\n"
+
"
\"informationUrl\": \"\"
,\n"
+
"
\"informationTitle\": \"a\"
,\n"
+
"
\"informationTitle\": null
,\n"
+
"
\"informationPublishDate\": \"span\"
,\n"
+
"
\"informationPublishDate\": null
,\n"
+
" \"informationSource\": null,\n"
+
"
\"informationSource\": null,\n"
+
"
\"infoBlockPosition\": \"ul[class=\\\"commonList_dot\\\"]
>li\",\n"
+
"
\"infoBlockPosition\": \"div[class=\\\"page-content\\\"]>ul
>li\",\n"
+
" \"linkLocation\": \"a\",\n"
+
"
\"linkLocation\": \"a\",\n"
+
"
\"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explai
n\\\":\\\"\\\"}]\",\n"
+
"
\"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expressio
n\\\":\\\"\\\"}]\",\n"
+
"
\"crawlDepth\": null
,\n"
+
"
\"crawlDepth\": 2
,\n"
+
" \"pageUrl\": null,\n"
+
"
\"pageUrl\": null,\n"
+
" \"matchPage\": null,\n"
+
"
\"matchPage\": null,\n"
+
" \"pageStart\": 0,\n"
+
"
\"pageStart\": 0,\n"
+
" \"pageEnd\": 0,\n"
+
"
\"pageEnd\": 0,\n"
+
" \"ynPageAll\": \"0\",\n"
+
"
\"ynPageAll\": \"0\",\n"
+
"
\"detailExpressionType\": \"3
\",\n"
+
"
\"detailExpressionType\": \"0
\",\n"
+
" \"detailUrl\": null,\n"
+
"
\"detailUrl\": null,\n"
+
"
\"detailExpressionTitle\": \"<title><exp>h1[class=\\\"main_content
_title\\\"]</exp></title>\",\n"
+
"
\"detailExpressionTitle\": \"<title><exp>*.h1[id=\\\"con
_title\\\"]</exp></title>\",\n"
+
"
\"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"szty\\\"]>span:contains(时间)
</exp></publish_date>\",\n"
+
"
\"detailExpressionPublishDate\": \"<publish_date><exp>*.span[id=\\\"con_time\\\"]
</exp></publish_date>\",\n"
+
"
\"detailExpressionSource\": \"<origin><exp>div[class=\\\"szty\\\"]>span:contains(来源)</exp></origin>\"
,\n"
+
"
\"detailExpressionSource\": null
,\n"
+
" \"detailExpressionAuthor\": null,\n"
+
"
\"detailExpressionAuthor\": null,\n"
+
" \"detailExpressionSummary\": null,\n"
+
"
\"detailExpressionSummary\": null,\n"
+
"
\"detailExpressionContent\": \"<content><exp>div[class=\\\"content
\\\"]</exp></content>\",\n"
+
"
\"detailExpressionContent\": \"<content><exp>*.div[id=\\\"con_con
\\\"]</exp></content>\",\n"
+
"
\"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explai
n\\\":\\\"\\\"}]\",\n"
+
"
\"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expressio
n\\\":\\\"\\\"}]\",\n"
+
" \"ynDownload\": \"0\",\n"
+
"
\"ynDownload\": \"0\",\n"
+
" \"formUrl\": null,\n"
+
"
\"formUrl\": null,\n"
+
" \"formTitle\": null,\n"
+
"
\"formTitle\": null,\n"
+
" \"formType\": null,\n"
+
"
\"formType\": null,\n"
+
" \"dataFormExpression\": null,\n"
+
"
\"dataFormExpression\": null,\n"
+
"
\"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain
\\\":\\\"\\\"}]\",\n"
+
"
\"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey
\\\":\\\"\\\"}]\",\n"
+
" \"dataPageUrl\": null,\n"
+
"
\"dataPageUrl\": null,\n"
+
" \"dataPageRule\": null,\n"
+
"
\"dataPageRule\": null,\n"
+
" \"dataPageStart\": 0,\n"
+
"
\"dataPageStart\": 0,\n"
+
" \"dataPageEnd\": 0,\n"
+
"
\"dataPageEnd\": 0,\n"
+
" \"ynDataPageAll\": \"0\",\n"
+
"
\"ynDataPageAll\": \"0\",\n"
+
" \"dataType\": 0,\n"
+
"
\"dataType\": 0,\n"
+
" \"dataFormat\": 0,\n"
+
"
\"dataFormat\": 0,\n"
+
" \"dataStorageMode\": 0,\n"
+
"
\"dataStorageMode\": 0,\n"
+
"
\"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"
}\",\n"
+
"
\"dataStorageInfo\": \"{
}\",\n"
+
" \"ynDynamicCrawl\": 1,\n"
+
"
\"ynDynamicCrawl\": 1,\n"
+
" \"ynLogin\": 0,\n"
+
"
\"ynLogin\": 0,\n"
+
" \"domainName\": null,\n"
+
"
\"domainName\": null,\n"
+
" \"link\": null,\n"
+
"
\"link\": null,\n"
+
" \"account\": null,\n"
+
"
\"account\": null,\n"
+
" \"password\": null,\n"
+
"
\"password\": null,\n"
+
" \"userAgent\": null,\n"
+
"
\"userAgent\": null,\n"
+
" \"referer\": null,\n"
+
"
\"referer\": null,\n"
+
" \"cookies\": null,\n"
+
"
\"cookies\": null,\n"
+
" \"headers\": null,\n"
+
"
\"headers\": null,\n"
+
" \"otherInfo\": null,\n"
+
"
\"otherInfo\": null,\n"
+
" \"crawlType\": 1,\n"
+
"
\"crawlType\": 1,\n"
+
" \"crawlName\": null,\n"
+
"
\"crawlName\": null,\n"
+
" \"crawlAddress\": null,\n"
+
"
\"crawlAddress\": null,\n"
+
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
"
\"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n"
+
"
\"cron\": \"05 23 14 1/7
* ?\",\n"
+
"
\"cron\": \"53 40 0/4 *
* ?\",\n"
+
"
\"ynSnapshot\": \"0\"
\n"
+
"
\"ynSnapshot\": null
\n"
+
"}"
;
"}"
;
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
record
,
SiteMsgTemple
.
class
);
SiteMsgTemple
siteMsgTemple
=
new
Gson
().
fromJson
(
value
,
SiteMsgTemple
.
class
);
ArticleCrawlerThread
articleCrawlerThread
=
new
ArticleCrawlerThread
();
ArticleCrawlerThread
articleCrawlerThread
=
new
ArticleCrawlerThread
();
articleCrawlerThread
.
siteMsgTemple
=
siteMsgTemple
;
articleCrawlerThread
.
siteMsgTemple
=
siteMsgTemple
;
articleCrawlerThread
.
crawler
();
articleCrawlerThread
.
crawler
();
...
...
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByCss.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByRegular.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/download/HtmlPageParser.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
.
download
;
import
org.htmlcleaner.*
;
import
org.lobobrowser.html.parser.DocumentBuilderImpl
;
import
org.lobobrowser.html.parser.InputSourceImpl
;
import
org.lobobrowser.html.test.SimpleUserAgentContext
;
import
org.w3c.dom.Document
;
import
org.xml.sax.SAXException
;
import
javax.xml.parsers.DocumentBuilder
;
import
javax.xml.parsers.DocumentBuilderFactory
;
import
javax.xml.parsers.ParserConfigurationException
;
import
java.io.*
;
public
class
HtmlPageParser
{
public
static
Document
cobraParse
(
String
url
,
String
pageData
)
throws
SAXException
,
IOException
{
StringReader
sReader
=
new
StringReader
(
pageData
);
return
cobraParse
(
url
,
sReader
);
}
public
static
Document
cobraParse
(
String
url
,
Reader
pageData
)
throws
SAXException
,
IOException
{
SimpleUserAgentContext
ucontext
=
new
SimpleUserAgentContext
();
ucontext
.
setScriptingEnabled
(
false
);
ucontext
.
setExternalCSSEnabled
(
false
);
DocumentBuilderImpl
builder
=
new
DocumentBuilderImpl
(
ucontext
);
Document
document
=
builder
.
parse
(
new
InputSourceImpl
(
pageData
,
url
));
return
document
;
}
public
static
Document
xmlGetDocument
(
String
pageBody
)
throws
ParserConfigurationException
,
SAXException
,
IOException
{
DocumentBuilderFactory
factory
=
DocumentBuilderFactory
.
newInstance
();
DocumentBuilder
builder
=
factory
.
newDocumentBuilder
();
InputStream
is
=
new
ByteArrayInputStream
(
pageBody
.
getBytes
());
Document
document
=
builder
.
parse
(
is
);
return
document
;
}
public
static
Document
htmlCleanerParser
(
String
pageBody
)
throws
XPatherException
,
ParserConfigurationException
{
HtmlCleaner
cleaner
=
new
HtmlCleaner
();
TagNode
tagNode
=
cleaner
.
clean
(
pageBody
);
Document
document
=
new
DomSerializer
(
new
CleanerProperties
()).
createDOM
(
tagNode
);
return
document
;
}
}
comm_crawler/src/main/java/com/zzsn/download/PageBuilderParser.java
浏览文件 @
a18c008d
package
com
.
zzsn
.
download
;
package
com
.
zzsn
.
download
;
//import com.zzsn.crawler.uriparser.HtmlPageParser;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.util.StringUtil
;
import
org.htmlcleaner.BaseTokenImpl
;
import
org.htmlcleaner.BaseTokenImpl
;
import
org.htmlcleaner.ContentNode
;
import
org.htmlcleaner.ContentNode
;
import
org.htmlcleaner.TagNode
;
import
org.htmlcleaner.TagNode
;
import
org.htmlcleaner.XPatherException
;
import
org.htmlcleaner.XPatherException
;
import
org.jsoup.Jsoup
;
import
org.w3c.dom.*
;
import
org.w3c.dom.*
;
import
javax.xml.xpath.*
;
import
javax.xml.xpath.*
;
...
@@ -17,17 +13,14 @@ import java.util.List;
...
@@ -17,17 +13,14 @@ import java.util.List;
public
class
PageBuilderParser
{
public
class
PageBuilderParser
{
public
NodeList
parserNodeList
(
Object
doc
,
String
path
)
{
public
NodeList
parserNodeList
(
Object
doc
,
String
path
)
NodeList
nodeList
=
null
;
throws
XPathExpressionException
try
{
{
XPathFactory
factory
=
XPathFactory
.
newInstance
();
XPathFactory
factory
=
XPathFactory
.
newInstance
();
XPath
xPath
=
factory
.
newXPath
();
XPath
xPath
=
factory
.
newXPath
();
XPathExpression
expression
=
xPath
.
compile
(
path
);
XPathExpression
expression
=
xPath
.
compile
(
path
);
nodeList
=
(
NodeList
)
expression
.
evaluate
(
NodeList
nodeList
=
(
NodeList
)
expression
.
evaluate
(
doc
,
XPathConstants
.
NODESET
);
doc
,
XPathConstants
.
NODESET
);
}
catch
(
XPathExpressionException
e
){
e
.
printStackTrace
();
}
return
nodeList
;
return
nodeList
;
}
}
...
@@ -70,45 +63,7 @@ public class PageBuilderParser {
...
@@ -70,45 +63,7 @@ public class PageBuilderParser {
}
}
return
object
;
return
object
;
}
}
public
Object
parserNode
(
Object
doc
,
String
path
)
throws
XPathExpressionException
{
if
(
path
==
null
||
path
.
trim
().
length
()
==
0
)
{
return
null
;
}
XPathFactory
factory
=
XPathFactory
.
newInstance
();
XPath
xPath
=
factory
.
newXPath
();
XPathExpression
expression
=
xPath
.
compile
(
path
);
Object
object
=
null
;
try
{
object
=
expression
.
evaluate
(
doc
,
XPathConstants
.
NODE
);
return
(
Node
)
object
;
}
catch
(
XPathExpressionException
e
)
{
try
{
object
=
expression
.
evaluate
(
doc
,
XPathConstants
.
NODESET
);
return
(
NodeList
)
object
;
}
catch
(
XPathExpressionException
e1
)
{
try
{
object
=
expression
.
evaluate
(
doc
,
XPathConstants
.
STRING
);
return
(
String
)
object
;
}
catch
(
XPathExpressionException
e2
)
{
try
{
object
=
expression
.
evaluate
(
doc
,
XPathConstants
.
NUMBER
);
return
(
Number
)
object
;
}
catch
(
XPathExpressionException
e3
)
{
try
{
object
=
expression
.
evaluate
(
doc
,
XPathConstants
.
BOOLEAN
);
return
(
Boolean
)
object
;
}
catch
(
XPathExpressionException
e4
)
{
// TODO Auto-generated catch block
e4
.
printStackTrace
();
}
}
}
}
}
return
object
;
}
public
String
parserStrBr
(
Object
doc
,
String
path
)
public
String
parserStrBr
(
Object
doc
,
String
path
)
throws
XPathExpressionException
throws
XPathExpressionException
{
{
...
@@ -406,15 +361,5 @@ public class PageBuilderParser {
...
@@ -406,15 +361,5 @@ public class PageBuilderParser {
{
{
return
true
;
return
true
;
}
}
public
static
void
main
(
String
[]
args
)
throws
Exception
{
PageBuilderParser
pageBuilderParser
=
new
PageBuilderParser
();
String
aa
=
"<content><exp>*.div[class=\"artText clearfix\"]</exp><subtraction>div[class=\"relateArt\"]</subtraction></content>"
;
Document
document
=
HtmlPageParser
.
xmlGetDocument
(
aa
);
String
content
=
pageBuilderParser
.
parserStr
(
document
,
"//exp"
);
String
subtraction
=
pageBuilderParser
.
parserStr
(
document
,
"//subtraction"
);
System
.
out
.
println
(
content
);
System
.
out
.
println
(
subtraction
);
}
}
}
comm_crawler/src/main/java/com/zzsn/download/PageConnectioner.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/download/PageDownloader.java
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/download/PageGet.java
浏览文件 @
a18c008d
...
@@ -73,6 +73,7 @@ public class PageGet {
...
@@ -73,6 +73,7 @@ public class PageGet {
buffer
.
append
(
"\r\n"
);
buffer
.
append
(
"\r\n"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
this
.
connection
.
disconnect
();
this
.
connection
.
disconnect
();
}
finally
{
}
finally
{
try
{
try
{
...
@@ -133,6 +134,7 @@ public class PageGet {
...
@@ -133,6 +134,7 @@ public class PageGet {
buffer
.
append
(
"\r\n"
);
buffer
.
append
(
"\r\n"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
}
finally
{
}
finally
{
try
{
try
{
if
(
in
!=
null
)
{
if
(
in
!=
null
)
{
...
@@ -146,8 +148,8 @@ public class PageGet {
...
@@ -146,8 +148,8 @@ public class PageGet {
this
.
pageStr
=
buffer
.
toString
();
this
.
pageStr
=
buffer
.
toString
();
}
}
private
static
final
int
MAX_SOKET_TIMEOUT
=
3000
;
private
static
final
int
MAX_SOKET_TIMEOUT
=
3000
0
;
private
static
final
int
MAX_CONNECTION_TIMEOUT
=
3000
;
private
static
final
int
MAX_CONNECTION_TIMEOUT
=
3000
0
;
private
static
final
RequestConfig
REQCONFIG
=
RequestConfig
.
custom
()
private
static
final
RequestConfig
REQCONFIG
=
RequestConfig
.
custom
()
.
setSocketTimeout
(
MAX_SOKET_TIMEOUT
).
setConnectTimeout
(
MAX_CONNECTION_TIMEOUT
)
.
build
();
.
setSocketTimeout
(
MAX_SOKET_TIMEOUT
).
setConnectTimeout
(
MAX_CONNECTION_TIMEOUT
)
.
build
();
public
void
httpClientGet
()
{
public
void
httpClientGet
()
{
...
@@ -200,9 +202,9 @@ public class PageGet {
...
@@ -200,9 +202,9 @@ public class PageGet {
webClient
.
getOptions
().
setThrowExceptionOnFailingStatusCode
(
false
);
webClient
.
getOptions
().
setThrowExceptionOnFailingStatusCode
(
false
);
// webClient.getOptions().setTimeout(500000);
// webClient.getOptions().setTimeout(500000);
ProxyConfig
proxyConfig
=
new
ProxyConfig
(
Proxy_Addr
,
Proxy_Port
);
ProxyConfig
proxyConfig
=
new
ProxyConfig
(
Proxy_Addr
,
Proxy_Port
);
if
(
false
)
{
// if (pluginUtil.isNeedProxy()
) {
webClient
.
getOptions
().
setProxyConfig
(
proxyConfig
);
//
webClient.getOptions().setProxyConfig(proxyConfig);
}
//
}
try
{
try
{
Page
page
=
webClient
.
getPage
(
this
.
url
);
Page
page
=
webClient
.
getPage
(
this
.
url
);
if
(
page
instanceof
HtmlPage
)
{
if
(
page
instanceof
HtmlPage
)
{
...
@@ -210,10 +212,6 @@ public class PageGet {
...
@@ -210,10 +212,6 @@ public class PageGet {
// webClient.waitForBackgroundJavaScript(600000);
// webClient.waitForBackgroundJavaScript(600000);
this
.
setPageStr
(
htmlPage
.
asXml
());
this
.
setPageStr
(
htmlPage
.
asXml
());
}
}
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
}
}
...
...
comm_crawler/src/main/java/com/zzsn/download/PagePost.java
浏览文件 @
a18c008d
package
com
.
zzsn
.
download
;
package
com
.
zzsn
.
download
;
import
com.gargoylesoftware.htmlunit.*
;
import
com.gargoylesoftware.htmlunit.BrowserVersion
;
import
com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException
;
import
com.gargoylesoftware.htmlunit.Page
;
import
com.gargoylesoftware.htmlunit.WebClient
;
import
com.gargoylesoftware.htmlunit.html.HtmlPage
;
import
com.gargoylesoftware.htmlunit.html.HtmlPage
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.NameValuePair
;
...
@@ -179,10 +182,6 @@ public class PagePost {
...
@@ -179,10 +182,6 @@ public class PagePost {
// webClient.Headers.Add("ContentLength", param.Length.ToString());
// webClient.Headers.Add("ContentLength", param.Length.ToString());
// byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes);
// byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes);
ProxyConfig
proxyConfig
=
new
ProxyConfig
(
Proxy_Addr
,
Proxy_Port
);
if
(
false
)
{
webClient
.
getOptions
().
setProxyConfig
(
proxyConfig
);
}
try
{
try
{
Page
page
=
webClient
.
getPage
(
this
.
url
);
Page
page
=
webClient
.
getPage
(
this
.
url
);
if
(
page
instanceof
HtmlPage
)
{
if
(
page
instanceof
HtmlPage
)
{
...
@@ -190,10 +189,6 @@ public class PagePost {
...
@@ -190,10 +189,6 @@ public class PagePost {
// webClient.waitForBackgroundJavaScript(600000);
// webClient.waitForBackgroundJavaScript(600000);
this
.
setPageStr
(
htmlPage
.
asXml
());
this
.
setPageStr
(
htmlPage
.
asXml
());
}
}
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
}
}
...
...
comm_crawler/src/main/java/com/zzsn/download/RequestUtil.java
浏览文件 @
a18c008d
...
@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory;
...
@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory;
public
class
RequestUtil
{
public
class
RequestUtil
{
private
static
volatile
RequestUtil
instance
=
null
;
//保证 instance 在所有线程中同步
private
RequestUtil
(){}
//private 避免类在外部被实例化
public
static
synchronized
RequestUtil
getInstance
()
{
//getInstance 方法前加同步
if
(
instance
==
null
)
{
instance
=
new
RequestUtil
();
}
return
instance
;
}
private
static
Logger
log
=
LoggerFactory
.
getLogger
(
RequestUtil
.
class
);
private
static
Logger
log
=
LoggerFactory
.
getLogger
(
RequestUtil
.
class
);
public
static
String
getTaotiaoData
(
String
url
)
throws
Exception
{
public
static
String
getTaotiaoData
(
String
url
)
throws
Exception
{
HttpClientBuilder
builder
=
HttpClients
.
custom
();
HttpClientBuilder
builder
=
HttpClients
.
custom
();
//对照UA字串的标准格式理解一下每部分的意思
//对照UA字串的标准格式理解一下每部分的意思
...
@@ -215,12 +223,13 @@ public class RequestUtil {
...
@@ -215,12 +223,13 @@ public class RequestUtil {
}
}
return
jsonObject
;
return
jsonObject
;
}
}
public
static
String
httpGetRequest
(
String
url
)
throws
Exception
{
String
result
=
null
;
public
String
httpGetRequest
(
String
url
)
throws
Exception
{
String
result
=
""
;
HttpClientBuilder
builder
=
HttpClients
.
custom
();
HttpClientBuilder
builder
=
HttpClients
.
custom
();
//对照UA字串的标准格式理解一下每部分的意思
//对照UA字串的标准格式理解一下每部分的意思
builder
.
setUserAgent
(
"Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)"
);
builder
.
setUserAgent
(
"Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)"
);
CloseableHttpClient
httpClient
=
builder
.
build
();
CloseableHttpClient
httpClient
=
builder
.
build
();
// CloseableHttpClient httpClient = HttpClients.createDefault();
// CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse
response
=
null
;
CloseableHttpResponse
response
=
null
;
try
{
try
{
...
@@ -232,7 +241,7 @@ public class RequestUtil {
...
@@ -232,7 +241,7 @@ public class RequestUtil {
}
}
EntityUtils
.
consume
(
resEntity
);
EntityUtils
.
consume
(
resEntity
);
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
// return get(url, tts++)
;
result
=
""
;
}
finally
{
}
finally
{
response
.
close
();
response
.
close
();
httpClient
.
close
();
httpClient
.
close
();
...
...
comm_crawler/src/main/java/com/zzsn/download/StringUtil.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
.
download
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
public
class
StringUtil
{
public
static
boolean
convertBoolean
(
String
s
,
boolean
b
)
{
if
(
s
==
null
)
{
return
b
;
}
if
(
s
.
equals
(
"0"
))
{
return
false
;
}
if
(
s
.
equals
(
"1"
))
{
return
true
;
}
return
b
;
}
public
static
String
convertBooleanToString
(
boolean
b
)
{
String
s
=
b
?
"1"
:
"0"
;
return
s
;
}
public
static
String
trimWhiteSpace
(
String
str
)
{
String
s
=
replaceBlank
(
str
);
String
ret
=
s
.
trim
();
return
ret
;
}
public
static
String
replaceBlank
(
String
str
)
{
/* String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");su
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}*/
StringBuffer
buffer
=
new
StringBuffer
();
for
(
int
i
=
0
;
i
<
str
.
length
();
i
++)
{
char
c
=
str
.
charAt
(
i
);
boolean
bspace
=
Character
.
isWhitespace
(
c
);
if
(
bspace
)
{
c
=
' '
;
}
buffer
.
append
(
c
);
}
return
buffer
.
toString
();
}
//获取分隔符[和]之间的子串,如aa[abc]bbb->abc
public
static
List
<
String
>
getSubStrs
(
String
str
,
String
start
,
String
end
)
{
List
<
String
>
resultStrs
=
new
ArrayList
<
String
>();
if
(
str
==
null
||
str
.
trim
().
length
()
==
0
)
{
return
resultStrs
;
}
String
ptnstr
=
String
.
format
(
"%s([^%s%s]+)%s"
,
start
,
start
,
end
,
end
);
// String ptnstr1 = "\\[([^\\[\\]]+)\\]";
Pattern
pattern
=
Pattern
.
compile
(
ptnstr
);
Matcher
matcher
=
pattern
.
matcher
(
str
);
while
(
matcher
.
find
())
{
String
substr
=
matcher
.
group
(
1
);
resultStrs
.
add
(
substr
);
}
return
resultStrs
;
}
//fromStr:aaa123bb, origStr:aaa[xxx]bb, replaceStr:[xxx]. return:123
public
static
String
getHomologousWord
(
String
replaceStr
,
String
origStr
,
String
fromStr
)
{
String
retStr
=
null
;
int
pos
=
origStr
.
indexOf
(
replaceStr
);
if
(
pos
==
-
1
)
{
return
retStr
;
}
String
start
=
origStr
.
substring
(
0
,
pos
);
String
end
=
origStr
.
substring
(
pos
+
replaceStr
.
length
());
if
(
start
.
length
()
>
0
&&
!
fromStr
.
startsWith
(
start
))
{
return
retStr
;
}
if
(
end
.
length
()
>
0
&&
!
fromStr
.
endsWith
(
end
))
{
return
retStr
;
}
retStr
=
fromStr
.
substring
(
start
.
length
(),
fromStr
.
length
()-
end
.
length
());
return
retStr
;
}
public
static
String
trimBeginningBracket
(
String
s
)
{
String
ret
=
s
;
if
(
s
.
length
()
==
0
)
{
return
s
;
}
Map
<
Character
,
Character
>
braketPeers
=
new
HashMap
<
Character
,
Character
>();
braketPeers
.
put
(
'【'
,
'】'
);
braketPeers
.
put
(
'['
,
']'
);
braketPeers
.
put
(
'['
,
']'
);
braketPeers
.
put
(
'('
,
')'
);
braketPeers
.
put
(
'('
,
')'
);
braketPeers
.
put
(
'〔'
,
'〕'
);
String
searchStr
=
s
;
while
(
searchStr
.
length
()
>
0
)
{
char
beginc
=
searchStr
.
charAt
(
0
);
Character
value
=
braketPeers
.
get
(
beginc
);
if
(
value
==
null
)
{
break
;
}
int
endPos
=
-
1
;
for
(
int
i
=
1
;
i
<
searchStr
.
length
();
i
++)
{
if
(
searchStr
.
charAt
(
i
)
==
value
)
{
endPos
=
i
;
break
;
}
}
if
(
endPos
>=
0
)
{
ret
=
searchStr
.
substring
(
endPos
+
1
);
searchStr
=
ret
;
}
else
{
break
;
}
}
return
ret
;
}
public
static
String
trimMiddleBracket
(
String
s
)
{
String
ret
=
s
;
if
(
s
.
length
()
==
0
)
{
return
s
;
}
Map
<
Character
,
Character
>
braketPeers
=
new
HashMap
<
Character
,
Character
>();
String
[]
brakets
=
{
"】"
,
"]"
,
"]"
,
")"
,
")"
,
"〕"
};
braketPeers
.
put
(
'【'
,
'】'
);
braketPeers
.
put
(
'['
,
']'
);
braketPeers
.
put
(
'['
,
']'
);
braketPeers
.
put
(
'('
,
')'
);
braketPeers
.
put
(
'('
,
')'
);
braketPeers
.
put
(
'〔'
,
'〕'
);
String
searchStr
=
s
;
int
index
=
0
;
while
(
searchStr
.
length
()
>
0
)
{
int
startPos
=
-
1
;
Character
value
=
null
;
for
(
int
i
=
index
;
i
<
searchStr
.
length
();
i
++)
{
boolean
findLeftBraket
=
false
;
value
=
searchStr
.
charAt
(
i
);
for
(
Character
key
:
braketPeers
.
keySet
())
{
if
(
value
.
equals
(
key
))
{
startPos
=
i
;
findLeftBraket
=
true
;
break
;
}
}
if
(
findLeftBraket
)
{
break
;
}
}
int
endPos
=
-
1
;
for
(
int
i
=
startPos
+
1
;
i
<
searchStr
.
length
();
i
++)
{
if
(
null
!=
braketPeers
.
get
(
value
)
&&
searchStr
.
charAt
(
i
)
==
braketPeers
.
get
(
value
))
{
endPos
=
i
;
break
;
}
}
if
(
endPos
>=
startPos
)
{
if
(
startPos
>=
0
)
{
searchStr
=
searchStr
.
substring
(
0
,
startPos
)
+
searchStr
.
substring
(
endPos
+
1
,
searchStr
.
length
());
}
}
else
{
searchStr
=
searchStr
.
replace
(
value
.
toString
(),
""
);
index
=
startPos
;
}
if
(
startPos
<
0
)
{
ret
=
searchStr
;
break
;
}
}
for
(
String
bs
:
brakets
)
{
ret
=
ret
.
replace
(
bs
.
toString
(),
""
);
}
return
ret
;
}
public
static
String
trimEnddingBracket
(
String
s
)
{
String
ret
=
s
;
if
(
s
.
length
()
==
0
)
{
return
s
;
}
Map
<
Character
,
Character
>
braketPeers
=
new
HashMap
<
Character
,
Character
>();
braketPeers
.
put
(
'】'
,
'【'
);
braketPeers
.
put
(
']'
,
'['
);
braketPeers
.
put
(
')'
,
'('
);
braketPeers
.
put
(
')'
,
'('
);
braketPeers
.
put
(
'〕'
,
'〔'
);
int
endPos
=
s
.
length
()
-
1
;
String
searchStr
=
s
;
while
(
endPos
>=
0
)
{
char
endc
=
searchStr
.
charAt
(
endPos
);
Character
value
=
braketPeers
.
get
(
endc
);
if
(
value
==
null
)
{
break
;
}
int
startPos
=
-
1
;
for
(
int
i
=
searchStr
.
length
()
-
2
;
i
>=
0
;
i
--)
{
if
(
searchStr
.
charAt
(
i
)
==
value
)
{
startPos
=
i
;
break
;
}
}
if
(
startPos
>=
0
)
{
ret
=
searchStr
.
substring
(
0
,
startPos
);
searchStr
=
ret
;
}
endPos
=
startPos
-
1
;
}
return
ret
;
}
// public static String delSymbolAndPunc(String s)
// {
// StringBuffer buffer = new StringBuffer();
// s = replaceBlank(s);
// for (int i = 0; i < s.length(); i ++)
// {
// char c = s.charAt(i);
// if (c == ' ' || CharUtil.isSymbol(c) || CharUtil.isPunctuation(c) || Integer.toHexString(c).equals("a0"))
// {
// continue;
// }
// buffer.append(c);
// }
// return buffer.toString();
// }
public
static
String
delCharNotChinese
(
String
s
)
{
StringBuffer
buffer
=
new
StringBuffer
();
for
(
int
i
=
0
;
i
<
s
.
length
();
i
++)
{
char
c
=
s
.
charAt
(
i
);
if
(
isChinese
(
c
))
{
buffer
.
append
(
c
);
}
}
return
buffer
.
toString
();
}
public
static
boolean
isChinese
(
char
c
)
{
if
(
c
>=
0x4e00
&&
c
<=
0x9fa5
)
{
return
true
;
}
return
false
;
}
public
static
String
toBanjiao
(
String
s
)
{
if
(
s
==
null
||
s
.
length
()
==
0
)
{
return
s
;
}
StringBuffer
buffer
=
new
StringBuffer
();
for
(
int
i
=
0
;
i
<
s
.
length
();
i
++)
{
char
c
=
s
.
charAt
(
i
);
if
(
c
>=
65281
&&
c
<=
65374
)
{
c
=
(
char
)
(
c
-
65248
);
}
else
if
(
c
==
12288
)
{
// 空格
c
=
(
char
)
32
;
}
buffer
.
append
(
c
);
}
return
buffer
.
toString
();
}
public
static
String
listToString
(
List
<
String
>
arr
)
{
StringBuffer
buffer
=
new
StringBuffer
();
if
(
arr
==
null
)
{
return
buffer
.
toString
();
}
for
(
int
i
=
0
;
i
<
arr
.
size
();
i
++)
{
buffer
.
append
(
arr
.
get
(
i
));
if
(
i
!=
arr
.
size
()
-
1
)
{
buffer
.
append
(
";"
);
}
}
return
buffer
.
toString
();
}
public
static
List
<
String
>
stringToList
(
String
str
)
{
List
<
String
>
strs
=
new
ArrayList
<
String
>();
if
(
str
==
null
)
{
return
strs
;
}
String
[]
ss
=
str
.
split
(
";"
);
for
(
String
s
:
ss
)
{
if
(
s
.
trim
().
length
()
==
0
)
{
continue
;
}
strs
.
add
(
s
);
}
return
strs
;
}
public
static
String
normalizeHtmlTransf
(
String
s
)
{
String
ret
=
s
.
replaceAll
(
"•"
,
"·"
);
ret
=
ret
.
replaceAll
(
"·"
,
"·"
);
ret
=
ret
.
replaceAll
(
" "
,
" "
);
ret
=
ret
.
replaceAll
(
"""
,
"\""
);
ret
=
ret
.
replaceAll
(
"&"
,
"&"
);
ret
=
ret
.
replace
(
'・'
,
'·'
);
ret
=
ret
.
replace
(
"“"
,
"\""
);
ret
=
ret
.
replace
(
"”"
,
"\""
);
ret
=
ret
.
replace
(
"…"
,
"..."
);
ret
=
ret
.
replace
(
"<"
,
"<"
);
ret
=
ret
.
replace
(
">"
,
">"
);
ret
=
ret
.
replace
(
"—"
,
"—"
);
ret
=
ret
.
replace
(
"–"
,
"–"
);
ret
=
ret
.
replace
(
"˜"
,
"~"
);
ret
=
ret
.
replace
(
"‘"
,
"'"
);
ret
=
ret
.
replace
(
"’"
,
"'"
);
ret
=
ret
.
replace
(
"‚"
,
","
);
ret
=
ret
.
replace
(
"‹"
,
"‹"
);
ret
=
ret
.
replace
(
"›"
,
"›"
);
ret
=
ret
.
replace
(
"…"
,
"…"
);
ret
=
ret
.
replace
(
"|"
,
" "
);
return
ret
;
}
public
static
String
normalizeSegTransf
(
String
s
)
{
String
ret
=
s
.
replaceAll
(
"\r\n;"
,
" "
);
ret
=
ret
.
replace
(
"\n"
,
""
);
ret
=
ret
.
replace
(
"|"
,
" "
);
return
ret
;
}
}
comm_crawler/src/main/java/com/zzsn/download/Test.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
.
download
;
public
class
Test
{
public
static
void
main
(
String
[]
args
)
{
PageDownloader
downloader
=
new
PageDownloader
();
String
rankUrl
=
"https://www.baidu.com/"
;
String
encoding
=
"utf-8"
;
String
pageBody
=
downloader
.
downloadWithStr
(
rankUrl
,
encoding
,
false
,
false
);
System
.
out
.
println
(
pageBody
);
}
}
comm_crawler/src/main/java/com/zzsn/entity/BadSiteMsg.java
浏览文件 @
a18c008d
...
@@ -2,6 +2,8 @@ package com.zzsn.entity;
...
@@ -2,6 +2,8 @@ package com.zzsn.entity;
import
lombok.Data
;
import
lombok.Data
;
import
java.util.Date
;
@Data
@Data
public
class
BadSiteMsg
{
public
class
BadSiteMsg
{
...
@@ -9,16 +11,10 @@ public class BadSiteMsg {
...
@@ -9,16 +11,10 @@ public class BadSiteMsg {
private
String
id
;
private
String
id
;
/**信息源编码*/
/**信息源编码*/
private
String
infoSourceCode
;
private
String
infoSourceCode
;
/**信息源名称*/
/**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private
String
webSiteName
;
/**栏目名称*/
private
String
siteName
;
/**栏目地址*/
private
String
siteUri
;
/**有问题类型*/
private
String
errorType
;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private
String
problemType
;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private
String
crawlerType
;
private
String
crawlerType
;
/**分区id (多个用英文逗号隔开)*/
private
String
partition
;
/**消费时间*/
private
Date
consumerDate
;
}
}
comm_crawler/src/main/java/com/zzsn/entity/BadSiteMsgBak.java
0 → 100644
浏览文件 @
a18c008d
package
com
.
zzsn
.
entity
;
import
lombok.Data
;
@Data
public
class
BadSiteMsgBak
{
/**主键*/
private
String
id
;
/**信息源编码*/
private
String
infoSourceCode
;
/**信息源名称*/
private
String
webSiteName
;
/**栏目名称*/
private
String
siteName
;
/**栏目地址*/
private
String
siteUri
;
/**有问题类型*/
private
String
errorType
;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private
String
problemType
;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private
String
crawlerType
;
}
comm_crawler/src/main/java/com/zzsn/generation/Constants.java
浏览文件 @
a18c008d
...
@@ -165,7 +165,8 @@ public class Constants {
...
@@ -165,7 +165,8 @@ public class Constants {
//判断重复的rate
//判断重复的rate
public
static
final
Double
TITLE_SIMILARITY_RATE
=
Double
.
valueOf
(
prop
.
getProperty
(
"TITLE_SIMILARITY_RATE"
));
public
static
final
Double
TITLE_SIMILARITY_RATE
=
Double
.
valueOf
(
prop
.
getProperty
(
"TITLE_SIMILARITY_RATE"
));
public
static
final
String
MODEL_SCORE_URL
=
prop
.
getProperty
(
"MODEL_SCORE_URL"
);
public
static
final
String
MODEL_SCORE_URL
=
prop
.
getProperty
(
"MODEL_SCORE_URL"
);
public
static
final
Integer
CACHE_UPDATE
=
Integer
.
valueOf
(
prop
.
getProperty
(
"CACHE_UPDATE"
));
public
static
final
Integer
CACHE_UPDATE
=
Integer
.
valueOf
(
prop
.
getProperty
(
"CACHE_UPDATE"
));
//国资监管评价中心相关性过滤算法URl(XGBOOST)
//国资监管评价中心相关性过滤算法URl(XGBOOST)
public
static
final
String
RELEVANCE_GZJG_XGBOOST_URL
=
prop
.
getProperty
(
"RELEVANCE_GZJG_XGBOOST_URL"
);
public
static
final
String
RELEVANCE_GZJG_XGBOOST_URL
=
prop
.
getProperty
(
"RELEVANCE_GZJG_XGBOOST_URL"
);
// public static final boolean CRAL_TYPE = Integer.valueOf(prop.getProperty("CRAW_TYPE")).equals(1);
// public static final boolean CRAL_TYPE = Integer.valueOf(prop.getProperty("CRAW_TYPE")).equals(1);
...
...
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
浏览文件 @
a18c008d
...
@@ -18,7 +18,6 @@ public class JedisUtil {
...
@@ -18,7 +18,6 @@ public class JedisUtil {
private
static
final
String
PREFIX
=
"comm_"
;
private
static
final
String
PREFIX
=
"comm_"
;
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
JedisUtil
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
JedisUtil
.
class
);
private
static
JedisPool
jedisPool
=
null
;
private
static
JedisPool
jedisPool
=
null
;
private
JedisUtil
()
{
private
JedisUtil
()
{
}
}
...
@@ -127,7 +126,13 @@ public class JedisUtil {
...
@@ -127,7 +126,13 @@ public class JedisUtil {
}
}
getDefaultJedis
().
del
(
PREFIX
+
key
);
getDefaultJedis
().
del
(
PREFIX
+
key
);
}
}
public
static
void
delString
(
String
key
)
throws
Exception
{
if
(
StringUtils
.
isEmpty
(
key
))
{
logger
.
error
(
"key is null"
);
throw
new
Exception
(
"key is null"
);
}
getDefaultJedis
().
del
(
key
);
}
public
static
void
setString
(
String
key
,
String
value
,
int
expireTime
)
throws
Exception
{
public
static
void
setString
(
String
key
,
String
value
,
int
expireTime
)
throws
Exception
{
Jedis
jedis
=
null
;
Jedis
jedis
=
null
;
try
{
try
{
...
...
comm_crawler/src/main/java/com/zzsn/test/HttpClientTester.java
浏览文件 @
a18c008d
...
@@ -43,7 +43,7 @@ public class HttpClientTester {
...
@@ -43,7 +43,7 @@ public class HttpClientTester {
private
static
PageBuilderParser
builderParser
=
null
;
private
static
PageBuilderParser
builderParser
=
null
;
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
// get("https://edition.cnn.com/world");
// get("https://edition.cnn.com/world");
String
html
=
HttpgetUtil
.
getHtml
(
"http
s://edition.cnn.com/world
"
);
String
html
=
HttpgetUtil
.
getHtml
(
"http
://www.ahhfly.gov.cn/content/column/11488310?pageIndex=1
"
);
System
.
out
.
println
(
html
);
System
.
out
.
println
(
html
);
// post();
// post();
}
}
...
...
comm_crawler/src/main/java/com/zzsn/test/WebClientTest.java
浏览文件 @
a18c008d
...
@@ -8,7 +8,7 @@ import java.net.MalformedURLException;
...
@@ -8,7 +8,7 @@ import java.net.MalformedURLException;
public
class
WebClientTest
{
public
class
WebClientTest
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
String
url
=
"http://www.
sgcc.com.cn/html/sgcc_main/col2017021879/column_2017021879_1.shtml
"
;
String
url
=
"http://www.
ahhfly.gov.cn/content/column/11488310?pageIndex=1
"
;
String
charset
=
"utf-8"
;
String
charset
=
"utf-8"
;
String
s
=
downloadByWebClient
(
url
,
charset
);
String
s
=
downloadByWebClient
(
url
,
charset
);
System
.
out
.
println
(
s
.
length
());
System
.
out
.
println
(
s
.
length
());
...
...
comm_crawler/src/main/java/com/zzsn/test/test.java
浏览文件 @
a18c008d
package
com
.
test
;
package
com
.
zzsn
.
test
;
import
com.zzsn.download.RequestUtil
;
import
java.io.*
;
import
java.io.*
;
import
java.net.HttpURLConnection
;
import
java.net.HttpURLConnection
;
...
@@ -9,6 +11,12 @@ public class test {
...
@@ -9,6 +11,12 @@ public class test {
String
urlStr
=
"http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg"
;
String
urlStr
=
"http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg"
;
String
fileName
=
"testImg.png"
;
String
fileName
=
"testImg.png"
;
downLoadByUrl
(
urlStr
,
fileName
);
downLoadByUrl
(
urlStr
,
fileName
);
try
{
RequestUtil
requestUtil
=
RequestUtil
.
getInstance
();
String
body
=
requestUtil
.
httpGetRequest
(
urlStr
);
}
catch
(
Exception
e
)
{
// e.printStackTrace();
}
}
}
/**
/**
* 从网络Url中下载文件
* 从网络Url中下载文件
...
...
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
浏览文件 @
a18c008d
...
@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions;
...
@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions;
import
org.openqa.selenium.remote.HttpCommandExecutor
;
import
org.openqa.selenium.remote.HttpCommandExecutor
;
import
java.net.URL
;
import
java.net.URL
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -33,16 +34,23 @@ public class DriverUtil {
...
@@ -33,16 +34,23 @@ public class DriverUtil {
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
ChromeDriverService
service
=
ChromeDriverService
.
createDefaultService
();
ChromeDriverService
service
=
ChromeDriverService
.
createDefaultService
();
ChromeOptions
options
=
new
ChromeOptions
();
ChromeOptions
options
=
new
ChromeOptions
();
//浏览器启动的位置
// options.setBinary(Constants.CHROMEBIN);
// 无痕模式
// 无痕模式
options
.
addArguments
(
"--incognito"
);
options
.
addArguments
(
"--incognito"
);
// 禁用沙箱
// 禁用沙箱
options
.
addArguments
(
"no-sandbox"
);
options
.
addArguments
(
"
--
no-sandbox"
);
// 禁用GPU
// 禁用GPU
options
.
addArguments
(
"--disable-gpu"
);
options
.
addArguments
(
"--disable-gpu"
);
// 禁用图形界面(此模式启动会导致驱动通信异常)
// 禁用图形界面(此模式启动会导致驱动通信异常)
//
options.addArguments("--headless");
//
options.addArguments("--headless");
// 禁用插件
// 禁用插件
options
.
addArguments
(
"disable-extensions"
);
options
.
addArguments
(
"disable-extensions"
);
// 屏蔽“chrome正受到自动测试软件的控制”的提示
// options.addArguments("--disable-infobars", "--disable-blink-features=AutomationControlled");
// options.addArguments("--remote-debugging-port=9222");
// options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
// 重新初始化一个chrome浏览器实例
// 重新初始化一个chrome浏览器实例
return
new
ChromeDriver
(
service
,
options
);
return
new
ChromeDriver
(
service
,
options
);
}
}
...
@@ -65,8 +73,8 @@ public class DriverUtil {
...
@@ -65,8 +73,8 @@ public class DriverUtil {
// }
// }
ReuseWebDriver
driver
=
null
;
ReuseWebDriver
driver
=
null
;
try
{
try
{
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
System
.
out
.
println
(
driver
.
connectTestFail
());
// log.info("驱动连接失败:"+
driver.connectTestFail());
if
(
driver
.
connectTestFail
())
{
if
(
driver
.
connectTestFail
())
{
// 若驱动返回错误码,重新创建驱动服务并缓存
// 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver
chromeDriver
=
DriverUtil
.
reconnectDriver
();
ChromeDriver
chromeDriver
=
DriverUtil
.
reconnectDriver
();
...
@@ -81,11 +89,11 @@ public class DriverUtil {
...
@@ -81,11 +89,11 @@ public class DriverUtil {
map
.
put
(
"serverUrl"
,
serverUrl
);
map
.
put
(
"serverUrl"
,
serverUrl
);
// 缓存浏览器驱动信息
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
System
.
out
.
println
(
"获取驱动driver失败重新设置:"
+
Constants
.
SELENIUM_DRIVER_CACHE
+
"::"
+
JSON
.
toJSONString
(
map
));
log
.
info
(
"获取驱动driver失败重新设置:"
+
Constants
.
SELENIUM_DRIVER_CACHE
+
"::"
+
JSON
.
toJSONString
(
map
));
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
}
}
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
System
.
out
.
println
(
"
出现异常"
);
log
.
info
(
"获取驱动driver
出现异常"
);
// 若驱动返回错误码,重新创建驱动服务并缓存
// 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver
chromeDriver
=
DriverUtil
.
reconnectDriver
();
ChromeDriver
chromeDriver
=
DriverUtil
.
reconnectDriver
();
serverUrl
=
DriverUtil
.
getServerUrl
(
chromeDriver
);
serverUrl
=
DriverUtil
.
getServerUrl
(
chromeDriver
);
...
@@ -99,7 +107,6 @@ public class DriverUtil {
...
@@ -99,7 +107,6 @@ public class DriverUtil {
map
.
put
(
"serverUrl"
,
serverUrl
);
map
.
put
(
"serverUrl"
,
serverUrl
);
// 缓存浏览器驱动信息
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
System
.
out
.
println
(
"获取驱动driver失败重新设置:"
+
Constants
.
SELENIUM_DRIVER_CACHE
+
"::"
+
JSON
.
toJSONString
(
map
));
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
}
}
return
driver
;
return
driver
;
...
@@ -114,11 +121,11 @@ public class DriverUtil {
...
@@ -114,11 +121,11 @@ public class DriverUtil {
Map
<
String
,
String
>
map
=
getSessionInfo
();
Map
<
String
,
String
>
map
=
getSessionInfo
();
String
sessionId
=
map
.
get
(
"sessionId"
);
String
sessionId
=
map
.
get
(
"sessionId"
);
String
serverUrl
=
map
.
get
(
"serverUrl"
);
String
serverUrl
=
map
.
get
(
"serverUrl"
);
log
.
info
(
"从redis中获取保存的sessionId:"
+
sessionId
);
return
connectChrome
(
sessionId
,
serverUrl
);
return
connectChrome
(
sessionId
,
serverUrl
);
}
}
public
static
Map
<
String
,
String
>
getSessionInfo
()
throws
Exception
{
public
static
Map
<
String
,
String
>
getSessionInfo
()
throws
Exception
{
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
System
.
out
.
println
(
"获取驱动session:"
+
Constants
.
SELENIUM_DRIVER_CACHE
+
"::"
+
cacheInfo
);
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
if
(
map
==
null
||
map
.
size
()<
1
)
{
if
(
map
==
null
||
map
.
size
()<
1
)
{
map
=
new
HashMap
<>(
2
);
map
=
new
HashMap
<>(
2
);
...
@@ -126,7 +133,6 @@ public class DriverUtil {
...
@@ -126,7 +133,6 @@ public class DriverUtil {
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
System
.
out
.
println
(
"获取驱动session失败重新设置:"
+
Constants
.
SELENIUM_DRIVER_CACHE
+
"::"
+
JSON
.
toJSONString
(
map
));
}
}
return
map
;
return
map
;
}
}
...
...
comm_crawler/src/main/java/com/zzsn/util/PublishDateUtil.java
浏览文件 @
a18c008d
...
@@ -160,7 +160,9 @@ public class PublishDateUtil {
...
@@ -160,7 +160,9 @@ public class PublishDateUtil {
{
{
// return formatUSDate(raw);
// return formatUSDate(raw);
return
raw
;
// return raw;
//当时间解析不了时返回空
return
""
;
}
}
}
}
...
...
comm_crawler/src/main/resources/aa.txt
浏览文件 @
a18c008d
差异被折叠。
点击展开。
comm_crawler/src/main/resources/application.properties
浏览文件 @
a18c008d
...
@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ?
...
@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ?
kafka.producer.servers
=
114.115.159.144:9092
kafka.producer.servers
=
114.115.159.144:9092
#kafka.producer.servers=39.101.72.117:19092
#kafka.producer.servers=39.101.72.117:19092
kafka.producer.retries
=
0
kafka.producer.retries
=
0
kafka.producer.batch.size
=
4096
kafka.producer.batch.size
=
4096
00
kafka.producer.linger
=
1
kafka.producer.linger
=
1
kafka.producer.buffer.memory
=
40960
kafka.producer.buffer.memory
=
40960
0
spring.activemq.broker-url
=
tcp://127.0.0.1:61616
spring.activemq.broker-url
=
tcp://127.0.0.1:61616
...
...
comm_crawler/src/main/resources/constants.properties
浏览文件 @
a18c008d
...
@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0
...
@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL
=
D
\:
//data//jwyqyqjc//keywords.txt
JWYQJC_INFILE_URL
=
D
\:
//data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS
=
10
JWYQJC_MEMCACHED_DAYS
=
10
TITLE_SIMILARITY_RATE
=
0.8
TITLE_SIMILARITY_RATE
=
0.8
MODEL_SCORE_URL
=
http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE
=
1
CACHE_UPDATE
=
1
...
@@ -36,7 +35,7 @@ PROXYID=1
...
@@ -36,7 +35,7 @@ PROXYID=1
THREAD_SIZE
=
1
THREAD_SIZE
=
1
#
#
CHROMEDRIVE
=
E:
\\
chrome
\\
chromedriver.exe
CHROMEDRIVE
=
E:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\
\
Program Files
\\
Google
\\
Chrome
\\
Application
\
\
chrome.exe
CHROMEBIN
=
C:
\
U
sers
\W
IN10
\A
ppData
\L
ocal
\G
oogle
\C
hrome
\A
pplication
\c
hrome.exe
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
#mysql connection
#mysql connection
...
@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
...
@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC
=
clb-infosource-handler-dynamin
KAFKA_CONSUMER_TOPIC
=
clb-infosource-handler-dynamin
#
#
KAFKA_CONSUMER_GROUP_ID
=
test
-zs
1
KAFKA_CONSUMER_GROUP_ID
=
test1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET
=
earliest
KAFKA_CONSUMER_AUTO_OFFSET_RESET
=
earliest
KAFKA_PRODUCT_TOPIC
=
crawlerInfo
KAFKA_PRODUCT_TOPIC
=
crawlerInfo
...
@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0
...
@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0
#redis.host=114.116.26.150
#redis.host=114.116.26.150
#redis.port=6379
#redis.port=6379
#redis.pass=zzsn9988
#redis.pass=zzsn9988
#
redis.host=114.115.236.206
redis.host
=
114.115.236.206
#
redis.port=6379
redis.port
=
6379
#
redis.pass=clbzzsn
redis.pass
=
clbzzsn
#redis.host=8.130.30.33
#redis.host=8.130.30.33
#redis.port=9010
#redis.port=9010
#redis.pass=wxadS&jklim
#redis.pass=wxadS&jklim
redis.host
=
127.0.0.1
#
redis.host=127.0.0.1
redis.port
=
6379
#
redis.port=6379
redis.pass
=
xxxxxx
#
redis.pass=xxxxxx
redis.timeout
=
10000
redis.timeout
=
10000
redis.maxIdle
=
300
redis.maxIdle
=
300
redis.maxTotal
=
600
redis.maxTotal
=
600
...
@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\
...
@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\
selenium.driver.cache
=
selenium_driver_cache_loc112
selenium.driver.cache
=
selenium_driver_cache_loc112
#采集缓存的rediskey
MODEL_SCORE_URL
=
dy-1
...
...
comm_crawler/src/main/resources/redis.properties
浏览文件 @
a18c008d
# Redis settings
# Redis settings
#redis.host=114.115.236.206
redis.host
=
114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
redis.host
=
127.0.0.1
redis.port
=
6379
redis.port
=
6379
redis.pass
=
xxxxxx
redis.pass
=
clbzzsn
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
redis.timeout
=
10000
redis.timeout
=
10000
redis.maxIdle
=
300
redis.maxIdle
=
300
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论