提交 a18c008d 作者: liuweigang

采集代码更新9

上级 83f00b0f
...@@ -253,7 +253,7 @@ ...@@ -253,7 +253,7 @@
</dependency> </dependency>
<!--WebMagic 爬虫框架--> <!--WebMagic 爬虫框架-->
<dependency> <!--<dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.7.5</version> <version>0.7.5</version>
...@@ -274,7 +274,7 @@ ...@@ -274,7 +274,7 @@
<artifactId>slf4j-log4j12</artifactId> <artifactId>slf4j-log4j12</artifactId>
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>-->
<!-- 下面引用本地包,webMgic源码包修过后--> <!-- 下面引用本地包,webMgic源码包修过后-->
<!-- <dependency>--> <!-- <dependency>-->
<!-- <groupId>us.codecraft</groupId>--> <!-- <groupId>us.codecraft</groupId>-->
...@@ -307,11 +307,11 @@ ...@@ -307,11 +307,11 @@
<!-- </exclusions>--> <!-- </exclusions>-->
<!-- </dependency>--> <!-- </dependency>-->
<!--&lt;!&ndash; &lt;!&ndash; 上面引用本地包,webMgic源码包修过后&ndash;&gt;&ndash;&gt;--> <!--&lt;!&ndash; &lt;!&ndash; 上面引用本地包,webMgic源码包修过后&ndash;&gt;&ndash;&gt;-->
<dependency> <!--<dependency>
<groupId>com.github.detro</groupId> <groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId> <artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version> <version>1.2.0</version>
</dependency> </dependency>-->
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
......
...@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread; ...@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg; import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SpringContextUtil; import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants; import com.zzsn.utility.index.Constants;
import com.zzsn.webMagic.*;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerConfig;
...@@ -35,39 +34,36 @@ import java.util.Properties; ...@@ -35,39 +34,36 @@ import java.util.Properties;
@Slf4j @Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplication extends SpringBootServletInitializer { public class CrawlerMateSearchApplication extends SpringBootServletInitializer{
@Override // @Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) { // protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerMateSearchApplication.class); // return builder.sources(CrawlerMateSearchApplication.class);
} // }
public static void main(String[] args) { public static void main(String[] args) {
SpringApplication.run(CrawlerMateSearchApplication.class,args); SpringApplication.run(CrawlerMateSearchApplication.class,args);
} }
/** // @Override
* 采用webMagic框架爬取 // public void run(String... args) throws Exception {
*/ //// try {
public void webMagic(){ //// consumerPartition();
new LinksReadThread().start(); //// }catch (Exception e){
new BaiduContentThread().start(); //// consumerPartition();
//// }
} //// try {
//// loadSiteMsgLoc();
//// }catch (Exception e){
//// loadSiteMsgLoc();
//// }
//// try {
//// consumerPartition();
//// }catch (Exception e){
//// consumerPartition();
//// }
// }
/**
* 老方法抓取
* @throws Exception
*/
public void ordinary() {
System.out.println("——————++++++++++++——————===");
try {
consumerPartition();
} catch (Exception e) {
consumerPartition();
}
loadSiteMsgLoc();
}
public void consumerPartition (){ public void consumerPartition (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
......
package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.FileUtil;
import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplicationbak extends SpringBootServletInitializer implements CommandLineRunner {
// @Override
// protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
// return builder.sources(CrawlerMateSearchApplication.class);
// }
public static void main(String[] args) {
SpringApplication.run(CrawlerMateSearchApplicationbak.class,args);
}
@Override
public void run(String... args) throws Exception {
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
// try {
// loadSiteMsgLoc();
// }catch (Exception e){
// loadSiteMsgLoc();
// }
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
}
public void consumerPartition (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
consumer.commitSync();
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++");
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}
}
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
System.out.println("——————++++++++++++——————===");
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = new Gson().fromJson(subvalue, KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
public void loadloc(){
String key="{\n" +
" \"id\": \"2022090522\",\n" +
" \"wordsCode\": \"KW-20220602-0003\",\n" +
" \"wordsName\": \"2022世界机器人大会\",\n" +
" \"keyWord\": \"2022世界机器人大会\",\n" +
" \"exclusionWord\": null,\n" +
" \"status\": \"1\",\n" +
" \"subjectId\": null,\n" +
" \"subjectIds\": null,\n" +
" \"startTime\": null,\n" +
" \"endTime\": null \n" +
"}";
try {
KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
e.printStackTrace();
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
...@@ -87,56 +87,56 @@ public class ThreadExecutorConfig { ...@@ -87,56 +87,56 @@ public class ThreadExecutorConfig {
} }
@Bean(value = "asyncexecutorService") // @Bean(value = "asyncexecutorService")
public Executor executorService() { // public Executor executorService() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量 // executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量 // executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
@Bean(value = "asyncexecutorServiceWebBaidu") // @Bean(value = "asyncexecutorServiceWebBaidu")
public Executor executorServiceWebBaidu() { // public Executor executorServiceWebBaidu() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量 // executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量 // executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
@Bean(value = "asyncexecutorServiceDetailUrl") // @Bean(value = "asyncexecutorServiceDetailUrl")
public Executor asyncexecutorServiceDetailUrl() { // public Executor asyncexecutorServiceDetailUrl() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量 // executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量 // executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
} }
\ No newline at end of file
package com.zzsn.entity;
import lombok.Data;
import java.util.Date;
@Data
public class BadSiteMsg {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private String crawlerType;
/**分区id (多个用英文逗号隔开)*/
private String partition;
/**消费时间*/
private Date consumerDate;
}
...@@ -52,8 +52,10 @@ public class KafkaConsumerJob { ...@@ -52,8 +52,10 @@ public class KafkaConsumerJob {
public static final ExecutorService poolExecuter = new BlockThreadPoolExecute(5 public static final ExecutorService poolExecuter = new BlockThreadPoolExecute(5
, 10 , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); , 10 , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
/**fixedDelay:上一次执行完毕时间点之后一分钟再执行*/
// @Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") @Scheduled(fixedDelay=60000)
// @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
...@@ -67,10 +69,14 @@ public class KafkaConsumerJob { ...@@ -67,10 +69,14 @@ public class KafkaConsumerJob {
ConsumerRecords<String, String> records = consumer.poll(0); ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync(); consumer.commitSync();
for(ConsumerRecord record : records){ for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class); KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread=new MetaBaiduSearchThread(); MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg=keywordMsg; metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler(); metaSearchThread.crawler();
}catch (Exception e){
continue;
}
} }
} }
}catch (Exception e){ }catch (Exception e){
...@@ -81,8 +87,9 @@ public class KafkaConsumerJob { ...@@ -81,8 +87,9 @@ public class KafkaConsumerJob {
} }
// @Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") @Scheduled(fixedDelay=300000)
public void consumerPartition (){ public void consumerPartition (){
try {
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
...@@ -94,24 +101,26 @@ public class KafkaConsumerJob { ...@@ -94,24 +101,26 @@ public class KafkaConsumerJob {
} }
consumer.assign(topicPartitions); consumer.assign(topicPartitions);
try{ try {
while(true){ while (true) {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环 //消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0); ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){ for (ConsumerRecord record : records) {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class); KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
consumer.commitSync(); consumer.commitSync();
MetaBaiduSearchThread metaSearchThread=new MetaBaiduSearchThread(); MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg=keywordMsg; metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler(); metaSearchThread.crawler();
} }
} }
} catch (Exception e) {
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}catch (Exception e){ }catch (Exception e){
consumer = createConsumer(); log.info("kafka调用信息失败");
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
} }
} }
// @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE) // @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE)
@Async("asyncTaskExecutor") @Async("asyncTaskExecutor")
......
...@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; ...@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil; import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils; import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo; import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.BadSiteMsg;
import com.zzsn.entity.Site; import com.zzsn.entity.Site;
import com.zzsn.entity.SiteTemplate; import com.zzsn.entity.SiteTemplate;
import com.zzsn.paser.SourceTemplateByTag; import com.zzsn.paser.SourceTemplateByTag;
...@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable {
// @Async("asyncexecutorServiceWebBaidu") // @Async("asyncexecutorServiceWebBaidu")
public void crawler(){ public void crawler(){
// sentBadSiteMsg(keywordMsg,Constants.CRAWLER_SERVER,Constants.KAFKA_CONSUMER_PARTITION);
//对传进来的关键词组进行组合 //对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord(); String keyWord = keywordMsg.getKeyWord();
List<String> keyWords = SplitKeyword.transForm(keyWord); List<String> keyWords = SplitKeyword.transForm(keyWord);
...@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable {
} }
} }
public void sentBadSiteMsg(KeywordMsg keymsg,String crawlerType,String partition){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(keymsg.getId());
badSiteMsg.setInfoSourceCode(keymsg.getWordsCode());
badSiteMsg.setConsumerDate(new Date());
badSiteMsg.setCrawlerType(crawlerType);
badSiteMsg.setPartition(partition);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("crawler_consumer", docjson);
}catch (Exception e){
}
}
private String locateCharSet(String url) { private String locateCharSet(String url) {
String encoding = "gbk"; String encoding = "gbk";
try { try {
......
...@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable { ...@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable {
@Async("asyncexecutorServiceDetailUrl") // @Async("asyncexecutorServiceDetailUrl")
public void crawler(){ public void crawler(){
try { try {
......
...@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable { ...@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable {
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Async("asyncexecutorServiceWebBaidu") // @Async("asyncexecutorServiceWebBaidu")
public void crawler(){ public void crawler(){
//对传进来的关键词组进行组合 //对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord(); String keyWord = keywordMsg.getKeyWord();
......
...@@ -113,12 +113,13 @@ public class Constants { ...@@ -113,12 +113,13 @@ public class Constants {
public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION"); public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION");
public static final String KAFKA_PRODUCT_PARTITION= prop.getProperty("KAFKA_PRODUCT_PARTITION"); public static final String KAFKA_PRODUCT_PARTITION= prop.getProperty("KAFKA_PRODUCT_PARTITION");
public static final String CRAWLER_SERVER= prop.getProperty("crawler_server");
public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
// public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC"); //
// public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC");
public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize")); //
// public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize"));
public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger")); //
// public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger"));
} }
package com.zzsn.utils;
import com.zzsn.webMagic.KafkaConsumers;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.proxy.Proxy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class GrabUtil {
private final static Logger logger = LoggerFactory.getLogger(GrabUtil.class);
/**
* 批量代理IP有效检测
*
*/
public static boolean checkProxyIp(Proxy proxyx) {
//创建httpGet实例
// HttpGet httpGet = new HttpGet("http://www.baidu.com");
HttpGet httpGet = new HttpGet("https://www.163.com/dy/article/GDT03TFD05158K7T.html");
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost proxy = new HttpHost(proxyx.getHost(),proxyx.getPort());
CredentialsProvider provider = new BasicCredentialsProvider();
//包含账号密码的代理
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(proxyx.getUsername(), proxyx.getPassword()));
//创建httpClient实例
CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(proxy)
.setConnectTimeout(2000)
.setSocketTimeout(2000)
.setConnectionRequestTimeout(2000)
.build();
httpGet.setConfig(requestConfig);
//设置请求头消息
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (response != null){
HttpEntity entity = response.getEntity(); //获取返回实体
if (entity != null){
System.out.println("网页内容为:"+ EntityUtils.toString(entity,"utf-8"));
}
}
if (statusCode ==200){
return true;
}
}catch ( Exception e){
e.printStackTrace();
// logger.error("校验代理ip是否可用出错,错误信息:",e);
}finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return false;
}
public static String httpGet(String url,Proxy myproxy){
HttpGet httpGet = new HttpGet(url);
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost proxy = new HttpHost(myproxy.getHost(),myproxy.getPort());
CredentialsProvider provider = new BasicCredentialsProvider();
//包含账号密码的代理
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(myproxy.getUsername(), myproxy.getPassword()));
//创建httpClient实例
CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(proxy)
.setConnectTimeout(5000)
.setSocketTimeout(5000)
.setConnectionRequestTimeout(5000)
.build();
httpGet.setConfig(requestConfig);
//设置请求头消息
httpGet.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode ==200){
logger.info(" page sucess by ["+proxy.getHostName()+"],"+url+" ");
HttpEntity entity = response.getEntity(); //获取返回实体
return EntityUtils.toString(entity,"utf-8");
}else {
logger.info("page error by ["+proxy.getHostName()+"],"+url+" code:"+statusCode);
}
}catch ( Exception e){
logger.info("page error by ["+proxy.getHostName()+"],"+url+" code:500 ",e);
}finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
/**
* 给list进行分包
* @param iPacket 包大小
* @param list
*/
public static List<List<Request>> AverageList(List<Request>list, int iPacket){
int iCount=list.size()/iPacket;
int iRemit=list.size()%iPacket;
List<List<Request>> Average=new ArrayList<>();
for (int i=0;i<iCount;i++){
Average.add(list.subList(i*iPacket,(i+1)*iPacket));
}
if (iRemit>0){
Average.add(list.subList(iCount*iPacket,(iCount*iPacket+iRemit)));
}
return Average;
}
/**
* 获取神龙ip接口
* 一次获取一个,有效3分钟
*/
// public static MyProxy getShenlongIp(){
// MyProxy proxy=null;
// String url="http://api.shenlongip.com/ip?key=jy0tr2q0&pattern=json&count=1&need=1100&protocol=1&sign=d5654a620e9b424ca87fe1802f4c9e88";
// String result=HttpUtil.get(url);
// logger.info("调用代理IP接口返回结果:"+result);
// if (result!=null && result.length()>0){
// JSONObject re=JSONObject.parseObject(result);
// if (200== re.getInteger("code")){
// JSONArray datas= re.getJSONArray("data");
// if (datas.size()>0){
// for (Object data:datas){
// try {
// JSONObject o=(JSONObject) data;
// long expire= DateUtil.convertDate(o.getString("expire"),"yyyy-MM-dd HH:mm:ss").getTime();
// proxy=new MyProxy(o.getString("ip"),o.getInteger("port"),"hys_81310170_41c8","12345678",expire);
// }catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
// }else {
// logger.info("获取代理IP出错,返回信息:"+result);
// }
// }
// return proxy;
// }
public static void main(String[] args) {
String url="http://baijiahao.baidu.com/s?id=1662021416338923958&wfr=spider&for=pc;";
Proxy proxy= KafkaConsumers.getProxy().get(0);
System.out.println(httpGet(url,proxy));
}
// 3 175.10.141.61-40013-hys_81310170_41c8-12345678
// 1 125.119.174.226-40032-hys_81310170_41c8-12345678
// 2 122.230.139.103-40004-hys_81310170_41c8-12345678
// 4 183.165.248.64-40007-hys_81310170_41c8-12345678
}
package com.zzsn.webMagic;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.index.Constants;
import com.zzsn.utils.GrabUtil;
import com.zzsn.webMagic.downloader.SeleniumDownloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import java.util.ArrayList;
import java.util.List;
/**
* 抓取 百度内容
*/
public class BaiduContentThread extends Thread {
private final static Logger logger = LoggerFactory.getLogger(BaiduContentThread.class);
@Override
public void run() {
//从队列获取连接进行内容提取
List<Request> requestsList=new ArrayList<>();
Request request=null;
while (true){
try {
//队列
ClbAnsProcessitem entity=(ClbAnsProcessitem) BaiduTask.baiduUrlQueue.take();
if (entity !=null){
logger.info("等到内容连接来了,开始处理列表连接。。。。。");
// 根据连接判断url是否在redis里存在,存在则丢掉
boolean sismember = JedisUtil.sismember("baidu_web_test5::"+entity.getOrgId(), entity.getSourceAddress());
if (!sismember){
request=new Request();
request.setUrl(entity.getSourceAddress());
//把相关信息传递到下个对象中
request.putExtra("model", JSONObject.toJSONString(entity));
requestsList.add(request);
}else {
BaiduTask.doublep++;
logger.info("连接:"+entity.getSourceAddress()+"已存在。");
}
}
if (requestsList.size()<5){
continue;
}
//清除jvm dns缓存ip信息
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
java.security.Security.setProperty("networkaddress.cache.negative.ttl", "0");
if ("1".equals(Constants.PROXY)){
Proxy p=new Proxy("",3232); //ProxyMap.getProxy();
logger.info("获取资讯内容url,启动代理ip,进行下一步内容处理");
if (p !=null){
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new BaiduTxtProcessor())
.setDownloader(httpClientDownloader)
.startRequest(requestsList)
.thread(1)
.runAsync();
}else {
Thread.sleep(1000*30);
continue;
}
}else {
logger.info("获取资讯内容url,组装好了http请求信息,提交到WebMgic..................");
Spider.create(new BaiduTxtProcessor())
.startRequest(requestsList)
.thread(1)
.runAsync();
}
requestsList=new ArrayList<>();
try {
Thread.sleep(1000*5);
} catch (InterruptedException e) {
e.printStackTrace();
}
}catch (Exception e){
e.printStackTrace();
}
}
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.util.DateUtil;
import org.apache.http.HttpHeaders;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 根据百度关键字搜索返回列表结果处理
*/
public class BaiduKeyWordProcessor implements PageProcessor {
private final static Logger log = LoggerFactory.getLogger(BaiduKeyWordProcessor.class);
private ClbAnsProcessitem processitem=null;
private List<ClbAnsProcessitem> processitemList=null;
private Site site;
@Override
public void process(Page page) {
log.info("监听到百度关键字搜索,返回结果");
List<String> links=page.getHtml().xpath("//div[@id=content_left]/div[@tpl='news-normal']").all();
processitem=new ClbAnsProcessitem();
processitemList=new ArrayList<>();
Request request=page.getRequest();
log.info("获取到列表Url数:"+(links.size()));
for (String s:links){
processitem=new ClbAnsProcessitem();
//解析出标题,时间
String link="关键字列表";
try {
Html html=Html.create(s);
link=html.xpath("//div/div/h3").links().toString();
if (link == null || link.contains(".pdf") || link.trim().length()==0|| link.contains(".PDF")||link.contains("download")) {
BaiduTask.invalidUrl++;
continue;
}
Elements title=html.getDocument().select("a[data-click]");
Elements tiem=html.getDocument().select("span.c-color-gray2");
Elements org=html.getDocument().select("span.c-color-gray");
if (title !=null && title.get(0)!=null){
processitem.setTitle(title.get(0).text());
}
if (tiem!=null && tiem.size()>0){
processitem.setPublishDate(DateUtil.getPublishDate(tiem.get(0).text()));
}
if (org !=null && org.size()>0){
processitem.setSourceSite(org.get(0).text());
}
processitem.setSourceAddress(link);
processitem.setSid(request.getExtra("sid").toString());
processitem.setOrgId(request.getExtra("tid").toString());
processitem.setTid(request.getExtra("tid").toString());
processitem.setKeyWords(request.getExtra("words").toString());
//完成列表相关属性组装放入全局对列,等待下一步处理连接里的内容
BaiduTask.baiduUrlQueue.put(processitem); //加入到队列,如果队列满则阻塞
BaiduTask.linksUrl++;
}catch (Exception e){
log.error("解析列表Url["+page.getRequest().getUrl()+"]下的"+link+"+出错,错误信息:",e);
}
}
processitemList=null;
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(3000); //停3秒
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
site.setCycleRetryTimes(3);//设置循环次数
site.setTimeOut(8000);
}
return site;
}
// public static void main(String[] args) {
// List<Request> list=new ArrayList<>();
// Request request=new Request();
// Map<String,Object>map=new HashMap<>();
// map.put("aa","这是测试传参");
// request.setExtras(map);
// String url="https://sichuan.scol.com.cn/ggxw/202209/58604583.html";
// String url1="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%E5%9F%83%E5%A1%9E%E4%BF%84%E6%AF%94%E4%BA%9A%2B%E5%AE%A2%E6%9C%BA%2B%E9%81%87%E9%9A%BE&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=0";
// request.setUrl(url);
// list.add(request);
//// Request r2=new Request();
//// r2.setExtras(map);
//// r2.setUrl(url1);
//// list.add(r2);
// // Proxy proxy=new Proxy("113.243.178.169",40016,"hys_81310170_41c8","12345678");
// // HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
// // httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(proxy));
// Spider.create(new BaiduKeyWordProcessor()).thread(1)
// .startRequest(list)
// // .setDownloader(new SeleniumDownloader("E:\\chromdriver\\chromedriver.exe"))
// // .setDownloader(httpClientDownloader)
// // .addUrl(url)
// .runAsync();
//
// }
}
package com.zzsn.webMagic;
import com.alibaba.fastjson.JSONObject;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.job.KafkaConsumerTask;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SplitKeyword;
import com.zzsn.utility.index.Constants;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 百度关键字搜索爬虫任务
*/
@Component
@EnableScheduling
public class BaiduTask {
private final static Logger logger = LoggerFactory.getLogger(BaiduTask.class);
//缓存url列表对象对列,先进先出原则
public static LinkedBlockingQueue<ClbAnsProcessitem> baiduUrlQueue=new LinkedBlockingQueue();
/**
* 关键字队列
*/
public static LinkedBlockingQueue<KeywordMsg> keyWordsQueue=new LinkedBlockingQueue<>();
private final String BAIDU_KEY_URL="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%s&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=%d";
/**
* 关键词组个数
*/
static int keyWordsArrey=0;
/**
* 关键词个数
*/
static int keyWordsSigin=0;
/**
* 生成有效百度连接
*/
static int linksUrl=0;
/**
* 无效百度连接
*/
static int invalidUrl=0;
/**
* 重复连接
*/
static int doublep=0;
static int toKafka=0;
/**
* 根据关键词组搜索抓取连接列表
*/
@Scheduled(initialDelay = 40000,fixedRate = 1000*60*10)
public void getFormKeyWordsArry(){
//创建消费者
logger.info("10分钟跑一次。。。。。");
KafkaConsumer<String, String> consumer = KafkaConsumers.createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
KeywordMsg keywordMsg=null; //关键词对象
KeywordMsg nextKeyWords=null; //关键词对象
List<String> keyword=null; //关键词字符串集合
List<KeywordMsg> keywordMsgsList=new ArrayList<>(); //保存所有关键字列表
for (int i=0;i<Constants.KAFKA_COUNT;i++){
ConsumerRecords<String, String> records = consumer.poll(100);
consumer.commitSync();
logger.info("百度关键字搜索消费消息开始。。。。。。。。。。。。");
for (ConsumerRecord consumerRecord:records){
//封装关键字对象
try {
keywordMsg = new Gson().fromJson(consumerRecord.value().toString(), KeywordMsg.class);
/***
* 拿到关键词组,由关键词组再次拼装成关键词生成关键词对象放入队列
* 下一步需要用代理ip来分包爬取连接
*/
if (keywordMsg !=null){
keyWordsArrey++;
//拿到关键词组,再次组装关键词对象
keyword= SplitKeyword.transForm(keywordMsg.getKeyWord());
if (keyword !=null && keyword.size()>0){
for (String keys:keyword){
keyWordsSigin++;
nextKeyWords=new KeywordMsg();
nextKeyWords.setId(keywordMsg.getId());
nextKeyWords.setCrawlerType(keywordMsg.getCrawlerType());
nextKeyWords.setKeyWord(keys);
nextKeyWords.setStatus(keywordMsg.getStatus());
nextKeyWords.setExclusionWord(keywordMsg.getExclusionWord());
nextKeyWords.setSubjectId(keywordMsg.getSubjectId());
nextKeyWords.setWordsCode(keywordMsg.getWordsCode());
nextKeyWords.setWordsName(keywordMsg.getWordsName());
keyWordsQueue.put(nextKeyWords);
}
}
}else {
logger.error("百度搜索关键字,未解析到关键字");
}
}catch (Exception e){
logger.error("从kafka获取关键字出错,错误信息:",e);
}
}
}
}
}
package com.zzsn.webMagic;
import cn.hutool.core.date.DateUtil;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.paser.SourceTemplateByTag;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.extractor.StandardWebExtractorHandler;
import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.util.RequestUtil;
import com.zzsn.utility.util.Utility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.ibatis.annotations.Param;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* 资讯正文抓取
*/
public class BaiduTxtProcessor implements PageProcessor {
private final static Logger log = LoggerFactory.getLogger(BaiduTxtProcessor.class);
private Site site;
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
private ClbAnsProcessitem processitem=null;
private List<ClbAnsProcessitem> processitemList=null;
@Override
public void process(Page page) {
log.info("监听到进入内容解析中。。。。。。。。。。。。。");
Request request=page.getRequest();
processitem=new ClbAnsProcessitem();
processitemList=new ArrayList<>();
processitem=JSONObject.parseObject(request.getExtra("model").toString(),ClbAnsProcessitem.class);
processitem.setSource("3");
DocInfo docInfo = new DocInfo();
try {
String link=page.getUrl().toString(); //当前页面连接
String infodata=page.getHtml().toString(); //页面内容
//判断网页编码
String contentCharset = Utility.getWebEncodingByStr(infodata);
if(link.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
infodata = RequestUtil.getTaotiaoData(link );
}
// 判断是否存在对应域名的模板
if(link.contains("qq.com") && !link.contains("://new.qq.com")){
link= KafkaConsumers.transqqURl(link);
}
String domainurl = new URL(link).getHost();
Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
SiteTemplate siteTemplate=new SiteTemplate();
if (siteTempObj != null && !"null".equals(siteTempObj)) {
com.zzsn.entity.Site site=(com.zzsn.entity.Site)siteTempObj;
siteTemplate.setMatchTitle(site.getMatchTitle());
siteTemplate.setMatchAuthor(site.getMatchAuthor());
siteTemplate.setMatchContent(site.getMatchContent());
siteTemplate.setMatchOrigin(site.getMatchOrigin());
siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
siteTemplate.setMatchSummary(site.getMatchSummary());
docInfo= SourceTemplateByTag.doPaserByTag(infodata, docInfo, siteTemplate);
}
if(null!=docInfo.getContentWithTag()) {
System.out.println("使用模板解析内容成功"+domainurl);
log.info("使用模板解析内容成功"+domainurl);
}
//使用内容规则解析
if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
new StandardWebExtractorHandler().doHandler(infodata, docInfo);
}
//替换图片相对路径为绝对路径
if (docInfo.getTitle() != null
&& docInfo.getTitle().trim().length() > 0
&& docInfo.getContentNoTag() != null
&& docInfo.getContentNoTag().trim().length() > 0){
ContentFileResult contentFileResult = new ContentFileResult();
contentFileResult =KafkaConsumers.getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));
docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
}
if (StringUtils.isNotBlank(docInfo.getPublishDate())){
processitem.setPublishDate(docInfo.getPublishDate());
}
//只有内容不为空的才进下一步处理
if (StringUtils.isNotBlank(docInfo.getContentNoTag())){
processitem.setContent(docInfo.getContentNoTag());
processitem.setContentWithtag(docInfo.getContentWithTag());
processitem.setSummary(docInfo.getSummary());
processitem.setAuthor(docInfo.getAuthor());
processitem.setCreateDate(DateUtil.now());
String msg=new ObjectMapper().writeValueAsString(processitem);
//输出
kafkaTemplate.send(Constants.testBaidu, msg);
BaiduTask.toKafka++;
log.info("加入到kfaka...........");
// 加入缓存池中
// JedisUtil.sadd("baidu::"+processitem.getOrgId(), processitem.getSourceAddress());
//测试redis
JedisUtil.sadd("baidu_web_test5::"+processitem.getOrgId(), processitem.getSourceAddress());
}else {
log.info("没有获取资讯内容,连接:"+link);
}
}catch (Exception e){
log.error("解析内容逻辑出错,错误信息:",e);
}
log.info("程序处理:关键词组总数:"+BaiduTask.keyWordsArrey+";关键词总数:"+BaiduTask.keyWordsSigin+";有效URL连接数:"+BaiduTask.linksUrl+";无效URL连接数:"+BaiduTask.invalidUrl+";重复连接个数:"+BaiduTask.doublep+";入kafak数量:"+BaiduTask.toKafka);
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(4000);
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
site.addHeader(HttpHeaders.CONNECTION, "close");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
site.setTimeOut(8000);
site.setCycleRetryTimes(3);
}
return site;
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utils.GrabUtil;
import com.zzsn.webMagic.downloader.SeleniumDownloader;
import org.apache.http.HttpHeaders;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import java.util.List;
public class Expaler implements PageProcessor {
Site site;
@Override
public void process(Page page) {
// List<String> links=page.getHtml().xpath("//div[@id=content_left]/div[@tpl='news-normal']").all();
// links.forEach(t-> System.out.println(t));
System.out.println(page.getHtml().toString());
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(0);
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
site.addHeader(HttpHeaders.CONNECTION, "close");
}
return site;
}
public static void main(String[] args) {
String url="https://www.163.com/dy/article/GTOJV0AF0551M1ZM.html";
Proxy proxy=new Proxy("114.102.52.200",61798,"hys_81310170_41c8","12345678");
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(KafkaConsumers.getProxy().get(0)));
SeleniumDownloader seleniumDownloader=new SeleniumDownloader("E:\\chromdriver\\chromedriver.exe");
seleniumDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new Expaler()).thread(1)
// .startRequest(list)
.setDownloader(seleniumDownloader)
// .setDownloader(httpClientDownloader)
.addUrl(url)
.runAsync();
}
}
package com.zzsn.webMagic;
import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.oracledb.OracleDBManager;
import com.zzsn.search.oracledb.OracleDataTable;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.model.FileTag;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import us.codecraft.webmagic.proxy.Proxy;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* kafka消费者
*/
public class KafkaConsumers {
public static org.apache.kafka.clients.consumer.KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
/**
* 从oracl里获取代理ip
* @return
*/
public static List<Proxy> getProxy(){
List<Proxy> proxyList=null;
String searchSql = "select proxy from CIS_sys_Proxy where id="+(RandomUtil.randomInt(4)+1);
// String searchSql = "select proxy from CIS_sys_Proxy ";
String proxy=null;
OracleDBManager dm = new OracleDBManager();
try {
OracleDataTable dt = dm.getResultData(null, null, searchSql);
if(dt != null && dt.getRowCount()> 0){
proxyList=new ArrayList<>();
for(int i = 0; i<dt.getRowCount(); i++){
for(int j = 0; j<dt.getColCoun(); j++)
if(dt.getRow()[i][j].length()>5){
proxy=dt.getRow()[i][j];
String[] ps=proxy.split("-");
proxyList.add(new Proxy(ps[0],Integer.valueOf(ps[1]),ps[2],ps[3]));
}
}
}
} catch (SQLException e) {
e.printStackTrace();
}
return proxyList;
}
//转换qq新闻链接
public static String transqqURl(String oldurl){
String patt="https://new.qq.com/omn/[date]/[pamars].html";
String b1=oldurl.substring(oldurl.lastIndexOf("/")+1);
String b2=getNumbers(b1);
String curl=patt.replace("[date]",b2).replace("[pamars]",b1);
return curl;
}
public static String getNumbers(String content) {
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
return matcher.group(0);
}
return "";
}
public static ContentFileResult getContentFile(String contentWithTag, String sourceaddress)throws Exception{
String contentImgCvtTag = contentWithTag;
String formatImgContent= contentWithTag;
Map<String, FileTag> imgDataMap = ContentFileFinder.getContentFileTag(contentWithTag,sourceaddress);
//key为图片爬取路径,value为图片保存路径
Map<String, FileTag> imgMap = new HashMap<String, FileTag>();
for (String key : imgDataMap.keySet()) {
FileTag fileTag = imgDataMap.get(key);
while (contentImgCvtTag.contains(key)) {
//IMG_SERVER开头的路径
contentImgCvtTag = contentImgCvtTag.replace(key, fileTag.getSaveTag());
}
imgMap.put(fileTag.getAbsolutePath(), fileTag);
}
ContentFileResult cis = new ContentFileResult();
cis.setContentAbsoulute(formatImgContent);
cis.setContentImgCvtTag(contentImgCvtTag);
cis.setFileMap(imgMap);
return cis;
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
/**
* 从队列里读取关键字
*/
public class LinksReadThread extends Thread {
private final static Logger logger = LoggerFactory.getLogger(LinksReadThread.class);
private final String BAIDU_KEY_URL="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%s&pn=%d";
@Override
public void run() {
List<Request> requestByMcList=new ArrayList<>(); //reques列表
Request webReq=null;
while (true){
try {
KeywordMsg keywordMsg= BaiduTask.keyWordsQueue.take();
logger.info("等到关键字来了。。。。。。");
/****
* 根据关键字词组再次拼装成需要搜索的关键字
* 一个关键字词组能生成N个关键字,一个关键字词组启动一个webMagic爬虫任务
* 同时把关键字词组的基本属性传递给下一步
* 处理流程:1、初始化 webMagic
* 2、 代理获取:增加代理验证
* 3、由关键字拼接百度url,每个关键词只处理一页(10),后续可修改
*/
//组装下一步用 webMagic进行多线程处理,组装url
try {
for (int i=0;i<Constants.PAGESIZE;i++){
//根据配置文件配置抓取页数,此方法只对百度元搜索有用
webReq=new Request();
webReq.putExtra("sid",keywordMsg.getId());
webReq.putExtra("tid",keywordMsg.getWordsCode());
webReq.putExtra("words",keywordMsg.getKeyWord());
webReq.setUrl(String.format(BAIDU_KEY_URL, URLEncoder.encode(keywordMsg.getKeyWord(),"UTF-8"),(i*10)));
requestByMcList.add(webReq);
}
}catch (Exception e){
logger.error("根据关键词组生成request对象出错,错误信息:",e);
}
// if (requestByMcList.size()<10){
// continue;
// }
//清除jvm 缓存Ip
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
java.security.Security.setProperty("networkaddress.cache.negative.ttl", "0");
if (Constants.PROXY.equals("1")){
Proxy p=new Proxy("",32323);//ProxyMap.getProxy();
logger.info("获取待处理一批url,获取代理ip请等待进行下步处理");
if (p!=null){
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new BaiduKeyWordProcessor())
.setDownloader(httpClientDownloader)
.startRequest(requestByMcList)
.thread(1)
.runAsync();
}else {
Thread.sleep(1000*10);
continue;
}
}else {
//不走代理则直接开始跑
logger.info("获取待处理url提交到webMgic,提交数:"+requestByMcList.size());
Spider.create(new BaiduKeyWordProcessor())
.startRequest(requestByMcList)
.thread(1)
.runAsync();
}
requestByMcList=new ArrayList<>();
try {
Thread.sleep(1000*8);
} catch (InterruptedException e) {
e.printStackTrace();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
package com.zzsn.webMagic.downloader;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private volatile ChromeDriver webDriver;
private static final String DRIVER_PHANTOMJS = "phantomjs";
Dimension targetSize = new Dimension(600, 600);// 设置窗口大小
private ProxyProvider proxyProvider;
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
if (webDriver !=null){
webDriver.quit();
}
try {
ChromeOptions options=new ChromeOptions();
//设置 chrome 的无头模式
options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--start-maximized");
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options.addArguments("--window-size=1280,4300");
if (proxyProvider !=null){
us.codecraft.webmagic.proxy.Proxy p= proxyProvider.getProxy(task);
String proxyServer =p.getHost()+":"+p.getPort();
org.openqa.selenium.Proxy proxy = new Proxy().setHttpProxy(proxyServer).setSslProxy(proxyServer);
proxy.setSocksUsername("hys_81310170_41c8");
proxy.setSocksPassword("1234567");
options.setProxy(proxy);
}
webDriver = new ChromeDriver(options);
} catch (Exception e) {
logger.warn("interrupted", e);
return null;
}
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriver.quit();
return page;
}
// private void checkInit() {
// if (webDriver == null) {
// synchronized (this) {
// webDriverPool = new WebDriverPool(poolSize);
// }
// }
// }
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriver.quit();
}
}
package com.zzsn.webMagic.downloader;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
private Logger logger = LoggerFactory.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;
private final int capacity;
private final static int STAT_RUNNING = 1;
private final static int STAT_CLODED = 2;
private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);
/*
* new fields for configuring phantomJS
*/
private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
private static final String DEFAULT_CONFIG_FILE = "E:/chromdriver/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs";
protected static Properties sConfig;
protected static DesiredCapabilities sCaps;
/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
String configFile = DEFAULT_CONFIG_FILE;
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));
// Prepare capabilities
sCaps = new DesiredCapabilities();
sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Fetch PhantomJS-specific configuration parameters
if (driver.equals(DRIVER_PHANTOMJS)) {
// "phantomjs_exec_path"
if (sConfig.getProperty("phantomjs_exec_path") != null) {
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
sConfig.getProperty("phantomjs_exec_path"));
} else {
throw new IOException(
String.format(
"Property '%s' not set!",
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY));
}
// "phantomjs_driver_path"
if (sConfig.getProperty("phantomjs_driver_path") != null) {
System.out.println("Test will use an external GhostDriver");
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY,
sConfig.getProperty("phantomjs_driver_path"));
} else {
System.out
.println("Test will use PhantomJS internal GhostDriver");
}
}
// Disable "web-security", enable all possible "ssl-protocols" and
// "ignore-ssl-errors" for PhantomJSDriver
// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
// String[] {
// "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
ArrayList<String> cliArgsCap = new ArrayList<String>();
cliArgsCap.add("--web-security=false");
cliArgsCap.add("--ssl-protocol=any");
cliArgsCap.add("--ignore-ssl-errors=true");
sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
cliArgsCap);
// Control LogLevel for GhostDriver, via CLI arguments
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
new String[] { "--logLevel="
+ (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig
.getProperty("phantomjs_driver_loglevel")
: "INFO") });
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
ChromeOptions options=new ChromeOptions();
//设置 chrome 的无头模式
options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--start-maximized");
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options.addArguments("--window-size=1280,4300");
String proxyServer = "1.83.251.72:40013";
Proxy proxy = new Proxy().setHttpProxy(proxyServer).setSslProxy(proxyServer);
options.setProxy(proxy);
mDriver = new ChromeDriver(options);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private boolean isUrl(String urlString) {
try {
new URL(urlString);
return true;
} catch (MalformedURLException mue) {
return false;
}
}
/**
* store webDrivers created
*/
private List<WebDriver> webDriverList = Collections
.synchronizedList(new ArrayList<WebDriver>());
/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();
public WebDriverPool(int capacity) {
this.capacity = capacity;
}
public WebDriverPool() {
this(DEFAULT_CAPACITY);
}
/**
*
* @return
* @throws InterruptedException
*/
public WebDriver get() throws InterruptedException {
checkRunning();
WebDriver poll = innerQueue.poll();
if (poll != null) {
return poll;
}
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
// add new WebDriver instance into pool
try {
configure();
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
}
return innerQueue.take();
}
public void returnToPool(WebDriver webDriver) {
checkRunning();
innerQueue.add(webDriver);
}
protected void checkRunning() {
if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
throw new IllegalStateException("Already closed!");
}
}
public void closeAll() {
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);
if (!b) {
throw new IllegalStateException("Already closed!");
}
for (WebDriver webDriver : webDriverList) {
logger.info("Quit webDriver" + webDriver);
webDriver.quit();
webDriver = null;
}
}
}
...@@ -23,16 +23,16 @@ THREAD_SIZE=1 ...@@ -23,16 +23,16 @@ THREAD_SIZE=1
#地址 #地址
KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#消费主题 #消费主题
#KAFKA_CONSUMER_TOPIC=keyWordsInfo KAFKA_CONSUMER_TOPIC=keyWordsInfo
#测试主题 #测试主题
KAFKA_CONSUMER_TOPIC=baiduTest #KAFKA_CONSUMER_TOPIC=baiduTest
#消费者 #消费者
#KAFKA_CONSUMER_GROUP_ID=baidu-web-test KAFKA_CONSUMER_GROUP_ID=baidu2
#测试消费者 #测试消费者
KAFKA_CONSUMER_GROUP_ID=baidu-wemagic #KAFKA_CONSUMER_GROUP_ID=baidu-wemagic
#发送消息 #发送消息
KAFKA_test_TOPIC=baidu-bind2-test #KAFKA_test_TOPIC=baidu-bind-test
#KAFKA_test_TOPIC=baidu-new-test KAFKA_test_TOPIC=baidu-new-test
...@@ -81,3 +81,6 @@ whiles=10 ...@@ -81,3 +81,6 @@ whiles=10
pageSize=5 pageSize=5
#平分因子 #平分因子
averger=2000 averger=2000
#
crawler_server=baidu-1
\ No newline at end of file
...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
siteMsgTemple.setVerifyType("1"); // siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
......
package com.zzsn.crawler; package com.zzsn.crawler;
import cn.hutool.core.date.DateTime; import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.paser.*; import com.zzsn.crawler.paser.*;
import com.zzsn.crawler.uriparser.HisURIConfig; import com.zzsn.crawler.uriparser.HisURIConfig;
import com.zzsn.crawler.uriparser.HisURIParser; import com.zzsn.crawler.uriparser.HisURIParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{
public PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple(); public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override @Override
...@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{
// @Async("asyncexecutorService") // @Async("asyncexecutorService")
public void crawler(){ public void crawler(){
sentBadSiteMsg(siteMsgTemple,Constants.MODEL_SCORE_URL,Constants.KAFKA_CONSUMER_PARTITION);
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
log.info("获取urlList: "+urlList.size()); log.info("获取urlList: "+urlList.size());
...@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{ ...@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{
// String charset = paserSiteDownload.getCharSet(urlList.get(0)); // String charset = paserSiteDownload.getCharSet(urlList.get(0));
// String charset = paserSiteDownload.getCharSet(urlList.get(0)); // String charset = paserSiteDownload.getCharSet(urlList.get(0));
String charset = ""; String charset = "";
try { // try {
charset = paserSiteDownload.locateCharSet(urlList.get(0)); // charset = paserSiteDownload.locateCharSet(urlList.get(0));
} catch (Exception e) { // } catch (Exception e) {
try { // try {
charset = paserSiteDownload.getCharSet(urlList.get(0)); // charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) { // } catch (IOException ex) {
// //
} // }
} // }
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList=new ArrayList<>();
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now()); log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
String infoSourceId=siteMsgTemple.getId();//获取信息源id String infoSourceId=siteMsgTemple.getId();//获取信息源id
//默认表达式类型 //默认表达式类型
siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType()); siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
...@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{ ...@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple);
}else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
...@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{ ...@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{
} }
} }
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String crawlerType,String partition){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setConsumerDate(new Date());
badSiteMsg.setCrawlerType(crawlerType);
badSiteMsg.setPartition(partition);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("crawler_consumer", docjson);
}catch (Exception e){
}
}
public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){ public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem(); ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
......
...@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath { ...@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath {
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
......
...@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent; ...@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent;
import java.io.*; import java.io.*;
import java.time.Duration; import java.time.Duration;
import java.time.temporal.ChronoUnit; import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.alibaba.fastjson.JSON;
import com.zzsn.crawler.ReuseWebDriver; import com.zzsn.crawler.ReuseWebDriver;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.DriverUtil; import com.zzsn.util.DriverUtil;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*; import org.openqa.selenium.*;
...@@ -72,25 +76,51 @@ public class SeleniumTime { ...@@ -72,25 +76,51 @@ public class SeleniumTime {
// @Async("asyncTaskExecutorSelenium") // @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url) { public static String getScopehtml(String url) {
String html = ""; String html = "";
ReuseWebDriver driver=null;
try { try {
ReuseWebDriver driver = DriverUtil.getChromeDriver();
try { try {
Duration duration=Duration.of(50, ChronoUnit.SECONDS); driver= DriverUtil.getChromeDriver();
}catch (Exception e){
log.info("获取浏览器ReuseWebDriver异常:" + e.getMessage());
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
}
try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
// Thread.sleep(1000); Thread.sleep(5000);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
} catch (Exception e) { } catch (Exception e) {
driver.quit();
log.info("获取页面内容异常:" + e.getMessage()); log.info("获取页面内容异常:" + e.getMessage());
} }
} catch (Exception e) { } catch (Exception e) {
driver.quit();
// 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动 // 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动
log.info("驱动打开URL异常:" + e.getMessage()); log.info("驱动打开URL异常:" + e.getMessage());
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
} }
} catch (Exception e) { } catch (Exception e) {
driver.quit();
try {
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
} catch (Exception ex) {
}
log.info("驱动访问页面出现出现异常:" + e.getMessage()); log.info("驱动访问页面出现出现异常:" + e.getMessage());
} }
return html; return html;
......
...@@ -13,52 +13,52 @@ public class ArticleCrawler { ...@@ -13,52 +13,52 @@ public class ArticleCrawler {
articleCrawler.consumer(); articleCrawler.consumer();
} }
public void consumer(){ public void consumer(){
String record="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1560150270181019650\",\n" + " \"id\": \"1534713917403385858\",\n" +
" \"infoSourceCode\": \"IN-20220818-0011\",\n" + " \"infoSourceCode\": \"IN-20220609-58696\",\n" +
" \"webSiteName\": \"一带一路-项目周报\",\n" + " \"webSiteName\": \"中华人民共和国工业和信息化部\",\n" +
" \"siteName\": \"一带一路-项目周报\",\n" + " \"siteName\": \"中华人民共和国工业和信息化部-领导活动\",\n" +
" \"siteUri\": \"https://www.yidaiyilu.gov.cn/info/iList.jsp?cat_id=11432&cur_page=3\",\n" + " \"siteUri\": \"https://www.miit.gov.cn/xwdt/gxdt/ldhd/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": \"2\",\n" +
" \"language\": null,\n" + " \"language\": \"zh\",\n" +
" \"checkedList\": null,\n" + " \"checkedList\": \"1\",\n" +
" \"hisUriExp\": null,\n" + " \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": \"1\",\n" +
" \"listUrl\": null,\n" + " \"listUrl\": null,\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": \"\",\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": null,\n" +
" \"informationPublishDate\": \"span\",\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[class=\\\"commonList_dot\\\"]>li\",\n" + " \"infoBlockPosition\": \"div[class=\\\"page-content\\\"]>ul>li\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": 2,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"0\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"main_content_title\\\"]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>*.h1[id=\\\"con_title\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"szty\\\"]>span:contains(时间)</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>*.span[id=\\\"con_time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>div[class=\\\"szty\\\"]>span:contains(来源)</exp></origin>\",\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>*.div[id=\\\"con_con\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -67,7 +67,7 @@ public class ArticleCrawler { ...@@ -67,7 +67,7 @@ public class ArticleCrawler {
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" + " \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 1,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
...@@ -83,10 +83,10 @@ public class ArticleCrawler { ...@@ -83,10 +83,10 @@ public class ArticleCrawler {
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"05 23 14 1/7 * ?\",\n" + " \"cron\": \"53 40 0/4 * * ?\",\n" +
" \"ynSnapshot\": \"0\"\n" + " \"ynSnapshot\": null\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread(); ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread();
articleCrawlerThread.siteMsgTemple=siteMsgTemple; articleCrawlerThread.siteMsgTemple=siteMsgTemple;
articleCrawlerThread.crawler(); articleCrawlerThread.crawler();
......
package com.zzsn.download;
import org.htmlcleaner.*;
import org.lobobrowser.html.parser.DocumentBuilderImpl;
import org.lobobrowser.html.parser.InputSourceImpl;
import org.lobobrowser.html.test.SimpleUserAgentContext;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
public class HtmlPageParser {
public static Document cobraParse(String url, String pageData)
throws SAXException, IOException {
StringReader sReader = new StringReader(pageData);
return cobraParse(url, sReader);
}
public static Document cobraParse(String url, Reader pageData)
throws SAXException, IOException {
SimpleUserAgentContext ucontext = new SimpleUserAgentContext();
ucontext.setScriptingEnabled(false);
ucontext.setExternalCSSEnabled(false);
DocumentBuilderImpl builder = new DocumentBuilderImpl(ucontext);
Document document = builder
.parse(new InputSourceImpl(pageData, url));
return document;
}
public static Document xmlGetDocument(String pageBody)
throws ParserConfigurationException, SAXException, IOException
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
InputStream is = new ByteArrayInputStream(pageBody.getBytes());
Document document = builder.parse(is);
return document;
}
public static Document htmlCleanerParser(String pageBody)
throws XPatherException, ParserConfigurationException
{
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(pageBody);
Document document = new DomSerializer(
new CleanerProperties()).createDOM(tagNode);
return document;
}
}
package com.zzsn.download; package com.zzsn.download;
//import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.util.StringUtil;
import org.htmlcleaner.BaseTokenImpl; import org.htmlcleaner.BaseTokenImpl;
import org.htmlcleaner.ContentNode; import org.htmlcleaner.ContentNode;
import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException; import org.htmlcleaner.XPatherException;
import org.jsoup.Jsoup;
import org.w3c.dom.*; import org.w3c.dom.*;
import javax.xml.xpath.*; import javax.xml.xpath.*;
...@@ -17,17 +13,14 @@ import java.util.List; ...@@ -17,17 +13,14 @@ import java.util.List;
public class PageBuilderParser { public class PageBuilderParser {
public NodeList parserNodeList(Object doc, String path) { public NodeList parserNodeList(Object doc, String path)
NodeList nodeList=null; throws XPathExpressionException
try { {
XPathFactory factory = XPathFactory.newInstance(); XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath(); XPath xPath = factory.newXPath();
XPathExpression expression = xPath.compile(path); XPathExpression expression = xPath.compile(path);
nodeList = (NodeList) expression.evaluate( NodeList nodeList = (NodeList)expression.evaluate(
doc, XPathConstants.NODESET); doc, XPathConstants.NODESET);
}catch (XPathExpressionException e){
e.printStackTrace();
}
return nodeList; return nodeList;
} }
...@@ -70,45 +63,7 @@ public class PageBuilderParser { ...@@ -70,45 +63,7 @@ public class PageBuilderParser {
} }
return object; return object;
} }
public Object parserNode(Object doc, String path) throws XPathExpressionException
{
if (path == null || path.trim().length() == 0)
{
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath();
XPathExpression expression = xPath.compile(path);
Object object = null;
try {
object = expression.evaluate(doc, XPathConstants.NODE);
return (Node)object;
} catch (XPathExpressionException e) {
try {
object = expression.evaluate(doc, XPathConstants.NODESET);
return (NodeList)object;
} catch (XPathExpressionException e1) {
try {
object = expression.evaluate(doc, XPathConstants.STRING);
return (String)object;
} catch (XPathExpressionException e2) {
try {
object = expression.evaluate(doc, XPathConstants.NUMBER);
return (Number)object;
} catch (XPathExpressionException e3) {
try {
object = expression.evaluate(doc, XPathConstants.BOOLEAN);
return (Boolean)object;
} catch (XPathExpressionException e4) {
// TODO Auto-generated catch block
e4.printStackTrace();
}
}
}
}
}
return object;
}
public String parserStrBr(Object doc, String path) public String parserStrBr(Object doc, String path)
throws XPathExpressionException throws XPathExpressionException
{ {
...@@ -407,14 +362,4 @@ public class PageBuilderParser { ...@@ -407,14 +362,4 @@ public class PageBuilderParser {
return true; return true;
} }
public static void main(String[] args)throws Exception {
PageBuilderParser pageBuilderParser=new PageBuilderParser();
String aa="<content><exp>*.div[class=\"artText clearfix\"]</exp><subtraction>div[class=\"relateArt\"]</subtraction></content>";
Document document = HtmlPageParser.xmlGetDocument(aa);
String content = pageBuilderParser.parserStr(document, "//exp");
String subtraction = pageBuilderParser.parserStr(document, "//subtraction");
System.out.println(content);
System.out.println(subtraction);
}
} }
...@@ -73,6 +73,7 @@ public class PageGet { ...@@ -73,6 +73,7 @@ public class PageGet {
buffer.append("\r\n"); buffer.append("\r\n");
} }
} catch (Exception e) { } catch (Exception e) {
this.connection.disconnect(); this.connection.disconnect();
} finally { } finally {
try { try {
...@@ -133,6 +134,7 @@ public class PageGet { ...@@ -133,6 +134,7 @@ public class PageGet {
buffer.append("\r\n"); buffer.append("\r\n");
} }
} catch (Exception e) { } catch (Exception e) {
} finally { } finally {
try { try {
if (in != null) { if (in != null) {
...@@ -146,8 +148,8 @@ public class PageGet { ...@@ -146,8 +148,8 @@ public class PageGet {
this.pageStr = buffer.toString(); this.pageStr = buffer.toString();
} }
private static final int MAX_SOKET_TIMEOUT =3000; private static final int MAX_SOKET_TIMEOUT =30000;
private static final int MAX_CONNECTION_TIMEOUT = 3000; private static final int MAX_CONNECTION_TIMEOUT = 30000;
private static final RequestConfig REQCONFIG = RequestConfig.custom() private static final RequestConfig REQCONFIG = RequestConfig.custom()
.setSocketTimeout(MAX_SOKET_TIMEOUT).setConnectTimeout(MAX_CONNECTION_TIMEOUT) .build(); .setSocketTimeout(MAX_SOKET_TIMEOUT).setConnectTimeout(MAX_CONNECTION_TIMEOUT) .build();
public void httpClientGet() { public void httpClientGet() {
...@@ -200,9 +202,9 @@ public class PageGet { ...@@ -200,9 +202,9 @@ public class PageGet {
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// webClient.getOptions().setTimeout(500000); // webClient.getOptions().setTimeout(500000);
ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port); ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port);
if (false) { // if (pluginUtil.isNeedProxy()) {
webClient.getOptions().setProxyConfig(proxyConfig); // webClient.getOptions().setProxyConfig(proxyConfig);
} // }
try { try {
Page page = webClient.getPage(this.url); Page page = webClient.getPage(this.url);
if (page instanceof HtmlPage) { if (page instanceof HtmlPage) {
...@@ -210,10 +212,6 @@ public class PageGet { ...@@ -210,10 +212,6 @@ public class PageGet {
// webClient.waitForBackgroundJavaScript(600000); // webClient.waitForBackgroundJavaScript(600000);
this.setPageStr(htmlPage.asXml()); this.setPageStr(htmlPage.asXml());
} }
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
} catch (Exception e) { } catch (Exception e) {
} }
......
package com.zzsn.download; package com.zzsn.download;
import com.gargoylesoftware.htmlunit.*; import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
...@@ -179,10 +182,6 @@ public class PagePost { ...@@ -179,10 +182,6 @@ public class PagePost {
// webClient.Headers.Add("ContentLength", param.Length.ToString()); // webClient.Headers.Add("ContentLength", param.Length.ToString());
// byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes); // byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes);
ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port);
if (false) {
webClient.getOptions().setProxyConfig(proxyConfig);
}
try { try {
Page page = webClient.getPage(this.url); Page page = webClient.getPage(this.url);
if (page instanceof HtmlPage) { if (page instanceof HtmlPage) {
...@@ -190,10 +189,6 @@ public class PagePost { ...@@ -190,10 +189,6 @@ public class PagePost {
// webClient.waitForBackgroundJavaScript(600000); // webClient.waitForBackgroundJavaScript(600000);
this.setPageStr(htmlPage.asXml()); this.setPageStr(htmlPage.asXml());
} }
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
} catch (Exception e) { } catch (Exception e) {
} }
......
...@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory; ...@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory;
public class RequestUtil { public class RequestUtil {
private static volatile RequestUtil instance=null; //保证 instance 在所有线程中同步
private RequestUtil(){} //private 避免类在外部被实例化
public static synchronized RequestUtil getInstance()
{
//getInstance 方法前加同步
if(instance==null)
{
instance=new RequestUtil();
}
return instance;
}
private static Logger log = LoggerFactory.getLogger(RequestUtil.class); private static Logger log = LoggerFactory.getLogger(RequestUtil.class);
public static String getTaotiaoData(String url) throws Exception { public static String getTaotiaoData(String url) throws Exception {
HttpClientBuilder builder = HttpClients.custom(); HttpClientBuilder builder = HttpClients.custom();
//对照UA字串的标准格式理解一下每部分的意思 //对照UA字串的标准格式理解一下每部分的意思
...@@ -215,8 +223,9 @@ public class RequestUtil { ...@@ -215,8 +223,9 @@ public class RequestUtil {
} }
return jsonObject; return jsonObject;
} }
public static String httpGetRequest(String url) throws Exception {
String result = null; public String httpGetRequest(String url) throws Exception {
String result = "";
HttpClientBuilder builder = HttpClients.custom(); HttpClientBuilder builder = HttpClients.custom();
//对照UA字串的标准格式理解一下每部分的意思 //对照UA字串的标准格式理解一下每部分的意思
builder.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)"); builder.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)");
...@@ -232,7 +241,7 @@ public class RequestUtil { ...@@ -232,7 +241,7 @@ public class RequestUtil {
} }
EntityUtils.consume(resEntity); EntityUtils.consume(resEntity);
} catch (IOException e) { } catch (IOException e) {
// return get(url, tts++); result="";
} finally { } finally {
response.close(); response.close();
httpClient.close(); httpClient.close();
......
package com.zzsn.download;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtil {
public static boolean convertBoolean(String s, boolean b)
{
if (s == null)
{
return b;
}
if (s.equals("0"))
{
return false;
}
if (s.equals("1"))
{
return true;
}
return b;
}
public static String convertBooleanToString(boolean b)
{
String s = b ? "1" : "0";
return s;
}
public static String trimWhiteSpace(String str)
{
String s = replaceBlank(str);
String ret = s.trim();
return ret;
}
public static String replaceBlank(String str) {
/* String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");su
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}*/
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < str.length(); i ++)
{
char c = str.charAt(i);
boolean bspace = Character.isWhitespace(c);
if (bspace)
{
c = ' ';
}
buffer.append(c);
}
return buffer.toString();
}
//获取分隔符[和]之间的子串,如aa[abc]bbb->abc
public static List<String> getSubStrs(String str, String start, String end)
{
List<String> resultStrs = new ArrayList<String>();
if (str == null || str.trim().length() == 0)
{
return resultStrs;
}
String ptnstr = String.format("%s([^%s%s]+)%s", start, start, end, end);
// String ptnstr1 = "\\[([^\\[\\]]+)\\]";
Pattern pattern = Pattern.compile(ptnstr);
Matcher matcher = pattern.matcher(str);
while (matcher.find())
{
String substr = matcher.group(1);
resultStrs.add(substr);
}
return resultStrs;
}
//fromStr:aaa123bb, origStr:aaa[xxx]bb, replaceStr:[xxx]. return:123
public static String getHomologousWord(String replaceStr,
String origStr, String fromStr)
{
String retStr = null;
int pos = origStr.indexOf(replaceStr);
if (pos == -1)
{
return retStr;
}
String start = origStr.substring(0, pos);
String end = origStr.substring(pos + replaceStr.length());
if (start.length() > 0 && !fromStr.startsWith(start))
{
return retStr;
}
if (end.length() > 0 && !fromStr.endsWith(end))
{
return retStr;
}
retStr = fromStr.substring(start.length(),
fromStr.length()-end.length());
return retStr;
}
public static String trimBeginningBracket(String s)
{
String ret = s;
if (s.length() == 0)
{
return s;
}
Map<Character, Character> braketPeers
= new HashMap<Character, Character>();
braketPeers.put('【', '】');
braketPeers.put('[', ']');
braketPeers.put('[', ']');
braketPeers.put('(', ')');
braketPeers.put('(', ')');
braketPeers.put('〔', '〕');
String searchStr = s;
while (searchStr.length() > 0)
{
char beginc = searchStr.charAt(0);
Character value = braketPeers.get(beginc);
if (value == null)
{
break;
}
int endPos = -1;
for (int i = 1; i < searchStr.length(); i ++)
{
if (searchStr.charAt(i) == value)
{
endPos = i;
break;
}
}
if (endPos >= 0)
{
ret = searchStr.substring(endPos+1);
searchStr = ret;
}
else {
break;
}
}
return ret;
}
public static String trimMiddleBracket(String s) {
String ret = s;
if (s.length() == 0) {
return s;
}
Map<Character, Character> braketPeers = new HashMap<Character, Character>();
String[] brakets = { "】", "]", "]", ")", ")", "〕" };
braketPeers.put('【', '】');
braketPeers.put('[', ']');
braketPeers.put('[', ']');
braketPeers.put('(', ')');
braketPeers.put('(', ')');
braketPeers.put('〔', '〕');
String searchStr = s;
int index = 0;
while (searchStr.length() > 0) {
int startPos = -1;
Character value = null;
for (int i = index; i < searchStr.length(); i++) {
boolean findLeftBraket = false;
value = searchStr.charAt(i);
for (Character key : braketPeers.keySet()) {
if (value.equals(key)) {
startPos = i;
findLeftBraket = true;
break;
}
}
if (findLeftBraket) {
break;
}
}
int endPos = -1;
for (int i = startPos + 1; i < searchStr.length(); i++) {
if (null != braketPeers.get(value) && searchStr.charAt(i) == braketPeers.get(value)) {
endPos = i;
break;
}
}
if (endPos >= startPos) {
if (startPos >= 0) {
searchStr = searchStr.substring(0, startPos) + searchStr.substring(endPos + 1, searchStr.length());
}
} else {
searchStr = searchStr.replace(value.toString(), "");
index = startPos;
}
if (startPos < 0) {
ret = searchStr;
break;
}
}
for (String bs : brakets) {
ret = ret.replace(bs.toString(), "");
}
return ret;
}
public static String trimEnddingBracket(String s)
{
String ret = s;
if (s.length() == 0)
{
return s;
}
Map<Character, Character> braketPeers
= new HashMap<Character, Character>();
braketPeers.put('】', '【');
braketPeers.put(']', '[');
braketPeers.put(')', '(');
braketPeers.put(')', '(');
braketPeers.put('〕','〔');
int endPos = s.length() - 1;
String searchStr = s;
while (endPos >= 0)
{
char endc = searchStr.charAt(endPos);
Character value = braketPeers.get(endc);
if (value == null)
{
break;
}
int startPos = -1;
for (int i = searchStr.length() - 2; i >= 0; i --)
{
if (searchStr.charAt(i) == value)
{
startPos = i;
break;
}
}
if (startPos >= 0)
{
ret = searchStr.substring(0, startPos);
searchStr = ret;
}
endPos = startPos - 1;
}
return ret;
}
// public static String delSymbolAndPunc(String s)
// {
// StringBuffer buffer = new StringBuffer();
// s = replaceBlank(s);
// for (int i = 0; i < s.length(); i ++)
// {
// char c = s.charAt(i);
// if (c == ' ' || CharUtil.isSymbol(c) || CharUtil.isPunctuation(c) || Integer.toHexString(c).equals("a0"))
// {
// continue;
// }
// buffer.append(c);
// }
// return buffer.toString();
// }
public static String delCharNotChinese(String s)
{
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < s.length(); i ++)
{
char c = s.charAt(i);
if (isChinese(c))
{
buffer.append(c);
}
}
return buffer.toString();
}
public static boolean isChinese(char c)
{
if(c >= 0x4e00 && c <= 0x9fa5)
{
return true;
}
return false;
}
public static String toBanjiao(String s)
{
if (s == null || s.length() == 0)
{
return s;
}
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < s.length(); i ++)
{
char c = s.charAt(i);
if (c >= 65281 && c <= 65374)
{
c = (char) (c - 65248);
}
else if (c == 12288) { // 空格
c = (char) 32;
}
buffer.append(c);
}
return buffer.toString();
}
public static String listToString(List<String> arr)
{
StringBuffer buffer = new StringBuffer();
if (arr == null)
{
return buffer.toString();
}
for (int i = 0; i < arr.size(); i ++)
{
buffer.append(arr.get(i));
if (i != arr.size() - 1)
{
buffer.append(";");
}
}
return buffer.toString();
}
public static List<String> stringToList(String str)
{
List<String> strs = new ArrayList<String>();
if (str == null)
{
return strs;
}
String[] ss = str.split(";");
for (String s : ss)
{
if (s.trim().length() == 0)
{
continue;
}
strs.add(s);
}
return strs;
}
public static String normalizeHtmlTransf(String s)
{
String ret = s.replaceAll("&bull;", "·");
ret = ret.replaceAll("&middot;", "·");
ret = ret.replaceAll("&nbsp;", " ");
ret = ret.replaceAll("&quot;", "\"");
ret = ret.replaceAll("&amp;", "&");
ret = ret.replace('・', '·');
ret = ret.replace("&ldquo;", "\"");
ret = ret.replace("&rdquo;", "\"");
ret = ret.replace("&hellip;", "...");
ret = ret.replace("&lt;", "<");
ret = ret.replace("&gt;", ">");
ret = ret.replace("&mdash;", "—");
ret = ret.replace("&ndash;", "–");
ret = ret.replace("&tilde;", "~");
ret = ret.replace("&lsquo;", "'");
ret = ret.replace("&rsquo;", "'");
ret = ret.replace("&sbquo;", ",");
ret = ret.replace("&lsaquo;", "‹");
ret = ret.replace("&rsaquo;", "›");
ret = ret.replace("&hellip;", "…");
ret = ret.replace("|", " ");
return ret;
}
public static String normalizeSegTransf(String s)
{
String ret = s.replaceAll("\r\n;", " ");
ret = ret.replace("\n", "");
ret = ret.replace("|", " ");
return ret;
}
}
package com.zzsn.download;
public class Test {
public static void main(String[] args) {
PageDownloader downloader=new PageDownloader();
String rankUrl="https://www.baidu.com/";
String encoding="utf-8";
String pageBody = downloader.downloadWithStr(rankUrl, encoding, false,false);
System.out.println(pageBody);
}
}
...@@ -2,6 +2,8 @@ package com.zzsn.entity; ...@@ -2,6 +2,8 @@ package com.zzsn.entity;
import lombok.Data; import lombok.Data;
import java.util.Date;
@Data @Data
public class BadSiteMsg { public class BadSiteMsg {
...@@ -9,16 +11,10 @@ public class BadSiteMsg { ...@@ -9,16 +11,10 @@ public class BadSiteMsg {
private String id; private String id;
/**信息源编码*/ /**信息源编码*/
private String infoSourceCode; private String infoSourceCode;
/**信息源名称*/ /**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType; private String crawlerType;
/**分区id (多个用英文逗号隔开)*/
private String partition;
/**消费时间*/
private Date consumerDate;
} }
package com.zzsn.entity;
import lombok.Data;
@Data
public class BadSiteMsgBak {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType;
}
...@@ -165,6 +165,7 @@ public class Constants { ...@@ -165,6 +165,7 @@ public class Constants {
//判断重复的rate //判断重复的rate
public static final Double TITLE_SIMILARITY_RATE = Double.valueOf(prop.getProperty("TITLE_SIMILARITY_RATE")); public static final Double TITLE_SIMILARITY_RATE = Double.valueOf(prop.getProperty("TITLE_SIMILARITY_RATE"));
public static final String MODEL_SCORE_URL = prop.getProperty("MODEL_SCORE_URL"); public static final String MODEL_SCORE_URL = prop.getProperty("MODEL_SCORE_URL");
public static final Integer CACHE_UPDATE = Integer.valueOf(prop.getProperty("CACHE_UPDATE")); public static final Integer CACHE_UPDATE = Integer.valueOf(prop.getProperty("CACHE_UPDATE"));
//国资监管评价中心相关性过滤算法URl(XGBOOST) //国资监管评价中心相关性过滤算法URl(XGBOOST)
public static final String RELEVANCE_GZJG_XGBOOST_URL = prop.getProperty("RELEVANCE_GZJG_XGBOOST_URL"); public static final String RELEVANCE_GZJG_XGBOOST_URL = prop.getProperty("RELEVANCE_GZJG_XGBOOST_URL");
......
...@@ -18,7 +18,6 @@ public class JedisUtil { ...@@ -18,7 +18,6 @@ public class JedisUtil {
private static final String PREFIX = "comm_"; private static final String PREFIX = "comm_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class); private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null; private static JedisPool jedisPool = null;
private JedisUtil() { private JedisUtil() {
} }
...@@ -127,7 +126,13 @@ public class JedisUtil { ...@@ -127,7 +126,13 @@ public class JedisUtil {
} }
getDefaultJedis().del(PREFIX + key); getDefaultJedis().del(PREFIX + key);
} }
public static void delString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(key);
}
public static void setString(String key, String value, int expireTime) throws Exception { public static void setString(String key, String value, int expireTime) throws Exception {
Jedis jedis=null; Jedis jedis=null;
try { try {
......
...@@ -43,7 +43,7 @@ public class HttpClientTester { ...@@ -43,7 +43,7 @@ public class HttpClientTester {
private static PageBuilderParser builderParser = null; private static PageBuilderParser builderParser = null;
public static void main(String[] args) { public static void main(String[] args) {
// get("https://edition.cnn.com/world"); // get("https://edition.cnn.com/world");
String html = HttpgetUtil.getHtml("https://edition.cnn.com/world"); String html = HttpgetUtil.getHtml("http://www.ahhfly.gov.cn/content/column/11488310?pageIndex=1");
System.out.println(html); System.out.println(html);
// post(); // post();
} }
......
...@@ -8,7 +8,7 @@ import java.net.MalformedURLException; ...@@ -8,7 +8,7 @@ import java.net.MalformedURLException;
public class WebClientTest { public class WebClientTest {
public static void main(String[] args) throws Exception{ public static void main(String[] args) throws Exception{
String url="http://www.sgcc.com.cn/html/sgcc_main/col2017021879/column_2017021879_1.shtml"; String url="http://www.ahhfly.gov.cn/content/column/11488310?pageIndex=1";
String charset="utf-8"; String charset="utf-8";
String s = downloadByWebClient(url, charset); String s = downloadByWebClient(url, charset);
System.out.println(s.length()); System.out.println(s.length());
......
package com.test; package com.zzsn.test;
import com.zzsn.download.RequestUtil;
import java.io.*; import java.io.*;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
...@@ -9,6 +11,12 @@ public class test { ...@@ -9,6 +11,12 @@ public class test {
String urlStr="http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg"; String urlStr="http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg";
String fileName="testImg.png"; String fileName="testImg.png";
downLoadByUrl(urlStr,fileName); downLoadByUrl(urlStr,fileName);
try {
RequestUtil requestUtil =RequestUtil.getInstance();
String body = requestUtil.httpGetRequest(urlStr);
} catch (Exception e) {
// e.printStackTrace();
}
} }
/** /**
* 从网络Url中下载文件 * 从网络Url中下载文件
......
...@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions; ...@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.HttpCommandExecutor; import org.openqa.selenium.remote.HttpCommandExecutor;
import java.net.URL; import java.net.URL;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -33,16 +34,23 @@ public class DriverUtil { ...@@ -33,16 +34,23 @@ public class DriverUtil {
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE); System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
ChromeDriverService service = ChromeDriverService.createDefaultService(); ChromeDriverService service = ChromeDriverService.createDefaultService();
ChromeOptions options = new ChromeOptions(); ChromeOptions options = new ChromeOptions();
//浏览器启动的位置
// options.setBinary(Constants.CHROMEBIN);
// 无痕模式 // 无痕模式
options.addArguments("--incognito"); options.addArguments("--incognito");
// 禁用沙箱 // 禁用沙箱
options.addArguments("no-sandbox"); options.addArguments("--no-sandbox");
// 禁用GPU // 禁用GPU
options.addArguments("--disable-gpu"); options.addArguments("--disable-gpu");
// 禁用图形界面(此模式启动会导致驱动通信异常) // 禁用图形界面(此模式启动会导致驱动通信异常)
// options.addArguments("--headless"); // options.addArguments("--headless");
// 禁用插件 // 禁用插件
options.addArguments("disable-extensions"); options.addArguments("disable-extensions");
// 屏蔽“chrome正受到自动测试软件的控制”的提示
// options.addArguments("--disable-infobars", "--disable-blink-features=AutomationControlled");
// options.addArguments("--remote-debugging-port=9222");
// options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
// 重新初始化一个chrome浏览器实例 // 重新初始化一个chrome浏览器实例
return new ChromeDriver(service, options); return new ChromeDriver(service, options);
} }
...@@ -66,7 +74,7 @@ public class DriverUtil { ...@@ -66,7 +74,7 @@ public class DriverUtil {
ReuseWebDriver driver=null; ReuseWebDriver driver=null;
try { try {
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
System.out.println(driver.connectTestFail()); // log.info("驱动连接失败:"+driver.connectTestFail());
if (driver.connectTestFail()) { if (driver.connectTestFail()) {
// 若驱动返回错误码,重新创建驱动服务并缓存 // 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver(); ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
...@@ -81,11 +89,11 @@ public class DriverUtil { ...@@ -81,11 +89,11 @@ public class DriverUtil {
map.put("serverUrl", serverUrl); map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map)); log.info("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
} }
}catch (Exception e){ }catch (Exception e){
System.out.println("出现异常"); log.info("获取驱动driver出现异常");
// 若驱动返回错误码,重新创建驱动服务并缓存 // 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver(); ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
serverUrl = DriverUtil.getServerUrl(chromeDriver); serverUrl = DriverUtil.getServerUrl(chromeDriver);
...@@ -99,7 +107,6 @@ public class DriverUtil { ...@@ -99,7 +107,6 @@ public class DriverUtil {
map.put("serverUrl", serverUrl); map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
} }
return driver; return driver;
...@@ -114,11 +121,11 @@ public class DriverUtil { ...@@ -114,11 +121,11 @@ public class DriverUtil {
Map<String, String> map =getSessionInfo(); Map<String, String> map =getSessionInfo();
String sessionId = map.get("sessionId"); String sessionId = map.get("sessionId");
String serverUrl = map.get("serverUrl"); String serverUrl = map.get("serverUrl");
log.info("从redis中获取保存的sessionId:"+sessionId);
return connectChrome(sessionId, serverUrl); return connectChrome(sessionId, serverUrl);
} }
public static Map<String, String> getSessionInfo() throws Exception{ public static Map<String, String> getSessionInfo() throws Exception{
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE); String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
System.out.println("获取驱动session:"+Constants.SELENIUM_DRIVER_CACHE+"::"+cacheInfo);
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class); Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
if(map==null || map.size()<1) { if(map==null || map.size()<1) {
map = new HashMap<>(2); map = new HashMap<>(2);
...@@ -126,7 +133,6 @@ public class DriverUtil { ...@@ -126,7 +133,6 @@ public class DriverUtil {
map.put("serverUrl", "https://www.baidu.com/"); map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动session失败重新设置:"+Constants.SELENIUM_DRIVER_CACHE+"::"+JSON.toJSONString(map));
} }
return map; return map;
} }
......
...@@ -160,7 +160,9 @@ public class PublishDateUtil { ...@@ -160,7 +160,9 @@ public class PublishDateUtil {
{ {
// return formatUSDate(raw); // return formatUSDate(raw);
return raw; // return raw;
//当时间解析不了时返回空
return "";
} }
} }
......
...@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ? ...@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ?
kafka.producer.servers=114.115.159.144:9092 kafka.producer.servers=114.115.159.144:9092
#kafka.producer.servers=39.101.72.117:19092 #kafka.producer.servers=39.101.72.117:19092
kafka.producer.retries=0 kafka.producer.retries=0
kafka.producer.batch.size=4096 kafka.producer.batch.size=409600
kafka.producer.linger=1 kafka.producer.linger=1
kafka.producer.buffer.memory=40960 kafka.producer.buffer.memory=409600
spring.activemq.broker-url= tcp://127.0.0.1:61616 spring.activemq.broker-url= tcp://127.0.0.1:61616
......
...@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0 ...@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS=10 JWYQJC_MEMCACHED_DAYS=10
TITLE_SIMILARITY_RATE=0.8 TITLE_SIMILARITY_RATE=0.8
MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE=1 CACHE_UPDATE=1
...@@ -36,7 +35,7 @@ PROXYID=1 ...@@ -36,7 +35,7 @@ PROXYID=1
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= E:\\chrome\\chromedriver.exe CHROMEDRIVE= E:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\Users\WIN10\AppData\Local\Google\Chrome\Application\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
#mysql connection #mysql connection
...@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic #KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
# #
KAFKA_CONSUMER_GROUP_ID=test-zs1 KAFKA_CONSUMER_GROUP_ID=test1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest #KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
KAFKA_PRODUCT_TOPIC=crawlerInfo KAFKA_PRODUCT_TOPIC=crawlerInfo
...@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0 ...@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0
#redis.host=114.116.26.150 #redis.host=114.116.26.150
#redis.port=6379 #redis.port=6379
#redis.pass=zzsn9988 #redis.pass=zzsn9988
#redis.host=114.115.236.206 redis.host=114.115.236.206
#redis.port=6379 redis.port=6379
#redis.pass=clbzzsn redis.pass=clbzzsn
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
redis.host=127.0.0.1 #redis.host=127.0.0.1
redis.port=6379 #redis.port=6379
redis.pass=xxxxxx #redis.pass=xxxxxx
redis.timeout=10000 redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
...@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\ ...@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\
selenium.driver.cache=selenium_driver_cache_loc112 selenium.driver.cache=selenium_driver_cache_loc112
#采集缓存的rediskey
MODEL_SCORE_URL=dy-1
......
# Redis settings # Redis settings
#redis.host=114.115.236.206 redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
redis.host=127.0.0.1
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=clbzzsn
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
redis.timeout=10000 redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论