提交 a18c008d 作者: liuweigang

采集代码更新9

上级 83f00b0f
...@@ -253,7 +253,7 @@ ...@@ -253,7 +253,7 @@
</dependency> </dependency>
<!--WebMagic 爬虫框架--> <!--WebMagic 爬虫框架-->
<dependency> <!--<dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.7.5</version> <version>0.7.5</version>
...@@ -274,7 +274,7 @@ ...@@ -274,7 +274,7 @@
<artifactId>slf4j-log4j12</artifactId> <artifactId>slf4j-log4j12</artifactId>
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>-->
<!-- 下面引用本地包,webMgic源码包修过后--> <!-- 下面引用本地包,webMgic源码包修过后-->
<!-- <dependency>--> <!-- <dependency>-->
<!-- <groupId>us.codecraft</groupId>--> <!-- <groupId>us.codecraft</groupId>-->
...@@ -307,11 +307,11 @@ ...@@ -307,11 +307,11 @@
<!-- </exclusions>--> <!-- </exclusions>-->
<!-- </dependency>--> <!-- </dependency>-->
<!--&lt;!&ndash; &lt;!&ndash; 上面引用本地包,webMgic源码包修过后&ndash;&gt;&ndash;&gt;--> <!--&lt;!&ndash; &lt;!&ndash; 上面引用本地包,webMgic源码包修过后&ndash;&gt;&ndash;&gt;-->
<dependency> <!--<dependency>
<groupId>com.github.detro</groupId> <groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId> <artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version> <version>1.2.0</version>
</dependency> </dependency>-->
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
......
...@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread; ...@@ -8,7 +8,6 @@ import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg; import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SpringContextUtil; import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants; import com.zzsn.utility.index.Constants;
import com.zzsn.webMagic.*;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerConfig;
...@@ -35,39 +34,36 @@ import java.util.Properties; ...@@ -35,39 +34,36 @@ import java.util.Properties;
@Slf4j @Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn") @SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplication extends SpringBootServletInitializer { public class CrawlerMateSearchApplication extends SpringBootServletInitializer{
@Override // @Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) { // protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerMateSearchApplication.class); // return builder.sources(CrawlerMateSearchApplication.class);
} // }
public static void main(String[] args) { public static void main(String[] args) {
SpringApplication.run(CrawlerMateSearchApplication.class,args); SpringApplication.run(CrawlerMateSearchApplication.class,args);
} }
/** // @Override
* 采用webMagic框架爬取 // public void run(String... args) throws Exception {
*/ //// try {
public void webMagic(){ //// consumerPartition();
new LinksReadThread().start(); //// }catch (Exception e){
new BaiduContentThread().start(); //// consumerPartition();
//// }
} //// try {
//// loadSiteMsgLoc();
//// }catch (Exception e){
//// loadSiteMsgLoc();
//// }
//// try {
//// consumerPartition();
//// }catch (Exception e){
//// consumerPartition();
//// }
// }
/**
* 老方法抓取
* @throws Exception
*/
public void ordinary() {
System.out.println("——————++++++++++++——————===");
try {
consumerPartition();
} catch (Exception e) {
consumerPartition();
}
loadSiteMsgLoc();
}
public void consumerPartition (){ public void consumerPartition (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
......
package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.FileUtil;
import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerMateSearchApplicationbak extends SpringBootServletInitializer implements CommandLineRunner {
// @Override
// protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
// return builder.sources(CrawlerMateSearchApplication.class);
// }
public static void main(String[] args) {
SpringApplication.run(CrawlerMateSearchApplicationbak.class,args);
}
@Override
public void run(String... args) throws Exception {
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
// try {
// loadSiteMsgLoc();
// }catch (Exception e){
// loadSiteMsgLoc();
// }
// try {
// consumerPartition();
// }catch (Exception e){
// consumerPartition();
// }
}
public void consumerPartition (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
consumer.commitSync();
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
log.info("关键词请求结束++++");
}catch (Exception e){
log.info("关键词解析异常: "+record.value().toString());
}
}
}
}
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
System.out.println("——————++++++++++++——————===");
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = new Gson().fromJson(subvalue, KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
public void loadloc(){
String key="{\n" +
" \"id\": \"2022090522\",\n" +
" \"wordsCode\": \"KW-20220602-0003\",\n" +
" \"wordsName\": \"2022世界机器人大会\",\n" +
" \"keyWord\": \"2022世界机器人大会\",\n" +
" \"exclusionWord\": null,\n" +
" \"status\": \"1\",\n" +
" \"subjectId\": null,\n" +
" \"subjectIds\": null,\n" +
" \"startTime\": null,\n" +
" \"endTime\": null \n" +
"}";
try {
KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
e.printStackTrace();
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
...@@ -87,56 +87,56 @@ public class ThreadExecutorConfig { ...@@ -87,56 +87,56 @@ public class ThreadExecutorConfig {
} }
@Bean(value = "asyncexecutorService") // @Bean(value = "asyncexecutorService")
public Executor executorService() { // public Executor executorService() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量 // executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量 // executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
@Bean(value = "asyncexecutorServiceWebBaidu") // @Bean(value = "asyncexecutorServiceWebBaidu")
public Executor executorServiceWebBaidu() { // public Executor executorServiceWebBaidu() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量 // executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量 // executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
@Bean(value = "asyncexecutorServiceDetailUrl") // @Bean(value = "asyncexecutorServiceDetailUrl")
public Executor asyncexecutorServiceDetailUrl() { // public Executor asyncexecutorServiceDetailUrl() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); // ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量 // executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量 // executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列 // executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-"); // executor.setThreadNamePrefix("selenium-");
/** // /**
* 对拒绝task的处理策略 // * 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务 // rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行 // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/ // */
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); // executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间 // executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize(); // executor.initialize();
return executor; // return executor;
} // }
} }
\ No newline at end of file
package com.zzsn.entity;
import lombok.Data;
import java.util.Date;
@Data
public class BadSiteMsg {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private String crawlerType;
/**分区id (多个用英文逗号隔开)*/
private String partition;
/**消费时间*/
private Date consumerDate;
}
...@@ -52,8 +52,10 @@ public class KafkaConsumerJob { ...@@ -52,8 +52,10 @@ public class KafkaConsumerJob {
public static final ExecutorService poolExecuter = new BlockThreadPoolExecute(5 public static final ExecutorService poolExecuter = new BlockThreadPoolExecute(5
, 10 , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1)); , 10 , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
/**fixedDelay:上一次执行完毕时间点之后一分钟再执行*/
// @Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") @Scheduled(fixedDelay=60000)
// @Async("asyncTaskExecutor")
public void consumer (){ public void consumer (){
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
...@@ -67,10 +69,14 @@ public class KafkaConsumerJob { ...@@ -67,10 +69,14 @@ public class KafkaConsumerJob {
ConsumerRecords<String, String> records = consumer.poll(0); ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync(); consumer.commitSync();
for(ConsumerRecord record : records){ for(ConsumerRecord record : records){
try {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class); KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread=new MetaBaiduSearchThread(); MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg=keywordMsg; metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler(); metaSearchThread.crawler();
}catch (Exception e){
continue;
}
} }
} }
}catch (Exception e){ }catch (Exception e){
...@@ -81,8 +87,9 @@ public class KafkaConsumerJob { ...@@ -81,8 +87,9 @@ public class KafkaConsumerJob {
} }
// @Scheduled(cron = "0 0/2 * * * ?") // @Scheduled(cron = "0 0/2 * * * ?")
@Async("asyncTaskExecutor") @Scheduled(fixedDelay=300000)
public void consumerPartition (){ public void consumerPartition (){
try {
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
...@@ -94,24 +101,26 @@ public class KafkaConsumerJob { ...@@ -94,24 +101,26 @@ public class KafkaConsumerJob {
} }
consumer.assign(topicPartitions); consumer.assign(topicPartitions);
try{ try {
while(true){ while (true) {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环 //消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0); ConsumerRecords<String, String> records = consumer.poll(0);
for(ConsumerRecord record : records){ for (ConsumerRecord record : records) {
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class); KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
consumer.commitSync(); consumer.commitSync();
MetaBaiduSearchThread metaSearchThread=new MetaBaiduSearchThread(); MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg=keywordMsg; metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler(); metaSearchThread.crawler();
} }
} }
} catch (Exception e) {
// consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}catch (Exception e){ }catch (Exception e){
consumer = createConsumer(); log.info("kafka调用信息失败");
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
} }
} }
// @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE) // @Scheduled(initialDelay = 1000, fixedRate = Long.MAX_VALUE)
@Async("asyncTaskExecutor") @Async("asyncTaskExecutor")
......
...@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; ...@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil; import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils; import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo; import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.BadSiteMsg;
import com.zzsn.entity.Site; import com.zzsn.entity.Site;
import com.zzsn.entity.SiteTemplate; import com.zzsn.entity.SiteTemplate;
import com.zzsn.paser.SourceTemplateByTag; import com.zzsn.paser.SourceTemplateByTag;
...@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -84,6 +85,8 @@ public class MetaBaiduSearchThread implements Runnable {
// @Async("asyncexecutorServiceWebBaidu") // @Async("asyncexecutorServiceWebBaidu")
public void crawler(){ public void crawler(){
// sentBadSiteMsg(keywordMsg,Constants.CRAWLER_SERVER,Constants.KAFKA_CONSUMER_PARTITION);
//对传进来的关键词组进行组合 //对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord(); String keyWord = keywordMsg.getKeyWord();
List<String> keyWords = SplitKeyword.transForm(keyWord); List<String> keyWords = SplitKeyword.transForm(keyWord);
...@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -153,6 +156,24 @@ public class MetaBaiduSearchThread implements Runnable {
} }
} }
public void sentBadSiteMsg(KeywordMsg keymsg,String crawlerType,String partition){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(keymsg.getId());
badSiteMsg.setInfoSourceCode(keymsg.getWordsCode());
badSiteMsg.setConsumerDate(new Date());
badSiteMsg.setCrawlerType(crawlerType);
badSiteMsg.setPartition(partition);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("crawler_consumer", docjson);
}catch (Exception e){
}
}
private String locateCharSet(String url) { private String locateCharSet(String url) {
String encoding = "gbk"; String encoding = "gbk";
try { try {
......
...@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable { ...@@ -78,7 +78,7 @@ public class DetailBaiduSearchThread implements Runnable {
@Async("asyncexecutorServiceDetailUrl") // @Async("asyncexecutorServiceDetailUrl")
public void crawler(){ public void crawler(){
try { try {
......
...@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable { ...@@ -82,7 +82,7 @@ public class WebBaiduSearchThread implements Runnable {
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Async("asyncexecutorServiceWebBaidu") // @Async("asyncexecutorServiceWebBaidu")
public void crawler(){ public void crawler(){
//对传进来的关键词组进行组合 //对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord(); String keyWord = keywordMsg.getKeyWord();
......
...@@ -113,12 +113,13 @@ public class Constants { ...@@ -113,12 +113,13 @@ public class Constants {
public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION"); public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION");
public static final String KAFKA_PRODUCT_PARTITION= prop.getProperty("KAFKA_PRODUCT_PARTITION"); public static final String KAFKA_PRODUCT_PARTITION= prop.getProperty("KAFKA_PRODUCT_PARTITION");
public static final String CRAWLER_SERVER= prop.getProperty("crawler_server");
public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
// public static final int KAFKA_COUNT=Integer.valueOf(prop.getProperty("whiles"));
public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC"); //
// public static final String testBaidu=prop.getProperty("KAFKA_test_TOPIC");
public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize")); //
// public static final Integer PAGESIZE=Integer.valueOf(prop.getProperty("pageSize"));
public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger")); //
// public static final Integer AVERGER=Integer.valueOf(prop.getProperty("averger"));
} }
package com.zzsn.utils;
import com.zzsn.webMagic.KafkaConsumers;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.proxy.Proxy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class GrabUtil {
private final static Logger logger = LoggerFactory.getLogger(GrabUtil.class);
/**
* 批量代理IP有效检测
*
*/
public static boolean checkProxyIp(Proxy proxyx) {
//创建httpGet实例
// HttpGet httpGet = new HttpGet("http://www.baidu.com");
HttpGet httpGet = new HttpGet("https://www.163.com/dy/article/GDT03TFD05158K7T.html");
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost proxy = new HttpHost(proxyx.getHost(),proxyx.getPort());
CredentialsProvider provider = new BasicCredentialsProvider();
//包含账号密码的代理
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(proxyx.getUsername(), proxyx.getPassword()));
//创建httpClient实例
CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(proxy)
.setConnectTimeout(2000)
.setSocketTimeout(2000)
.setConnectionRequestTimeout(2000)
.build();
httpGet.setConfig(requestConfig);
//设置请求头消息
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (response != null){
HttpEntity entity = response.getEntity(); //获取返回实体
if (entity != null){
System.out.println("网页内容为:"+ EntityUtils.toString(entity,"utf-8"));
}
}
if (statusCode ==200){
return true;
}
}catch ( Exception e){
e.printStackTrace();
// logger.error("校验代理ip是否可用出错,错误信息:",e);
}finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return false;
}
public static String httpGet(String url,Proxy myproxy){
HttpGet httpGet = new HttpGet(url);
//设置代理IP,设置连接超时时间 、 设置 请求读取数据的超时时间 、 设置从connect Manager获取Connection超时时间、
HttpHost proxy = new HttpHost(myproxy.getHost(),myproxy.getPort());
CredentialsProvider provider = new BasicCredentialsProvider();
//包含账号密码的代理
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(myproxy.getUsername(), myproxy.getPassword()));
//创建httpClient实例
CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(proxy)
.setConnectTimeout(5000)
.setSocketTimeout(5000)
.setConnectionRequestTimeout(5000)
.build();
httpGet.setConfig(requestConfig);
//设置请求头消息
httpGet.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
CloseableHttpResponse response=null;
try {
response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode ==200){
logger.info(" page sucess by ["+proxy.getHostName()+"],"+url+" ");
HttpEntity entity = response.getEntity(); //获取返回实体
return EntityUtils.toString(entity,"utf-8");
}else {
logger.info("page error by ["+proxy.getHostName()+"],"+url+" code:"+statusCode);
}
}catch ( Exception e){
logger.info("page error by ["+proxy.getHostName()+"],"+url+" code:500 ",e);
}finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpClient != null){
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
/**
* 给list进行分包
* @param iPacket 包大小
* @param list
*/
public static List<List<Request>> AverageList(List<Request>list, int iPacket){
int iCount=list.size()/iPacket;
int iRemit=list.size()%iPacket;
List<List<Request>> Average=new ArrayList<>();
for (int i=0;i<iCount;i++){
Average.add(list.subList(i*iPacket,(i+1)*iPacket));
}
if (iRemit>0){
Average.add(list.subList(iCount*iPacket,(iCount*iPacket+iRemit)));
}
return Average;
}
/**
* 获取神龙ip接口
* 一次获取一个,有效3分钟
*/
// public static MyProxy getShenlongIp(){
// MyProxy proxy=null;
// String url="http://api.shenlongip.com/ip?key=jy0tr2q0&pattern=json&count=1&need=1100&protocol=1&sign=d5654a620e9b424ca87fe1802f4c9e88";
// String result=HttpUtil.get(url);
// logger.info("调用代理IP接口返回结果:"+result);
// if (result!=null && result.length()>0){
// JSONObject re=JSONObject.parseObject(result);
// if (200== re.getInteger("code")){
// JSONArray datas= re.getJSONArray("data");
// if (datas.size()>0){
// for (Object data:datas){
// try {
// JSONObject o=(JSONObject) data;
// long expire= DateUtil.convertDate(o.getString("expire"),"yyyy-MM-dd HH:mm:ss").getTime();
// proxy=new MyProxy(o.getString("ip"),o.getInteger("port"),"hys_81310170_41c8","12345678",expire);
// }catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
// }else {
// logger.info("获取代理IP出错,返回信息:"+result);
// }
// }
// return proxy;
// }
public static void main(String[] args) {
String url="http://baijiahao.baidu.com/s?id=1662021416338923958&wfr=spider&for=pc;";
Proxy proxy= KafkaConsumers.getProxy().get(0);
System.out.println(httpGet(url,proxy));
}
// 3 175.10.141.61-40013-hys_81310170_41c8-12345678
// 1 125.119.174.226-40032-hys_81310170_41c8-12345678
// 2 122.230.139.103-40004-hys_81310170_41c8-12345678
// 4 183.165.248.64-40007-hys_81310170_41c8-12345678
}
package com.zzsn.webMagic;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.cache.JedisUtil;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.index.Constants;
import com.zzsn.utils.GrabUtil;
import com.zzsn.webMagic.downloader.SeleniumDownloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import java.util.ArrayList;
import java.util.List;
/**
* 抓取 百度内容
*/
public class BaiduContentThread extends Thread {
private final static Logger logger = LoggerFactory.getLogger(BaiduContentThread.class);
@Override
public void run() {
//从队列获取连接进行内容提取
List<Request> requestsList=new ArrayList<>();
Request request=null;
while (true){
try {
//队列
ClbAnsProcessitem entity=(ClbAnsProcessitem) BaiduTask.baiduUrlQueue.take();
if (entity !=null){
logger.info("等到内容连接来了,开始处理列表连接。。。。。");
// 根据连接判断url是否在redis里存在,存在则丢掉
boolean sismember = JedisUtil.sismember("baidu_web_test5::"+entity.getOrgId(), entity.getSourceAddress());
if (!sismember){
request=new Request();
request.setUrl(entity.getSourceAddress());
//把相关信息传递到下个对象中
request.putExtra("model", JSONObject.toJSONString(entity));
requestsList.add(request);
}else {
BaiduTask.doublep++;
logger.info("连接:"+entity.getSourceAddress()+"已存在。");
}
}
if (requestsList.size()<5){
continue;
}
//清除jvm dns缓存ip信息
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
java.security.Security.setProperty("networkaddress.cache.negative.ttl", "0");
if ("1".equals(Constants.PROXY)){
Proxy p=new Proxy("",3232); //ProxyMap.getProxy();
logger.info("获取资讯内容url,启动代理ip,进行下一步内容处理");
if (p !=null){
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new BaiduTxtProcessor())
.setDownloader(httpClientDownloader)
.startRequest(requestsList)
.thread(1)
.runAsync();
}else {
Thread.sleep(1000*30);
continue;
}
}else {
logger.info("获取资讯内容url,组装好了http请求信息,提交到WebMgic..................");
Spider.create(new BaiduTxtProcessor())
.startRequest(requestsList)
.thread(1)
.runAsync();
}
requestsList=new ArrayList<>();
try {
Thread.sleep(1000*5);
} catch (InterruptedException e) {
e.printStackTrace();
}
}catch (Exception e){
e.printStackTrace();
}
}
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.util.DateUtil;
import org.apache.http.HttpHeaders;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 根据百度关键字搜索返回列表结果处理
*/
public class BaiduKeyWordProcessor implements PageProcessor {
private final static Logger log = LoggerFactory.getLogger(BaiduKeyWordProcessor.class);
private ClbAnsProcessitem processitem=null;
private List<ClbAnsProcessitem> processitemList=null;
private Site site;
@Override
public void process(Page page) {
log.info("监听到百度关键字搜索,返回结果");
List<String> links=page.getHtml().xpath("//div[@id=content_left]/div[@tpl='news-normal']").all();
processitem=new ClbAnsProcessitem();
processitemList=new ArrayList<>();
Request request=page.getRequest();
log.info("获取到列表Url数:"+(links.size()));
for (String s:links){
processitem=new ClbAnsProcessitem();
//解析出标题,时间
String link="关键字列表";
try {
Html html=Html.create(s);
link=html.xpath("//div/div/h3").links().toString();
if (link == null || link.contains(".pdf") || link.trim().length()==0|| link.contains(".PDF")||link.contains("download")) {
BaiduTask.invalidUrl++;
continue;
}
Elements title=html.getDocument().select("a[data-click]");
Elements tiem=html.getDocument().select("span.c-color-gray2");
Elements org=html.getDocument().select("span.c-color-gray");
if (title !=null && title.get(0)!=null){
processitem.setTitle(title.get(0).text());
}
if (tiem!=null && tiem.size()>0){
processitem.setPublishDate(DateUtil.getPublishDate(tiem.get(0).text()));
}
if (org !=null && org.size()>0){
processitem.setSourceSite(org.get(0).text());
}
processitem.setSourceAddress(link);
processitem.setSid(request.getExtra("sid").toString());
processitem.setOrgId(request.getExtra("tid").toString());
processitem.setTid(request.getExtra("tid").toString());
processitem.setKeyWords(request.getExtra("words").toString());
//完成列表相关属性组装放入全局对列,等待下一步处理连接里的内容
BaiduTask.baiduUrlQueue.put(processitem); //加入到队列,如果队列满则阻塞
BaiduTask.linksUrl++;
}catch (Exception e){
log.error("解析列表Url["+page.getRequest().getUrl()+"]下的"+link+"+出错,错误信息:",e);
}
}
processitemList=null;
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(3000); //停3秒
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
site.setCycleRetryTimes(3);//设置循环次数
site.setTimeOut(8000);
}
return site;
}
// public static void main(String[] args) {
// List<Request> list=new ArrayList<>();
// Request request=new Request();
// Map<String,Object>map=new HashMap<>();
// map.put("aa","这是测试传参");
// request.setExtras(map);
// String url="https://sichuan.scol.com.cn/ggxw/202209/58604583.html";
// String url1="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%E5%9F%83%E5%A1%9E%E4%BF%84%E6%AF%94%E4%BA%9A%2B%E5%AE%A2%E6%9C%BA%2B%E9%81%87%E9%9A%BE&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=0";
// request.setUrl(url);
// list.add(request);
//// Request r2=new Request();
//// r2.setExtras(map);
//// r2.setUrl(url1);
//// list.add(r2);
// // Proxy proxy=new Proxy("113.243.178.169",40016,"hys_81310170_41c8","12345678");
// // HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
// // httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(proxy));
// Spider.create(new BaiduKeyWordProcessor()).thread(1)
// .startRequest(list)
// // .setDownloader(new SeleniumDownloader("E:\\chromdriver\\chromedriver.exe"))
// // .setDownloader(httpClientDownloader)
// // .addUrl(url)
// .runAsync();
//
// }
}
package com.zzsn.webMagic;
import com.alibaba.fastjson.JSONObject;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.job.KafkaConsumerTask;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SplitKeyword;
import com.zzsn.utility.index.Constants;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 百度关键字搜索爬虫任务
*/
@Component
@EnableScheduling
public class BaiduTask {
private final static Logger logger = LoggerFactory.getLogger(BaiduTask.class);
//缓存url列表对象对列,先进先出原则
public static LinkedBlockingQueue<ClbAnsProcessitem> baiduUrlQueue=new LinkedBlockingQueue();
/**
* 关键字队列
*/
public static LinkedBlockingQueue<KeywordMsg> keyWordsQueue=new LinkedBlockingQueue<>();
private final String BAIDU_KEY_URL="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=%s&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&pn=%d";
/**
* 关键词组个数
*/
static int keyWordsArrey=0;
/**
* 关键词个数
*/
static int keyWordsSigin=0;
/**
* 生成有效百度连接
*/
static int linksUrl=0;
/**
* 无效百度连接
*/
static int invalidUrl=0;
/**
* 重复连接
*/
static int doublep=0;
static int toKafka=0;
/**
* 根据关键词组搜索抓取连接列表
*/
@Scheduled(initialDelay = 40000,fixedRate = 1000*60*10)
public void getFormKeyWordsArry(){
//创建消费者
logger.info("10分钟跑一次。。。。。");
KafkaConsumer<String, String> consumer = KafkaConsumers.createConsumer();
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
KeywordMsg keywordMsg=null; //关键词对象
KeywordMsg nextKeyWords=null; //关键词对象
List<String> keyword=null; //关键词字符串集合
List<KeywordMsg> keywordMsgsList=new ArrayList<>(); //保存所有关键字列表
for (int i=0;i<Constants.KAFKA_COUNT;i++){
ConsumerRecords<String, String> records = consumer.poll(100);
consumer.commitSync();
logger.info("百度关键字搜索消费消息开始。。。。。。。。。。。。");
for (ConsumerRecord consumerRecord:records){
//封装关键字对象
try {
keywordMsg = new Gson().fromJson(consumerRecord.value().toString(), KeywordMsg.class);
/***
* 拿到关键词组,由关键词组再次拼装成关键词生成关键词对象放入队列
* 下一步需要用代理ip来分包爬取连接
*/
if (keywordMsg !=null){
keyWordsArrey++;
//拿到关键词组,再次组装关键词对象
keyword= SplitKeyword.transForm(keywordMsg.getKeyWord());
if (keyword !=null && keyword.size()>0){
for (String keys:keyword){
keyWordsSigin++;
nextKeyWords=new KeywordMsg();
nextKeyWords.setId(keywordMsg.getId());
nextKeyWords.setCrawlerType(keywordMsg.getCrawlerType());
nextKeyWords.setKeyWord(keys);
nextKeyWords.setStatus(keywordMsg.getStatus());
nextKeyWords.setExclusionWord(keywordMsg.getExclusionWord());
nextKeyWords.setSubjectId(keywordMsg.getSubjectId());
nextKeyWords.setWordsCode(keywordMsg.getWordsCode());
nextKeyWords.setWordsName(keywordMsg.getWordsName());
keyWordsQueue.put(nextKeyWords);
}
}
}else {
logger.error("百度搜索关键字,未解析到关键字");
}
}catch (Exception e){
logger.error("从kafka获取关键字出错,错误信息:",e);
}
}
}
}
}
package com.zzsn.webMagic;
import cn.hutool.core.date.DateUtil;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.paser.SourceTemplateByTag;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.extractor.StandardWebExtractorHandler;
import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.util.RequestUtil;
import com.zzsn.utility.util.Utility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.ibatis.annotations.Param;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* 资讯正文抓取
*/
public class BaiduTxtProcessor implements PageProcessor {
private final static Logger log = LoggerFactory.getLogger(BaiduTxtProcessor.class);
private Site site;
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
private ClbAnsProcessitem processitem=null;
private List<ClbAnsProcessitem> processitemList=null;
@Override
public void process(Page page) {
log.info("监听到进入内容解析中。。。。。。。。。。。。。");
Request request=page.getRequest();
processitem=new ClbAnsProcessitem();
processitemList=new ArrayList<>();
processitem=JSONObject.parseObject(request.getExtra("model").toString(),ClbAnsProcessitem.class);
processitem.setSource("3");
DocInfo docInfo = new DocInfo();
try {
String link=page.getUrl().toString(); //当前页面连接
String infodata=page.getHtml().toString(); //页面内容
//判断网页编码
String contentCharset = Utility.getWebEncodingByStr(infodata);
if(link.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
infodata = RequestUtil.getTaotiaoData(link );
}
// 判断是否存在对应域名的模板
if(link.contains("qq.com") && !link.contains("://new.qq.com")){
link= KafkaConsumers.transqqURl(link);
}
String domainurl = new URL(link).getHost();
Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
SiteTemplate siteTemplate=new SiteTemplate();
if (siteTempObj != null && !"null".equals(siteTempObj)) {
com.zzsn.entity.Site site=(com.zzsn.entity.Site)siteTempObj;
siteTemplate.setMatchTitle(site.getMatchTitle());
siteTemplate.setMatchAuthor(site.getMatchAuthor());
siteTemplate.setMatchContent(site.getMatchContent());
siteTemplate.setMatchOrigin(site.getMatchOrigin());
siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
siteTemplate.setMatchSummary(site.getMatchSummary());
docInfo= SourceTemplateByTag.doPaserByTag(infodata, docInfo, siteTemplate);
}
if(null!=docInfo.getContentWithTag()) {
System.out.println("使用模板解析内容成功"+domainurl);
log.info("使用模板解析内容成功"+domainurl);
}
//使用内容规则解析
if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
new StandardWebExtractorHandler().doHandler(infodata, docInfo);
}
//替换图片相对路径为绝对路径
if (docInfo.getTitle() != null
&& docInfo.getTitle().trim().length() > 0
&& docInfo.getContentNoTag() != null
&& docInfo.getContentNoTag().trim().length() > 0){
ContentFileResult contentFileResult = new ContentFileResult();
contentFileResult =KafkaConsumers.getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));
docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
}
if (StringUtils.isNotBlank(docInfo.getPublishDate())){
processitem.setPublishDate(docInfo.getPublishDate());
}
//只有内容不为空的才进下一步处理
if (StringUtils.isNotBlank(docInfo.getContentNoTag())){
processitem.setContent(docInfo.getContentNoTag());
processitem.setContentWithtag(docInfo.getContentWithTag());
processitem.setSummary(docInfo.getSummary());
processitem.setAuthor(docInfo.getAuthor());
processitem.setCreateDate(DateUtil.now());
String msg=new ObjectMapper().writeValueAsString(processitem);
//输出
kafkaTemplate.send(Constants.testBaidu, msg);
BaiduTask.toKafka++;
log.info("加入到kfaka...........");
// 加入缓存池中
// JedisUtil.sadd("baidu::"+processitem.getOrgId(), processitem.getSourceAddress());
//测试redis
JedisUtil.sadd("baidu_web_test5::"+processitem.getOrgId(), processitem.getSourceAddress());
}else {
log.info("没有获取资讯内容,连接:"+link);
}
}catch (Exception e){
log.error("解析内容逻辑出错,错误信息:",e);
}
log.info("程序处理:关键词组总数:"+BaiduTask.keyWordsArrey+";关键词总数:"+BaiduTask.keyWordsSigin+";有效URL连接数:"+BaiduTask.linksUrl+";无效URL连接数:"+BaiduTask.invalidUrl+";重复连接个数:"+BaiduTask.doublep+";入kafak数量:"+BaiduTask.toKafka);
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(4000);
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
site.addHeader(HttpHeaders.CONNECTION, "close");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
site.setTimeOut(8000);
site.setCycleRetryTimes(3);
}
return site;
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utils.GrabUtil;
import com.zzsn.webMagic.downloader.SeleniumDownloader;
import org.apache.http.HttpHeaders;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.selector.Html;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import java.util.List;
public class Expaler implements PageProcessor {
Site site;
@Override
public void process(Page page) {
// List<String> links=page.getHtml().xpath("//div[@id=content_left]/div[@tpl='news-normal']").all();
// links.forEach(t-> System.out.println(t));
System.out.println(page.getHtml().toString());
}
@Override
public Site getSite() {
if (site==null){
site=Site.me().setSleepTime(0);
site.addHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
site.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
site.addHeader(HttpHeaders.CONNECTION, "close");
}
return site;
}
public static void main(String[] args) {
String url="https://www.163.com/dy/article/GTOJV0AF0551M1ZM.html";
Proxy proxy=new Proxy("114.102.52.200",61798,"hys_81310170_41c8","12345678");
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(KafkaConsumers.getProxy().get(0)));
SeleniumDownloader seleniumDownloader=new SeleniumDownloader("E:\\chromdriver\\chromedriver.exe");
seleniumDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new Expaler()).thread(1)
// .startRequest(list)
.setDownloader(seleniumDownloader)
// .setDownloader(httpClientDownloader)
.addUrl(url)
.runAsync();
}
}
package com.zzsn.webMagic;
import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.oracledb.OracleDBManager;
import com.zzsn.search.oracledb.OracleDataTable;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.model.FileTag;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import us.codecraft.webmagic.proxy.Proxy;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* kafka消费者
*/
public class KafkaConsumers {
public static org.apache.kafka.clients.consumer.KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
/**
* 从oracl里获取代理ip
* @return
*/
public static List<Proxy> getProxy(){
List<Proxy> proxyList=null;
String searchSql = "select proxy from CIS_sys_Proxy where id="+(RandomUtil.randomInt(4)+1);
// String searchSql = "select proxy from CIS_sys_Proxy ";
String proxy=null;
OracleDBManager dm = new OracleDBManager();
try {
OracleDataTable dt = dm.getResultData(null, null, searchSql);
if(dt != null && dt.getRowCount()> 0){
proxyList=new ArrayList<>();
for(int i = 0; i<dt.getRowCount(); i++){
for(int j = 0; j<dt.getColCoun(); j++)
if(dt.getRow()[i][j].length()>5){
proxy=dt.getRow()[i][j];
String[] ps=proxy.split("-");
proxyList.add(new Proxy(ps[0],Integer.valueOf(ps[1]),ps[2],ps[3]));
}
}
}
} catch (SQLException e) {
e.printStackTrace();
}
return proxyList;
}
//转换qq新闻链接
public static String transqqURl(String oldurl){
String patt="https://new.qq.com/omn/[date]/[pamars].html";
String b1=oldurl.substring(oldurl.lastIndexOf("/")+1);
String b2=getNumbers(b1);
String curl=patt.replace("[date]",b2).replace("[pamars]",b1);
return curl;
}
public static String getNumbers(String content) {
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
return matcher.group(0);
}
return "";
}
public static ContentFileResult getContentFile(String contentWithTag, String sourceaddress)throws Exception{
String contentImgCvtTag = contentWithTag;
String formatImgContent= contentWithTag;
Map<String, FileTag> imgDataMap = ContentFileFinder.getContentFileTag(contentWithTag,sourceaddress);
//key为图片爬取路径,value为图片保存路径
Map<String, FileTag> imgMap = new HashMap<String, FileTag>();
for (String key : imgDataMap.keySet()) {
FileTag fileTag = imgDataMap.get(key);
while (contentImgCvtTag.contains(key)) {
//IMG_SERVER开头的路径
contentImgCvtTag = contentImgCvtTag.replace(key, fileTag.getSaveTag());
}
imgMap.put(fileTag.getAbsolutePath(), fileTag);
}
ContentFileResult cis = new ContentFileResult();
cis.setContentAbsoulute(formatImgContent);
cis.setContentImgCvtTag(contentImgCvtTag);
cis.setFileMap(imgMap);
return cis;
}
}
package com.zzsn.webMagic;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
/**
* 从队列里读取关键字
*/
public class LinksReadThread extends Thread {
private final static Logger logger = LoggerFactory.getLogger(LinksReadThread.class);
private final String BAIDU_KEY_URL="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%s&pn=%d";
@Override
public void run() {
List<Request> requestByMcList=new ArrayList<>(); //reques列表
Request webReq=null;
while (true){
try {
KeywordMsg keywordMsg= BaiduTask.keyWordsQueue.take();
logger.info("等到关键字来了。。。。。。");
/****
* 根据关键字词组再次拼装成需要搜索的关键字
* 一个关键字词组能生成N个关键字,一个关键字词组启动一个webMagic爬虫任务
* 同时把关键字词组的基本属性传递给下一步
* 处理流程:1、初始化 webMagic
* 2、 代理获取:增加代理验证
* 3、由关键字拼接百度url,每个关键词只处理一页(10),后续可修改
*/
//组装下一步用 webMagic进行多线程处理,组装url
try {
for (int i=0;i<Constants.PAGESIZE;i++){
//根据配置文件配置抓取页数,此方法只对百度元搜索有用
webReq=new Request();
webReq.putExtra("sid",keywordMsg.getId());
webReq.putExtra("tid",keywordMsg.getWordsCode());
webReq.putExtra("words",keywordMsg.getKeyWord());
webReq.setUrl(String.format(BAIDU_KEY_URL, URLEncoder.encode(keywordMsg.getKeyWord(),"UTF-8"),(i*10)));
requestByMcList.add(webReq);
}
}catch (Exception e){
logger.error("根据关键词组生成request对象出错,错误信息:",e);
}
// if (requestByMcList.size()<10){
// continue;
// }
//清除jvm 缓存Ip
java.security.Security.setProperty("networkaddress.cache.ttl" , "0");
java.security.Security.setProperty("networkaddress.cache.negative.ttl", "0");
if (Constants.PROXY.equals("1")){
Proxy p=new Proxy("",32323);//ProxyMap.getProxy();
logger.info("获取待处理一批url,获取代理ip请等待进行下步处理");
if (p!=null){
HttpClientDownloader httpClientDownloader=new HttpClientDownloader();
httpClientDownloader.setProxyProvider(new SimpleProxyProvider(KafkaConsumers.getProxy()));
Spider.create(new BaiduKeyWordProcessor())
.setDownloader(httpClientDownloader)
.startRequest(requestByMcList)
.thread(1)
.runAsync();
}else {
Thread.sleep(1000*10);
continue;
}
}else {
//不走代理则直接开始跑
logger.info("获取待处理url提交到webMgic,提交数:"+requestByMcList.size());
Spider.create(new BaiduKeyWordProcessor())
.startRequest(requestByMcList)
.thread(1)
.runAsync();
}
requestByMcList=new ArrayList<>();
try {
Thread.sleep(1000*8);
} catch (InterruptedException e) {
e.printStackTrace();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
package com.zzsn.webMagic.downloader;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private volatile ChromeDriver webDriver;
private static final String DRIVER_PHANTOMJS = "phantomjs";
Dimension targetSize = new Dimension(600, 600);// 设置窗口大小
private ProxyProvider proxyProvider;
/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
if (webDriver !=null){
webDriver.quit();
}
try {
ChromeOptions options=new ChromeOptions();
//设置 chrome 的无头模式
options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--start-maximized");
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options.addArguments("--window-size=1280,4300");
if (proxyProvider !=null){
us.codecraft.webmagic.proxy.Proxy p= proxyProvider.getProxy(task);
String proxyServer =p.getHost()+":"+p.getPort();
org.openqa.selenium.Proxy proxy = new Proxy().setHttpProxy(proxyServer).setSslProxy(proxyServer);
proxy.setSocksUsername("hys_81310170_41c8");
proxy.setSocksPassword("1234567");
options.setProxy(proxy);
}
webDriver = new ChromeDriver(options);
} catch (Exception e) {
logger.warn("interrupted", e);
return null;
}
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriver.quit();
return page;
}
// private void checkInit() {
// if (webDriver == null) {
// synchronized (this) {
// webDriverPool = new WebDriverPool(poolSize);
// }
// }
// }
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriver.quit();
}
}
package com.zzsn.webMagic.downloader;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
private Logger logger = LoggerFactory.getLogger(getClass());
private final static int DEFAULT_CAPACITY = 5;
private final int capacity;
private final static int STAT_RUNNING = 1;
private final static int STAT_CLODED = 2;
private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);
/*
* new fields for configuring phantomJS
*/
private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
private static final String DEFAULT_CONFIG_FILE = "E:/chromdriver/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs";
protected static Properties sConfig;
protected static DesiredCapabilities sCaps;
/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
String configFile = DEFAULT_CONFIG_FILE;
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));
// Prepare capabilities
sCaps = new DesiredCapabilities();
sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Fetch PhantomJS-specific configuration parameters
if (driver.equals(DRIVER_PHANTOMJS)) {
// "phantomjs_exec_path"
if (sConfig.getProperty("phantomjs_exec_path") != null) {
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
sConfig.getProperty("phantomjs_exec_path"));
} else {
throw new IOException(
String.format(
"Property '%s' not set!",
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY));
}
// "phantomjs_driver_path"
if (sConfig.getProperty("phantomjs_driver_path") != null) {
System.out.println("Test will use an external GhostDriver");
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY,
sConfig.getProperty("phantomjs_driver_path"));
} else {
System.out
.println("Test will use PhantomJS internal GhostDriver");
}
}
// Disable "web-security", enable all possible "ssl-protocols" and
// "ignore-ssl-errors" for PhantomJSDriver
// sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new
// String[] {
// "--web-security=false",
// "--ssl-protocol=any",
// "--ignore-ssl-errors=true"
// });
ArrayList<String> cliArgsCap = new ArrayList<String>();
cliArgsCap.add("--web-security=false");
cliArgsCap.add("--ssl-protocol=any");
cliArgsCap.add("--ignore-ssl-errors=true");
sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
cliArgsCap);
// Control LogLevel for GhostDriver, via CLI arguments
sCaps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
new String[] { "--logLevel="
+ (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig
.getProperty("phantomjs_driver_loglevel")
: "INFO") });
// String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
// Start appropriate Driver
if (isUrl(driver)) {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
ChromeOptions options=new ChromeOptions();
//设置 chrome 的无头模式
options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--start-maximized");
//因为报表页面必须滚动才能全部展示,这里直接给个很大的高度
options.addArguments("--window-size=1280,4300");
String proxyServer = "1.83.251.72:40013";
Proxy proxy = new Proxy().setHttpProxy(proxyServer).setSslProxy(proxyServer);
options.setProxy(proxy);
mDriver = new ChromeDriver(options);
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
}
/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private boolean isUrl(String urlString) {
try {
new URL(urlString);
return true;
} catch (MalformedURLException mue) {
return false;
}
}
/**
* store webDrivers created
*/
private List<WebDriver> webDriverList = Collections
.synchronizedList(new ArrayList<WebDriver>());
/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();
public WebDriverPool(int capacity) {
this.capacity = capacity;
}
public WebDriverPool() {
this(DEFAULT_CAPACITY);
}
/**
*
* @return
* @throws InterruptedException
*/
public WebDriver get() throws InterruptedException {
checkRunning();
WebDriver poll = innerQueue.poll();
if (poll != null) {
return poll;
}
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {
// add new WebDriver instance into pool
try {
configure();
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
}
// ChromeDriver e = new ChromeDriver();
// WebDriver e = getWebDriver();
// innerQueue.add(e);
// webDriverList.add(e);
}
}
}
return innerQueue.take();
}
public void returnToPool(WebDriver webDriver) {
checkRunning();
innerQueue.add(webDriver);
}
protected void checkRunning() {
if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
throw new IllegalStateException("Already closed!");
}
}
public void closeAll() {
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);
if (!b) {
throw new IllegalStateException("Already closed!");
}
for (WebDriver webDriver : webDriverList) {
logger.info("Quit webDriver" + webDriver);
webDriver.quit();
webDriver = null;
}
}
}
...@@ -23,16 +23,16 @@ THREAD_SIZE=1 ...@@ -23,16 +23,16 @@ THREAD_SIZE=1
#地址 #地址
KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#消费主题 #消费主题
#KAFKA_CONSUMER_TOPIC=keyWordsInfo KAFKA_CONSUMER_TOPIC=keyWordsInfo
#测试主题 #测试主题
KAFKA_CONSUMER_TOPIC=baiduTest #KAFKA_CONSUMER_TOPIC=baiduTest
#消费者 #消费者
#KAFKA_CONSUMER_GROUP_ID=baidu-web-test KAFKA_CONSUMER_GROUP_ID=baidu2
#测试消费者 #测试消费者
KAFKA_CONSUMER_GROUP_ID=baidu-wemagic #KAFKA_CONSUMER_GROUP_ID=baidu-wemagic
#发送消息 #发送消息
KAFKA_test_TOPIC=baidu-bind2-test #KAFKA_test_TOPIC=baidu-bind-test
#KAFKA_test_TOPIC=baidu-new-test KAFKA_test_TOPIC=baidu-new-test
...@@ -81,3 +81,6 @@ whiles=10 ...@@ -81,3 +81,6 @@ whiles=10
pageSize=5 pageSize=5
#平分因子 #平分因子
averger=2000 averger=2000
#
crawler_server=baidu-1
\ No newline at end of file
package com.zzsn; package com.zzsn;
import cn.hutool.core.date.DateUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.DynaminSiteThread; import com.zzsn.crawler.DynaminSiteThread;
import com.zzsn.crawler.SiteThread; import com.zzsn.crawler.SiteThread;
import com.zzsn.crawlerOther.ArticleCrawler; import com.zzsn.crawlerOther.ArticleCrawler;
import com.zzsn.entity.BadSiteMsg;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.generation.FileUtil; import com.zzsn.generation.FileUtil;
...@@ -24,6 +27,7 @@ import org.springframework.boot.builder.SpringApplicationBuilder; ...@@ -24,6 +27,7 @@ import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.ServletComponentScan; import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import org.springframework.context.ConfigurableApplicationContext; import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.kafka.core.KafkaTemplate;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -60,13 +64,27 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -60,13 +64,27 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// } catch (Exception e) { // } catch (Exception e) {
// loadSiteFitler(); // loadSiteFitler();
// } // }
// try {
// loadSiteByGruop();
// } catch (Exception e) {
// loadSiteByGruop();
// }
// try {
// loadSiteMsgLoc();
// } catch (Exception e) {
// loadSiteMsgLoc();
// }
// loadSiteMsgLoc2(); // loadSiteMsgLoc2();
// loadSiteMsgLoc3(); // loadSiteMsgLoc3();
// loadSiteMsgLoc3();
} }
//分区获取kafka数据方法
public void loadSiteMsg(){ public void loadSiteMsg(){
try{ try{
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class); KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息"); log.info("进入定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer(); KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
// 消费某个主题的某个分区数据 // 消费某个主题的某个分区数据
...@@ -84,10 +102,12 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -84,10 +102,12 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
ConsumerRecords<String, String> records = consumer.poll(300); ConsumerRecords<String, String> records = consumer.poll(300);
if (records != null && records.count() > 0) { if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) { for (ConsumerRecord record : records) {
System.out.println("kafka消息:" + record.value().toString());
// System.out.println("kafka消息:" + record.value().toString());
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple; siteThread.siteMsgTemple = siteMsgTemple;
log.info("信息源栏目:"+siteMsgTemple.getSiteName());
siteThread.crawler(); siteThread.crawler();
} }
} }
...@@ -99,14 +119,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -99,14 +119,19 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println(e.getMessage()); System.out.println(e.getMessage());
System.out.println("程序异常+++++"); System.out.println("程序异常+++++");
try { try {
Thread.sleep(30000); Thread.sleep(3000);
} catch (InterruptedException ex) { } catch (InterruptedException ex) {
ex.printStackTrace(); ex.printStackTrace();
} }
// loadSiteMsg(); // loadSiteMsg();
} }
} }
/**
*
* @param (0:静态爬取 1:动态爬取)
*/
//一个组获取Kafka消息通过过滤获取信息
public void loadSiteFitler(){ public void loadSiteFitler(){
try{ try{
String filepath= Constants.IMGPATH; String filepath= Constants.IMGPATH;
...@@ -114,7 +139,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -114,7 +139,7 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
File f = new File(filepath); File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8"); List<String> allLines = FileUtil.getFileLines(f, "utf-8");
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class); KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息"); log.info("进入定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer(); KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC)); consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
...@@ -122,16 +147,17 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -122,16 +147,17 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
try { try {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环 //消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300); ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync(); consumer.commitSync();
if (records != null && records.count() > 0) { if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) { for (ConsumerRecord record : records) {
System.out.println("kafka消息:" + record.value().toString());
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
String infoSourceCode = siteMsgTemple.getInfoSourceCode(); String infoSourceCode = siteMsgTemple.getInfoSourceCode();
log.info("获取数据正常信息源栏目:"+siteMsgTemple.getSiteName());
if(StringUtils.isNotEmpty(infoSourceCode) && allLines.contains(infoSourceCode)){ if(StringUtils.isNotEmpty(infoSourceCode) && allLines.contains(infoSourceCode)){
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple; siteThread.siteMsgTemple = siteMsgTemple;
log.info("500强信息源栏目:"+siteMsgTemple.getSiteName());
siteThread.crawler(); siteThread.crawler();
} }
...@@ -145,6 +171,46 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -145,6 +171,46 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
System.out.println(e.getMessage()); System.out.println(e.getMessage());
System.out.println("程序异常+++++"); System.out.println("程序异常+++++");
try { try {
Thread.sleep(3000);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
// loadSiteMsg();
}
}
//单独组获取所有信息
public void loadSiteByGruop(){
try{
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
log.info("进入定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
while(true){
try {
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300);
consumer.commitSync();
if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) {
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
log.info("信息源栏目:" + siteMsgTemple.getSiteName());
siteThread.crawler();
}
}
}catch (Exception e){
continue;
}
}
}catch (Exception e){
System.out.println(e.getMessage());
System.out.println("程序异常+++++");
try {
Thread.sleep(30000); Thread.sleep(30000);
} catch (InterruptedException ex) { } catch (InterruptedException ex) {
ex.printStackTrace(); ex.printStackTrace();
...@@ -272,7 +338,79 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple ...@@ -272,7 +338,79 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class); // ArticleCrawler articleCrawler=SpringContextUtil.getBean(ArticleCrawler.class);
// articleCrawler.consumer(); // articleCrawler.consumer();
System.out.println("——————++++++++++++——————==="); System.out.println("——————++++++++++++——————===");
String value="{\"crawlType\":1,\"cron\":\"59 24 1/1 * * ?\",\"dataFormInfo\":\"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\"dataFormat\":0,\"dataPageEnd\":0,\"dataPageStart\":0,\"dataStorageInfo\":\"{\\\"accessMode\\\":\\\"FTP\\\"}\",\"dataStorageMode\":0,\"dataType\":0,\"detailExpressionAuthor\":\"<author><exp>div[class=\\\"c-infos\\\"]>span:nth-child(3)</exp><subtraction>div[class=\\\"ctx-content\\\"]</subtraction></author>\",\"detailExpressionPublishDate\":\"<publish_date><exp>span[class=\\\"publish-date\\\"]</exp></publish_date>\",\"detailExpressionSource\":\"<origin><exp>div[class=\\\"c-infos\\\"]>span:nth-child(2)</exp></origin>\",\"detailExpressionTitle\":\"<title><exp>div[class=\\\"c-title\\\"]>h1</exp></title>\",\"detailExpressionType\":\"3\",\"detailInfo\":\"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\"extractInfo\":\"[{\\\"id\\\":3,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\"hisDateEndTime\":1661875200000,\"hisDateStartTime\":1659283200000,\"id\":\"1560165847270588417\",\"infoBlockPosition\":\"table[class=\\\"table-model\\\"]\",\"infoSourceCode\":\"IN-20220818-0001\",\"infoSourceTypeId\":\"1\",\"informationPublishDate\":\"table[class=\\\"table-model\\\"]>tbody>tr>td:nth-child(9)\",\"informationSource\":\"table[class=\\\"table-model\\\"]>tbody>tr>td:nth-child(7)>a\",\"informationTitle\":\"table[class=\\\"table-model\\\"]>tbody>tr>td:nth-child(4)>a\",\"linkLocation\":\"table[class=\\\"table-model\\\"]>tbody>tr>td:nth-child(4)>a\",\"listExpressionType\":\"3\",\"listUrl\":\"https://data.eastmoney.com/report/industry.jshtml?hyid=1032\",\"pageEnd\":0,\"pageStart\":0,\"parameter\":\"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\"siteName\":\"行业研报\",\"siteUri\":\"https://data.eastmoney.com/report/industry.jshtml?hyid=1032\",\"status\":\"1\",\"webSiteName\":\"东方财富网\",\"ynDataPageAll\":\"0\",\"ynDownload\":\"0\",\"ynDynamicCrawl\":1,\"ynHisDataAll\":\"1\",\"ynLogin\":0,\"ynPageAll\":\"0\"}"; String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1579006103849472002\",\n" +
" \"infoSourceCode\": \"IN-20221009-0091\",\n" +
" \"webSiteName\": \"中国废钢铁应用协会\",\n" +
" \"siteName\": \"中国废钢铁应用协会-新闻中心-热点新闻\",\n" +
" \"siteUri\": \"http://www.camu.org.cn/feigangxiehui/list.html?channelName=%E7%83%AD%E7%82%B9%E6%96%B0%E9%97%BB&code=101001\",\n" +
" \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": \"2\",\n" +
" \"language\": \"zh-cn\",\n" +
" \"checkedList\": \"1\",\n" +
" \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" +
" \"status\": \"1\",\n" +
" \"listUrl\": \"http://www.camu.org.cn/feigangxiehui/list.html?channelName=%E7%83%AD%E7%82%B9%E6%96%B0%E9%97%BB&code=101001\",\n" +
" \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": \"http://www\\\\.camu\\\\.org\\\\.cn/feigangxiehui/article_detail\\\\.html\\\\?id=[\\\\d]{1,}&code=101001&type=0\",\n" +
" \"informationTitle\": \"\",\n" +
" \"informationPublishDate\": \"\",\n" +
" \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[id=\\\"listTable\\\"]>li\",\n" +
" \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":270,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": 3,\n" +
" \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"0\",\n" +
" \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>*.h1[class=\\\"article-tit\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>*.div[class=\\\"article-info\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>*.article[class=\\\"content\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":270,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" +
" \"formTitle\": null,\n" +
" \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":270,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" +
" \"dataPageEnd\": 0,\n" +
" \"ynDataPageAll\": \"0\",\n" +
" \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 0,\n" +
" \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" +
" \"link\": null,\n" +
" \"account\": null,\n" +
" \"password\": null,\n" +
" \"userAgent\": null,\n" +
" \"referer\": null,\n" +
" \"cookies\": null,\n" +
" \"headers\": null,\n" +
" \"otherInfo\": null,\n" +
" \"crawlType\": 1,\n" +
" \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"11 22 0/7 * * ?\",\n" +
" \"ynSnapshot\": null\n" +
"}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
DynaminSiteThread siteThread = new DynaminSiteThread(); DynaminSiteThread siteThread = new DynaminSiteThread();
// siteMsgTemple.setYnDynamicCrawl(0); // siteMsgTemple.setYnDynamicCrawl(0);
......
...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController { ...@@ -35,7 +35,7 @@ public class CrawlerCommVerifyController extends BaseController {
@ResponseBody @ResponseBody
public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){ public String VerifyDetailMsg(@RequestBody SiteMsgTemple siteMsgTemple, HttpServletResponse response){
SiteInfoVerify siteInfoVerify=new SiteInfoVerify(); SiteInfoVerify siteInfoVerify=new SiteInfoVerify();
siteMsgTemple.setVerifyType("1"); // siteMsgTemple.setVerifyType("1");
VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple); VerifyResult verifyResult = siteInfoVerify.crawlerDetialMsg(siteMsgTemple);
return MsgUtil.outSiteJSON(verifyResult); return MsgUtil.outSiteJSON(verifyResult);
} }
......
package com.zzsn.crawler; package com.zzsn.crawler;
import cn.hutool.core.date.DateTime; import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.paser.*; import com.zzsn.crawler.paser.*;
import com.zzsn.crawler.uriparser.HisURIConfig; import com.zzsn.crawler.uriparser.HisURIConfig;
import com.zzsn.crawler.uriparser.HisURIParser; import com.zzsn.crawler.uriparser.HisURIParser;
import com.zzsn.download.PageDownloader;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -27,6 +29,7 @@ public class DynaminSiteThread implements Runnable{
public PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple(); public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override @Override
...@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{ ...@@ -37,6 +40,7 @@ public class DynaminSiteThread implements Runnable{
// @Async("asyncexecutorService") // @Async("asyncexecutorService")
public void crawler(){ public void crawler(){
sentBadSiteMsg(siteMsgTemple,Constants.MODEL_SCORE_URL,Constants.KAFKA_CONSUMER_PARTITION);
//获取栏目链接以及翻页的链接 //获取栏目链接以及翻页的链接
List<String> urlList=getPageListUrl(siteMsgTemple); List<String> urlList=getPageListUrl(siteMsgTemple);
log.info("获取urlList: "+urlList.size()); log.info("获取urlList: "+urlList.size());
...@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{ ...@@ -58,21 +62,19 @@ public class DynaminSiteThread implements Runnable{
// String charset = paserSiteDownload.getCharSet(urlList.get(0)); // String charset = paserSiteDownload.getCharSet(urlList.get(0));
// String charset = paserSiteDownload.getCharSet(urlList.get(0)); // String charset = paserSiteDownload.getCharSet(urlList.get(0));
String charset = ""; String charset = "";
try { // try {
charset = paserSiteDownload.locateCharSet(urlList.get(0)); // charset = paserSiteDownload.locateCharSet(urlList.get(0));
} catch (Exception e) { // } catch (Exception e) {
try { // try {
charset = paserSiteDownload.getCharSet(urlList.get(0)); // charset = paserSiteDownload.getCharSet(urlList.get(0));
} catch (IOException ex) { // } catch (IOException ex) {
// //
} // }
} // }
//获取列表url等信息通过匹配url过滤 //获取列表url等信息通过匹配url过滤
List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>(); List<CatchWebByMetaSearch> metaSearchList=new ArrayList<>();
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList=new ArrayList<>();
log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now()); log.info("信息源名称:"+siteMsgTemple.getSiteName()+" 信息源采集开始时间:"+DateTime.now());
String infoSourceId=siteMsgTemple.getId();//获取信息源id String infoSourceId=siteMsgTemple.getId();//获取信息源id
//默认表达式类型 //默认表达式类型
siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType()); siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
...@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{ ...@@ -104,7 +106,6 @@ public class DynaminSiteThread implements Runnable{
WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss(); WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple); docInfoList = webContentPaserByCss.catchWebNewsByCSS(metaSearchList, siteMsgTemple);
}else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析 }else if(siteMsgTemple.getDetailExpressionType().equals("2")){//xpath解析
WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath(); WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
//获取资讯详情信息 根据标签解析 //获取资讯详情信息 根据标签解析
...@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{ ...@@ -144,6 +145,25 @@ public class DynaminSiteThread implements Runnable{
} }
} }
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String crawlerType,String partition){
try {
BadSiteMsg badSiteMsg = new BadSiteMsg();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setConsumerDate(new Date());
badSiteMsg.setCrawlerType(crawlerType);
badSiteMsg.setPartition(partition);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("crawler_consumer", docjson);
}catch (Exception e){
}
}
public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){ public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem(); ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
clbAnsProcessitem.setSid(docInfo.getSid()+""); clbAnsProcessitem.setSid(docInfo.getSid()+"");
......
...@@ -45,7 +45,7 @@ public class WebContentPaserByCss { ...@@ -45,7 +45,7 @@ public class WebContentPaserByCss {
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
public static PageBuilderParser pageBuilderParser = new PageBuilderParser(); public static PageBuilderParser pageBuilderParser = new PageBuilderParser();
RequestUtil requestUtil=RequestUtil.getInstance();
// 验证站点新闻列表URL // 验证站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByCSSVerify( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByCSSVerify(
...@@ -53,82 +53,78 @@ public class WebContentPaserByCss { ...@@ -53,82 +53,78 @@ public class WebContentPaserByCss {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
URL url = new URL(urlList.get(i)); // URL url = new URL(urlList.get(i));
URI uri = null; // URI uri = null;
String uri_code = ""; String uri_code = urlList.get(i);
try { try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null); // uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString()) // uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") // .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); // .replaceAll("%20", "+");
Thread.sleep(2000L); Thread.sleep(2000L);
String body = ""; String body = "";
if(siteMsgTemple.getHeaders()!=null){//添加header if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,null,true,false, siteMsgTemple.getHeaders());
}else { }else {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求 try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) { } catch (Exception e) {
log.info(e.getMessage()); try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
}
}
if (StringUtils.isEmpty(body)) {//为空时调用
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
} }
} }
if (StringUtils.isEmpty(body)) {//为空时调用 if (StringUtils.isEmpty(body)) {//为空时调用
try { try {
if (StringUtils.isEmpty(body)){ if (StringUtils.isEmpty(body)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); body = SeleniumTime.getScopehtml(uri_code);
body = seleniumVerify.getScopehtml(uri_code); }
// PageDownload pageDownload=new PageDownload(); if (StringUtils.isEmpty(body)){
// body = pageDownload.downloadByWebClient(uri_code, charset); body = SeleniumTime.getScopehtml(uri_code);
} }
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("静态请求失败:"+uri_code);
} }
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// body = SeleniumTime.getVerifyScopehtml(uri_code);
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
SeleniumVerify seleniumVerify=new SeleniumVerify();
body = seleniumVerify.getScopehtml(uri_code);
} }
}
// TimeUnit.SECONDS.sleep(2);
}
// if (StringUtils.isEmpty(body)) {
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
//抽取资讯url
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code); log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code);
SeleniumVerify seleniumVerify=new SeleniumVerify(); body = SeleniumTime.getScopehtml(uri_code);
body = seleniumVerify.getScopehtml(uri_code);
doc = Jsoup.parse(body); doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 }else{
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1"); log.info("请求内容为空:"+uri_code);
body = SeleniumTime.getScopehtml(uri_code);
if(StringUtils.isNotEmpty(body)) {
Document doc2 = Jsoup.parse(body);
List<CatchWebByMetaSearch> catchWebByMetaSearches2 = parserCrawlerSiteListByCss(siteMsgTemple, doc2);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches2);
} }
} }
// //
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
// return catchWebByMetaSearchList;
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
log.info("对应请求不是url"); log.info("对应请求不正确:"+urlList.get(i));
} }
} }
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
...@@ -139,68 +135,68 @@ public class WebContentPaserByCss { ...@@ -139,68 +135,68 @@ public class WebContentPaserByCss {
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByCSS( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByCSS(
List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) { List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
URL url = new URL(urlList.get(i)); URL url = new URL(urlList.get(i));
URI uri = null; URI uri = null;
String uri_code = ""; String uri_code = urlList.get(i);
try { try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null); // uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString()) // uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") // .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); // .replaceAll("%20", "+");
Thread.sleep(2000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(siteMsgTemple.getHeaders()!=null){//添加header if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
//链接进来先执行静态请求,内容为空使用模拟浏览器
try {//正常请求 try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){ body = requestUtil.httpGetRequest(uri_code);
log.info(e.getMessage());
}
if (StringUtils.isEmpty(body)) {//为空时调用
try {
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
// PageDownload pageDownload=new PageDownload();
// body = pageDownload.downloadByWebClient(uri_code, charset);
}
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("静态请求失败:"+uri_code);
} }
if (StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, true);
} catch (Exception e3) {
log.info(e3.getMessage());
body="";
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用 }
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){ if (StringUtils.isEmpty(body)) {//当body为空和动态时调用
// body = SeleniumTime.getVerifyScopehtml(uri_code); try {
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} catch (Exception e) {
log.info("模拟浏览器请求异常:"+uri_code);
} }
} }
// TimeUnit.SECONDS.sleep(2);
} }
// if (StringUtils.isEmpty(body)) { // Document document=null;
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1"); // try {
// document = Jsoup.parse(new URL(uri_code), 300000);
// }catch (Exception e){
// log.info("请求失败");
// } // }
log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 if (catchWebByMetaSearches.size() < 1) {//提取不到信息时再次调用
// if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code); log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code);
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
if(StringUtils.isNotEmpty(body)) {
doc = Jsoup.parse(body); doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
} }
} }
// //
} catch (Exception e) { } catch (Exception e) {
...@@ -216,6 +212,16 @@ public class WebContentPaserByCss { ...@@ -216,6 +212,16 @@ public class WebContentPaserByCss {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
public Document getHtmlContent(String url){
Document doc=null;
try {
Document document = Jsoup.parse(new URL(url), 300000);
}catch (Exception e){
log.info("请求失败:"+e.getMessage());
}
return doc;
}
/** /**
* *
* @param siteMsgTemple * @param siteMsgTemple
...@@ -225,7 +231,7 @@ public class WebContentPaserByCss { ...@@ -225,7 +231,7 @@ public class WebContentPaserByCss {
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
...@@ -354,8 +360,8 @@ public class WebContentPaserByCss { ...@@ -354,8 +360,8 @@ public class WebContentPaserByCss {
try { try {
int count = 0; int count = 0;
int k = 0; int k = 0;
int size=catchWebList.size()>50?50:catchWebList.size(); // int size=catchWebList.size()>60?60:catchWebList.size();
for (int i = 0; i < size; i++) { for (int i = 0; i < catchWebList.size(); i++) {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) { if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) {
...@@ -375,31 +381,42 @@ public class WebContentPaserByCss { ...@@ -375,31 +381,42 @@ public class WebContentPaserByCss {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// PageDownload pageDownload=new PageDownload();
// content = pageDownload.downloadByWebClient(cwbm.getSourceaddress(), "utf-8");
if (StringUtils.isEmpty(content)){ if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
// if (StringUtils.isEmpty(content)){
// JedisUtil.delString(Constants.SELENIUM_DRIVER_CACHE);
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// }
}else{ }else{
// try {
// content = paserSiteDownload.getContent(cwbm);
// }catch (Exception e){
// log.info(e.getMessage());
// }
if (StringUtils.isEmpty(content)) {
try { try {
content = paserSiteDownload.getContent(cwbm); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
} }
if (StringUtils.isEmpty(content)) { }
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, true);
} catch (Exception e) { }catch (Exception e){
log.info("静态请求失败:"+cwbm.getSourceaddress()); log.info(e.getMessage());
} }
} }
if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
} }
}catch (Exception e) { }catch (Exception e) {
sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0"); log.info("网站请求异常:"+cwbm.getSourceaddress());
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
} }
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
...@@ -423,14 +440,13 @@ public class WebContentPaserByCss { ...@@ -423,14 +440,13 @@ public class WebContentPaserByCss {
if(StringUtils.isNotEmpty(content)) { if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { }else {
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
continue;
} }
}catch (Exception e){ }catch (Exception e){
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
continue;
} }
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try { try {
count++; count++;
...@@ -479,7 +495,7 @@ public class WebContentPaserByCss { ...@@ -479,7 +495,7 @@ public class WebContentPaserByCss {
// }catch (Exception e){ // }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION); // log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// } // }
log.info("采集到的信息"+docjson); log.info("采集到信息的标题:"+processitem.getTitle());
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo);//用于统计新采集信息的数量 docInfoList.add(docInfo);//用于统计新采集信息的数量
JedisUtil.sadd(rediskey, cwbm.getSourceaddress()); JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
...@@ -514,7 +530,11 @@ public class WebContentPaserByCss { ...@@ -514,7 +530,11 @@ public class WebContentPaserByCss {
String content=""; String content="";
try { try {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问 //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress()); try {//正常请求
content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) {
log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
try {//正常请求 try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
...@@ -523,15 +543,13 @@ public class WebContentPaserByCss { ...@@ -523,15 +543,13 @@ public class WebContentPaserByCss {
} }
} }
if(StringUtils.isEmpty(content) ) { if(StringUtils.isEmpty(content) ) {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress()); // content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{
content = paserSiteDownload.getContent(cwbm);
} }
}catch (Exception e) { }catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
...@@ -671,7 +689,6 @@ public class WebContentPaserByCss { ...@@ -671,7 +689,6 @@ public class WebContentPaserByCss {
//资讯时间 //资讯时间
if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) { if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) {
publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate()); publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate());
if(StringUtils.isNotEmpty(publishDate)) { if(StringUtils.isNotEmpty(publishDate)) {
docInfo.setOlPpublishDate(publishDate); docInfo.setOlPpublishDate(publishDate);
docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate)); docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate));
......
...@@ -43,7 +43,7 @@ public class WebContentPaserByIntellige { ...@@ -43,7 +43,7 @@ public class WebContentPaserByIntellige {
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
public static PageBuilderParser pageBuilderParser = new PageBuilderParser(); public static PageBuilderParser pageBuilderParser = new PageBuilderParser();
RequestUtil requestUtil=RequestUtil.getInstance();
// 验证站点新闻列表URL // 验证站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgVerify( public List<CatchWebByMetaSearch> catchWebOfStaticmsgVerify(
...@@ -137,50 +137,44 @@ public class WebContentPaserByIntellige { ...@@ -137,50 +137,44 @@ public class WebContentPaserByIntellige {
try { try {
URL url = new URL(urlList.get(i)); URL url = new URL(urlList.get(i));
URI uri = null; URI uri = null;
String uri_code = ""; String uri_code = urlList.get(i);
try { try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null); // uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString()) // uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") // .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); // .replaceAll("%20", "+");
Thread.sleep(2000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(siteMsgTemple.getHeaders()!=null){//添加header if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
body = RequestUtil.httpGetRequest(uri_code); //链接进来先执行静态请求,内容为空使用模拟浏览器
if(StringUtils.isEmpty(body)) { // try {//正常请求
// body = RequestUtil.httpGetRequest(uri_code);
// } catch (Exception e) {
// log.info("静态请求失败:"+uri_code);
// }
if (StringUtils.isEmpty(body)) {
try {//正常请求 try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = pageDownload.downloadWithStr(uri_code, charset, false, true);
} catch (Exception e) { } catch (Exception e3) {
log.info(e.getMessage()); log.info(e3.getMessage());
body="";
} }
} }
if (StringUtils.isEmpty(body)) {//为空时调用 // if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if (StringUtils.isEmpty(body)) {//当body为空和动态时调用
try { try {
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
// PageDownload pageDownload=new PageDownload();
// body = pageDownload.downloadByWebClient(uri_code, charset);
}
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("模拟浏览器请求异常:"+uri_code);
}
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用 if (StringUtils.isEmpty(body)){
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// body = SeleniumTime.getVerifyScopehtml(uri_code);
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} }
} }
// TimeUnit.SECONDS.sleep(2);
} }
// if (StringUtils.isEmpty(body)) { log.info("body的长度:"+body.length());
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body); Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
...@@ -189,12 +183,11 @@ public class WebContentPaserByIntellige { ...@@ -189,12 +183,11 @@ public class WebContentPaserByIntellige {
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用 if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code); log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code);
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
if(StringUtils.isNotEmpty(body)) {
doc = Jsoup.parse(body); doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} }
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
} }
} }
// //
...@@ -221,7 +214,7 @@ public class WebContentPaserByIntellige { ...@@ -221,7 +214,7 @@ public class WebContentPaserByIntellige {
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
...@@ -368,27 +361,33 @@ public class WebContentPaserByIntellige { ...@@ -368,27 +361,33 @@ public class WebContentPaserByIntellige {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
PageDownload pageDownload=new PageDownload(); if (StringUtils.isEmpty(content)){
content = pageDownload.downloadByWebClient(cwbm.getSourceaddress(), "utf-8"); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
if (StringUtils.isEmpty(content)){ if (StringUtils.isEmpty(content)){
JedisUtil.delString(Constants.SELENIUM_DRIVER_CACHE);
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
}else{ }else{
if (StringUtils.isEmpty(content)) {
try { try {
content = paserSiteDownload.getContent(cwbm); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
} }
if (StringUtils.isEmpty(content)) { }
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, true);
} catch (Exception e) { }catch (Exception e){
log.info("静态请求失败:"+cwbm.getSourceaddress()); log.info(e.getMessage());
} }
} }
if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
} }
}catch (Exception e) { }catch (Exception e) {
...@@ -413,8 +412,9 @@ public class WebContentPaserByIntellige { ...@@ -413,8 +412,9 @@ public class WebContentPaserByIntellige {
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
// 封装解析的docinfo对象 // 封装解析的docinfo对象
try { try {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
if(StringUtils.isNotEmpty(content)) { //调用解析接口 if(StringUtils.isNotEmpty(content)) { //调用解析接口
String url="http://47.118.79.75:5000/extract-article"; String url=":http://114.116.112.175:5000/extract-article";
Map<String, Object> params=new HashMap<>(); Map<String, Object> params=new HashMap<>();
params.put("lang_code",siteMsgTemple.getLanguage()==null?"":siteMsgTemple.getLanguage()); params.put("lang_code",siteMsgTemple.getLanguage()==null?"":siteMsgTemple.getLanguage());
params.put("article_html",content==null?"":content); params.put("article_html",content==null?"":content);
...@@ -427,15 +427,18 @@ public class WebContentPaserByIntellige { ...@@ -427,15 +427,18 @@ public class WebContentPaserByIntellige {
String title = JsonPath.read(docBody, "$.title"); String title = JsonPath.read(docBody, "$.title");
String source = JsonPath.read(docBody, "$.source"); String source = JsonPath.read(docBody, "$.source");
String meta_description = JsonPath.read(docBody, "$.meta_description"); String meta_description = JsonPath.read(docBody, "$.meta_description");
// String url = JsonPath.read(docBody, "$.url"); if (StringUtils.isEmpty(docInfo.getTitle())) {
// docInfo.setSummary(meta_description); docInfo.setTitle(title);
// docInfo.setOrigin(source); }
if (StringUtils.isEmpty(docInfo.getContentNoTag())) {
docInfo.setContentNoTag(cleaned_text); docInfo.setContentNoTag(cleaned_text);
docInfo.setPublishDate(publish_date);
docInfo.setContentWithTag(text); docInfo.setContentWithTag(text);
docInfo.setTitle(title);
} }
// docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); if (StringUtils.isEmpty(docInfo.getPublishDate())) {
docInfo.setPublishDate(publish_date);
}
}
}else { }else {
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1"); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
...@@ -493,7 +496,7 @@ public class WebContentPaserByIntellige { ...@@ -493,7 +496,7 @@ public class WebContentPaserByIntellige {
// }catch (Exception e){ // }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION); // log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// } // }
log.info("采集到的信息"+docjson); log.info("采集到信息的标题:"+processitem.getTitle());
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo);//用于统计新采集信息的数量 docInfoList.add(docInfo);//用于统计新采集信息的数量
JedisUtil.sadd(rediskey, cwbm.getSourceaddress()); JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
...@@ -528,7 +531,11 @@ public class WebContentPaserByIntellige { ...@@ -528,7 +531,11 @@ public class WebContentPaserByIntellige {
String content=""; String content="";
try { try {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问 //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress()); try {//正常请求
content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) {
log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
try {//正常请求 try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
...@@ -538,14 +545,9 @@ public class WebContentPaserByIntellige { ...@@ -538,14 +545,9 @@ public class WebContentPaserByIntellige {
} }
if(StringUtils.isEmpty(content) ) { if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{
content = paserSiteDownload.getContent(cwbm);
} }
}catch (Exception e) { }catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
...@@ -572,6 +574,8 @@ public class WebContentPaserByIntellige { ...@@ -572,6 +574,8 @@ public class WebContentPaserByIntellige {
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
// 封装解析的docinfo对象 // 封装解析的docinfo对象
try { try {
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
String url="http://47.118.79.75:5000/extract-article"; String url="http://47.118.79.75:5000/extract-article";
Map<String, Object> params=new HashMap<>(); Map<String, Object> params=new HashMap<>();
params.put("lang_code",siteMsgTemple.getLanguage()==null?"":siteMsgTemple.getLanguage()); params.put("lang_code",siteMsgTemple.getLanguage()==null?"":siteMsgTemple.getLanguage());
...@@ -588,12 +592,18 @@ public class WebContentPaserByIntellige { ...@@ -588,12 +592,18 @@ public class WebContentPaserByIntellige {
// String url = JsonPath.read(docBody, "$.url"); // String url = JsonPath.read(docBody, "$.url");
// docInfo.setSummary(meta_description); // docInfo.setSummary(meta_description);
// docInfo.setOrigin(source); // docInfo.setOrigin(source);
if (StringUtils.isEmpty(docInfo.getTitle())) {
docInfo.setTitle(title);
}
if (StringUtils.isEmpty(docInfo.getContentNoTag())) {
docInfo.setContentNoTag(cleaned_text); docInfo.setContentNoTag(cleaned_text);
docInfo.setPublishDate(publish_date);
docInfo.setContentWithTag(text); docInfo.setContentWithTag(text);
docInfo.setTitle(title); }
if (StringUtils.isEmpty(docInfo.getPublishDate())) {
docInfo.setPublishDate(publish_date);
}
// docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
}catch (Exception e){ }catch (Exception e){
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
// continue; // continue;
......
...@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath { ...@@ -594,7 +594,7 @@ public class WebContentPaserByJsonXpath {
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
......
...@@ -40,6 +40,7 @@ public class WebContentPaserByRegular { ...@@ -40,6 +40,7 @@ public class WebContentPaserByRegular {
public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
RequestUtil requestUtil=RequestUtil.getInstance();
//通过注解引入配置 //通过注解引入配置
@Resource(name = "defaultThreadPool") @Resource(name = "defaultThreadPool")
private ThreadPoolTaskExecutor executor; private ThreadPoolTaskExecutor executor;
...@@ -63,18 +64,25 @@ public class WebContentPaserByRegular { ...@@ -63,18 +64,25 @@ public class WebContentPaserByRegular {
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) { if(StringUtils.isEmpty(body)) {
try {//正常请求 try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) { } catch (Exception e) {
log.info(e.getMessage()); // log.info(e.getMessage());
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
}
} }
} }
if (StringUtils.isEmpty(body)){ if (StringUtils.isEmpty(body)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); body = SeleniumTime.getScopehtml(uri_code);
body = seleniumVerify.getScopehtml(uri_code); if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
}
} }
} }
...@@ -88,8 +96,7 @@ public class WebContentPaserByRegular { ...@@ -88,8 +96,7 @@ public class WebContentPaserByRegular {
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) { if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
SeleniumVerify seleniumVerify=new SeleniumVerify(); body = SeleniumTime.getScopehtml(uri_code);
body = seleniumVerify.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){ if(catchWebByMetaSearches.size()<1){
...@@ -119,47 +126,53 @@ public class WebContentPaserByRegular { ...@@ -119,47 +126,53 @@ public class WebContentPaserByRegular {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
URL url = new URL(urlList.get(i)); // URL url = new URL(urlList.get(i));
URI uri = null; // URI uri = null;
String uri_code = urlList.get(i); String uri_code = urlList.get(i);
try { try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null); // uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString()) // uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") // .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); // .replaceAll("%20", "+");
// Thread.sleep(1000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
try {//先使用静态网络请求获取列表内容 //链接进来先执行静态请求,内容为空使用模拟浏览器
body = pageDownload.downloadWithStr(uri_code, charset, false, false); try {//正常请求
}catch (Exception e){ body = requestUtil.httpGetRequest(uri_code);
log.info(e.getMessage()); } catch (Exception e) {
body = pageDownload.downloadWithStr(uri_code, charset, false, false); log.info("静态请求失败:"+uri_code);
} }
if (StringUtils.isEmpty(body)){ if (StringUtils.isEmpty(body)) {
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, true);
} catch (Exception e3) {
log.info(e3.getMessage());
body="";
}
}
// if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if (StringUtils.isEmpty(body)) {//当body为空和动态时调用
try {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} catch (Exception e) {
log.info("模拟浏览器请求异常:"+uri_code);
}
} }
} }
// if(StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")){
// String imagUrl="";
// WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
// webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
// }
//抽取资讯url //抽取资讯url
log.info("body的长度:"+body.length()); log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) { if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
if(StringUtils.isNotEmpty(body)) {
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
} }
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个"); log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个");
} }
...@@ -185,7 +198,7 @@ public class WebContentPaserByRegular { ...@@ -185,7 +198,7 @@ public class WebContentPaserByRegular {
*/ */
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
...@@ -317,8 +330,8 @@ public class WebContentPaserByRegular { ...@@ -317,8 +330,8 @@ public class WebContentPaserByRegular {
List<DocInfo> docInfoList = new ArrayList<>(); List<DocInfo> docInfoList = new ArrayList<>();
int count = 0; int count = 0;
int k = 0; int k = 0;
int size=catchWebList.size()>50?50:catchWebList.size(); // int size=catchWebList.size()>60?60:catchWebList.size();
for (int i = 0; i < size; i++) { for (int i = 0; i < catchWebList.size(); i++) {
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
...@@ -339,26 +352,41 @@ public class WebContentPaserByRegular { ...@@ -339,26 +352,41 @@ public class WebContentPaserByRegular {
// 请求下载内容 // 请求下载内容
String content = ""; String content = "";
try { try {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// if (StringUtils.isEmpty(content)){ if (StringUtils.isEmpty(content)){
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); JedisUtil.delString(Constants.SELENIUM_DRIVER_CACHE);
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
}else{
// try {
// content = paserSiteDownload.getContent(cwbm);
// }catch (Exception e){
// log.info(e.getMessage());
// } // }
} else { if (StringUtils.isEmpty(content)) {
try { try {
content =HttpgetUtil.getHtml(cwbm.getSourceaddress()); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false); }catch (Exception e){
} catch (Exception e) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
log.info(e.getMessage()); log.info(e.getMessage());
// content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null); }
}
if (StringUtils.isEmpty(content)) {
try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, true);
}catch (Exception e){
log.info(e.getMessage());
}
}
if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类 //超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
} catch (Exception e) { } catch (Exception e) {
continue; continue;
} }
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
continue; continue;
} }
...@@ -444,6 +472,7 @@ public class WebContentPaserByRegular { ...@@ -444,6 +472,7 @@ public class WebContentPaserByRegular {
// processitem.setScreenShotImg(imagUrl); // processitem.setScreenShotImg(imagUrl);
// } // }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
log.info("采集到信息的标题:"+processitem.getTitle());
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress()); JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
count++; count++;
...@@ -477,7 +506,11 @@ public class WebContentPaserByRegular { ...@@ -477,7 +506,11 @@ public class WebContentPaserByRegular {
String content=""; String content="";
try { try {
//首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问 //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress()); try {//正常请求
content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) {
log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
try {//正常请求 try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
...@@ -487,15 +520,11 @@ public class WebContentPaserByRegular { ...@@ -487,15 +520,11 @@ public class WebContentPaserByRegular {
} }
if(StringUtils.isEmpty(content) ) { if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
SeleniumVerify seleniumVerify=new SeleniumVerify(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{
content = paserSiteDownload.getContent(cwbm);
} }
}catch (Exception e) { }catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
......
...@@ -68,7 +68,7 @@ public class WebContentPaserByXpath { ...@@ -68,7 +68,7 @@ public class WebContentPaserByXpath {
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
public static PageBuilderParser builderParser=new PageBuilderParser(); public static PageBuilderParser builderParser=new PageBuilderParser();
public SeleniumTime seleniumTime; public SeleniumTime seleniumTime;
RequestUtil requestUtil=RequestUtil.getInstance();
// 验证站点新闻列表URL // 验证站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByXapthVerify( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByXapthVerify(
...@@ -90,41 +90,41 @@ public class WebContentPaserByXpath { ...@@ -90,41 +90,41 @@ public class WebContentPaserByXpath {
if(siteMsgTemple.getHeaders()!=null){ if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
body = RequestUtil.httpGetRequest(uri_code);
if(StringUtils.isEmpty(body)) {
try {//正常请求 try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) { } catch (Exception e) {
log.info(e.getMessage()); try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
} }
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body)) {//为空时调用
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, false, false); body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) { } catch (Exception e2) {
try { log.info(e2.getMessage());
body = paserSiteDownload.getHtml(uri_code, charset); body="";
} catch (Exception e) {
log.info("静态请求失败:"+uri_code);
} }
} }
if (StringUtils.isEmpty(body)) {//为空时调用
try {
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
} }
if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) { if (StringUtils.isEmpty(body)){
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){ body = SeleniumTime.getScopehtml(uri_code);
body = SeleniumTime.getVerifyScopehtml(uri_code); }
}else { } catch (Exception e) {
SeleniumVerify seleniumVerify=new SeleniumVerify(); log.info("静态请求失败:"+uri_code);
body = seleniumVerify.getScopehtml(uri_code);
} }
} }
} }
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) { if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
SeleniumVerify seleniumVerify=new SeleniumVerify(); body = SeleniumTime.getScopehtml(uri_code);
body = seleniumVerify.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
} }
if(catchWebByMetaSearches.size()<1){ if(catchWebByMetaSearches.size()<1){
...@@ -158,69 +158,60 @@ public class WebContentPaserByXpath { ...@@ -158,69 +158,60 @@ public class WebContentPaserByXpath {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
URL url = new URL(urlList.get(i)); // URL url = new URL(urlList.get(i));
URI uri = null; // URI uri = null;
String uri_code = ""; String uri_code = urlList.get(i);
try { try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null); // uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString()) // uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") // .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); // .replaceAll("%20", "+");
Thread.sleep(2000L);
String body = ""; String body = "";
if(siteMsgTemple.getHeaders()!=null){ if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
try { //链接进来先执行静态请求,内容为空使用模拟浏览器
body = pageDownload.downloadWithStr(uri_code, charset, true, false); // try {//正常请求
}catch (Exception e){ // body = RequestUtil.httpGetRequest(uri_code);
log.info(e.getMessage()); // } catch (Exception e) {
} // log.info("静态请求失败:"+uri_code);
if (StringUtils.isEmpty(body)) { // }
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body)) {
try { try {//正常请求
body = paserSiteDownload.getHtml(uri_code, charset); body = pageDownload.downloadWithStr(uri_code, charset, false, true);
} catch (Exception e) { } catch (Exception e3) {
log.info("静态请求失败:"+uri_code); log.info(e3.getMessage());
} body="";
} }
} }
if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) { // if (StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){ if (StringUtils.isEmpty(body)) {//当body为空和动态时调用
body = SeleniumTime.getVerifyScopehtml(uri_code); try {
}else {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
} catch (Exception e) {
log.info("模拟浏览器请求异常:"+uri_code);
} }
} }
} }
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) { if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
if(StringUtils.isNotEmpty(body)) {
catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
} }
}
if(catchWebByMetaSearches.size()<1){ if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue; continue;
} }
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
if(seleniumTime!=null) {
seleniumTime.close();
}
// return catchWebByMetaSearchList;
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
log.info("对应请求不是url"+urlList.get(i)); log.info("对应请求不正确:"+urlList.get(i));
if(seleniumTime!=null) {
seleniumTime.close();
}
} }
} }
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
...@@ -228,7 +219,7 @@ public class WebContentPaserByXpath { ...@@ -228,7 +219,7 @@ public class WebContentPaserByXpath {
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){ public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try { try {
BadSiteMsg badSiteMsg = new BadSiteMsg(); BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId()); badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode()); badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName()); badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
...@@ -383,8 +374,8 @@ public class WebContentPaserByXpath { ...@@ -383,8 +374,8 @@ public class WebContentPaserByXpath {
try { try {
int count = 0; int count = 0;
int k = 0; int k = 0;
int size=catchWebList.size()>50?50:catchWebList.size(); // int size=catchWebList.size()>50?50:catchWebList.size();
for (int i = 0; i < size; i++) { for (int i = 0; i < catchWebList.size(); i++) {
count++; count++;
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
...@@ -406,21 +397,28 @@ public class WebContentPaserByXpath { ...@@ -406,21 +397,28 @@ public class WebContentPaserByXpath {
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
if (StringUtils.isEmpty(content)){
JedisUtil.delString(Constants.SELENIUM_DRIVER_CACHE);
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
}else{ }else{
if (StringUtils.isEmpty(content)) {
try { try {
content = paserSiteDownload.getContent(cwbm); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
} }
if (StringUtils.isEmpty(content)) { }
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
try { try {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, true);
} catch (Exception e) { }catch (Exception e){
log.info("静态请求失败:"+cwbm.getSourceaddress()); log.info(e.getMessage());
} }
} }
if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
} }
}catch (Exception e) { }catch (Exception e) {
...@@ -538,21 +536,23 @@ public class WebContentPaserByXpath { ...@@ -538,21 +536,23 @@ public class WebContentPaserByXpath {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求 try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) { } catch (Exception e) {
log.info(e.getMessage()); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false);
} // log.info(e.getMessage());
} }
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
// SeleniumVerify seleniumVerify=new SeleniumVerify(); // SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress()); // content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
} }
}else{ }else{
content = RequestUtil.httpGetRequest(cwbm.getSourceaddress()); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
try {//正常请求 try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
...@@ -560,7 +560,6 @@ public class WebContentPaserByXpath { ...@@ -560,7 +560,6 @@ public class WebContentPaserByXpath {
log.info(e.getMessage()); log.info(e.getMessage());
} }
} }
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
......
...@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent; ...@@ -6,10 +6,14 @@ import java.awt.event.KeyEvent;
import java.io.*; import java.io.*;
import java.time.Duration; import java.time.Duration;
import java.time.temporal.ChronoUnit; import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.alibaba.fastjson.JSON;
import com.zzsn.crawler.ReuseWebDriver; import com.zzsn.crawler.ReuseWebDriver;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.DriverUtil; import com.zzsn.util.DriverUtil;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*; import org.openqa.selenium.*;
...@@ -72,25 +76,51 @@ public class SeleniumTime { ...@@ -72,25 +76,51 @@ public class SeleniumTime {
// @Async("asyncTaskExecutorSelenium") // @Async("asyncTaskExecutorSelenium")
public static String getScopehtml(String url) { public static String getScopehtml(String url) {
String html = ""; String html = "";
ReuseWebDriver driver=null;
try { try {
ReuseWebDriver driver = DriverUtil.getChromeDriver();
try { try {
Duration duration=Duration.of(50, ChronoUnit.SECONDS); driver= DriverUtil.getChromeDriver();
}catch (Exception e){
log.info("获取浏览器ReuseWebDriver异常:" + e.getMessage());
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
}
try {
Duration duration=Duration.of(60, ChronoUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(duration); driver.manage().timeouts().pageLoadTimeout(duration);
driver.get(url); driver.get(url);
// Thread.sleep(1000); Thread.sleep(5000);
try { try {
WebElement webElement = driver.findElement(By.xpath("/html")); WebElement webElement = driver.findElement(By.xpath("/html"));
html = webElement.getAttribute("outerHTML"); html = webElement.getAttribute("outerHTML");
} catch (Exception e) { } catch (Exception e) {
driver.quit();
log.info("获取页面内容异常:" + e.getMessage()); log.info("获取页面内容异常:" + e.getMessage());
} }
} catch (Exception e) { } catch (Exception e) {
driver.quit();
// 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动 // 若驱动Session连接异常,则直接退出驱动并在下次访问得的时候重新打开驱动
log.info("驱动打开URL异常:" + e.getMessage()); log.info("驱动打开URL异常:" + e.getMessage());
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
} }
} catch (Exception e) { } catch (Exception e) {
driver.quit();
try {
Map<String, String> map = new HashMap<>();
map.put("sessionId", "sessionId");
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
} catch (Exception ex) {
}
log.info("驱动访问页面出现出现异常:" + e.getMessage()); log.info("驱动访问页面出现出现异常:" + e.getMessage());
} }
return html; return html;
......
...@@ -13,52 +13,52 @@ public class ArticleCrawler { ...@@ -13,52 +13,52 @@ public class ArticleCrawler {
articleCrawler.consumer(); articleCrawler.consumer();
} }
public void consumer(){ public void consumer(){
String record="{\n" + String value="{\n" +
" \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" + " \"@class\": \"com.zzsn.clb.common.model.task.dto.titr.InfoSourceDTO\",\n" +
" \"id\": \"1560150270181019650\",\n" + " \"id\": \"1534713917403385858\",\n" +
" \"infoSourceCode\": \"IN-20220818-0011\",\n" + " \"infoSourceCode\": \"IN-20220609-58696\",\n" +
" \"webSiteName\": \"一带一路-项目周报\",\n" + " \"webSiteName\": \"中华人民共和国工业和信息化部\",\n" +
" \"siteName\": \"一带一路-项目周报\",\n" + " \"siteName\": \"中华人民共和国工业和信息化部-领导活动\",\n" +
" \"siteUri\": \"https://www.yidaiyilu.gov.cn/info/iList.jsp?cat_id=11432&cur_page=3\",\n" + " \"siteUri\": \"https://www.miit.gov.cn/xwdt/gxdt/ldhd/index.html\",\n" +
" \"infoSourceTypeId\": \"1\",\n" + " \"infoSourceTypeId\": \"1\",\n" +
" \"siteLevel\": null,\n" + " \"siteLevel\": \"2\",\n" +
" \"language\": null,\n" + " \"language\": \"zh\",\n" +
" \"checkedList\": null,\n" + " \"checkedList\": \"1\",\n" +
" \"hisUriExp\": null,\n" + " \"hisUriExp\": null,\n" +
" \"hisDateStartTime\": null,\n" + " \"hisDateStartTime\": null,\n" +
" \"hisDateEndTime\": null,\n" + " \"hisDateEndTime\": null,\n" +
" \"ynHisDataAll\": \"0\",\n" + " \"ynHisDataAll\": \"0\",\n" +
" \"status\": null,\n" + " \"status\": \"1\",\n" +
" \"listUrl\": null,\n" + " \"listUrl\": null,\n" +
" \"listExpressionType\": \"3\",\n" + " \"listExpressionType\": \"3\",\n" +
" \"informationUrl\": null,\n" + " \"informationUrl\": \"\",\n" +
" \"informationTitle\": \"a\",\n" + " \"informationTitle\": null,\n" +
" \"informationPublishDate\": \"span\",\n" + " \"informationPublishDate\": null,\n" +
" \"informationSource\": null,\n" + " \"informationSource\": null,\n" +
" \"infoBlockPosition\": \"ul[class=\\\"commonList_dot\\\"]>li\",\n" + " \"infoBlockPosition\": \"div[class=\\\"page-content\\\"]>ul>li\",\n" +
" \"linkLocation\": \"a\",\n" + " \"linkLocation\": \"a\",\n" +
" \"extractInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"extractInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"crawlDepth\": null,\n" + " \"crawlDepth\": 2,\n" +
" \"pageUrl\": null,\n" + " \"pageUrl\": null,\n" +
" \"matchPage\": null,\n" + " \"matchPage\": null,\n" +
" \"pageStart\": 0,\n" + " \"pageStart\": 0,\n" +
" \"pageEnd\": 0,\n" + " \"pageEnd\": 0,\n" +
" \"ynPageAll\": \"0\",\n" + " \"ynPageAll\": \"0\",\n" +
" \"detailExpressionType\": \"3\",\n" + " \"detailExpressionType\": \"0\",\n" +
" \"detailUrl\": null,\n" + " \"detailUrl\": null,\n" +
" \"detailExpressionTitle\": \"<title><exp>h1[class=\\\"main_content_title\\\"]</exp></title>\",\n" + " \"detailExpressionTitle\": \"<title><exp>*.h1[id=\\\"con_title\\\"]</exp></title>\",\n" +
" \"detailExpressionPublishDate\": \"<publish_date><exp>div[class=\\\"szty\\\"]>span:contains(时间)</exp></publish_date>\",\n" + " \"detailExpressionPublishDate\": \"<publish_date><exp>*.span[id=\\\"con_time\\\"]</exp></publish_date>\",\n" +
" \"detailExpressionSource\": \"<origin><exp>div[class=\\\"szty\\\"]>span:contains(来源)</exp></origin>\",\n" + " \"detailExpressionSource\": null,\n" +
" \"detailExpressionAuthor\": null,\n" + " \"detailExpressionAuthor\": null,\n" +
" \"detailExpressionSummary\": null,\n" + " \"detailExpressionSummary\": null,\n" +
" \"detailExpressionContent\": \"<content><exp>div[class=\\\"content\\\"]</exp></content>\",\n" + " \"detailExpressionContent\": \"<content><exp>*.div[id=\\\"con_con\\\"]</exp></content>\",\n" +
" \"detailInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"detailInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"expression\\\":\\\"\\\"}]\",\n" +
" \"ynDownload\": \"0\",\n" + " \"ynDownload\": \"0\",\n" +
" \"formUrl\": null,\n" + " \"formUrl\": null,\n" +
" \"formTitle\": null,\n" + " \"formTitle\": null,\n" +
" \"formType\": null,\n" + " \"formType\": null,\n" +
" \"dataFormExpression\": null,\n" + " \"dataFormExpression\": null,\n" +
" \"dataFormInfo\": \"[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\",\\\"explain\\\":\\\"\\\"}]\",\n" + " \"dataFormInfo\": \"[{\\\"id\\\":2,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"mapping\\\":\\\"\\\",\\\"expression\\\":\\\"\\\",\\\"primaryKey\\\":\\\"\\\"}]\",\n" +
" \"dataPageUrl\": null,\n" + " \"dataPageUrl\": null,\n" +
" \"dataPageRule\": null,\n" + " \"dataPageRule\": null,\n" +
" \"dataPageStart\": 0,\n" + " \"dataPageStart\": 0,\n" +
...@@ -67,7 +67,7 @@ public class ArticleCrawler { ...@@ -67,7 +67,7 @@ public class ArticleCrawler {
" \"dataType\": 0,\n" + " \"dataType\": 0,\n" +
" \"dataFormat\": 0,\n" + " \"dataFormat\": 0,\n" +
" \"dataStorageMode\": 0,\n" + " \"dataStorageMode\": 0,\n" +
" \"dataStorageInfo\": \"{\\\"accessMode\\\":\\\"FTP\\\"}\",\n" + " \"dataStorageInfo\": \"{}\",\n" +
" \"ynDynamicCrawl\": 1,\n" + " \"ynDynamicCrawl\": 1,\n" +
" \"ynLogin\": 0,\n" + " \"ynLogin\": 0,\n" +
" \"domainName\": null,\n" + " \"domainName\": null,\n" +
...@@ -83,10 +83,10 @@ public class ArticleCrawler { ...@@ -83,10 +83,10 @@ public class ArticleCrawler {
" \"crawlName\": null,\n" + " \"crawlName\": null,\n" +
" \"crawlAddress\": null,\n" + " \"crawlAddress\": null,\n" +
" \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" + " \"parameter\": \"{\\\"crawlingParam\\\":[{\\\"id\\\":0,\\\"name\\\":\\\"\\\",\\\"explain\\\":\\\"\\\",\\\"content\\\":\\\"\\\"}]}\",\n" +
" \"cron\": \"05 23 14 1/7 * ?\",\n" + " \"cron\": \"53 40 0/4 * * ?\",\n" +
" \"ynSnapshot\": \"0\"\n" + " \"ynSnapshot\": null\n" +
"}"; "}";
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record, SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(value, SiteMsgTemple.class);
ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread(); ArticleCrawlerThread articleCrawlerThread=new ArticleCrawlerThread();
articleCrawlerThread.siteMsgTemple=siteMsgTemple; articleCrawlerThread.siteMsgTemple=siteMsgTemple;
articleCrawlerThread.crawler(); articleCrawlerThread.crawler();
......
package com.zzsn.crawlerOther.paser; package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.db.DBManager;
import com.zzsn.crawler.db.SnowIdUtils;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.SeleniumVerify;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.crawlerOther.StandardWebExtractorHandler;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.download.RequestUtil;
import com.zzsn.entity.DocInfo; import com.zzsn.entity.*;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.util.ContentFileFinder; import com.zzsn.test.JSUtil;
import com.zzsn.util.ContentUtility; import com.zzsn.util.*;
import com.zzsn.util.PublishDateUtil;
import com.zzsn.util.Utility;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.web.bind.annotation.RequestBody;
import java.io.InputStream; import java.io.InputStream;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.sql.SQLException;
import java.sql.Types;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -44,8 +42,98 @@ public class WebContentPaserByCss { ...@@ -44,8 +42,98 @@ public class WebContentPaserByCss {
public static PageDownloader pageDownload=new PageDownloader(); public static PageDownloader pageDownload=new PageDownloader();
public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
public static PageBuilderParser pageBuilderParser = new PageBuilderParser(); public static PageBuilderParser pageBuilderParser = new PageBuilderParser();
RequestUtil requestUtil=RequestUtil.getInstance();
// 验证站点新闻列表URL
@SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByCSSVerify(
List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
try {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
Thread.sleep(2000L);
String body = "";
if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else {
if(StringUtils.isEmpty(body)) {
try {//正常请求
body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) {
// log.info(e.getMessage());
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
}
}
}
if (StringUtils.isEmpty(body)) {//为空时调用
try {
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
// PageDownload pageDownload=new PageDownload();
// body = pageDownload.downloadByWebClient(uri_code, charset);
}
} catch (Exception e) {
log.info("静态请求失败:"+uri_code);
}
}
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// body = SeleniumTime.getVerifyScopehtml(uri_code);
body = SeleniumTime.getScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code);
}
}
// TimeUnit.SECONDS.sleep(2);
}
// if (StringUtils.isEmpty(body)) {
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body);
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code);
body = SeleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
}
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
// sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
}
}
//
} catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code);
// return catchWebByMetaSearchList;
continue;
}
} catch (Exception e) {
log.info("对应请求不是url");
}
}
return catchWebByMetaSearchList;
}
// 提取站点新闻列表URL // 提取站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
...@@ -65,36 +153,56 @@ public class WebContentPaserByCss { ...@@ -65,36 +153,56 @@ public class WebContentPaserByCss {
.replaceAll("%20", "+"); .replaceAll("%20", "+");
Thread.sleep(2000L); Thread.sleep(2000L);
String body = ""; String body = "";
if(siteMsgTemple.getHeaders()!=null){ if(siteMsgTemple.getHeaders()!=null){//添加header
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,true,false, siteMsgTemple.getHeaders());
}else { }else {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { try {//正常请求
body = SeleniumTime.getScopehtml(uri_code);
TimeUnit.SECONDS.sleep(5);
} else {
try {
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){ }catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
} }
if (StringUtils.isEmpty(body)) { if (StringUtils.isEmpty(body)) {//为空时调用
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) {
try { try {
body = paserSiteDownload.getHtml(uri_code, charset); if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
// PageDownload pageDownload=new PageDownload();
// body = pageDownload.downloadByWebClient(uri_code, charset);
}
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("静态请求失败:"+uri_code);
} }
} }
if (StringUtils.isEmpty(body)&&siteMsgTemple.getYnDynamicCrawl() == 1) {//当body为空和动态时调用
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// body = SeleniumTime.getVerifyScopehtml(uri_code);
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code);
} }
} }
TimeUnit.SECONDS.sleep(2);
// TimeUnit.SECONDS.sleep(2);
} }
Document doc= Jsoup.parse(body); // if (StringUtils.isEmpty(body)) {
// sentBadSiteMsg(siteMsgTemple, "请求异常", "1");
// }
if(StringUtils.isNotEmpty(body)) {
Document doc = Jsoup.parse(body);
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
log.info("第一次请求未解析到列表内容进行第二次请求:"+uri_code);
body = SeleniumTime.getScopehtml(uri_code);
doc = Jsoup.parse(body);
catchWebByMetaSearches = parserCrawlerSiteListByCss(siteMsgTemple, doc);
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
}
if (catchWebByMetaSearches.size() < 1 && siteMsgTemple.getYnDynamicCrawl() == 1) {//提取不到信息时再次调用
// sentBadSiteMsg(siteMsgTemple, "列表解析配置异常", "1");
}
}
//
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
// return catchWebByMetaSearchList; // return catchWebByMetaSearchList;
...@@ -108,7 +216,33 @@ public class WebContentPaserByCss { ...@@ -108,7 +216,33 @@ public class WebContentPaserByCss {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
* @param (0:静态爬取 1:动态爬取)
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByCss(SiteMsgTemple siteMsgTemple,Document doc)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByCss(SiteMsgTemple siteMsgTemple,Document doc)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -141,10 +275,12 @@ public class WebContentPaserByCss { ...@@ -141,10 +275,12 @@ public class WebContentPaserByCss {
} }
//抽取时间 //抽取时间
if (StringUtils.isNotEmpty(informationPublishDate)) { if (StringUtils.isNotEmpty(informationPublishDate)) {
String publishDate = itemElement.select(informationPublishDate).text(); String publishDate ="";
if (informationPublishDate.contains("@")) { if (informationPublishDate.contains("@")) {
String[] informationPublishDates = informationPublishDate.split("@"); String[] informationPublishDates = informationPublishDate.split("@");
publishDate = itemElement.select(informationPublishDates[0]).attr(informationPublishDates[1]); publishDate = itemElement.select(informationPublishDates[0]).attr(informationPublishDates[1]);
}else{
publishDate = itemElement.select(informationPublishDate).text();
} }
catchWebByMetaSearch.setPublishDate(publishDate); catchWebByMetaSearch.setPublishDate(publishDate);
} }
...@@ -203,7 +339,8 @@ public class WebContentPaserByCss { ...@@ -203,7 +339,8 @@ public class WebContentPaserByCss {
} }
metaSearchList.add(catchWebByMetaSearch); metaSearchList.add(catchWebByMetaSearch);
}catch (Exception e){ }catch (Exception e){
log.info("列表字段信息解析异常"); log.info("列表字段解析异常");
continue;
} }
} }
return metaSearchList; return metaSearchList;
...@@ -216,27 +353,35 @@ public class WebContentPaserByCss { ...@@ -216,27 +353,35 @@ public class WebContentPaserByCss {
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList=new ArrayList<>();
try { try {
int count = 0; int count = 0;
for (int i = 0; i < catchWebList.size(); i++) { int k = 0;
int size=catchWebList.size()>50?50:catchWebList.size();
for (int i = 0; i < size; i++) {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
boolean sismember = JedisUtil.sismember(cwbm.getSourceaddress(), ""); if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) {
if(sismember){
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress());
continue; continue;
} }
if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) { String rediskey = siteMsgTemple.getInfoSourceCode();
try {
boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
if (sismember) {
log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue; continue;
} }
} catch (Exception e) {
log.info("缓存出问题");
}
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// PageDownload pageDownload=new PageDownload();
// content = pageDownload.downloadByWebClient(cwbm.getSourceaddress(), "utf-8");
if (StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
}else{ }else{
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
// if(StringUtils.isEmpty(content)){
// content = paserSiteDownload.getContent(cwbm);
// }
try { try {
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
}catch (Exception e){ }catch (Exception e){
...@@ -254,19 +399,8 @@ public class WebContentPaserByCss { ...@@ -254,19 +399,8 @@ public class WebContentPaserByCss {
} }
} }
}catch (Exception e) { }catch (Exception e) {
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","0");
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if (StringUtils.isEmpty(content)) {
content = paserSiteDownload.getContent(cwbm);
}
}
} }
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -278,32 +412,84 @@ public class WebContentPaserByCss { ...@@ -278,32 +412,84 @@ public class WebContentPaserByCss {
docInfo.setTitle(cwbm.getTitle()==null?"":cwbm.getTitle().replace("...", "")); docInfo.setTitle(cwbm.getTitle()==null?"":cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor()); docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate()); docInfo.setPublishDate(cwbm.getPublishDate());
if(cwbm.getSourceaddress()!=null) { if(StringUtils.isNotEmpty(cwbm.getSourcesite())) {
docInfo.setOrigin(cwbm.getSourcesite()); docInfo.setOrigin(cwbm.getSourcesite());
}else{ }else{
docInfo.setOrigin(siteMsgTemple.getSiteName()); docInfo.setOrigin(siteMsgTemple.getSiteName());
} }
docInfo.setOrigin("agco");
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
// 封装解析的docinfo对象 // 封装解析的docinfo对象
try { try {
if(StringUtils.isNotEmpty(content)) { if(StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { }else {
StandardWebExtractorHandler swe = new StandardWebExtractorHandler(); // sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
swe.doHandler(content, docInfo);
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content);
} }
}catch (Exception e){ }catch (Exception e){
// sentBadSiteMsg(siteMsgTemple,"解析配置异常","1");
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
}
ObjectMapper mapper = new ObjectMapper();
try {
count++;
docInfo.setId(count+"");
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())||
StringUtils.isEmpty(processitem.getPublishDate())) {
k++;
if(k>3){
break;
}
if (StringUtils.isEmpty(processitem.getTitle())) {
log.info("资讯的信息不全缺少标题" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getContent())) {
log.info("资讯的信息不全缺少内容!:" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少时间!:" + cwbm.getSourceaddress());
continue;
}
}
if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("2");
}else{
processitem.setSource("1");
}
//使用浏览器截取图片
// if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
// String imagUrl = "";
//// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
//// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString();
// processitem.setScreenShotImg(imagUrl);
// }
String docjson = mapper.writeValueAsString(processitem);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
// int partition=0;
// try {
// partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
// }catch (Exception e){
// log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
// }
log.info("采集到的信息"+docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo);//用于统计新采集信息的数量
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
log.info("发送到kafka成功。");
} catch (JsonProcessingException e) {
log.info("发送到kafka失败。");
continue; continue;
} }
//插入数据库
intsertData(docInfo);
JedisUtil.setString(cwbm.getSourceaddress(),"1",-1);
docInfoList.add(docInfo);
} catch (Exception e){
} catch (Exception e){
continue; continue;
} }
...@@ -315,7 +501,7 @@ public class WebContentPaserByCss { ...@@ -315,7 +501,7 @@ public class WebContentPaserByCss {
return docInfoList; return docInfoList;
} }
// 抓取新闻内容 // 抓取新闻内容验证
public List<DocInfo> catchVerifyWebNewsByCSS(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) { public List<DocInfo> catchVerifyWebNewsByCSS(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) {
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList=new ArrayList<>();
...@@ -327,28 +513,36 @@ public class WebContentPaserByCss { ...@@ -327,28 +513,36 @@ public class WebContentPaserByCss {
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); try {//正常请求
}else{ content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) {
log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
if(StringUtils.isEmpty(content)){ log.info(e.getMessage());
content = paserSiteDownload.getContent(cwbm);
} }
} }
}catch (Exception e) { if(StringUtils.isEmpty(content) ) {
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
if (StringUtils.isEmpty(content)) { content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
if (siteMsgTemple.getHeaders() != null) { // content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders()); // if(StringUtils.isEmpty(content)){
} else { // SeleniumVerify seleniumVerify=new SeleniumVerify();
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); // content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
if (StringUtils.isEmpty(content)) { //
// }
}else{
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
} }
}catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
} }
} if (StringUtils.isEmpty(content)) {
continue;
} }
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
...@@ -371,10 +565,15 @@ public class WebContentPaserByCss { ...@@ -371,10 +565,15 @@ public class WebContentPaserByCss {
// 封装解析的docinfo对象 // 封装解析的docinfo对象
try { try {
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple ); docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
if(StringUtils.isEmpty(docInfo.getContentNoTag())){
// content = JSUtil.getParseredHtml(cwbm.getSourceaddress());
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
}
}catch (Exception e){ }catch (Exception e){
log.info("详情内容解析出现异常:"+cwbm.getSourceaddress()); log.info("详情内容解析出现异常:"+cwbm.getSourceaddress());
continue;
} }
intsertData(docInfo);
docInfoList.add(docInfo); docInfoList.add(docInfo);
} catch (Exception e){ } catch (Exception e){
...@@ -382,6 +581,7 @@ public class WebContentPaserByCss { ...@@ -382,6 +581,7 @@ public class WebContentPaserByCss {
} }
} }
count=count+1;
log.info("本次成功件数:" + count); log.info("本次成功件数:" + count);
} catch (Exception e) { } catch (Exception e) {
log.info("详情内容解析出错!"); log.info("详情内容解析出错!");
...@@ -389,41 +589,6 @@ public class WebContentPaserByCss { ...@@ -389,41 +589,6 @@ public class WebContentPaserByCss {
return docInfoList; return docInfoList;
} }
static String insertSql = "insert into cis_ans_processitem " +
" (id,sid, title,summary,publish_date,origin,author, content,words,keywords,sourceaddress) " +
" values(?,?,?,?,?,?,?,?,?,?,?)";
private static void intsertData(DocInfo docInfo)
{
long snowID = SnowIdUtils.uniqueLong();
String id=snowID+"";
String sid=docInfo.getSid()+"";
String title=docInfo.getTitle();
String summ = docInfo.getSummary();
if(summ!=null && summ.length()>5000){
summ=summ.substring(0,4900);
}
String summary=summ;
String publish_date=docInfo.getPublishDate();
String origin=docInfo.getOrigin();
String author=docInfo.getAuthor();
String content=docInfo.getContentNoTag();
String words=docInfo.getContentWithTag();
String keywords=docInfo.getKeywords();
String sourceaddress=docInfo.getSourceaddress();
DBManager dm = new DBManager();
String[] coulmn = new String[]{id, sid, title,summary,publish_date,origin,author,content,words,keywords,sourceaddress};
int[] type = new int[]{Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR};
try {
boolean flag = dm.updateOrAdd(coulmn, type, insertSql);
if(flag)
System.out.println("插入成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
public DocInfo doPaserByCssTag(String htmlContent, DocInfo docInfo, SiteMsgTemple siteTemplate){ public DocInfo doPaserByCssTag(String htmlContent, DocInfo docInfo, SiteMsgTemple siteTemplate){
Document doc = Jsoup.parse(htmlContent); Document doc = Jsoup.parse(htmlContent);
...@@ -489,15 +654,8 @@ public class WebContentPaserByCss { ...@@ -489,15 +654,8 @@ public class WebContentPaserByCss {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml); String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml);
try { try {
Map<String, String> imgDataMap = ContentFileFinder.getContentImgTag(contentWithTag, docInfo.getSourceaddress()); contentWithTag =ContentFileFinder.getContentAndImg(contentWithTag, docInfo.getSourceaddress());
for (String key : imgDataMap.keySet()) {
while (contentWithTag.contains(key)) {
//转换为绝对路径
String absoluteTag = imgDataMap.get(key);
contentWithTag = contentWithTag.replace(key, absoluteTag);
break;
}
}
}catch (Exception e){ }catch (Exception e){
log.info("图片转换出错"); log.info("图片转换出错");
}finally { }finally {
...@@ -518,7 +676,9 @@ public class WebContentPaserByCss { ...@@ -518,7 +676,9 @@ public class WebContentPaserByCss {
//资讯时间 //资讯时间
if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) { if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) {
publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate()); publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate());
if(StringUtils.isNotEmpty(publishDate)) { if(StringUtils.isNotEmpty(publishDate)) {
docInfo.setOlPpublishDate(publishDate);
docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate)); docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate));
} }
} }
......
...@@ -2,20 +2,19 @@ package com.zzsn.crawlerOther.paser; ...@@ -2,20 +2,19 @@ package com.zzsn.crawlerOther.paser;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder; import com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.*;
import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownload;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.download.RequestUtil;
import com.zzsn.entity.ClbAnsProcessitem; import com.zzsn.entity.*;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.test.JSUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
...@@ -30,31 +29,24 @@ import javax.annotation.Resource; ...@@ -30,31 +29,24 @@ import javax.annotation.Resource;
import java.io.InputStream; import java.io.InputStream;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.zzsn.crawler.paser.StandardWebExtractorHandler;
@Slf4j @Slf4j
public class WebContentPaserByRegular { public class WebContentPaserByRegular {
public static PageDownloader pageDownload=new PageDownloader(); public static PageDownloader pageDownload=new PageDownloader();
public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
public KafkaTemplate kafkaTemplate= null; public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
// public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static SubtractionTag subtractionTag=new SubtractionTag();
//通过注解引入配置 //通过注解引入配置
@Resource(name = "defaultThreadPool") @Resource(name = "defaultThreadPool")
private ThreadPoolTaskExecutor executor; private ThreadPoolTaskExecutor executor;
RequestUtil requestUtil=RequestUtil.getInstance();
// 提取站点新闻列表URL // 提取站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegularVerify(List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
try { try {
...@@ -66,46 +58,117 @@ public class WebContentPaserByRegular { ...@@ -66,46 +58,117 @@ public class WebContentPaserByRegular {
uri_code = Utility.encodURI(uri.toString()) uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%") .replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+"); .replaceAll("%20", "+");
Thread.sleep(1000L); // Thread.sleep(1000L);
String body = ""; String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){ if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if(StringUtils.isEmpty(body)) {
try {//正常请求
body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) {
// log.info(e.getMessage());
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
}
}
}
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
TimeUnit.SECONDS.sleep(5);
} else {
try {
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){
log.info(e.getMessage());
} }
if (StringUtils.isEmpty(body)) {
body = pageDownload.downloadWithStr(uri_code, charset, true, false); }
if (StringUtils.isEmpty(body)) { // if(StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")){
try { // String imagUrl="";
body = paserSiteDownload.getHtml(uri_code, charset); // WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
// webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
// }
//抽取资讯url
log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) {
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
}
if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
}
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个");
}
} catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code);
log.info("异常信息"+e.getMessage());
continue;
}
} catch (Exception e) { } catch (Exception e) {
log.info("静态请求失败:"+uri_code); log.info("对应请求不是url"+urlList.get(i));
} }
} }
return catchWebByMetaSearchList;
} }
TimeUnit.SECONDS.sleep(5);
// 提取站点新闻列表URL
@SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByRegular(List<String> urlList, String charset, SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
try {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = urlList.get(i);
try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
// Thread.sleep(1000L);
String body = "";
if(StringUtils.isNotEmpty(siteMsgTemple.getHeaders())){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else {
try {//先使用静态网络请求获取列表内容
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}catch (Exception e){
log.info(e.getMessage());
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
} }
if (StringUtils.isEmpty(body)){
body = SeleniumTime.getScopehtml(uri_code);
} }
}
// if(StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")){
// String imagUrl="";
// WebPageScreenShot webPageScreenShot=new WebPageScreenShot();
// webPageScreenShot.loadPage(uri_code,Constants.IMGPATH);
// }
//抽取资讯url //抽取资讯url
log.info("body的长度:"+body.length()); log.info("body的长度:"+body.length());
if(StringUtils.isNotEmpty(body)) { if(StringUtils.isNotEmpty(body)) {
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByRegular(siteMsgTemple, body);
}
if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
}
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个"); log.info("本次获取列表url:"+catchWebByMetaSearchList.size()+"个");
} }
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
log.info("异常信息"+e.getMessage()); log.info("异常信息"+e.getMessage());
// return catchWebByMetaSearchList;
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -117,6 +180,32 @@ public class WebContentPaserByRegular { ...@@ -117,6 +180,32 @@ public class WebContentPaserByRegular {
} }
/**
*
* @param siteMsgTemple
* @param msg 异常信息
* @param problemType 问题类型(1:信息源异常 2:爬取类别设置异常
*/
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByRegular(SiteMsgTemple siteMsgTemple,String doc)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByRegular(SiteMsgTemple siteMsgTemple,String doc)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -225,58 +314,58 @@ public class WebContentPaserByRegular { ...@@ -225,58 +314,58 @@ public class WebContentPaserByRegular {
} }
return eleText; return eleText;
} }
// 抓取新闻内容 // 抓取新闻内容
public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) { public List<DocInfo> catchWebNewsByRegular(List<CatchWebByMetaSearch> catchWebList, SiteMsgTemple siteMsgTemple) {
List<DocInfo> docInfoList=new ArrayList<>(); List<DocInfo> docInfoList = new ArrayList<>();
try {
int count = 0; int count = 0;
for (int i = 0; i < catchWebList.size(); i++) { int k = 0;
count++; int size=catchWebList.size()>50?50:catchWebList.size();
for (int i = 0; i < size; i++) {
long starttime = System.currentTimeMillis();
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
log.info("解析内容的URL:"+cwbm.getSourceaddress()); if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length() == 0 || cwbm.getSourceaddress().contains(".PDF") || cwbm.getSourceaddress().contains("download")) {
String value = JedisUtil.getString(cwbm.getSourceaddress());
if(StringUtils.isNotEmpty(value)){
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress());
continue; continue;
} }
log.info("解析内容的URL:" + cwbm.getSourceaddress());
String rediskey = siteMsgTemple.getInfoSourceCode();
try {
boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
if (sismember) {
log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue;
}
} catch (Exception e) {
log.info("缓存出问题");
}
// 请求下载内容 // 请求下载内容
String content=""; String content = "";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}else{ // if (StringUtils.isEmpty(content)){
// content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// }
} else {
try { try {
content =HttpgetUtil.getHtml(cwbm.getSourceaddress());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
} catch (Exception e) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}catch (Exception e){
log.info(e.getMessage()); log.info(e.getMessage());
} // content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
if (StringUtils.isEmpty(content)) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false);
if (StringUtils.isEmpty(content)) {
// try {
// content = paserSiteDownload.getContent(cwbm);
// } catch (Exception e) {
// log.info("静态请求失败:"+cwbm.getSourceaddress());
// }
} }
} }
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
} catch (Exception e) {
continue;
} }
}catch (Exception e) {
if (StringUtils.isEmpty(content)) { if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) { continue;
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), null, true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false);
if (content == null||content =="") {
content = paserSiteDownload.getContent(cwbm);
}
}
}
} }
//使用浏览器截取图片 log.info("详情内容的长度:" + content.length());
log.info("详情内容的长度:"+content.length());
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -285,68 +374,96 @@ public class WebContentPaserByRegular { ...@@ -285,68 +374,96 @@ public class WebContentPaserByRegular {
docInfo.setLastModified(cwbm.getLastModify()); docInfo.setLastModified(cwbm.getLastModify());
docInfo.setCharset("utf-8"); docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress()); docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle()==null?"":cwbm.getTitle().replace("...", "")); docInfo.setTitle(cwbm.getTitle() == null ? "" : cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor()); docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate()); docInfo.setPublishDate(cwbm.getPublishDate());
if(cwbm.getSourceaddress()!=null) { // if (cwbm.getSourceaddress() != null) {
docInfo.setOrigin(cwbm.getSourcesite()); // docInfo.setOrigin(cwbm.getSourcesite());
}else{ // } else {
// docInfo.setOrigin(siteMsgTemple.getSiteName());
// }
docInfo.setOrigin(siteMsgTemple.getSiteName()); docInfo.setOrigin(siteMsgTemple.getSiteName());
}
docInfo.setSummary(cwbm.getSummary()); docInfo.setSummary(cwbm.getSummary());
// 封装解析的docinfo对象 //封装解析的docinfo对象
try { try {
if(StringUtils.isNotEmpty(content)) { if (StringUtils.isNotEmpty(content)) {
docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple); docInfo = doPaserByCssTag(content, docInfo, siteMsgTemple);
}else { } else {
log.info("栏目名称:"+siteMsgTemple.getSiteName()+" 链接请求:"+cwbm.getSourceaddress()+" 内容为空:"+content); log.info("栏目名称:" + siteMsgTemple.getSiteName() + " 链接请求:" + cwbm.getSourceaddress() + " 内容为空:" + content);
} }
}catch (Exception e){ } catch (Exception e) {
log.info("文本内容解析不正确!"); log.info("文本内容解析不正确!");
continue; continue;
} }
docInfo.setId(count+"");
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try { try {
ClbAnsProcessitem processitem =paserSiteDownload.docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = paserSiteDownload.docInfoTrans2Processitem(docInfo);
if(siteMsgTemple.getYnDynamicCrawl()==1) { if (siteMsgTemple.getYnDynamicCrawl() == 1) {
processitem.setSource("2"); processitem.setSource("2");
}else{ } else {
processitem.setSource("1"); processitem.setSource("1");
} }
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent()) //默认解析
||StringUtils.isEmpty(processitem.getPublishDate())){ StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
log.info("资讯的信息不全缺少标题、时间或内容!:"+cwbm.getSourceaddress()); DocInfo docInfoItem=swe.doHandler(content,docInfo);
if(StringUtils.isEmpty(docInfo.getTitle())&&StringUtils.isNotEmpty(docInfoItem.getTitle())){
processitem.setTitle(docInfoItem.getTitle());
}
if(StringUtils.isEmpty(docInfo.getContentNoTag())&&StringUtils.isNotEmpty(docInfoItem.getContentNoTag())){
processitem.setContent(docInfoItem.getContentNoTag());
processitem.setContentWithTag(docInfoItem.getContentWithTag());
}
if(StringUtils.isEmpty(docInfo.getPublishDate())&&StringUtils.isNotEmpty(docInfoItem.getPublishDate())){
processitem.setPublishDate(docInfoItem.getPublishDate());
}
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())||
StringUtils.isEmpty(processitem.getPublishDate())) {
k++;
if(k>3){
break;
}
if (StringUtils.isEmpty(processitem.getTitle())) {
log.info("资讯的信息不全缺少标题" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getContent())) {
log.info("资讯的信息不全缺少内容!:" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少时间!:" + cwbm.getSourceaddress());
continue; continue;
} }
if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
String imagUrl = "";
// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
imagUrl=map.get("objectUrl").toString();
processitem.setScreenShotImg(imagUrl);
} }
//使用浏览器截取图片
// if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
// String imagUrl = "";
//// WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
//// webPageScreenShot.loadPage(cwbm.getSourceaddress(), Constants.IMGPATH);
// InputStream inputStream =SeleniumTime.getScreenshot(cwbm.getSourceaddress());
// HashMap map = ObsUpload.uploadInputStream(inputStream, "png");
// imagUrl=map.get("objectUrl").toString();
// processitem.setScreenShotImg(imagUrl);
// }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
count++;
docInfo.setId(count + "");
docInfoList.add(docInfo);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
long endtime = System.currentTimeMillis();
long dis=endtime-starttime;
log.info("详情页请求解析时间(毫秒):"+dis);
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace();
log.info("发送到kafka失败。"); log.info("发送到kafka失败。");
} }
JedisUtil.setString(cwbm.getSourceaddress(),"1",-1);
docInfoList.add(docInfo);
} catch (Exception e){
continue;
}
}
log.info("本次成功件数:" + count);
} catch (Exception e) { } catch (Exception e) {
log.info("内容解析部分出现异常!"); log.info("内容解析部分出现异常!");
} }
}
log.info("本次成功件数:" + count);
return docInfoList; return docInfoList;
} }
...@@ -362,28 +479,35 @@ public class WebContentPaserByRegular { ...@@ -362,28 +479,35 @@ public class WebContentPaserByRegular {
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { //首先使用静态访问,当访问异常使用动态访问,当解析内容为空再次调用动态访问
try {//正常请求
content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
} catch (Exception e) {
log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content) ) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress()); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// if(StringUtils.isEmpty(content)){
// SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
//
// }
}else{ }else{
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(),null,true,false);
if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
} }
}
}catch (Exception e) { }catch (Exception e) {
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), null, true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, true, false);
if (content == null||content =="") {
content = paserSiteDownload.getContent(cwbm);
}
}
} }
if (StringUtils.isEmpty(content)) {
continue;
} }
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -404,6 +528,10 @@ public class WebContentPaserByRegular { ...@@ -404,6 +528,10 @@ public class WebContentPaserByRegular {
// 封装解析的docinfo对象 // 封装解析的docinfo对象
try { try {
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple ); docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
if(StringUtils.isEmpty(docInfo.getContentNoTag())){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
docInfo = doPaserByCssTag(content, docInfo,siteMsgTemple );
}
}catch (Exception e){ }catch (Exception e){
log.info("文本内容解析不正确!"); log.info("文本内容解析不正确!");
continue; continue;
...@@ -515,22 +643,14 @@ public class WebContentPaserByRegular { ...@@ -515,22 +643,14 @@ public class WebContentPaserByRegular {
} }
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml); String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml);
try { try {
Map<String, String> imgDataMap = ContentFileFinder.getContentImgTag(contentWithTag, docInfo.getSourceaddress()); contentWithTag =ContentFileFinder.getContentAndImg(contentWithTag, docInfo.getSourceaddress());
for (String key : imgDataMap.keySet()) {
while (contentWithTag.contains(key)) {
//转换为绝对路径
String absoluteTag = imgDataMap.get(key);
contentWithTag = contentWithTag.replace(key, absoluteTag);
break;
}
}
}catch (Exception e){ }catch (Exception e){
log.info("图片转换出错"); log.info("图片转换出错");
}finally { }finally {
} }
docInfo.setContentWithTag(contentWithTag); docInfo.setContentWithTag(contentWithTag);
docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n","")); docInfo.setContentNoTag(ContentUtility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
} }
//作者 //作者
...@@ -544,6 +664,7 @@ public class WebContentPaserByRegular { ...@@ -544,6 +664,7 @@ public class WebContentPaserByRegular {
if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) { if(null!=siteTemplate.getDetailExpressionPublishDate()&&siteTemplate.getDetailExpressionPublishDate().length()>0) {
publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate()); publishDate=paseElementByCSS(doc,siteTemplate.getDetailExpressionPublishDate());
if(StringUtils.isNotEmpty(publishDate)) { if(StringUtils.isNotEmpty(publishDate)) {
docInfo.setOlPpublishDate(publishDate);
docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate)); docInfo.setPublishDate(PublishDateUtil.getPublishDate(publishDate));
}else{ }else{
// return docInfo; // return docInfo;
...@@ -564,8 +685,13 @@ public class WebContentPaserByRegular { ...@@ -564,8 +685,13 @@ public class WebContentPaserByRegular {
origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource()); origin=paseElementByCSS(doc,siteTemplate.getDetailExpressionSource());
if(StringUtils.isNotEmpty(origin)) { if(StringUtils.isNotEmpty(origin)) {
docInfo.setOrigin(origin); docInfo.setOrigin(origin);
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
}else{
docInfo.setOrigin(siteTemplate.getSiteName());
} }
return docInfo; return docInfo;
} }
......
...@@ -4,18 +4,19 @@ import com.fasterxml.jackson.core.JsonProcessingException; ...@@ -4,18 +4,19 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.PaserSiteDownload; import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.paser.SubtractionTag;
import com.zzsn.crawler.uriparser.HtmlPageParser; import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.SeleniumTime; import com.zzsn.crawler.uriparser.SeleniumTime;
import com.zzsn.crawler.uriparser.SeleniumVerify;
import com.zzsn.crawler.uriparser.WebPageScreenShot; import com.zzsn.crawler.uriparser.WebPageScreenShot;
import com.zzsn.crawler.uriparser.obs.ObsUpload; import com.zzsn.crawler.uriparser.obs.ObsUpload;
import com.zzsn.download.PageBuilderParser; import com.zzsn.download.PageBuilderParser;
import com.zzsn.download.PageDownloader; import com.zzsn.download.PageDownloader;
import com.zzsn.entity.CatchWebByMetaSearch; import com.zzsn.download.RequestUtil;
import com.zzsn.entity.ClbAnsProcessitem; import com.zzsn.entity.*;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.generation.Constants; import com.zzsn.generation.Constants;
import com.zzsn.job.JedisUtil; import com.zzsn.job.JedisUtil;
import com.zzsn.test.JSUtil;
import com.zzsn.util.*; import com.zzsn.util.*;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
...@@ -52,8 +53,10 @@ import java.net.URL; ...@@ -52,8 +53,10 @@ import java.net.URL;
import java.security.KeyManagementException; import java.security.KeyManagementException;
import java.security.KeyStoreException; import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.util.*; import java.util.ArrayList;
import java.util.concurrent.TimeUnit; import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -63,12 +66,13 @@ public class WebContentPaserByXpath { ...@@ -63,12 +66,13 @@ public class WebContentPaserByXpath {
public static PageDownloader pageDownload=new PageDownloader(); public static PageDownloader pageDownload=new PageDownloader();
public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); public static PaserSiteDownload paserSiteDownload=new PaserSiteDownload();
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public static SubtractionTag subtractionTag=new SubtractionTag(); public static com.zzsn.crawler.paser.SubtractionTag subtractionTag=new SubtractionTag();
public static PageBuilderParser builderParser=new PageBuilderParser(); public static PageBuilderParser builderParser=new PageBuilderParser();
public SeleniumTime seleniumTime; public SeleniumTime seleniumTime;
// 提取站点新闻列表URL RequestUtil requestUtil=RequestUtil.getInstance();
// 验证站点新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByXapth( public List<CatchWebByMetaSearch> catchWebOfStaticmsgByXapthVerify(
List<String> urlList, String charset,SiteMsgTemple siteMsgTemple) { List<String> urlList, String charset,SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -87,12 +91,91 @@ public class WebContentPaserByXpath { ...@@ -87,12 +91,91 @@ public class WebContentPaserByXpath {
if(siteMsgTemple.getHeaders()!=null){ if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders()); body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else { }else {
if (siteMsgTemple.getYnDynamicCrawl() == 1) { if(StringUtils.isEmpty(body)) {
// seleniumTime=new SeleniumTime(); try {//正常请求
body = requestUtil.httpGetRequest(uri_code);
} catch (Exception e) {
// log.info(e.getMessage());
try {//正常请求
body = pageDownload.downloadWithStr(uri_code, charset, true, false);
} catch (Exception e2) {
log.info(e2.getMessage());
body="";
}
}
}
if (StringUtils.isEmpty(body)) {
body = pageDownload.downloadWithStr(uri_code, charset, false, false);
if (StringUtils.isEmpty(body)) {
try {
body = paserSiteDownload.getHtml(uri_code, charset);
} catch (Exception e) {
log.info("静态请求失败:"+uri_code);
}
}
}
if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
// body = SeleniumTime.getVerifyScopehtml(uri_code);
body = SeleniumTime.getScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code); body = SeleniumTime.getScopehtml(uri_code);
TimeUnit.SECONDS.sleep(5); }
}
}
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
}
if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
}
catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code);
if(seleniumTime!=null) {
seleniumTime.close(); seleniumTime.close();
} else { }
// return catchWebByMetaSearchList;
continue;
}
} catch (Exception e) {
log.info("对应请求不是url"+urlList.get(i));
if(seleniumTime!=null) {
seleniumTime.close();
}
}
}
return catchWebByMetaSearchList;
}
// 提取站点新闻列表URL
@SuppressWarnings("deprecation")
public List<CatchWebByMetaSearch> catchWebOfStaticmsgByXapth(
List<String> urlList, String charset,SiteMsgTemple siteMsgTemple) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
try {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
Thread.sleep(2000L);
String body = "";
if(siteMsgTemple.getHeaders()!=null){
body = pageDownload.downloadWithStrAddHeader(uri_code,charset,false,false, siteMsgTemple.getHeaders());
}else {
try { try {
body = pageDownload.downloadWithStr(uri_code, charset, true, false); body = pageDownload.downloadWithStr(uri_code, charset, true, false);
}catch (Exception e){ }catch (Exception e){
...@@ -108,11 +191,27 @@ public class WebContentPaserByXpath { ...@@ -108,11 +191,27 @@ public class WebContentPaserByXpath {
} }
} }
} }
TimeUnit.SECONDS.sleep(5); if ( StringUtils.isEmpty(body) && siteMsgTemple.getYnDynamicCrawl() == 1) {
if(siteMsgTemple.getVerifyType()!=null&&siteMsgTemple.getVerifyType().contains("1")){
body = SeleniumTime.getVerifyScopehtml(uri_code);
}else {
body = SeleniumTime.getScopehtml(uri_code);
} }
} }
}
// if(StringUtils.isEmpty(body)){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
// }
//抽取资讯url //抽取资讯url
List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body); List<CatchWebByMetaSearch> catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
if (catchWebByMetaSearches.size()<1 && siteMsgTemple.getYnDynamicCrawl() == 1) {
body = SeleniumTime.getScopehtml(uri_code);
catchWebByMetaSearches = parserCrawlerSiteListByXpath(siteMsgTemple, body);
}
if(catchWebByMetaSearches.size()<1){
// sentBadSiteMsg(siteMsgTemple,"网络访问请求异常","1");
continue;
}
catchWebByMetaSearchList.addAll(catchWebByMetaSearches); catchWebByMetaSearchList.addAll(catchWebByMetaSearches);
} catch (Exception e) { } catch (Exception e) {
log.info("列表下载异常 对应的链接:"+uri_code); log.info("列表下载异常 对应的链接:"+uri_code);
...@@ -132,6 +231,27 @@ public class WebContentPaserByXpath { ...@@ -132,6 +231,27 @@ public class WebContentPaserByXpath {
return catchWebByMetaSearchList; return catchWebByMetaSearchList;
} }
public void sentBadSiteMsg(SiteMsgTemple siteMsgTemple,String msg,String problemType){
try {
BadSiteMsgBak badSiteMsg = new BadSiteMsgBak();
badSiteMsg.setId(siteMsgTemple.getId());
badSiteMsg.setInfoSourceCode(siteMsgTemple.getInfoSourceCode());
badSiteMsg.setWebSiteName(siteMsgTemple.getWebSiteName());
badSiteMsg.setSiteName(siteMsgTemple.getSiteName());
badSiteMsg.setSiteUri(siteMsgTemple.getSiteUri());
badSiteMsg.setErrorType(msg);
badSiteMsg.setProblemType(problemType);
String crawlerType=siteMsgTemple.getYnDynamicCrawl()!=1?"0":siteMsgTemple.getYnDynamicCrawl()+"";
badSiteMsg.setCrawlerType(crawlerType);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(badSiteMsg);
kafkaTemplate.send("badSiteTopic", docjson);
log.info("信息源问题:"+msg);
}catch (Exception e){
}
}
//提取列表信息 //提取列表信息
public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception { public List<CatchWebByMetaSearch> parserCrawlerSiteListByXpath(SiteMsgTemple siteMsgTemple,String body)throws Exception {
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
...@@ -267,28 +387,31 @@ public class WebContentPaserByXpath { ...@@ -267,28 +387,31 @@ public class WebContentPaserByXpath {
try { try {
int count = 0; int count = 0;
for (int i = 0; i < catchWebList.size(); i++) { int k = 0;
int size=catchWebList.size()>50?50:catchWebList.size();
for (int i = 0; i < size; i++) {
count++; count++;
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
boolean sismember = JedisUtil.sismember(cwbm.getSourceaddress(), ""); if (cwbm.getSourceaddress() == null || cwbm.getSourceaddress().contains(".pdf") || cwbm.getSourceaddress().trim().length()==0|| cwbm.getSourceaddress().contains(".PDF")||cwbm.getSourceaddress().contains("download")) {
if(sismember){ continue;
log.info("栏目信息重复:"+siteMsgTemple.getSiteName()+" :" +cwbm.getSourceaddress()); }
String rediskey = siteMsgTemple.getInfoSourceCode();
try {
boolean sismember = JedisUtil.sismember(rediskey, cwbm.getSourceaddress());
if (sismember) {
log.info("栏目信息重复:" + siteMsgTemple.getSiteName() + " :" + cwbm.getSourceaddress());
continue; continue;
} }
} catch (Exception e) {
log.info("缓存出问题");
}
// 请求下载内容 // 请求下载内容
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
seleniumTime=new SeleniumTime(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
seleniumTime.close();
}else{ }else{
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
// if(StringUtils.isEmpty(content)){
// content = paserSiteDownload.getContent(cwbm);
// }
try { try {
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
}catch (Exception e){ }catch (Exception e){
...@@ -306,19 +429,8 @@ public class WebContentPaserByXpath { ...@@ -306,19 +429,8 @@ public class WebContentPaserByXpath {
} }
} }
}catch (Exception e) { }catch (Exception e) {
if(seleniumTime!=null) { content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
seleniumTime.close();
} }
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
}
}
}
DocInfo docInfo = new DocInfo(); DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML"); docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId()); docInfo.setOrgId(cwbm.getOrgId());
...@@ -345,19 +457,37 @@ public class WebContentPaserByXpath { ...@@ -345,19 +457,37 @@ public class WebContentPaserByXpath {
} }
}catch (Exception e){ }catch (Exception e){
log.info("本次解析内容异常"+e.getMessage()); log.info("本次解析内容异常"+e.getMessage());
if(seleniumTime!=null) {
seleniumTime.close();
}
} }
docInfo.setId(count+""); docInfo.setId(count+"");
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
try { try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
if(StringUtils.isEmpty(processitem.getTitle())||StringUtils.isEmpty(processitem.getContent())||
StringUtils.isEmpty(processitem.getPublishDate())) {
k++;
if(k>3){
break;
}
if (StringUtils.isEmpty(processitem.getTitle())) {
log.info("资讯的信息不全缺少标题" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getContent())) {
log.info("资讯的信息不全缺少内容!:" + cwbm.getSourceaddress());
continue;
}
if (StringUtils.isEmpty(processitem.getPublishDate())) {
log.info("资讯的信息不全缺少时间!:" + cwbm.getSourceaddress());
continue;
}
}
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
processitem.setSource("2"); processitem.setSource("2");
}else{ }else{
processitem.setSource("1"); processitem.setSource("1");
} }
//使用浏览器截取图片
if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) { if (StringUtils.isNotEmpty(siteMsgTemple.getYnSnapshot()) && siteMsgTemple.getYnSnapshot().contains("1")) {
String imagUrl = ""; String imagUrl = "";
// WebPageScreenShot webPageScreenShot = new WebPageScreenShot(); // WebPageScreenShot webPageScreenShot = new WebPageScreenShot();
...@@ -367,19 +497,27 @@ public class WebContentPaserByXpath { ...@@ -367,19 +497,27 @@ public class WebContentPaserByXpath {
imagUrl=map.get("objectUrl").toString(); imagUrl=map.get("objectUrl").toString();
processitem.setScreenShotImg(imagUrl); processitem.setScreenShotImg(imagUrl);
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); // kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
int partition=0;
try {
partition = Integer.parseInt(Constants.KAFKA_PRODUCT_PARTITION);
}catch (Exception e){
log.info("分区配置异常:"+Constants.KAFKA_PRODUCT_PARTITION);
}
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
docInfoList.add(docInfo); //用于采集到的信息统计
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// e.printStackTrace(); // e.printStackTrace();
log.info("发送到kafka失败。"); log.info("发送到kafka失败。");
continue;
} }
JedisUtil.setString(cwbm.getSourceaddress(),"1",-1); // JedisUtil.setString(cwbm.getSourceaddress(),"1",-1);
docInfoList.add(docInfo); JedisUtil.sadd(rediskey, cwbm.getSourceaddress());
} catch (Exception e){ } catch (Exception e){
if(seleniumTime!=null) {
seleniumTime.close();
}
continue; continue;
} }
...@@ -405,35 +543,40 @@ public class WebContentPaserByXpath { ...@@ -405,35 +543,40 @@ public class WebContentPaserByXpath {
String content=""; String content="";
try { try {
if(siteMsgTemple.getYnDynamicCrawl()==1) { if(siteMsgTemple.getYnDynamicCrawl()==1) {
seleniumTime=new SeleniumTime(); try {//正常请求
content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
seleniumTime.close();
} catch (Exception e) {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), "", true, false);
// log.info(e.getMessage());
}
if(StringUtils.isEmpty(content)){
content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
// SeleniumVerify seleniumVerify=new SeleniumVerify();
// content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
}
}else{ }else{
content = requestUtil.httpGetRequest(cwbm.getSourceaddress());
if(StringUtils.isEmpty(content)) {
try {//正常请求
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false); content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
} catch (Exception e) {
log.info(e.getMessage());
}
}
if(StringUtils.isEmpty(content)){ if(StringUtils.isEmpty(content)){
content = paserSiteDownload.getContent(cwbm); content = paserSiteDownload.getContent(cwbm);
if(StringUtils.isEmpty(content)) { if(StringUtils.isEmpty(content)) {
seleniumTime = new SeleniumTime(); SeleniumVerify seleniumVerify=new SeleniumVerify();
content = seleniumTime.getScopehtml(cwbm.getSourceaddress()); content = seleniumVerify.getScopehtml(cwbm.getSourceaddress());
seleniumTime.close();
} }
} }
} }
}catch (Exception e) { }catch (Exception e) {
if(seleniumTime!=null) { if(StringUtils.isEmpty(content)){
seleniumTime.close(); content = SeleniumTime.getScopehtml(cwbm.getSourceaddress());
}
if (StringUtils.isEmpty(content)) {
if (siteMsgTemple.getHeaders() != null) {
content = pageDownload.downloadWithStrAddHeader(cwbm.getSourceaddress(), cwbm.getCharset(), true, false, siteMsgTemple.getHeaders());
} else {
content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), cwbm.getCharset(), true, false);
if (StringUtils.isEmpty(content)) {
seleniumTime=new SeleniumTime();
content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
seleniumTime.close();
}
}
} }
} }
...@@ -545,15 +688,7 @@ public class WebContentPaserByXpath { ...@@ -545,15 +688,7 @@ public class WebContentPaserByXpath {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml); String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(elementHtml);
try { try {
Map<String, String> imgDataMap = ContentFileFinder.getContentImgTag(contentWithTag, docInfo.getSourceaddress()); contentWithTag =ContentFileFinder.getContentAndImg(contentWithTag, docInfo.getSourceaddress());
for (String key : imgDataMap.keySet()) {
while (contentWithTag.contains(key)) {
//转换为绝对路径
String absoluteTag = imgDataMap.get(key);
contentWithTag = contentWithTag.replace(key, absoluteTag);
break;
}
}
}catch (Exception e){ }catch (Exception e){
log.info("图片转换出错"); log.info("图片转换出错");
}finally { }finally {
...@@ -574,8 +709,10 @@ public class WebContentPaserByXpath { ...@@ -574,8 +709,10 @@ public class WebContentPaserByXpath {
String publishDatePath=pageBuilderParser.parserStr(HtmlPageParser.xmlGetDocument(siteTemplate.getDetailExpressionPublishDate()),"//exp"); String publishDatePath=pageBuilderParser.parserStr(HtmlPageParser.xmlGetDocument(siteTemplate.getDetailExpressionPublishDate()),"//exp");
String publishDate= pageBuilderParser.parserStr(doc,publishDatePath); String publishDate= pageBuilderParser.parserStr(doc,publishDatePath);
if(publishDate.length()>0) { if(publishDate.length()>0) {
docInfo.setOlPpublishDate(publishDate);
docInfo.setPublishDate(DateUtil.getPublishDate(publishDate)); docInfo.setPublishDate(DateUtil.getPublishDate(publishDate));
}else if(StringUtils.isNotEmpty(docInfo.getPublishDate())){ }else if(StringUtils.isNotEmpty(docInfo.getPublishDate())){
docInfo.setOlPpublishDate(publishDate);
docInfo.setPublishDate(DateUtil.getPublishDate(docInfo.getPublishDate())); docInfo.setPublishDate(DateUtil.getPublishDate(docInfo.getPublishDate()));
} }
} }
...@@ -769,3 +906,4 @@ public class WebContentPaserByXpath { ...@@ -769,3 +906,4 @@ public class WebContentPaserByXpath {
} }
} }
package com.zzsn.download;
import org.htmlcleaner.*;
import org.lobobrowser.html.parser.DocumentBuilderImpl;
import org.lobobrowser.html.parser.InputSourceImpl;
import org.lobobrowser.html.test.SimpleUserAgentContext;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
public class HtmlPageParser {
public static Document cobraParse(String url, String pageData)
throws SAXException, IOException {
StringReader sReader = new StringReader(pageData);
return cobraParse(url, sReader);
}
public static Document cobraParse(String url, Reader pageData)
throws SAXException, IOException {
SimpleUserAgentContext ucontext = new SimpleUserAgentContext();
ucontext.setScriptingEnabled(false);
ucontext.setExternalCSSEnabled(false);
DocumentBuilderImpl builder = new DocumentBuilderImpl(ucontext);
Document document = builder
.parse(new InputSourceImpl(pageData, url));
return document;
}
public static Document xmlGetDocument(String pageBody)
throws ParserConfigurationException, SAXException, IOException
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
InputStream is = new ByteArrayInputStream(pageBody.getBytes());
Document document = builder.parse(is);
return document;
}
public static Document htmlCleanerParser(String pageBody)
throws XPatherException, ParserConfigurationException
{
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(pageBody);
Document document = new DomSerializer(
new CleanerProperties()).createDOM(tagNode);
return document;
}
}
package com.zzsn.download; package com.zzsn.download;
//import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import com.zzsn.util.StringUtil;
import org.htmlcleaner.BaseTokenImpl; import org.htmlcleaner.BaseTokenImpl;
import org.htmlcleaner.ContentNode; import org.htmlcleaner.ContentNode;
import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException; import org.htmlcleaner.XPatherException;
import org.jsoup.Jsoup;
import org.w3c.dom.*; import org.w3c.dom.*;
import javax.xml.xpath.*; import javax.xml.xpath.*;
...@@ -17,17 +13,14 @@ import java.util.List; ...@@ -17,17 +13,14 @@ import java.util.List;
public class PageBuilderParser { public class PageBuilderParser {
public NodeList parserNodeList(Object doc, String path) { public NodeList parserNodeList(Object doc, String path)
NodeList nodeList=null; throws XPathExpressionException
try { {
XPathFactory factory = XPathFactory.newInstance(); XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath(); XPath xPath = factory.newXPath();
XPathExpression expression = xPath.compile(path); XPathExpression expression = xPath.compile(path);
nodeList = (NodeList) expression.evaluate( NodeList nodeList = (NodeList)expression.evaluate(
doc, XPathConstants.NODESET); doc, XPathConstants.NODESET);
}catch (XPathExpressionException e){
e.printStackTrace();
}
return nodeList; return nodeList;
} }
...@@ -70,45 +63,7 @@ public class PageBuilderParser { ...@@ -70,45 +63,7 @@ public class PageBuilderParser {
} }
return object; return object;
} }
public Object parserNode(Object doc, String path) throws XPathExpressionException
{
if (path == null || path.trim().length() == 0)
{
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath();
XPathExpression expression = xPath.compile(path);
Object object = null;
try {
object = expression.evaluate(doc, XPathConstants.NODE);
return (Node)object;
} catch (XPathExpressionException e) {
try {
object = expression.evaluate(doc, XPathConstants.NODESET);
return (NodeList)object;
} catch (XPathExpressionException e1) {
try {
object = expression.evaluate(doc, XPathConstants.STRING);
return (String)object;
} catch (XPathExpressionException e2) {
try {
object = expression.evaluate(doc, XPathConstants.NUMBER);
return (Number)object;
} catch (XPathExpressionException e3) {
try {
object = expression.evaluate(doc, XPathConstants.BOOLEAN);
return (Boolean)object;
} catch (XPathExpressionException e4) {
// TODO Auto-generated catch block
e4.printStackTrace();
}
}
}
}
}
return object;
}
public String parserStrBr(Object doc, String path) public String parserStrBr(Object doc, String path)
throws XPathExpressionException throws XPathExpressionException
{ {
...@@ -407,14 +362,4 @@ public class PageBuilderParser { ...@@ -407,14 +362,4 @@ public class PageBuilderParser {
return true; return true;
} }
public static void main(String[] args)throws Exception {
PageBuilderParser pageBuilderParser=new PageBuilderParser();
String aa="<content><exp>*.div[class=\"artText clearfix\"]</exp><subtraction>div[class=\"relateArt\"]</subtraction></content>";
Document document = HtmlPageParser.xmlGetDocument(aa);
String content = pageBuilderParser.parserStr(document, "//exp");
String subtraction = pageBuilderParser.parserStr(document, "//subtraction");
System.out.println(content);
System.out.println(subtraction);
}
} }
package com.zzsn.download; package com.zzsn.download;
import com.gargoylesoftware.htmlunit.*; import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.zzsn.crawler.oracledb.OracleDBManager;
import com.zzsn.crawler.oracledb.OracleDataTable;
import com.zzsn.generation.Constants;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.DefaultHttpClient;
import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession; import javax.net.ssl.SSLSession;
import java.io.IOException; import java.io.IOException;
import java.net.*; import java.net.*;
import java.sql.SQLException;
public class PageConnectioner { public class PageConnectioner {
/**默认代理地址*/ /**默认代理地址*/
// public static String PROXY_ADDR = "proxy.zj.chinamobile.com"; public static String PROXY_ADDR = "proxy.zj.chinamobile.com";
private static final String PROXY_ADDR = "114.249.113.226"; // private static final String PROXY_ADDR = "114.249.113.226";
/**默认代理接口*/ /**默认代理接口*/
// public static int PROXY_PORT = 8080; public static int PROXY_PORT = 8080;
private static final int PROXY_PORT = 9000; // private static final int PROXY_PORT = 9000;
/**下载失败后的暂停时间*/ /**下载失败后的暂停时间*/
private static final long SLEEP_TIME = 5000; private static final long SLEEP_TIME = 5000;
/**构造下载使用的{@link HttpURLConnection} /**构造下载使用的{@link HttpURLConnection}
* @param urlstr 下载url * @param urlstr 下载url
* */ * */
protected HttpURLConnection connection(String urlstr) throws Exception { protected HttpURLConnection connection(String urlstr) throws Exception {
URL url = null; URL url = null;
// Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (Constants.PROXYID==5) { // if (pluginUtil.isNeedProxy()) {
String proxyIP = getProxyIP(); // ProxyIPGet count = ProxyIPGet.getInstance();
System.out.println("代理IP :"+proxyIP); // PROXY_ADDR=count.getIpaddre();
String[] proxys=proxyIP.split("-"); // PROXY_PORT=Integer.parseInt(count.getPort());
String proxyHost = proxys[0]; // Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
int proxyPort = Integer.parseInt(proxys[1]); // connection = (HttpURLConnection) url.openConnection(proxy);
String userName = proxys[2]; // } else {
String password = proxys[3]; // connection = (HttpURLConnection) url.openConnection();
//创建代理服务器 // }
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));
//设置代理的用户名密码
Authenticator.setDefault(new MyAuth(userName, password));
// 设定连接的相关参数
connection = (HttpURLConnection) url.openConnection(proxy);
}else {
connection = (HttpURLConnection) url.openConnection(); connection = (HttpURLConnection) url.openConnection();
}
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("accept", "*/*");
...@@ -68,82 +51,15 @@ public class PageConnectioner { ...@@ -68,82 +51,15 @@ public class PageConnectioner {
connection.setRequestProperty("referer", urlstr); connection.setRequestProperty("referer", urlstr);
} catch (Exception e) { } catch (Exception e) {
}
return connection;
}
static class MyAuth extends Authenticator
{
private String user;
private String pass;
public MyAuth(String user, String pass)
{
this.user = user;
this.pass = pass;
} }
@Override return connection;
protected PasswordAuthentication getPasswordAuthentication() {
return new PasswordAuthentication(user, pass.toCharArray());
}
}
public static String getProxyIP(){
String searchSql = "select proxy from CIS_sys_Proxy where ID = 4";
String proxy="";
OracleDBManager dm = new OracleDBManager();
String[] coulmn = null;
int[] type = null;
try {
OracleDataTable dt = dm.getResultData(coulmn, type, searchSql);
if(dt != null && dt.getRowCount()> 0){
for(int i = 0; i<dt.getRowCount(); i++){
for(int j = 0; j<dt.getColCoun(); j++)
if(dt.getRow()[i][j].length()>5){
proxy=dt.getRow()[i][j];
}
}
}else
System.out.println("查询失败");
} catch (SQLException e) {
e.printStackTrace();
}
return proxy;
}
public static HttpClient getHttpClient() {
String proxyIP = getProxyIP();
String[] proxys=proxyIP.split("-");
DefaultHttpClient httpClient = new DefaultHttpClient();
String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]);
String userName = proxys[2];
String password = proxys[3];
httpClient.getCredentialsProvider().setCredentials(
new AuthScope(proxyHost, proxyPort),
new UsernamePasswordCredentials(userName, password));
HttpHost proxy = new HttpHost(proxyHost,proxyPort);
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
return httpClient;
}
public static HttpClient getNoProxyHttpClient() {
String[] proxys=getProxyIP().split("-");
DefaultHttpClient httpClient = new DefaultHttpClient();
String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]);
String userName = proxys[2];
String password = proxys[3];
httpClient.getCredentialsProvider().setCredentials(
new AuthScope(proxyHost, proxyPort),
new UsernamePasswordCredentials(userName, password));
HttpHost proxy = new HttpHost(proxyHost,proxyPort);
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
return httpClient;
} }
/**构造下载使用的{@link HttpURLConnection} /**构造下载使用的{@link HttpURLConnection}
* @param urlstr 下载url (当参数类型是json字符串时调用) * @param urlstr 下载url (当参数类型是json字符串时调用)
* */ * */
public HttpURLConnection connection(String urlstr,String params) throws Exception { protected HttpURLConnection connection(String urlstr,String params) throws Exception {
URL url = null; URL url = null;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT)); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpURLConnection connection = null; HttpURLConnection connection = null;
...@@ -151,11 +67,11 @@ public class PageConnectioner { ...@@ -151,11 +67,11 @@ public class PageConnectioner {
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) { // if (pluginUtil.isNeedProxy()) {
connection = (HttpURLConnection) url.openConnection(proxy); // connection = (HttpURLConnection) url.openConnection(proxy);
} else { // } else {
connection = (HttpURLConnection) url.openConnection(); // connection = (HttpURLConnection) url.openConnection();
} // }
connection = (HttpURLConnection) url.openConnection(); connection = (HttpURLConnection) url.openConnection();
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
...@@ -175,13 +91,13 @@ public class PageConnectioner { ...@@ -175,13 +91,13 @@ public class PageConnectioner {
//参数类型是json字符串用到 //参数类型是json字符串用到
connection.setRequestProperty("Content-Type","application/json"); connection.setRequestProperty("Content-Type","application/json");
} catch (Exception e) { } catch (Exception e) {
//
} }
return connection; return connection;
} }
/** /**
* 该方法为代理IP * 该方法为代理IP,目前用于豆瓣
*/ */
protected HttpURLConnection connection(String urlstr, String ipadd, String prot) throws Exception { protected HttpURLConnection connection(String urlstr, String ipadd, String prot) throws Exception {
URL url = null; URL url = null;
...@@ -191,11 +107,12 @@ public class PageConnectioner { ...@@ -191,11 +107,12 @@ public class PageConnectioner {
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) { // if (pluginUtil.isNeedProxy()) {
connection = (HttpURLConnection) url.openConnection(proxy); // connection = (HttpURLConnection) url.openConnection(proxy);
} else { // } else {
// connection = (HttpURLConnection) url.openConnection();
// }
connection = (HttpURLConnection) url.openConnection(); connection = (HttpURLConnection) url.openConnection();
}
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("accept", "*/*");
...@@ -206,7 +123,6 @@ public class PageConnectioner { ...@@ -206,7 +123,6 @@ public class PageConnectioner {
connection.setRequestProperty("X-Terminal-Type", "pc"); connection.setRequestProperty("X-Terminal-Type", "pc");
} catch (Exception e) { } catch (Exception e) {
} }
return connection; return connection;
} }
...@@ -217,11 +133,13 @@ public class PageConnectioner { ...@@ -217,11 +133,13 @@ public class PageConnectioner {
try { try {
url = new URL(urlstr); url = new URL(urlstr);
if (false) {
connection = (HttpsURLConnection) url.openConnection(proxy); // if (pluginUtil.isNeedProxy()) {
} else { // connection = (HttpsURLConnection) url.openConnection(proxy);
// } else {
// connection = (HttpsURLConnection) url.openConnection();
// }
connection = (HttpsURLConnection) url.openConnection(); connection = (HttpsURLConnection) url.openConnection();
}
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("accept", "*/*");
...@@ -232,7 +150,6 @@ public class PageConnectioner { ...@@ -232,7 +150,6 @@ public class PageConnectioner {
connection.setRequestProperty("X-Terminal-Type", "pc"); connection.setRequestProperty("X-Terminal-Type", "pc");
} catch (Exception e) { } catch (Exception e) {
} }
return connection; return connection;
} }
...@@ -243,9 +160,8 @@ public class PageConnectioner { ...@@ -243,9 +160,8 @@ public class PageConnectioner {
*/ */
protected HttpsURLConnection httpsconnection(String urlstr) throws Exception { protected HttpsURLConnection httpsconnection(String urlstr) throws Exception {
URL url = null; URL url = null;
// Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT)); Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_ADDR, PROXY_PORT));
HttpsURLConnection connection = null; HttpsURLConnection connection = null;
try {
trustAllHttpsCertificates(); trustAllHttpsCertificates();
HostnameVerifier hv = new HostnameVerifier() { HostnameVerifier hv = new HostnameVerifier() {
@Override @Override
...@@ -255,41 +171,28 @@ public class PageConnectioner { ...@@ -255,41 +171,28 @@ public class PageConnectioner {
}; };
HttpsURLConnection.setDefaultHostnameVerifier(hv); HttpsURLConnection.setDefaultHostnameVerifier(hv);
try{
url = new URL(urlstr); url = new URL(urlstr);
if (Constants.PROXYID==1) { // if (pluginUtil.isNeedProxy()) {
String proxyIP = getProxyIP();
System.out.println("代理IP :"+proxyIP);
String[] proxys=proxyIP.split("-");
String proxyHost = proxys[0];
int proxyPort = Integer.parseInt(proxys[1]);
String userName = proxys[2];
String password = proxys[3];
//创建代理服务器
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));
//设置代理的用户名密码
Authenticator.setDefault(new MyAuth(userName, password));
// 设定连接的相关参数
connection = (HttpsURLConnection) url.openConnection(proxy);
}else {
connection = (HttpsURLConnection) url.openConnection();
}
// if (false) {
// connection = (HttpsURLConnection) url.openConnection(proxy); // connection = (HttpsURLConnection) url.openConnection(proxy);
// } else { // } else {
// connection = (HttpsURLConnection) url.openConnection(); // connection = (HttpsURLConnection) url.openConnection();
// } // }
connection = (HttpsURLConnection) url.openConnection();
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
} catch (Exception e) {
//
} }
return connection; catch(Exception e){
} }
return connection;
}
/**构造下载使用的{@link HttpsURLConnection} /**构造下载使用的{@link HttpsURLConnection}
* @param urlstr 下载url * @param urlstr 下载url
* @return * @return
...@@ -311,11 +214,7 @@ public class PageConnectioner { ...@@ -311,11 +214,7 @@ public class PageConnectioner {
try{ try{
String[] headerParam=params.split("\\|"); String[] headerParam=params.split("\\|");
url = new URL(urlstr); url = new URL(urlstr);
if (false) {
connection = (HttpsURLConnection) url.openConnection(proxy);
} else {
connection = (HttpsURLConnection) url.openConnection(); connection = (HttpsURLConnection) url.openConnection();
}
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
connection.setReadTimeout(5000); connection.setReadTimeout(5000);
connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("accept", "*/*");
...@@ -332,7 +231,6 @@ public class PageConnectioner { ...@@ -332,7 +231,6 @@ public class PageConnectioner {
} }
} }
catch(Exception e){ catch(Exception e){
} }
return connection; return connection;
...@@ -354,9 +252,8 @@ public class PageConnectioner { ...@@ -354,9 +252,8 @@ public class PageConnectioner {
break; break;
} catch (Exception e1) { } catch (Exception e1) {
try { try {
Thread.sleep(2000); Thread.sleep(10000);
} catch (InterruptedException e2) { } catch (InterruptedException e2) {
//
} }
} }
} }
...@@ -415,18 +312,10 @@ public class PageConnectioner { ...@@ -415,18 +312,10 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
connection = this.connection(url,headerParams); pg = new PageGet(url, encoding, this.connection(url,headerParams));
pg = new PageGet(url, encoding, connection); } catch (Exception e3) {
} catch (Exception e1) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -461,23 +350,17 @@ public class PageConnectioner { ...@@ -461,23 +350,17 @@ public class PageConnectioner {
*/ */
protected String staticConnectByGet(String url, String encoding) { protected String staticConnectByGet(String url, String encoding) {
//循环访问的超时时间 //循环访问的超时时间
long exitTimeDis = 3000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
connection = this.connection(url); pg = new PageGet(url, encoding, this.connection(url));
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
pg.urlConnectionGet(); pg.urlConnectionGet();
docBody = pg.getPageStr(); docBody = pg.getPageStr();
...@@ -509,18 +392,13 @@ public class PageConnectioner { ...@@ -509,18 +392,13 @@ public class PageConnectioner {
* @return * @return
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame) {
long exitTimeDis = 10000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
HttpsURLConnection connection = null;
try { try {
connection = this.httpsconnection(url); pg = new PageGet(url, encoding, this.httpsconnection(url));
pg = new PageGet(url, encoding, connection);
} catch (Exception e3) { } catch (Exception e3) {
//
} finally {
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
...@@ -565,7 +443,7 @@ public class PageConnectioner { ...@@ -565,7 +443,7 @@ public class PageConnectioner {
* @return * @return
*/ */
protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame,String params) { protected String staticHttpsConnectByGet(String url, String encoding, boolean bFrame,String params) {
long exitTimeDis = 3000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PageGet pg = null; PageGet pg = null;
...@@ -598,57 +476,6 @@ public class PageConnectioner { ...@@ -598,57 +476,6 @@ public class PageConnectioner {
} }
return docBody; return docBody;
} }
/**
* http get方法下载 static链接网页
* @param url 下载链接
* @param encoding 页面编码
* @return 下载的内容
* 目前用于豆瓣
*/
// protected String staticConnectPoxyByGet(String url, String encoding, ProxyIPGet count) {
//
// // 循环访问的超时时间
// long exitTimeDis = 30000;
//
// long startDownTime = System.currentTimeMillis();
// PageGet pg = null;
// String docBody = null;
// try {
// pg = new PageGet(url, encoding, this.httpsconnection(url, count.getIpaddre(), count.getPort()));
// } catch (Exception e3) {
// logUtil.getLogger().error(String.format("ORMSG: The site server access denied , EXCEPTION: %s ",
// ExceptionUtil.getExceptionStr(e3)));
// return docBody;
// }
//
// try {
// pg.urlHttpsConnectionGet();
// docBody = pg.getPageStr();
// } catch (Exception e) {
// count.getNewProxyInfo(1, false);
//
// // 访问失败后,尝试3次重连
// for (int i = 0; i < 3; i++) {
// long currTime = System.currentTimeMillis();
// if (currTime - startDownTime >= exitTimeDis) {
// break;
// }
// try {
// pg.urlConnectionGet();
// docBody = pg.getPageStr();
// break;
// } catch (Exception e1) {
// try {
// Thread.sleep(SLEEP_TIME);
// } catch (InterruptedException e2) {
// }
// }
// }
// }
// return docBody;
// }
/** /**
...@@ -658,29 +485,21 @@ public class PageConnectioner { ...@@ -658,29 +485,21 @@ public class PageConnectioner {
* @param postParam post参数,格式为raw(A=a&B=b) * @param postParam post参数,格式为raw(A=a&B=b)
* @return 下载的内容 * @return 下载的内容
*/ */
public String staticConnectByPost(String url, String encoding, String postParam) { protected String staticConnectByPost(String url, String encoding, String postParam) {
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂 if (postParam != null && postParam.contains("[Content-type]")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
connection = this.connection(url,param); pp = new PagePost(url, encoding, this.connection(url,param),param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
connection = this.connection(url); pp = new PagePost(url, encoding, this.connection(url), postParam);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -710,6 +529,7 @@ public class PageConnectioner { ...@@ -710,6 +529,7 @@ public class PageConnectioner {
/** /**
* http post方法下载 static链接网页 * http post方法下载 static链接网页
* @param url 下载链接 * @param url 下载链接
* @param encoding 页面编码
* @param postParam post参数,格式为raw(A=a&B=b) * @param postParam post参数,格式为raw(A=a&B=b)
* @return 下载的内容 * @return 下载的内容
*/ */
...@@ -719,23 +539,15 @@ public class PageConnectioner { ...@@ -719,23 +539,15 @@ public class PageConnectioner {
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
PagePost pp = null; PagePost pp = null;
String docBody = null; String docBody = null;
HttpURLConnection connection = null;
try { try {
if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂 if (postParam!= null && postParam.contains("{")&& postParam.contains(":")) { // 仅用于 鹏云课堂
String param = postParam.replace("[Content-type]", ""); String param = postParam.replace("[Content-type]", "");
connection = this.connection(url,param); pp = new PagePost(url, encoding, this.connection(url,param),param);
pp = new PagePost(url, encoding, connection,param);
}else{ }else{
connection = this.connection(url); pp = new PagePost(url, encoding, this.connection(url), postParam);
pp = new PagePost(url, encoding, connection, postParam);
} }
} catch (Exception e3) { } catch (Exception e3) {
assert connection != null;
connection.disconnect();
return docBody; return docBody;
}finally {
assert connection != null;
connection.disconnect();
} }
try { try {
...@@ -772,18 +584,13 @@ public class PageConnectioner { ...@@ -772,18 +584,13 @@ public class PageConnectioner {
long exitTimeDis = 30000; long exitTimeDis = 30000;
long startDownTime = System.currentTimeMillis(); long startDownTime = System.currentTimeMillis();
HttpsURLConnection connection = null;
PagePost pp = null; PagePost pp = null;
try { try {
connection = this.httpsconnection(url); pp = new PagePost(url, encoding, this.httpsconnection(url),param);
pp = new PagePost(url, encoding, connection, param);
} catch (Exception e3) { } catch (Exception e3) {
// // TODO Auto-generated catch block
} finally { e3.printStackTrace();
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
try { try {
pp.urlHttpsConnectionPost(); pp.urlHttpsConnectionPost();
...@@ -829,19 +636,13 @@ public class PageConnectioner { ...@@ -829,19 +636,13 @@ public class PageConnectioner {
webClient.setAjaxController(new NicelyResynchronizingAjaxController()); webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
if (false) {
ProxyConfig proxyConfig = new ProxyConfig(PROXY_ADDR, PROXY_PORT);
webClient.getOptions().setProxyConfig(proxyConfig);
}
String pageStr="";
try {
HtmlPage htmlPage = webClient.getPage(urlstr);
webClient.waitForBackgroundJavaScript(300000);
pageStr = htmlPage.asXml();
}catch (Exception e){
}finally { HtmlPage htmlPage = webClient.getPage(urlstr);
webClient.waitForBackgroundJavaScript(600000);
String pageStr = htmlPage.asXml();
webClient.close(); webClient.close();
if (bSave) {
} }
return pageStr; return pageStr;
} }
...@@ -867,10 +668,6 @@ public class PageConnectioner { ...@@ -867,10 +668,6 @@ public class PageConnectioner {
webClient.getOptions().setTimeout(20000);//设置“浏览器”的请求超时时间 webClient.getOptions().setTimeout(20000);//设置“浏览器”的请求超时时间
webClient.setJavaScriptTimeout(30000);//设置JS执行的超时时间 webClient.setJavaScriptTimeout(30000);//设置JS执行的超时时间
// ProxyConfig proxyConfig = new ProxyConfig(PROXY_ADDR, PROXY_PORT); // ProxyConfig proxyConfig = new ProxyConfig(PROXY_ADDR, PROXY_PORT);
if (false) {
ProxyConfig proxyConfig = new ProxyConfig(PROXY_ADDR, PROXY_PORT);
webClient.getOptions().setProxyConfig(proxyConfig);
}
String pageStr = null; String pageStr = null;
try { try {
Page page = webClient.getPage(urlstr); Page page = webClient.getPage(urlstr);
...@@ -879,14 +676,10 @@ public class PageConnectioner { ...@@ -879,14 +676,10 @@ public class PageConnectioner {
webClient.waitForBackgroundJavaScript(5000); //阻塞线程 webClient.waitForBackgroundJavaScript(5000); //阻塞线程
pageStr = htmlPage.asXml(); pageStr = htmlPage.asXml();
} }
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// pageStr = scriptPage.getContent();
// }
} catch (Exception e) { } catch (Exception e) {
}finally {
webClient.close();
} }
webClient.close();
return pageStr; return pageStr;
} }
......
...@@ -3,12 +3,8 @@ package com.zzsn.download; ...@@ -3,12 +3,8 @@ package com.zzsn.download;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import com.zzsn.crawler.PaserSiteDownload;
import com.zzsn.crawler.uriparser.HtmlPageParser;
import lombok.extern.slf4j.Slf4j;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import javax.net.ssl.HttpsURLConnection;
import java.io.*; import java.io.*;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
import java.net.MalformedURLException; import java.net.MalformedURLException;
...@@ -19,7 +15,6 @@ import java.util.Map; ...@@ -19,7 +15,6 @@ import java.util.Map;
import java.util.Timer; import java.util.Timer;
import java.util.TimerTask; import java.util.TimerTask;
@Slf4j
public class PageDownloader { public class PageDownloader {
private int interval = 5000; private int interval = 5000;
private long lastDownloadTime = -1; private long lastDownloadTime = -1;
...@@ -48,9 +43,8 @@ public class PageDownloader { ...@@ -48,9 +43,8 @@ public class PageDownloader {
} }
// 如果页面编码格式未知,则从页面中获取该页面编码格式 // 如果页面编码格式未知,则从页面中获取该页面编码格式
public String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException { private String getEncodingFromHtmlFile(String urlstr, HttpURLConnection connection) throws IOException {
String encoding = null;
try {
connection.setRequestMethod("GET"); connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) " connection.setRequestProperty("User-Agent", "Mozilla/5.0 " + "(Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) "
+ "Gecko/20080404 Firefox/2.0.0.14"); + "Gecko/20080404 Firefox/2.0.0.14");
...@@ -58,6 +52,7 @@ public class PageDownloader { ...@@ -58,6 +52,7 @@ public class PageDownloader {
connection.setRequestProperty("Cookie", "auth=token"); connection.setRequestProperty("Cookie", "auth=token");
String contentType = connection.getHeaderField("Content-Type"); String contentType = connection.getHeaderField("Content-Type");
String encoding = null;
if (contentType != null) { if (contentType != null) {
String temp = "charset="; String temp = "charset=";
int m = contentType.indexOf(temp); int m = contentType.indexOf(temp);
...@@ -66,23 +61,16 @@ public class PageDownloader { ...@@ -66,23 +61,16 @@ public class PageDownloader {
} }
} }
if (encoding == null) { if (encoding == null) {
InputStream is = null;
try { try {
InputStream is = null;
is = connection.getInputStream(); is = connection.getInputStream();
BufferedInputStream bufferedInputStream = new BufferedInputStream(is); BufferedInputStream bufferedInputStream = new BufferedInputStream(is);
encoding = EncodeDetector.getEncoding(bufferedInputStream); encoding = EncodeDetector.getEncoding(bufferedInputStream);
} catch (Exception e) {
//
}finally {
assert is != null;
is.close(); is.close();
} catch (Exception e) {
} }
} }
} catch (Exception e) {
//
} finally {
connection.disconnect(); connection.disconnect();
}
return encoding; return encoding;
} }
...@@ -91,12 +79,18 @@ public class PageDownloader { ...@@ -91,12 +79,18 @@ public class PageDownloader {
Document doc = null; Document doc = null;
String docBody=""; String docBody="";
if (false) {
docBody = downloadPoxyWithStr(url,encoding,bDynamic,bFrame);
}else{
docBody = downloadWithStr(url,encoding,bDynamic,bFrame); docBody = downloadWithStr(url,encoding,bDynamic,bFrame);
}
if (docBody != null) { if (docBody != null) {
// 测试导出文件
// byte[] buff=new byte[]{};
// buff=docBody.getBytes();
// FileOutputStream out=null;
// try {
// out = new FileOutputStream("D://out.txt");
// out.write(buff,0,buff.length);
// } catch (IOException e1) {
// e1.printStackTrace();
// }
try { try {
doc = HtmlPageParser.htmlCleanerParser(docBody); doc = HtmlPageParser.htmlCleanerParser(docBody);
} catch (Exception e) { } catch (Exception e) {
...@@ -160,32 +154,42 @@ public class PageDownloader { ...@@ -160,32 +154,42 @@ public class PageDownloader {
return doc; return doc;
} }
/** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/ /** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) { public String downloadWithStrBak(String url, String encoding, boolean bDynamic,boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval){ if (interval > 0 && lastDownloadTime > 0 && dis < interval)
{
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
// long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner(); PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null;
try { try {
if (encoding == null || encoding.isEmpty()) {//获取网站编码 connection = pConn.connection(url);
PaserSiteDownload paserSiteDownload=new PaserSiteDownload(); if (encoding == null || encoding.isEmpty()) {
encoding = paserSiteDownload.locateCharSet(url); encoding = getEncodingFromHtmlFile(url, connection);
} }
} catch (Exception e1) { } catch (Exception e1) {
log.info("获取编码失败"); // e1.printStackTrace();
encoding="utf-8";
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding); docBody = pConn.dynamicConnectByGet(url, encoding);
} else { } else {
// this.bDownloadUseFrame=true;
if (bFrame && this.bDownloadUseFrame) { if (bFrame && this.bDownloadUseFrame) {
String body = null; String body = null;
try { try {
body = pConn.downloadByWebClient(url); body = pConn.downloadByWebClient(url);
this.lastDownloadTime = System.currentTimeMillis(); this.lastDownloadTime = System.currentTimeMillis();
} catch (Exception e) { } catch (FailingHttpStatusCodeException e) {
return body; e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} }
// 不是connection reset等错误抛出导致的body=""的情况下的badpage才算真正的badpage // 不是connection reset等错误抛出导致的body=""的情况下的badpage才算真正的badpage
if (isBadDownloadPage(body) && this.badPage) { if (isBadDownloadPage(body) && this.badPage) {
...@@ -197,11 +201,12 @@ public class PageDownloader { ...@@ -197,11 +201,12 @@ public class PageDownloader {
} }
if(url.contains("https:")){ if(url.contains("https:")){
try { try {
connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8"; encoding = "utf-8";
} }
} catch (Exception e1) { } catch (Exception e1) {
// // e1.printStackTrace();
} }
docBody = pConn.staticHttpsConnectByGet(url, encoding,false); docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
}else{ }else{
...@@ -212,92 +217,52 @@ public class PageDownloader { ...@@ -212,92 +217,52 @@ public class PageDownloader {
return docBody; return docBody;
} }
public String downloadWithStrAddHeader(String url, String encoding, boolean bDynamic,boolean bFrame,String headerParams) { /** String接口,主要针对html网页,通过get方式获取,动态或者静态链接,bFrame为false时一般是解析json格式书籍*/
public String downloadWithStr(String url, String encoding, boolean bDynamic,boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval) if (interval > 0 && lastDownloadTime > 0 && dis < interval)
{ {
/*try {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info);
Thread.sleep(dis+2000);
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
}*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
long startDtime = System.currentTimeMillis(); // long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner(); PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
connection = pConn.connection(url,headerParams); connection = pConn.connection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = getEncodingFromHtmlFile(url, connection); encoding = getEncodingFromHtmlFile(url, connection);
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally { encoding="utf-8";
assert connection != null;
connection.disconnect();
} }
String docBody = null; String docBody = null;
if (bDynamic) { if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding); docBody = pConn.dynamicConnectByGet(url, encoding);
} else { } else {
if (bFrame && this.bDownloadUseFrame) {
String body = null;
try {
body = pConn.downloadByWebClient(url);
this.lastDownloadTime = System.currentTimeMillis();
} catch (FailingHttpStatusCodeException e) {
// e.printStackTrace();
} catch (MalformedURLException e) {
// e.printStackTrace();
} catch (IOException e) {
// e.printStackTrace();
}
// 不是connection reset等错误抛出导致的body=""的情况下的badpage才算真正的badpage
if (isBadDownloadPage(body) && this.badPage) {
return body;
}
if (body != null) {
return body;
}
}
if(url.contains("https:")){ if(url.contains("https:")){
try {
// connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8";
}
} catch (Exception e1) {
// e1.printStackTrace();
}
if(headerParams!=null ||headerParams.length()>0){
docBody = pConn.staticHttpsConnectByGet(url, encoding,false,headerParams);
}else{
docBody = pConn.staticHttpsConnectByGet(url, encoding,false); docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
}
// if(url.contains("https://www.miit.gov.cn/search-front-server")||url.contains("https://read.douban.com/api")||url.contains("https://wxa.jd.com")||url.contains("https://codemart.com/")||url.contains("https://frodo.douban.com")||url.contains("reader.qq.com")||url.contains("https://www.gongzicp.com")||url.contains("https://appapi.hongxiu.com")){
// docBody = pConn.staticHttpsConnectByGet(url, encoding,false,headerParams);
// }else{
// docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
// }
}else{ }else{
docBody = pConn.staticConnectByGet(url, encoding,headerParams); docBody = pConn.staticConnectByGet(url, encoding);
} }
} }
this.lastDownloadTime = System.currentTimeMillis(); this.lastDownloadTime = System.currentTimeMillis();
return docBody; return docBody;
} }
public static String os = System.getProperty("os.name");
/** String接口,目前用于豆瓣图书的爬取 */
public String downloadPoxyWithStr(String url, String encoding, boolean bDynamic, boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime; public String downloadWithStrAddHeader(String url, String encoding, boolean bDynamic,boolean bFrame,String headerParams) {
if (interval > 0 && lastDownloadTime > 0 && dis < interval) {
long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval)
{
/*try {
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
logUtil.getLogger().info(info);
Thread.sleep(dis+2000);
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",ExceptionUtil.getExceptionStr(e )));
}*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000); String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
new PageDownloader(dis+2000); new PageDownloader(dis+2000);
} }
...@@ -305,7 +270,7 @@ public class PageDownloader { ...@@ -305,7 +270,7 @@ public class PageDownloader {
PageConnectioner pConn = new PageConnectioner(); PageConnectioner pConn = new PageConnectioner();
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
connection = pConn.connection(url); connection = pConn.connection(url,headerParams);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = getEncodingFromHtmlFile(url, connection); encoding = getEncodingFromHtmlFile(url, connection);
} }
...@@ -336,75 +301,34 @@ public class PageDownloader { ...@@ -336,75 +301,34 @@ public class PageDownloader {
return body; return body;
} }
} }
docBody = pConn.staticConnectByGet(url, encoding); if(url.contains("https:")){
if (isBadDownloadPage(docBody) && this.badPage) {
return docBody;
}
}
this.lastDownloadTime = System.currentTimeMillis();
return docBody;
}
/** String接口,目前用于豆瓣API图书的爬取 */
public String downloadPoxyWithStrAPI(String url, String encoding, boolean bDynamic, boolean bFrame) {
long dis = System.currentTimeMillis() - lastDownloadTime;
if (interval > 0 && lastDownloadTime > 0 && dis < interval) {
/*try {
String info = String.format("ORMSG: Wait for next dl %dms", dis + 2000);
logUtil.getLogger().info(info);
Thread.sleep(dis + 2000);
} catch (InterruptedException e) {
logUtil.getLogger().error(String.format("PAMSG: The site server access denied, EXCEPTION: %s",
ExceptionUtil.getExceptionStr(e)));
}*/
String info = String.format("ORMSG: Wait for next dl %dms", dis+2000);
new PageDownloader(dis+2000);
}
long startDtime = System.currentTimeMillis();
PageConnectioner pConn = new PageConnectioner();
HttpsURLConnection connection = null;
try { try {
connection = pConn.httpsconnection(url); connection = pConn.httpsconnection(url);
if (encoding == null || encoding.isEmpty()) { if (encoding == null || encoding.isEmpty()) {
encoding = getEncodingFromHtmlFile(url, connection); encoding = "utf-8";
} }
} catch (Exception e1) { } catch (Exception e1) {
// e1.printStackTrace(); // e1.printStackTrace();
}finally {
assert connection != null;
connection.disconnect();
}
String docBody = null;
if (bDynamic) {
docBody = pConn.dynamicConnectByGet(url, encoding);
} else {
if (bFrame && this.bDownloadUseFrame) {
String body = null;
try {
body = pConn.downloadByWebClient(url);
this.lastDownloadTime = System.currentTimeMillis();
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 不是connection reset等错误抛出导致的body=""的情况下的badpage才算真正的badpage
if (isBadDownloadPage(body) && this.badPage) {
// logUtil.getLogger().info("ORMSG: This page is bad downloadPage");
return body;
} }
if (body != null) { if(headerParams!=null ||headerParams.length()>0){
return body; docBody = pConn.staticHttpsConnectByGet(url, encoding,false,headerParams);
}else{
docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
} }
// if(url.contains("https://www.miit.gov.cn/search-front-server")||url.contains("https://read.douban.com/api")||url.contains("https://wxa.jd.com")||url.contains("https://codemart.com/")||url.contains("https://frodo.douban.com")||url.contains("reader.qq.com")||url.contains("https://www.gongzicp.com")||url.contains("https://appapi.hongxiu.com")){
// docBody = pConn.staticHttpsConnectByGet(url, encoding,false,headerParams);
// }else{
// docBody = pConn.staticHttpsConnectByGet(url, encoding,false);
// }
}else{
docBody = pConn.staticConnectByGet(url, encoding,headerParams);
} }
docBody = pConn.staticConnectByGet(url, encoding);
} }
this.lastDownloadTime = System.currentTimeMillis(); this.lastDownloadTime = System.currentTimeMillis();
return docBody; return docBody;
} }
public static String os = System.getProperty("os.name");
// String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接 // String接口,主要针对html网页或者json网页,通过post方式获取,默认静态链接
public String downloadWithStr(String url, String encoding, String param) { public String downloadWithStr(String url, String encoding, String param) {
...@@ -478,7 +402,7 @@ public class PageDownloader { ...@@ -478,7 +402,7 @@ public class PageDownloader {
} }
// 判断网页是否是badpage // 判断网页是否是badpage
public boolean isBadDownloadPage(String body) { private boolean isBadDownloadPage(String body) {
try { try {
if (body.length() < 100) { if (body.length() < 100) {
return true; return true;
...@@ -486,19 +410,17 @@ public class PageDownloader { ...@@ -486,19 +410,17 @@ public class PageDownloader {
Document doc = HtmlPageParser.htmlCleanerParser(body); Document doc = HtmlPageParser.htmlCleanerParser(body);
PageBuilderParser parser = new PageBuilderParser(); PageBuilderParser parser = new PageBuilderParser();
String title = parser.parserStr(doc, "//title"); String title = parser.parserStr(doc, "//title");
if (title.equals("错误")) { //访问页面不存在 错误消息 if (title.equals("错误消息")) { //访问页面不存在 错误消息
return true; return true;
} }
if (title.contains("访问页面不存在")) { //访问页面不存在 错误消息 if (title.contains("访问页面不存在")) { //访问页面不存在 错误消息
return true; return true;
} }
if (title.contains("404")) { //访问页面不存在 错误消息 if (title.contains("纵横中文网")) { //访问页面不存在 错误消息
return true;
}
if (title.contains("Not Found")) { //访问页面不存在 错误消息
return true; return true;
} }
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block
return true; return true;
} }
return false; return false;
......
...@@ -73,6 +73,7 @@ public class PageGet { ...@@ -73,6 +73,7 @@ public class PageGet {
buffer.append("\r\n"); buffer.append("\r\n");
} }
} catch (Exception e) { } catch (Exception e) {
this.connection.disconnect(); this.connection.disconnect();
} finally { } finally {
try { try {
...@@ -133,6 +134,7 @@ public class PageGet { ...@@ -133,6 +134,7 @@ public class PageGet {
buffer.append("\r\n"); buffer.append("\r\n");
} }
} catch (Exception e) { } catch (Exception e) {
} finally { } finally {
try { try {
if (in != null) { if (in != null) {
...@@ -146,8 +148,8 @@ public class PageGet { ...@@ -146,8 +148,8 @@ public class PageGet {
this.pageStr = buffer.toString(); this.pageStr = buffer.toString();
} }
private static final int MAX_SOKET_TIMEOUT =3000; private static final int MAX_SOKET_TIMEOUT =30000;
private static final int MAX_CONNECTION_TIMEOUT = 3000; private static final int MAX_CONNECTION_TIMEOUT = 30000;
private static final RequestConfig REQCONFIG = RequestConfig.custom() private static final RequestConfig REQCONFIG = RequestConfig.custom()
.setSocketTimeout(MAX_SOKET_TIMEOUT).setConnectTimeout(MAX_CONNECTION_TIMEOUT) .build(); .setSocketTimeout(MAX_SOKET_TIMEOUT).setConnectTimeout(MAX_CONNECTION_TIMEOUT) .build();
public void httpClientGet() { public void httpClientGet() {
...@@ -200,9 +202,9 @@ public class PageGet { ...@@ -200,9 +202,9 @@ public class PageGet {
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// webClient.getOptions().setTimeout(500000); // webClient.getOptions().setTimeout(500000);
ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port); ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port);
if (false) { // if (pluginUtil.isNeedProxy()) {
webClient.getOptions().setProxyConfig(proxyConfig); // webClient.getOptions().setProxyConfig(proxyConfig);
} // }
try { try {
Page page = webClient.getPage(this.url); Page page = webClient.getPage(this.url);
if (page instanceof HtmlPage) { if (page instanceof HtmlPage) {
...@@ -210,10 +212,6 @@ public class PageGet { ...@@ -210,10 +212,6 @@ public class PageGet {
// webClient.waitForBackgroundJavaScript(600000); // webClient.waitForBackgroundJavaScript(600000);
this.setPageStr(htmlPage.asXml()); this.setPageStr(htmlPage.asXml());
} }
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
} catch (Exception e) { } catch (Exception e) {
} }
......
package com.zzsn.download; package com.zzsn.download;
import com.gargoylesoftware.htmlunit.*; import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
...@@ -179,10 +182,6 @@ public class PagePost { ...@@ -179,10 +182,6 @@ public class PagePost {
// webClient.Headers.Add("ContentLength", param.Length.ToString()); // webClient.Headers.Add("ContentLength", param.Length.ToString());
// byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes); // byte[] responseData = webClient.UploadData("https://api.weibo.com/oauth2/access_token", "POST", bytes);
ProxyConfig proxyConfig = new ProxyConfig(Proxy_Addr, Proxy_Port);
if (false) {
webClient.getOptions().setProxyConfig(proxyConfig);
}
try { try {
Page page = webClient.getPage(this.url); Page page = webClient.getPage(this.url);
if (page instanceof HtmlPage) { if (page instanceof HtmlPage) {
...@@ -190,10 +189,6 @@ public class PagePost { ...@@ -190,10 +189,6 @@ public class PagePost {
// webClient.waitForBackgroundJavaScript(600000); // webClient.waitForBackgroundJavaScript(600000);
this.setPageStr(htmlPage.asXml()); this.setPageStr(htmlPage.asXml());
} }
// else if (page instanceof JavaScriptPage) {
// JavaScriptPage scriptPage = (JavaScriptPage) page;
// this.setPageStr(scriptPage.getContent());
// }
} catch (Exception e) { } catch (Exception e) {
} }
......
...@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory; ...@@ -26,10 +26,18 @@ import org.slf4j.LoggerFactory;
public class RequestUtil { public class RequestUtil {
private static volatile RequestUtil instance=null; //保证 instance 在所有线程中同步
private RequestUtil(){} //private 避免类在外部被实例化
public static synchronized RequestUtil getInstance()
{
//getInstance 方法前加同步
if(instance==null)
{
instance=new RequestUtil();
}
return instance;
}
private static Logger log = LoggerFactory.getLogger(RequestUtil.class); private static Logger log = LoggerFactory.getLogger(RequestUtil.class);
public static String getTaotiaoData(String url) throws Exception { public static String getTaotiaoData(String url) throws Exception {
HttpClientBuilder builder = HttpClients.custom(); HttpClientBuilder builder = HttpClients.custom();
//对照UA字串的标准格式理解一下每部分的意思 //对照UA字串的标准格式理解一下每部分的意思
...@@ -215,8 +223,9 @@ public class RequestUtil { ...@@ -215,8 +223,9 @@ public class RequestUtil {
} }
return jsonObject; return jsonObject;
} }
public static String httpGetRequest(String url) throws Exception {
String result = null; public String httpGetRequest(String url) throws Exception {
String result = "";
HttpClientBuilder builder = HttpClients.custom(); HttpClientBuilder builder = HttpClients.custom();
//对照UA字串的标准格式理解一下每部分的意思 //对照UA字串的标准格式理解一下每部分的意思
builder.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)"); builder.setUserAgent("Mozilla/5.0(Windows;U;Windows NT 5.1;en-US;rv:0.9.4)");
...@@ -232,7 +241,7 @@ public class RequestUtil { ...@@ -232,7 +241,7 @@ public class RequestUtil {
} }
EntityUtils.consume(resEntity); EntityUtils.consume(resEntity);
} catch (IOException e) { } catch (IOException e) {
// return get(url, tts++); result="";
} finally { } finally {
response.close(); response.close();
httpClient.close(); httpClient.close();
......
package com.zzsn.download;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtil {
public static boolean convertBoolean(String s, boolean b)
{
if (s == null)
{
return b;
}
if (s.equals("0"))
{
return false;
}
if (s.equals("1"))
{
return true;
}
return b;
}
public static String convertBooleanToString(boolean b)
{
String s = b ? "1" : "0";
return s;
}
public static String trimWhiteSpace(String str)
{
String s = replaceBlank(str);
String ret = s.trim();
return ret;
}
public static String replaceBlank(String str) {
/* String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");su
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}*/
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < str.length(); i ++)
{
char c = str.charAt(i);
boolean bspace = Character.isWhitespace(c);
if (bspace)
{
c = ' ';
}
buffer.append(c);
}
return buffer.toString();
}
//获取分隔符[和]之间的子串,如aa[abc]bbb->abc
public static List<String> getSubStrs(String str, String start, String end)
{
List<String> resultStrs = new ArrayList<String>();
if (str == null || str.trim().length() == 0)
{
return resultStrs;
}
String ptnstr = String.format("%s([^%s%s]+)%s", start, start, end, end);
// String ptnstr1 = "\\[([^\\[\\]]+)\\]";
Pattern pattern = Pattern.compile(ptnstr);
Matcher matcher = pattern.matcher(str);
while (matcher.find())
{
String substr = matcher.group(1);
resultStrs.add(substr);
}
return resultStrs;
}
//fromStr:aaa123bb, origStr:aaa[xxx]bb, replaceStr:[xxx]. return:123
public static String getHomologousWord(String replaceStr,
String origStr, String fromStr)
{
String retStr = null;
int pos = origStr.indexOf(replaceStr);
if (pos == -1)
{
return retStr;
}
String start = origStr.substring(0, pos);
String end = origStr.substring(pos + replaceStr.length());
if (start.length() > 0 && !fromStr.startsWith(start))
{
return retStr;
}
if (end.length() > 0 && !fromStr.endsWith(end))
{
return retStr;
}
retStr = fromStr.substring(start.length(),
fromStr.length()-end.length());
return retStr;
}
public static String trimBeginningBracket(String s)
{
String ret = s;
if (s.length() == 0)
{
return s;
}
Map<Character, Character> braketPeers
= new HashMap<Character, Character>();
braketPeers.put('【', '】');
braketPeers.put('[', ']');
braketPeers.put('[', ']');
braketPeers.put('(', ')');
braketPeers.put('(', ')');
braketPeers.put('〔', '〕');
String searchStr = s;
while (searchStr.length() > 0)
{
char beginc = searchStr.charAt(0);
Character value = braketPeers.get(beginc);
if (value == null)
{
break;
}
int endPos = -1;
for (int i = 1; i < searchStr.length(); i ++)
{
if (searchStr.charAt(i) == value)
{
endPos = i;
break;
}
}
if (endPos >= 0)
{
ret = searchStr.substring(endPos+1);
searchStr = ret;
}
else {
break;
}
}
return ret;
}
public static String trimMiddleBracket(String s) {
String ret = s;
if (s.length() == 0) {
return s;
}
Map<Character, Character> braketPeers = new HashMap<Character, Character>();
String[] brakets = { "】", "]", "]", ")", ")", "〕" };
braketPeers.put('【', '】');
braketPeers.put('[', ']');
braketPeers.put('[', ']');
braketPeers.put('(', ')');
braketPeers.put('(', ')');
braketPeers.put('〔', '〕');
String searchStr = s;
int index = 0;
while (searchStr.length() > 0) {
int startPos = -1;
Character value = null;
for (int i = index; i < searchStr.length(); i++) {
boolean findLeftBraket = false;
value = searchStr.charAt(i);
for (Character key : braketPeers.keySet()) {
if (value.equals(key)) {
startPos = i;
findLeftBraket = true;
break;
}
}
if (findLeftBraket) {
break;
}
}
int endPos = -1;
for (int i = startPos + 1; i < searchStr.length(); i++) {
if (null != braketPeers.get(value) && searchStr.charAt(i) == braketPeers.get(value)) {
endPos = i;
break;
}
}
if (endPos >= startPos) {
if (startPos >= 0) {
searchStr = searchStr.substring(0, startPos) + searchStr.substring(endPos + 1, searchStr.length());
}
} else {
searchStr = searchStr.replace(value.toString(), "");
index = startPos;
}
if (startPos < 0) {
ret = searchStr;
break;
}
}
for (String bs : brakets) {
ret = ret.replace(bs.toString(), "");
}
return ret;
}
public static String trimEnddingBracket(String s)
{
String ret = s;
if (s.length() == 0)
{
return s;
}
Map<Character, Character> braketPeers
= new HashMap<Character, Character>();
braketPeers.put('】', '【');
braketPeers.put(']', '[');
braketPeers.put(')', '(');
braketPeers.put(')', '(');
braketPeers.put('〕','〔');
int endPos = s.length() - 1;
String searchStr = s;
while (endPos >= 0)
{
char endc = searchStr.charAt(endPos);
Character value = braketPeers.get(endc);
if (value == null)
{
break;
}
int startPos = -1;
for (int i = searchStr.length() - 2; i >= 0; i --)
{
if (searchStr.charAt(i) == value)
{
startPos = i;
break;
}
}
if (startPos >= 0)
{
ret = searchStr.substring(0, startPos);
searchStr = ret;
}
endPos = startPos - 1;
}
return ret;
}
// public static String delSymbolAndPunc(String s)
// {
// StringBuffer buffer = new StringBuffer();
// s = replaceBlank(s);
// for (int i = 0; i < s.length(); i ++)
// {
// char c = s.charAt(i);
// if (c == ' ' || CharUtil.isSymbol(c) || CharUtil.isPunctuation(c) || Integer.toHexString(c).equals("a0"))
// {
// continue;
// }
// buffer.append(c);
// }
// return buffer.toString();
// }
public static String delCharNotChinese(String s)
{
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < s.length(); i ++)
{
char c = s.charAt(i);
if (isChinese(c))
{
buffer.append(c);
}
}
return buffer.toString();
}
public static boolean isChinese(char c)
{
if(c >= 0x4e00 && c <= 0x9fa5)
{
return true;
}
return false;
}
public static String toBanjiao(String s)
{
if (s == null || s.length() == 0)
{
return s;
}
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < s.length(); i ++)
{
char c = s.charAt(i);
if (c >= 65281 && c <= 65374)
{
c = (char) (c - 65248);
}
else if (c == 12288) { // 空格
c = (char) 32;
}
buffer.append(c);
}
return buffer.toString();
}
public static String listToString(List<String> arr)
{
StringBuffer buffer = new StringBuffer();
if (arr == null)
{
return buffer.toString();
}
for (int i = 0; i < arr.size(); i ++)
{
buffer.append(arr.get(i));
if (i != arr.size() - 1)
{
buffer.append(";");
}
}
return buffer.toString();
}
public static List<String> stringToList(String str)
{
List<String> strs = new ArrayList<String>();
if (str == null)
{
return strs;
}
String[] ss = str.split(";");
for (String s : ss)
{
if (s.trim().length() == 0)
{
continue;
}
strs.add(s);
}
return strs;
}
public static String normalizeHtmlTransf(String s)
{
String ret = s.replaceAll("&bull;", "·");
ret = ret.replaceAll("&middot;", "·");
ret = ret.replaceAll("&nbsp;", " ");
ret = ret.replaceAll("&quot;", "\"");
ret = ret.replaceAll("&amp;", "&");
ret = ret.replace('・', '·');
ret = ret.replace("&ldquo;", "\"");
ret = ret.replace("&rdquo;", "\"");
ret = ret.replace("&hellip;", "...");
ret = ret.replace("&lt;", "<");
ret = ret.replace("&gt;", ">");
ret = ret.replace("&mdash;", "—");
ret = ret.replace("&ndash;", "–");
ret = ret.replace("&tilde;", "~");
ret = ret.replace("&lsquo;", "'");
ret = ret.replace("&rsquo;", "'");
ret = ret.replace("&sbquo;", ",");
ret = ret.replace("&lsaquo;", "‹");
ret = ret.replace("&rsaquo;", "›");
ret = ret.replace("&hellip;", "…");
ret = ret.replace("|", " ");
return ret;
}
public static String normalizeSegTransf(String s)
{
String ret = s.replaceAll("\r\n;", " ");
ret = ret.replace("\n", "");
ret = ret.replace("|", " ");
return ret;
}
}
package com.zzsn.download;
public class Test {
public static void main(String[] args) {
PageDownloader downloader=new PageDownloader();
String rankUrl="https://www.baidu.com/";
String encoding="utf-8";
String pageBody = downloader.downloadWithStr(rankUrl, encoding, false,false);
System.out.println(pageBody);
}
}
...@@ -2,6 +2,8 @@ package com.zzsn.entity; ...@@ -2,6 +2,8 @@ package com.zzsn.entity;
import lombok.Data; import lombok.Data;
import java.util.Date;
@Data @Data
public class BadSiteMsg { public class BadSiteMsg {
...@@ -9,16 +11,10 @@ public class BadSiteMsg { ...@@ -9,16 +11,10 @@ public class BadSiteMsg {
private String id; private String id;
/**信息源编码*/ /**信息源编码*/
private String infoSourceCode; private String infoSourceCode;
/**信息源名称*/ /**爬虫类别(1:动态 2:静态 3:500强 4:智库 5:百度)**/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType; private String crawlerType;
/**分区id (多个用英文逗号隔开)*/
private String partition;
/**消费时间*/
private Date consumerDate;
} }
package com.zzsn.entity;
import lombok.Data;
@Data
public class BadSiteMsgBak {
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**有问题类型*/
private String errorType;
/**问题类型(1:信息源异常 2:爬取类别设置异常)*/
private String problemType;
/**爬虫类型(0:静态爬取 1:动态爬取)*/
private String crawlerType;
}
...@@ -165,6 +165,7 @@ public class Constants { ...@@ -165,6 +165,7 @@ public class Constants {
//判断重复的rate //判断重复的rate
public static final Double TITLE_SIMILARITY_RATE = Double.valueOf(prop.getProperty("TITLE_SIMILARITY_RATE")); public static final Double TITLE_SIMILARITY_RATE = Double.valueOf(prop.getProperty("TITLE_SIMILARITY_RATE"));
public static final String MODEL_SCORE_URL = prop.getProperty("MODEL_SCORE_URL"); public static final String MODEL_SCORE_URL = prop.getProperty("MODEL_SCORE_URL");
public static final Integer CACHE_UPDATE = Integer.valueOf(prop.getProperty("CACHE_UPDATE")); public static final Integer CACHE_UPDATE = Integer.valueOf(prop.getProperty("CACHE_UPDATE"));
//国资监管评价中心相关性过滤算法URl(XGBOOST) //国资监管评价中心相关性过滤算法URl(XGBOOST)
public static final String RELEVANCE_GZJG_XGBOOST_URL = prop.getProperty("RELEVANCE_GZJG_XGBOOST_URL"); public static final String RELEVANCE_GZJG_XGBOOST_URL = prop.getProperty("RELEVANCE_GZJG_XGBOOST_URL");
......
...@@ -18,7 +18,6 @@ public class JedisUtil { ...@@ -18,7 +18,6 @@ public class JedisUtil {
private static final String PREFIX = "comm_"; private static final String PREFIX = "comm_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class); private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null; private static JedisPool jedisPool = null;
private JedisUtil() { private JedisUtil() {
} }
...@@ -127,7 +126,13 @@ public class JedisUtil { ...@@ -127,7 +126,13 @@ public class JedisUtil {
} }
getDefaultJedis().del(PREFIX + key); getDefaultJedis().del(PREFIX + key);
} }
public static void delString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(key);
}
public static void setString(String key, String value, int expireTime) throws Exception { public static void setString(String key, String value, int expireTime) throws Exception {
Jedis jedis=null; Jedis jedis=null;
try { try {
......
...@@ -43,7 +43,7 @@ public class HttpClientTester { ...@@ -43,7 +43,7 @@ public class HttpClientTester {
private static PageBuilderParser builderParser = null; private static PageBuilderParser builderParser = null;
public static void main(String[] args) { public static void main(String[] args) {
// get("https://edition.cnn.com/world"); // get("https://edition.cnn.com/world");
String html = HttpgetUtil.getHtml("https://edition.cnn.com/world"); String html = HttpgetUtil.getHtml("http://www.ahhfly.gov.cn/content/column/11488310?pageIndex=1");
System.out.println(html); System.out.println(html);
// post(); // post();
} }
......
...@@ -8,7 +8,7 @@ import java.net.MalformedURLException; ...@@ -8,7 +8,7 @@ import java.net.MalformedURLException;
public class WebClientTest { public class WebClientTest {
public static void main(String[] args) throws Exception{ public static void main(String[] args) throws Exception{
String url="http://www.sgcc.com.cn/html/sgcc_main/col2017021879/column_2017021879_1.shtml"; String url="http://www.ahhfly.gov.cn/content/column/11488310?pageIndex=1";
String charset="utf-8"; String charset="utf-8";
String s = downloadByWebClient(url, charset); String s = downloadByWebClient(url, charset);
System.out.println(s.length()); System.out.println(s.length());
......
package com.test; package com.zzsn.test;
import com.zzsn.download.RequestUtil;
import java.io.*; import java.io.*;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
...@@ -9,6 +11,12 @@ public class test { ...@@ -9,6 +11,12 @@ public class test {
String urlStr="http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg"; String urlStr="http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg";
String fileName="testImg.png"; String fileName="testImg.png";
downLoadByUrl(urlStr,fileName); downLoadByUrl(urlStr,fileName);
try {
RequestUtil requestUtil =RequestUtil.getInstance();
String body = requestUtil.httpGetRequest(urlStr);
} catch (Exception e) {
// e.printStackTrace();
}
} }
/** /**
* 从网络Url中下载文件 * 从网络Url中下载文件
......
...@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions; ...@@ -11,6 +11,7 @@ import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.HttpCommandExecutor; import org.openqa.selenium.remote.HttpCommandExecutor;
import java.net.URL; import java.net.URL;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -33,16 +34,23 @@ public class DriverUtil { ...@@ -33,16 +34,23 @@ public class DriverUtil {
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE); System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
ChromeDriverService service = ChromeDriverService.createDefaultService(); ChromeDriverService service = ChromeDriverService.createDefaultService();
ChromeOptions options = new ChromeOptions(); ChromeOptions options = new ChromeOptions();
//浏览器启动的位置
// options.setBinary(Constants.CHROMEBIN);
// 无痕模式 // 无痕模式
options.addArguments("--incognito"); options.addArguments("--incognito");
// 禁用沙箱 // 禁用沙箱
options.addArguments("no-sandbox"); options.addArguments("--no-sandbox");
// 禁用GPU // 禁用GPU
options.addArguments("--disable-gpu"); options.addArguments("--disable-gpu");
// 禁用图形界面(此模式启动会导致驱动通信异常) // 禁用图形界面(此模式启动会导致驱动通信异常)
// options.addArguments("--headless"); // options.addArguments("--headless");
// 禁用插件 // 禁用插件
options.addArguments("disable-extensions"); options.addArguments("disable-extensions");
// 屏蔽“chrome正受到自动测试软件的控制”的提示
// options.addArguments("--disable-infobars", "--disable-blink-features=AutomationControlled");
// options.addArguments("--remote-debugging-port=9222");
// options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
// 重新初始化一个chrome浏览器实例 // 重新初始化一个chrome浏览器实例
return new ChromeDriver(service, options); return new ChromeDriver(service, options);
} }
...@@ -66,7 +74,7 @@ public class DriverUtil { ...@@ -66,7 +74,7 @@ public class DriverUtil {
ReuseWebDriver driver=null; ReuseWebDriver driver=null;
try { try {
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
System.out.println(driver.connectTestFail()); // log.info("驱动连接失败:"+driver.connectTestFail());
if (driver.connectTestFail()) { if (driver.connectTestFail()) {
// 若驱动返回错误码,重新创建驱动服务并缓存 // 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver(); ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
...@@ -81,11 +89,11 @@ public class DriverUtil { ...@@ -81,11 +89,11 @@ public class DriverUtil {
map.put("serverUrl", serverUrl); map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map)); log.info("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
} }
}catch (Exception e){ }catch (Exception e){
System.out.println("出现异常"); log.info("获取驱动driver出现异常");
// 若驱动返回错误码,重新创建驱动服务并缓存 // 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver(); ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
serverUrl = DriverUtil.getServerUrl(chromeDriver); serverUrl = DriverUtil.getServerUrl(chromeDriver);
...@@ -99,7 +107,6 @@ public class DriverUtil { ...@@ -99,7 +107,6 @@ public class DriverUtil {
map.put("serverUrl", serverUrl); map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId); driver = new ReuseWebDriver(serverUrl, sessionId);
} }
return driver; return driver;
...@@ -114,11 +121,11 @@ public class DriverUtil { ...@@ -114,11 +121,11 @@ public class DriverUtil {
Map<String, String> map =getSessionInfo(); Map<String, String> map =getSessionInfo();
String sessionId = map.get("sessionId"); String sessionId = map.get("sessionId");
String serverUrl = map.get("serverUrl"); String serverUrl = map.get("serverUrl");
log.info("从redis中获取保存的sessionId:"+sessionId);
return connectChrome(sessionId, serverUrl); return connectChrome(sessionId, serverUrl);
} }
public static Map<String, String> getSessionInfo() throws Exception{ public static Map<String, String> getSessionInfo() throws Exception{
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE); String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
System.out.println("获取驱动session:"+Constants.SELENIUM_DRIVER_CACHE+"::"+cacheInfo);
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class); Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
if(map==null || map.size()<1) { if(map==null || map.size()<1) {
map = new HashMap<>(2); map = new HashMap<>(2);
...@@ -126,7 +133,6 @@ public class DriverUtil { ...@@ -126,7 +133,6 @@ public class DriverUtil {
map.put("serverUrl", "https://www.baidu.com/"); map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息 // 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1); JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动session失败重新设置:"+Constants.SELENIUM_DRIVER_CACHE+"::"+JSON.toJSONString(map));
} }
return map; return map;
} }
......
...@@ -160,7 +160,9 @@ public class PublishDateUtil { ...@@ -160,7 +160,9 @@ public class PublishDateUtil {
{ {
// return formatUSDate(raw); // return formatUSDate(raw);
return raw; // return raw;
//当时间解析不了时返回空
return "";
} }
} }
......
IN-20220609-15205 IN-20220609-47235
IN-20220609-45135 IN-20220609-47237
IN-20220609-50717 IN-20220609-47216
IN-20220609-52785 IN-20220609-47437
IN-20220609-3804 IN-20220609-47218
IN-20220609-4071 IN-20220609-47219
IN-20220609-15069 IN-20220609-47300
IN-20220609-45126 IN-20220609-47476
IN-20220609-50722 IN-20220609-47478
IN-20220609-52787 IN-20220609-47480
IN-20220609-3817 IN-20220609-47845
IN-20220609-4084 IN-20220609-47902
IN-20220609-15090 IN-20220609-47903
IN-20220609-45128 IN-20220609-47904
IN-20220609-50721 IN-20220609-47421
IN-20220609-52786 IN-20220609-47182
IN-20220609-58494 IN-20220609-47311
IN-20220609-58502 IN-20220609-47391
IN-20220609-58503 IN-20220609-47458
IN-20220609-58504 IN-20220609-47336
IN-20220609-58505 IN-20220609-59371
IN-20220609-58506 IN-20220609-59453
IN-20220609-58508 IN-20220609-49847
IN-20220609-3840 IN-20220609-59673
IN-20220609-4107 IN-20220609-59674
IN-20220609-12827 IN-20220609-59675
IN-20220609-15126 IN-20220609-59676
IN-20220609-44997 IN-20220609-60448
IN-20220609-45132 IN-20220609-60449
IN-20220609-50720 IN-20220609-47459
IN-20220609-52779 IN-20220609-47460
IN-20220609-3874 IN-20220609-47461
IN-20220609-4141 IN-20220824-0021
IN-20220609-15147 IN-20220824-0027
IN-20220609-45133 IN-20220824-0028
IN-20220609-50719 IN-20220609-47521
IN-20220609-52778 IN-20220609-47532
IN-20220609-56218 IN-20220609-47439
IN-20220609-56227 IN-20220609-28544
IN-20220609-56229 IN-20220609-28668
IN-20220609-57566 IN-20220609-28669
IN-20220609-15041 IN-20220609-28670
IN-20220609-45120 IN-20220609-28671
IN-20220609-50724 IN-20220609-28672
IN-20220609-52789 IN-20220609-47427
IN-20220609-56185 IN-20220609-59376
IN-20220609-56187 IN-20220609-59458
IN-20220609-56188 IN-20220609-47332
IN-20220609-56189 IN-20220609-47333
IN-20220609-56190 IN-20220609-47371
IN-20220609-56191 IN-20220609-47552
IN-20220609-56192 IN-20220609-47230
IN-20220609-57726 IN-20220609-47231
IN-20220609-57739 IN-20220609-47847
IN-20220609-57740 IN-20220609-47523
IN-20220609-57741 IN-20220609-47477
IN-20220609-57742 IN-20220609-47410
IN-20220609-57771 IN-20220609-47417
IN-20220609-57772 IN-20220609-47419
IN-20220609-12932 IN-20220608-40137
IN-20220609-15043 IN-20220608-43048
IN-20220609-45005 IN-20220609-32903
IN-20220609-45122 IN-20220609-33008
IN-20220609-50723 IN-20220609-55178
IN-20220609-52788 IN-20220609-55179
IN-20220609-57997 IN-20220609-55180
IN-20220609-57998 IN-20220609-47464
IN-20220609-57999 IN-20220609-47913
IN-20220609-13241 IN-20220609-47469
IN-20220609-45062 IN-20220609-47255
IN-20220609-54133 IN-20220609-45951
IN-20220609-57070 IN-20220609-45954
IN-20220609-14979 IN-20220609-34374
IN-20220609-14981 IN-20220609-60221
IN-20220609-45097 IN-20220609-60222
IN-20220609-45098 IN-20220609-60223
IN-20220609-50737 IN-20220609-60224
IN-20220609-50738 IN-20220609-47302
IN-20220609-52766 IN-20220609-47343
IN-20220609-52767 IN-20220609-34422
IN-20220609-14977 IN-20220609-60044
IN-20220609-14978 IN-20220609-60653
IN-20220609-45095 IN-20220609-47484
IN-20220609-45096 IN-20220609-32928
IN-20220609-50739 IN-20220609-33017
IN-20220609-50744 IN-20220609-47370
IN-20220609-52768 IN-20220609-47426
IN-20220609-52769 IN-20220609-47328
IN-20220609-50867 IN-20220609-47918
IN-20220609-52521 IN-20220609-47305
IN-20220609-52773 IN-20220609-59375
IN-20220609-52840 IN-20220609-59457
IN-20220609-54294 IN-20220609-47366
IN-20220609-3947 IN-20220609-47416
IN-20220609-3970 IN-20220609-47441
IN-20220609-3971 IN-20220609-47442
IN-20220609-3972 IN-20220609-47394
IN-20220609-4214 IN-20220609-47397
IN-20220609-4237 IN-20220609-47938
IN-20220609-4238 IN-20220609-47941
IN-20220609-4239 IN-20220609-47942
IN-20220609-11340 IN-20220609-47353
IN-20220609-13247 IN-20220609-47357
IN-20220609-13248 IN-20220609-47360
IN-20220609-45068 IN-20220609-47849
IN-20220609-45069 IN-20220824-0060
IN-20220609-50746 IN-20220824-0061
IN-20220609-52783 IN-20220609-47265
IN-20220609-55032 IN-20220609-47530
IN-20220609-57345 IN-20220609-47899
IN-20220609-13060 IN-20220609-47465
IN-20220609-45024 IN-20220609-47466
IN-20220609-58325 IN-20220609-47344
IN-20220609-58469 IN-20220609-47362
IN-20220609-58471 IN-20220609-47428
IN-20220609-58473 IN-20220824-0059
IN-20220609-58477 IN-20220609-47212
IN-20220609-52580 IN-20220609-47213
IN-20220609-52796 IN-20220609-47214
IN-20220609-52582 IN-20220609-47436
IN-20220609-52798 IN-20220609-60198
IN-20220609-52583 IN-20220609-60199
IN-20220609-52799 IN-20220608-42141
IN-20220609-52632 IN-20220609-47337
IN-20220609-52800 IN-20220609-47340
IN-20220609-53097 IN-20220609-47570
IN-20220609-53161 IN-20220609-47317
IN-20220609-13092 IN-20220609-47310
IN-20220609-13162 IN-20220609-47312
IN-20220609-22258 IN-20220609-47313
IN-20220609-45054 IN-20220609-47316
IN-20220609-53041 IN-20220609-47908
IN-20220609-53044 IN-20220609-47258
IN-20220609-53051 IN-20220609-47253
IN-20220609-53052 IN-20220609-36443
IN-20220609-53055 IN-20220609-36444
IN-20220609-53056 IN-20220609-36445
IN-20220609-53099 IN-20220609-36447
IN-20220609-53101 IN-20220609-47267
IN-20220609-53111 IN-20220609-47270
IN-20220609-53112 IN-20220609-47271
IN-20220609-53139 IN-20220824-0057
IN-20220609-53140 IN-20220609-47380
IN-20220609-57072 IN-20220609-47369
IN-20220609-57074 IN-20220608-43063
IN-20220609-57076 IN-20220608-43064
IN-20220609-57077 IN-20220608-43065
IN-20220609-57084 IN-20220609-32958
IN-20220609-57085 IN-20220609-47488
IN-20220609-52723 IN-20220609-36392
IN-20220609-52728 IN-20220609-36393
IN-20220609-52803 IN-20220609-36394
IN-20220609-52814 IN-20220609-47848
IN-20220609-53102 IN-20220609-47347
IN-20220609-53103 IN-20220609-60200
IN-20220609-53162 IN-20220609-47286
IN-20220609-53180 IN-20220609-47462
IN-20220609-52611 IN-20220609-34852
IN-20220609-52806 IN-20220609-34853
IN-20220609-50475 IN-20220609-34854
IN-20220609-50476 IN-20220609-34855
IN-20220609-50477 IN-20220609-34856
IN-20220609-52568 IN-20220609-34857
IN-20220609-52569 IN-20220609-34858
IN-20220609-52570 IN-20220609-34859
IN-20220609-52810 IN-20220609-47262
IN-20220609-52811 IN-20220609-47269
IN-20220609-52812 IN-20220609-47272
IN-20220609-53602 IN-20220609-47274
IN-20220609-53603 IN-20220609-47277
IN-20220609-53604 IN-20220609-47280
IN-20220609-39173 IN-20220609-47283
IN-20220609-52571 IN-20220609-47453
IN-20220609-52574 IN-20220609-47909
IN-20220609-52813 IN-20220609-47915
IN-20220609-52838 IN-20220609-47916
IN-20220609-6007 IN-20220609-47917
IN-20220609-13080 IN-20220609-47966
IN-20220609-13081 IN-20220609-47446
IN-20220609-13082 IN-20220609-47238
IN-20220609-13083 IN-20220609-35856
IN-20220609-13084 IN-20220609-47378
IN-20220609-13085 IN-20220824-0055
IN-20220609-13086 IN-20220824-0056
IN-20220609-13219 IN-20220609-47561
IN-20220609-13220 IN-20220609-28518
IN-20220609-13221 IN-20220609-45310
IN-20220609-13222 IN-20220609-45311
IN-20220609-13223 IN-20220609-45314
IN-20220609-13224 IN-20220609-45315
IN-20220609-45043 IN-20220609-45316
IN-20220609-45044 IN-20220609-45317
IN-20220609-45045 IN-20220609-47557
IN-20220609-45046 IN-20220609-47910
IN-20220609-45047 IN-20220609-47911
IN-20220609-45048 IN-20220609-47912
IN-20220609-45049 IN-20220609-47965
IN-20220609-52436 IN-20220609-28665
IN-20220609-52524 IN-20220609-46200
IN-20220609-52525 IN-20220609-46201
IN-20220609-52526 IN-20220609-46202
IN-20220609-52533 IN-20220609-46203
IN-20220609-52536 IN-20220609-46204
IN-20220609-52815 IN-20220824-0029
IN-20220609-52830 IN-20220824-0033
IN-20220609-52857 IN-20220609-47472
IN-20220609-52858 IN-20220608-39053
IN-20220609-52866 IN-20220609-34299
IN-20220609-52867 IN-20220609-56125
IN-20220609-54292 IN-20220609-56126
IN-20220608-36685 IN-20220609-60230
IN-20220608-36686 IN-20220609-60231
IN-20220608-36687 IN-20220609-60363
IN-20220608-36688 IN-20220609-60364
IN-20220608-36689 IN-20220609-60365
IN-20220608-36690 IN-20220609-47288
IN-20220608-36691 IN-20220609-47238
IN-20220608-36692 IN-20220609-47446
IN-20220608-36693 IN-20220609-47915
IN-20220608-37141 IN-20220609-47916
IN-20220608-37142 IN-20220609-47917
IN-20220608-37143 IN-20220609-47966
IN-20220608-37144 IN-20220905-0001
IN-20220608-37463 IN-20220905-0005
IN-20220608-37464 IN-20220609-47567
IN-20220608-37465 IN-20220906-0015
IN-20220608-37466 IN-20220905-0003
IN-20220608-37467 IN-20220905-0013
IN-20220608-57178 IN-20220829-0038
IN-20220609-3773 IN-20220829-0039
IN-20220609-3774 IN-20220830-0137
IN-20220609-4040 IN-20220831-0011
IN-20220609-4041 IN-20220829-0042
IN-20220609-4353 IN-20220901-0025
IN-20220609-4354 IN-20220829-0043
IN-20220609-4363 IN-20220829-0049
IN-20220609-4372 IN-20220901-0026
IN-20220609-15012 IN-20220901-0027
IN-20220609-15013 IN-20220901-0034
IN-20220609-45203 IN-20220901-0045
IN-20220609-49938 IN-20220902-0021
IN-20220609-49939 IN-20220902-0023
IN-20220609-52553 IN-20220902-0022
IN-20220609-52554 IN-20220902-0025
IN-20220609-52556 IN-20220902-0024
IN-20220609-52557 IN-20220902-0026
IN-20220609-52558 IN-20220901-0033
IN-20220609-52559 IN-20220901-0046
IN-20220609-52560 IN-20220902-0042
IN-20220609-52816 IN-20220902-0036
IN-20220609-52817 IN-20220903-0057
IN-20220609-52841 IN-20220903-0025
IN-20220609-52842 IN-20220903-0023
IN-20220609-52843 IN-20220903-0024
IN-20220609-52923 IN-20220906-0007
IN-20220609-52939 IN-20220906-0008
IN-20220609-52974 IN-20220905-0004
IN-20220609-52975 IN-20220905-0009
IN-20220609-53107 IN-20220905-0010
IN-20220609-53108 IN-20220905-0011
IN-20220609-53049 IN-20220905-0014
IN-20220609-53054 IN-20220905-0019
IN-20220609-53057 IN-20220906-0006
IN-20220609-53059 IN-20220905-0020
IN-20220609-53060 IN-20220908-0066
IN-20220609-53062 IN-20220908-0059
IN-20220609-53110 IN-20220908-0060
IN-20220609-53113 IN-20220908-0061
IN-20220609-53141 IN-20220908-0062
IN-20220609-53158 IN-20220908-0064
IN-20220609-53159 IN-20220908-0034
IN-20220609-53177 IN-20220905-0006
IN-20220609-57075 IN-20220906-0009
IN-20220609-57078 IN-20220906-0010
IN-20220609-57086 IN-20220908-0036
IN-20220609-57094 IN-20220908-0037
IN-20220609-57095 IN-20220908-0041
IN-20220609-57102 IN-20220908-0038
IN-20220609-52417 IN-20220908-0042
IN-20220609-52657 IN-20220908-0043
IN-20220609-52660 IN-20220908-0046
IN-20220609-52661 IN-20220908-0044
IN-20220609-52665 IN-20220908-0047
IN-20220609-52666 IN-20220908-0049
IN-20220609-52826 IN-20220908-0051
IN-20220609-52827 IN-20220908-0050
IN-20220609-52828 IN-20220908-0052
IN-20220609-52829 IN-20220908-0055
IN-20220609-52832 IN-20220908-0056
IN-20220609-53114 IN-20220906-0014
IN-20220609-53115 IN-20220906-0016
IN-20220609-53116 IN-20220906-0012
IN-20220609-53117 IN-20220906-0022
IN-20220609-53130 IN-20220906-0021
IN-20220609-53190 IN-20220906-0013
IN-20220609-53191 IN-20220906-0029
IN-20220609-53192 IN-20220906-0025
IN-20220609-53193 IN-20220906-0030
IN-20220609-53194 IN-20220906-0038
IN-20220609-56428 IN-20220906-0036
IN-20220609-60657 IN-20220906-0034
IN-20220609-52942 IN-20220907-0002
IN-20220609-52943 IN-20220907-0003
IN-20220609-52944 IN-20220907-0004
IN-20220609-52946 IN-20220907-0006
IN-20220609-52947 IN-20220907-0013
IN-20220609-52948 IN-20220907-0011
IN-20220609-52950 IN-20220907-0014
IN-20220609-52952 IN-20220907-0017
IN-20220609-52953 IN-20220907-0018
IN-20220609-52954 IN-20220907-0027
IN-20220609-52955 IN-20220907-0021
IN-20220609-52957 IN-20220907-0026
IN-20220609-52958 IN-20220907-0031
IN-20220609-52960 IN-20220907-0030
IN-20220609-52961 IN-20220907-0029
IN-20220609-52977 IN-20220907-0028
IN-20220609-52978 IN-20220908-0003
IN-20220609-52979 IN-20220907-0036
IN-20220609-52980 IN-20220908-0002
IN-20220609-52981 IN-20220908-0001
IN-20220609-52982 IN-20220907-0037
IN-20220609-52983 IN-20220907-0035
IN-20220609-52984 IN-20220908-0004
IN-20220609-52985 IN-20220908-0006
IN-20220609-52986 IN-20220908-0005
IN-20220609-52987 IN-20220908-0007
IN-20220609-52988 IN-20220908-0008
IN-20220609-52990 IN-20220906-0035
IN-20220609-52991 IN-20220908-0029
IN-20220609-52998 IN-20220908-0031
IN-20220609-53118 IN-20220908-0026
IN-20220609-53119 IN-20220908-0024
IN-20220609-53120 IN-20220908-0012
IN-20220609-53121 IN-20220908-0022
IN-20220609-53122 IN-20220907-0038
IN-20220609-53123 IN-20220907-0032
IN-20220609-53124 IN-20220907-0033
IN-20220609-53133 IN-20220907-0034
IN-20220609-53134 IN-20220907-0024
IN-20220609-53135 IN-20220907-0010
IN-20220609-53136 IN-20220907-0012
IN-20220609-53137 IN-20220907-0008
IN-20220609-53147 IN-20220905-0007
IN-20220609-53148 IN-20220907-0007
IN-20220609-53155 IN-20220905-0002
IN-20220609-53073 IN-20220903-0001
IN-20220609-53126 IN-20220903-0002
IN-20220609-57080 IN-20220903-0005
IN-20220609-53089 IN-20220903-0006
IN-20220609-53092 IN-20220903-0008
IN-20220609-53128 IN-20220903-0055
IN-20220609-53129 IN-20220903-0056
IN-20220609-57082 IN-20220906-0018
IN-20220609-57083 IN-20220906-0019
IN-20220609-52669 IN-20220906-0023
IN-20220609-52833 IN-20220908-0009
IN-20220609-53131 IN-20220908-0017
IN-20220609-53195 IN-20220908-0020
IN-20220609-11536 IN-20220908-0023
IN-20220609-14969 IN-20220905-0012
IN-20220609-52572 IN-20220905-0008
IN-20220609-52835 IN-20220905-0018
IN-20220609-52585 IN-20220906-0001
IN-20220609-52836 IN-20220906-0002
IN-20220609-52550 IN-20220906-0003
IN-20220609-52573 IN-20220906-0004
IN-20220609-52837 IN-20220906-0005
IN-20220609-4526 IN-20220906-0017
IN-20220609-4527 IN-20220906-0024
IN-20220609-4528 IN-20220906-0027
IN-20220609-4529 IN-20220907-0009
IN-20220609-44926 IN-20220907-0016
IN-20220609-44927 IN-20220907-0019
IN-20220609-44928 IN-20220907-0020
IN-20220609-44929 IN-20220907-0015
IN-20220609-53000 IN-20220907-0023
IN-20220609-53001 IN-20220907-0025
IN-20220609-53006 IN-20220907-0039
IN-20220609-53008 IN-20220908-0013
IN-20220609-53010 IN-20220908-0014
IN-20220609-53143 IN-20220908-0016
IN-20220609-53163 IN-20220908-0018
IN-20220609-53166 IN-20220908-0021
IN-20220609-53167 IN-20220908-0032
IN-20220609-53168 IN-20220908-0039
IN-20220609-57088 IN-20220908-0040
IN-20220609-57096 IN-20220908-0048
IN-20220609-57099 IN-20220908-0054
IN-20220609-57100 IN-20220908-0057
IN-20220609-57101 IN-20220908-0058
IN-20220609-4530 IN-20220908-0077
IN-20220609-4531 IN-20220908-0078
IN-20220609-4532 IN-20220909-0006
IN-20220609-4533 IN-20220909-0007
IN-20220609-4534 IN-20220609-3709
IN-20220609-13094 IN-20220909-0011
IN-20220609-44930 IN-20220909-0012
IN-20220609-44931 IN-20220909-0013
IN-20220609-44932 IN-20220909-0016
IN-20220609-45056 IN-20220909-0014
IN-20220609-52522 IN-20220909-0017
IN-20220609-52860 IN-20220608-42659
IN-20220609-53079 IN-20220609-60109
IN-20220609-53144 IN-20220609-49943
IN-20220609-53782 IN-20220909-0019
IN-20220609-53785 IN-20220909-0020
IN-20220609-57089 IN-20220609-60260
IN-20220609-57114 IN-20220609-47855
IN-20220609-52962 IN-20220609-47958
IN-20220609-52992 IN-20220909-0021
IN-20220609-53149 IN-20220913-0023
IN-20220609-53094 IN-20220909-0022
IN-20220609-53157 IN-20220609-60006
IN-20220609-57093 IN-20220609-32926
IN-20220609-52970 IN-20220609-33014
IN-20220609-53002 IN-20220609-4991
IN-20220609-53005 IN-20220609-60461
IN-20220609-53164 IN-20220609-59251
IN-20220609-53165 IN-20220609-59252
IN-20220609-53243 IN-20220609-60460
IN-20220609-57097 IN-20220911-0002
IN-20220609-57098 IN-20220911-0001
IN-20220609-57112 IN-20220608-34875
IN-20220609-52653 IN-20220608-34876
IN-20220609-52852 IN-20220608-34874
IN-20220609-53173 IN-20220609-3402
IN-20220609-53216 IN-20220609-3401
IN-20220609-52654 IN-20220609-45810
IN-20220609-52853 IN-20220609-45812
IN-20220609-53174 IN-20220609-45809
IN-20220609-53217 IN-20220608-40026
IN-20220609-52655 IN-20220609-32888
IN-20220609-52854 IN-20220608-40027
IN-20220609-53175 IN-20220608-40029
IN-20220609-53218 IN-20220609-60435
IN-20220609-52656 IN-20220609-60434
IN-20220609-52855 IN-20220908-0015
IN-20220609-53176 IN-20220908-0045
IN-20220609-53219 IN-20220908-0053
IN-20220609-52523 IN-20220909-0001
IN-20220609-52856 IN-20220913-0007
IN-20220609-59064 IN-20220913-0006
IN-20220609-52680 IN-20220913-0005
IN-20220609-52682 IN-20220915-0060
IN-20220609-52684 IN-20220915-0062
IN-20220609-52862 IN-20220915-0064
IN-20220609-52863 IN-20220915-0063
IN-20220609-52864 IN-20220608-35629
IN-20220609-53186 IN-20220609-59385
IN-20220609-53187 IN-20220609-32960
IN-20220609-53188 IN-20220608-43346
IN-20220609-53231 IN-20220608-35769
IN-20220609-53232 IN-20220608-43345
IN-20220609-53233 IN-20220609-33029
IN-20220609-52686 IN-20220609-47284
IN-20220609-52687 IN-20220915-0066
IN-20220609-52690 IN-20220915-0071
IN-20220609-52865 IN-20220915-0073
IN-20220609-52895 IN-20220915-0077
IN-20220609-52896 IN-20220915-0076
IN-20220609-53189 IN-20220915-0075
IN-20220609-53229 IN-20220915-0074
IN-20220609-53230 IN-20220915-0081
IN-20220609-53234 IN-20220609-50163
IN-20220609-53261 IN-20220609-50166
IN-20220609-53262 IN-20220609-50165
IN-20220609-52636 IN-20220608-41537
IN-20220609-52869 IN-20220609-32957
IN-20220609-53197 IN-20220609-3382
IN-20220609-53235 IN-20220609-35147
IN-20220609-52694 IN-20220609-35144
IN-20220609-52870 IN-20220609-35149
IN-20220609-53198 IN-20220609-35148
IN-20220609-53236 IN-20220609-35146
IN-20220609-52633 IN-20220609-35145
IN-20220609-52873 IN-20220609-35143
IN-20220609-53201 IN-20220608-42363
IN-20220609-53237 IN-20220608-42367
IN-20220609-52634 IN-20220608-42333
IN-20220609-52874 IN-20220608-42358
IN-20220609-53202 IN-20220609-33022
IN-20220609-53246 IN-20220609-16717
IN-20220609-52640 IN-20220609-32951
IN-20220609-52878 IN-20220609-47241
IN-20220609-53205 IN-20220609-47890
IN-20220609-53249 IN-20220609-48114
IN-20220609-52642 IN-20220609-48112
IN-20220609-52879 IN-20220609-48111
IN-20220609-53206 IN-20220609-48110
IN-20220609-53250 IN-20220609-48113
IN-20220609-52646 IN-20220609-48109
IN-20220609-52881 IN-20220609-48115
IN-20220609-53208 IN-20220609-48116
IN-20220609-53252 IN-20220609-54594
IN-20220609-52670 IN-20220905-0016
IN-20220609-52671 IN-20220905-0015
IN-20220609-52672 IN-20220608-35048
IN-20220609-52890 IN-20220608-34970
IN-20220609-52891 IN-20220513-0001
IN-20220609-52892 IN-20220913-0010
IN-20220609-53224 IN-20220608-42911
IN-20220609-53225 IN-20220608-40293
IN-20220609-53226 IN-20220608-40715
IN-20220609-53256 IN-20220608-40714
IN-20220609-53257 IN-20220608-42918
IN-20220609-53258 IN-20220608-42919
IN-20220609-52673 IN-20220608-42914
IN-20220609-52893 IN-20220609-32876
IN-20220609-53227 IN-20220609-33056
IN-20220609-53259 IN-20220609-44935
IN-20220609-52677 IN-20220609-47468
IN-20220609-52894 IN-20220609-47467
IN-20220609-53228 IN-20220908-0027
IN-20220609-53260 IN-20220609-55065
IN-20220609-53031 IN-20220908-0073
IN-20220609-53238 IN-20220824-0058
IN-20220609-57107 IN-20220608-35627
IN-20220609-52700 IN-20220909-0005
IN-20220609-52897 IN-20220609-46239
IN-20220609-53244 IN-20220609-28794
IN-20220609-53263 IN-20220609-28793
IN-20220609-52705 IN-20220609-46240
IN-20220609-52898 IN-20220609-47953
IN-20220609-53095 IN-20220609-47955
IN-20220609-53245 IN-20220609-47951
IN-20220609-52578 IN-20220609-47956
IN-20220609-52899 IN-20220609-47974
IN-20220609-52586 IN-20220909-0008
IN-20220609-52908 IN-20220908-0019
IN-20220609-53775 IN-20220609-34612
IN-20220609-53778 IN-20220909-0009
IN-20220609-53781 IN-20220909-0010
IN-20220609-53783 IN-20220609-48727
IN-20220609-53786 IN-20220609-48726
IN-20220609-57113 IN-20220608-38981
IN-20220609-57115 IN-20220907-0005
IN-20220609-57143 IN-20220913-0003
IN-20220609-57144 IN-20220906-0020
IN-20220609-57146 IN-20220906-0026
IN-20220609-53789 IN-20220907-0001
IN-20220609-53792 IN-20220907-0022
IN-20220609-53793 IN-20220913-0001
IN-20220609-53799 IN-20220908-0025
IN-20220609-53806 IN-20220908-0033
IN-20220609-57116 IN-20220908-0035
IN-20220609-57117 IN-20220609-59383
IN-20220609-57130 IN-20220908-0063
IN-20220609-57131 IN-20220908-0065
IN-20220609-57132 IN-20220609-10337
IN-20220609-52464 IN-20220916-0064
IN-20220609-52465 IN-20220917-0002
IN-20220609-52466 IN-20220917-0001
IN-20220609-52467 IN-20220917-0005
IN-20220609-52468 IN-20220917-0016
IN-20220609-52469 IN-20220917-0014
IN-20220609-52470 IN-20220917-0017
IN-20220609-53373 IN-20220917-0009
IN-20220609-53374 IN-20220917-0018
IN-20220609-53375 IN-20220917-0020
IN-20220609-53376 IN-20220917-0019
IN-20220609-53384 IN-20220917-0023
IN-20220609-53385 IN-20220917-0025
IN-20220609-53386 IN-20220917-0029
IN-20220609-53810 IN-20220917-0036
IN-20220609-57120 IN-20220917-0033
IN-20220609-52476 IN-20220917-0059
IN-20220609-52477 IN-20220917-0063
IN-20220609-52478 IN-20220917-0060
IN-20220609-52479 IN-20220917-0065
IN-20220609-52480 IN-20220917-0070
IN-20220609-52481 IN-20220917-0045
IN-20220609-52482 IN-20220917-0047
IN-20220609-53298 IN-20220917-0041
IN-20220609-53299 IN-20220917-0098
IN-20220609-53300 IN-20220917-0111
IN-20220609-53301 IN-20220917-0108
IN-20220609-53365 IN-20220917-0121
IN-20220609-53366 IN-20220917-0115
IN-20220609-53392 IN-20220917-0075
IN-20220609-53811 IN-20220917-0132
IN-20220609-53813 IN-20220917-0134
IN-20220609-53815 IN-20220917-0131
IN-20220609-53816 IN-20220917-0135
IN-20220609-53817 IN-20220917-0133
IN-20220609-53819 IN-20220917-0136
IN-20220609-53821 IN-20220917-0137
IN-20220609-57121 IN-20220916-0073
IN-20220609-57122 IN-20220916-0074
IN-20220609-57123 IN-20220916-0072
IN-20220609-57320 IN-20220916-0075
IN-20220609-57321 IN-20220916-0077
IN-20220609-57322 IN-20220916-0080
IN-20220609-57323 IN-20220916-0079
IN-20220609-53832 IN-20220916-0076
IN-20220609-53838 IN-20220916-0078
IN-20220609-53843 IN-20220916-0082
IN-20220609-53845 IN-20220916-0083
IN-20220609-57124 IN-20220916-0084
IN-20220609-57125 IN-20220916-0085
IN-20220609-57126 IN-20220609-59483
IN-20220609-57133 IN-20220609-59678
IN-20220609-53851 IN-20220609-59679
IN-20220609-53859 IN-20220609-14391
IN-20220609-53861 IN-20220609-14390
IN-20220609-57127 IN-20220609-60259
IN-20220609-57140 IN-20220609-47854
IN-20220609-57141 IN-20220608-43350
IN-20220609-53873 IN-20220608-43347
IN-20220609-57128 IN-20220608-43344
IN-20220609-53876 IN-20220608-43342
IN-20220609-57129 IN-20220608-43352
IN-20220609-53864 IN-20220609-32893
IN-20220609-53867 IN-20220609-20296
IN-20220609-53871 IN-20220608-55468
IN-20220609-57134 IN-20220608-41542
IN-20220609-57139 IN-20220608-41539
IN-20220609-57142 IN-20220609-22024
IN-20220609-53755 IN-20220609-59249
IN-20220609-53758 IN-20220609-60459
IN-20220609-53760 IN-20220608-54302
IN-20220609-57135 IN-20220609-3356
IN-20220609-57227 IN-20220609-3457
IN-20220609-57228 IN-20220609-3472
IN-20220609-52471 IN-20220609-3473
IN-20220609-52472 IN-20220609-3496
IN-20220609-52473 IN-20220915-0050
IN-20220609-52474 IN-20220609-37611
IN-20220609-52475 IN-20220609-37606
IN-20220609-53387 IN-20220609-37605
IN-20220609-53388 IN-20220609-37604
IN-20220609-53389 IN-20220609-37603
IN-20220609-53390 IN-20220909-0018
IN-20220609-53391 IN-20220909-0015
IN-20220609-53764 IN-20220608-42332
IN-20220609-53767 IN-20220609-48020
IN-20220609-53768 IN-20220908-0030
IN-20220609-53772 IN-20220908-0028
IN-20220609-57136 IN-20220609-59597
IN-20220609-57137 IN-20220609-59485
IN-20220609-57231 IN-20220609-59484
IN-20220609-57254 IN-20220609-18229
IN-20220609-54215 IN-20220608-54592
IN-20220609-57147 IN-20220608-54562
IN-20220609-54225 IN-20220609-47975
IN-20220609-54226 IN-20220609-34400
IN-20220609-54242 IN-20220609-33028
IN-20220609-54244 IN-20220609-32959
IN-20220609-54246 IN-20220609-18224
IN-20220609-57148 IN-20220609-12201
IN-20220609-57149 IN-20220609-18079
IN-20220609-57263 IN-20220609-12200
IN-20220609-57264 IN-20220608-55382
IN-20220609-57265 IN-20220608-55379
IN-20220609-54158 IN-20220608-55378
IN-20220609-54159 IN-20220608-55377
IN-20220609-57150 IN-20220608-55376
IN-20220609-57339 IN-20220608-55375
IN-20220609-54160 IN-20220608-55374
IN-20220609-54161 IN-20220608-55373
IN-20220609-54162 IN-20220608-55372
IN-20220609-54163 IN-20220608-55371
IN-20220609-54164 IN-20220608-55370
IN-20220609-57151 IN-20220608-55368
IN-20220609-57152 IN-20220608-55367
IN-20220609-57153 IN-20220608-55366
IN-20220609-57291 IN-20220608-55364
IN-20220609-57292 IN-20220608-55362
IN-20220609-54165 IN-20220608-55358
IN-20220609-54167 IN-20220608-55243
IN-20220609-54168 IN-20220608-55242
IN-20220609-57154 IN-20220608-55241
IN-20220609-57247 IN-20220608-55240
IN-20220609-57248 IN-20220608-55239
IN-20220609-54175 IN-20220608-55237
IN-20220609-54178 IN-20220608-55236
IN-20220609-57155 IN-20220608-55235
IN-20220609-57177 IN-20220608-55234
IN-20220609-11997 IN-20220608-55233
IN-20220609-13154 IN-20220608-55232
IN-20220609-54222 IN-20220608-55231
IN-20220609-54227 IN-20220608-55230
IN-20220609-54228 IN-20220608-55229
IN-20220609-54229 IN-20220608-55228
IN-20220609-57156 IN-20220608-55227
IN-20220609-57157 IN-20220608-55226
IN-20220609-57158 IN-20220608-55225
IN-20220609-57324 IN-20220608-55224
IN-20220609-54253 IN-20220608-55223
IN-20220609-54257 IN-20220608-55222
IN-20220609-54262 IN-20220608-55219
IN-20220609-54267 IN-20220608-55218
IN-20220609-54269 IN-20220608-55217
IN-20220609-54270 IN-20220608-55216
IN-20220609-54277 IN-20220608-55215
IN-20220609-54278 IN-20220608-55214
IN-20220609-57159 IN-20220608-55213
IN-20220609-57276 IN-20220608-55212
IN-20220609-57277 IN-20220608-55211
IN-20220609-57279 IN-20220608-55210
IN-20220609-57280 IN-20220608-55209
IN-20220609-57281 IN-20220608-55208
IN-20220609-57295 IN-20220608-55207
IN-20220609-57328 IN-20220608-55206
IN-20220609-54280 IN-20220608-55205
IN-20220609-54290 IN-20220608-55204
IN-20220609-54295 IN-20220608-55203
IN-20220609-54297 IN-20220608-55202
IN-20220609-54299 IN-20220608-55201
IN-20220609-54302 IN-20220608-55200
IN-20220609-54303 IN-20220608-55199
IN-20220609-57160 IN-20220608-54616
IN-20220609-57167 IN-20220608-54598
IN-20220609-57169 IN-20220608-43197
IN-20220609-57297 IN-20220608-43191
IN-20220609-57304 IN-20220608-43188
IN-20220609-57331 IN-20220608-39499
IN-20220609-13051 IN-20220608-39494
IN-20220609-45017 IN-20220608-39484
IN-20220609-54281 IN-20220609-47975
IN-20220609-54282 IN-20220609-34400
IN-20220609-54283 IN-20220609-33028
IN-20220609-54284 IN-20220609-32959
IN-20220609-57161 IN-20220609-18224
IN-20220609-57162 IN-20220609-12201
IN-20220609-57163 IN-20220609-18079
IN-20220609-57164 IN-20220609-12200
IN-20220609-12030 IN-20220608-55382
IN-20220609-12033 IN-20220608-55379
IN-20220609-12037 IN-20220608-55378
IN-20220609-28374 IN-20220608-55377
IN-20220609-28375 IN-20220608-55376
IN-20220609-54285 IN-20220608-55375
IN-20220609-54289 IN-20220608-55374
IN-20220609-54291 IN-20220608-55373
IN-20220609-54296 IN-20220608-55372
IN-20220609-54298 IN-20220608-55371
IN-20220609-54300 IN-20220608-55370
IN-20220609-54301 IN-20220608-55368
IN-20220609-56421 IN-20220608-55367
IN-20220609-57165 IN-20220608-55366
IN-20220609-57166 IN-20220608-55364
IN-20220609-57168 IN-20220608-55362
IN-20220609-57296 IN-20220608-55358
IN-20220609-57302 IN-20220608-55243
IN-20220609-57303 IN-20220608-55242
IN-20220609-57332 IN-20220608-55241
IN-20220609-3938 IN-20220608-55240
IN-20220609-4205 IN-20220608-55239
IN-20220609-13074 IN-20220608-55237
IN-20220609-45037 IN-20220608-55236
IN-20220609-54305 IN-20220608-55235
IN-20220609-57170 IN-20220608-55234
IN-20220609-3790 IN-20220608-55233
IN-20220609-4057 IN-20220608-55232
IN-20220609-4717 IN-20220608-55231
IN-20220609-4718 IN-20220608-55230
IN-20220609-44969 IN-20220608-55229
IN-20220609-44970 IN-20220608-55228
IN-20220609-54304 IN-20220608-55227
IN-20220609-54306 IN-20220608-55226
IN-20220609-54307 IN-20220608-55225
IN-20220609-54308 IN-20220608-55224
IN-20220609-54309 IN-20220608-55223
IN-20220609-54310 IN-20220608-55222
IN-20220609-57171 IN-20220608-55219
IN-20220609-57172 IN-20220608-55218
IN-20220609-57173 IN-20220608-55217
IN-20220609-57174 IN-20220608-55216
IN-20220609-57175 IN-20220608-55215
IN-20220609-57305 IN-20220608-55214
IN-20220609-13075 IN-20220608-55213
IN-20220609-45038 IN-20220608-55212
IN-20220609-54311 IN-20220608-55211
IN-20220609-54312 IN-20220608-55210
IN-20220609-57176 IN-20220608-55209
IN-20220609-57187 IN-20220608-55208
IN-20220609-54179 IN-20220608-55207
IN-20220609-54180 IN-20220608-55206
IN-20220609-57178 IN-20220608-55205
IN-20220609-57179 IN-20220608-55204
IN-20220609-54181 IN-20220608-55203
IN-20220609-54182 IN-20220608-55202
IN-20220609-54183 IN-20220608-55201
IN-20220609-57180 IN-20220608-55200
IN-20220609-57181 IN-20220608-55199
IN-20220609-57182 IN-20220608-54616
IN-20220609-54216 IN-20220608-54598
IN-20220609-54218 IN-20220608-43197
IN-20220609-54219 IN-20220608-43191
IN-20220609-57183 IN-20220608-43188
IN-20220609-57184 IN-20220608-39499
IN-20220609-57185 IN-20220608-39494
IN-20220609-54220 IN-20220608-39484
IN-20220609-54221 IN-20220609-47982
IN-20220609-54223 IN-20220609-47981
IN-20220609-54224 IN-20220609-47980
IN-20220609-57186 IN-20220609-47979
IN-20220609-57325 IN-20220609-47976
IN-20220609-57329 IN-20220609-35028
IN-20220609-57330 IN-20220609-35027
IN-20220609-13076 IN-20220609-35026
IN-20220609-45039 IN-20220609-12204
IN-20220609-52539 IN-20220609-12191
IN-20220609-52804 IN-20220608-42576
IN-20220609-54313 IN-20220608-42443
IN-20220609-57188 IN-20220608-42438
IN-20220609-54314 IN-20220608-42437
IN-20220609-56620 IN-20220608-42435
IN-20220609-56622 IN-20220608-42433
IN-20220609-56623 IN-20220608-42432
IN-20220609-57189 IN-20220608-42430
IN-20220609-57705 IN-20220608-42427
IN-20220609-57727 IN-20220608-42425
IN-20220609-57728 IN-20220608-42423
IN-20220609-11542 IN-20220608-39978
IN-20220609-14999 IN-20220608-39966
IN-20220609-44990 IN-20220609-52555
IN-20220609-45106 IN-20220609-52552
IN-20220609-50733 IN-20220609-52551
IN-20220609-52549 IN-20220609-50769
IN-20220609-52764 IN-20220609-50764
IN-20220609-52847 IN-20220609-50758
IN-20220609-54315 IN-20220609-50729
IN-20220609-57190 IN-20220609-50718
IN-20220609-54184 IN-20220609-34443
IN-20220609-54185 IN-20220609-34442
IN-20220609-54186 IN-20220609-34441
IN-20220609-57191 IN-20220609-34440
IN-20220609-57192 IN-20220608-55304
IN-20220609-57193 IN-20220608-55303
IN-20220609-54197 IN-20220608-55302
IN-20220609-54199 IN-20220608-55279
IN-20220609-54200 IN-20220608-55248
IN-20220609-54202 IN-20220608-55247
IN-20220609-54203 IN-20220608-55246
IN-20220609-54204 IN-20220608-55245
IN-20220609-54206 IN-20220608-55174
IN-20220609-54207 IN-20220608-55173
IN-20220609-54208 IN-20220608-38262
IN-20220609-54209 IN-20220608-38259
IN-20220609-54210 IN-20220608-37976
IN-20220609-54211 IN-20220608-37950
IN-20220609-54212 IN-20220608-37911
IN-20220609-54213 IN-20220608-39356
IN-20220609-57194 IN-20220608-35628
IN-20220609-57195 IN-20220609-15019
IN-20220609-57196 IN-20220609-33035
IN-20220609-57197 IN-20220609-32966
IN-20220609-57198 IN-20220608-44974
IN-20220609-57298 IN-20220608-44971
IN-20220609-57299 IN-20220608-44970
IN-20220609-57300 IN-20220608-44967
IN-20220609-57301 IN-20220608-35743
IN-20220609-57306 IN-20220608-34966
IN-20220609-57318 IN-20220608-42418
IN-20220609-57319 IN-20220609-33009
IN-20220609-57326 IN-20220609-32905
IN-20220609-57327 IN-20220929-0113
IN-20220609-54061 IN-20220609-59381
IN-20220609-54062 IN-20220609-3520
IN-20220609-54063 IN-20220609-59994
IN-20220609-57199 IN-20220609-59995
IN-20220609-57200 IN-20220609-60094
IN-20220609-57315 IN-20220609-60095
IN-20220609-54064 IN-20220609-60096
IN-20220609-54065 IN-20220609-60097
IN-20220609-57201 IN-20220609-60098
IN-20220609-57202 IN-20220609-60099
IN-20220609-54066 IN-20220609-60100
IN-20220609-54067 IN-20220609-60101
IN-20220609-54068 IN-20220609-60102
IN-20220609-54069 IN-20220609-60103
IN-20220609-54070 IN-20220609-60104
IN-20220609-57203 IN-20220609-60105
IN-20220609-57204 IN-20220609-60106
IN-20220609-57205 IN-20220609-60107
IN-20220609-57206 IN-20220609-12058
IN-20220609-57207 IN-20220609-60009
IN-20220609-54021 IN-20220608-39481
IN-20220609-54022 IN-20220609-60137
IN-20220609-54023 IN-20220609-60138
IN-20220609-54024 IN-20220609-59668
IN-20220609-57208 IN-20220609-60377
IN-20220609-57209 IN-20220609-60378
IN-20220609-57310 IN-20220609-60379
IN-20220609-57311 IN-20220609-47139
IN-20220609-49671 IN-20220609-59368
IN-20220609-54025 IN-20220609-59416
IN-20220609-57210 IN-20220609-47142
IN-20220609-54026 IN-20220609-3196
IN-20220609-54032 IN-20220609-32982
IN-20220609-54040 IN-20220609-47159
IN-20220609-57211 IN-20220609-47161
IN-20220609-57212 IN-20220609-47155
IN-20220609-57213 IN-20220609-47157
IN-20220609-54052 IN-20220609-47149
IN-20220609-57216 IN-20220609-47150
IN-20220609-54071 IN-20220609-47152
IN-20220609-54072 IN-20220609-47154
IN-20220609-54073 IN-20220609-47301
IN-20220609-54074 IN-20220609-59367
IN-20220609-57217 IN-20220609-60233
IN-20220609-57218 IN-20220609-47137
IN-20220609-57219 IN-20220609-47138
IN-20220609-57220 IN-20220609-47163
IN-20220609-54075 IN-20220608-35724
IN-20220609-54076 IN-20220609-47176
IN-20220609-54077 IN-20220609-47177
IN-20220609-57221 IN-20220609-47178
IN-20220609-57222 IN-20220609-47179
IN-20220609-57223 IN-20220609-47180
IN-20220609-54078 IN-20220609-47852
IN-20220609-54090 IN-20220609-47166
IN-20220609-57224 IN-20220609-47168
IN-20220609-57225 IN-20220609-47957
IN-20220609-53753 IN-20220609-47851
IN-20220609-57226 IN-20220609-47173
IN-20220609-53762 IN-20220609-47175
IN-20220609-53763 IN-20220609-47853
IN-20220609-57229 IN-20220824-0054
IN-20220609-57230 IN-20220609-47183
IN-20220609-53888 IN-20220608-37881
IN-20220609-53890 IN-20220608-40098
IN-20220609-57232 IN-20220608-40099
IN-20220609-57317 IN-20220608-41543
IN-20220609-53891 IN-20220608-41544
IN-20220609-53892 IN-20220609-32897
IN-20220609-57233 IN-20220609-3478
IN-20220609-57234 IN-20220609-60383
IN-20220609-53893 IN-20220609-60384
IN-20220609-53894 IN-20220609-60385
IN-20220609-53895 IN-20220609-60386
IN-20220609-57235 IN-20220609-47144
IN-20220609-57236 IN-20220609-47146
IN-20220609-57237 IN-20220609-47148
IN-20220609-53896 IN-20220609-47195
IN-20220609-53897 IN-20220609-47191
IN-20220609-57238 IN-20220609-47193
IN-20220609-57239 IN-20220609-47169
IN-20220609-54091 IN-20220609-47170
IN-20220609-54094 IN-20220609-47172
IN-20220609-54097 IN-20220609-47306
IN-20220609-57240 IN-20220609-49926
IN-20220609-57268 IN-20220609-60413
IN-20220609-57269 IN-20220609-60414
IN-20220609-54110 IN-20220609-60415
IN-20220609-54111 IN-20220609-47856
IN-20220609-57241 IN-20220609-47857
IN-20220609-57275 IN-20220609-47858
IN-20220609-54112 IN-20220609-60022
IN-20220609-54113 IN-20220609-60588
IN-20220609-54114 IN-20220609-47204
IN-20220609-54115 IN-20220609-47206
IN-20220609-57242 IN-20220609-47209
IN-20220609-57243 IN-20220609-47210
IN-20220609-57244 IN-20220609-47220
IN-20220609-57245 IN-20220609-47221
IN-20220609-54116 IN-20220609-47222
IN-20220609-57246 IN-20220609-59372
IN-20220609-52483 IN-20220609-59454
IN-20220609-53367 IN-20220609-47185
IN-20220609-54169 IN-20220609-47187
IN-20220609-54170 IN-20220609-47188
IN-20220609-54171 IN-20220609-47224
IN-20220609-57249 IN-20220609-47225
IN-20220609-57258 IN-20220609-47141
IN-20220609-57259 IN-20220609-47226
IN-20220609-53880 IN-20220609-47227
IN-20220609-53884 IN-20220609-47228
IN-20220609-57250 IN-20220608-35066
IN-20220609-57316 IN-20220609-3491
IN-20220609-54120 IN-20220609-47871
IN-20220609-54124 IN-20220609-20327
IN-20220609-54125 IN-20220608-42491
IN-20220609-54126 IN-20220608-42490
IN-20220609-57251 IN-20220608-42489
IN-20220609-57333 IN-20220608-38735
IN-20220609-57334 IN-20220608-40064
IN-20220609-57335 IN-20220608-54884
IN-20220609-54130 IN-20220608-54885
IN-20220609-54131 IN-20220608-54886
IN-20220609-57252 IN-20220608-54887
IN-20220609-57253 IN-20220608-54888
IN-20220609-54172 IN-20220609-32892
IN-20220609-57260 IN-20220609-47232
IN-20220609-54173 IN-20220609-60477
IN-20220609-54174 IN-20220609-60478
IN-20220609-57261 IN-20220609-60479
IN-20220609-57262 IN-20220608-40165
IN-20220609-54146 IN-20220608-43024
IN-20220609-54148 IN-20220608-43029
IN-20220609-54149 IN-20220608-43030
IN-20220609-54150 IN-20220608-43032
IN-20220609-57266 IN-20220608-43033
IN-20220609-57267 IN-20220608-43035
IN-20220609-57285 IN-20220609-33036
IN-20220609-57336 IN-20220609-60005
IN-20220609-54098 IN-20220608-34957
IN-20220609-54102 IN-20220609-47201
IN-20220609-57270 IN-20220609-3400
IN-20220609-57271 IN-20220609-47273
IN-20220609-54106 IN-20220609-47198
IN-20220609-54108 IN-20220609-47199
IN-20220609-54109 IN-20220609-47200
IN-20220609-57272 IN-20220609-47203
IN-20220609-57273 IN-20220609-47318
IN-20220609-57274 IN-20220609-34403
IN-20220609-13050 IN-20220609-60039
IN-20220609-15122 IN-20220609-60637
IN-20220609-45016 IN-20220609-47482
IN-20220609-54261 IN-20220609-47487
IN-20220609-54271 IN-20220609-34086
IN-20220609-54272 IN-20220609-34087
IN-20220609-54274 IN-20220609-34804
IN-20220609-54275 IN-20220609-28519
IN-20220609-54276 IN-20220609-28609
IN-20220609-57278 IN-20220609-28610
IN-20220609-57282 IN-20220609-28611
IN-20220609-57283 IN-20220609-28612
IN-20220609-57284 IN-20220609-28613
IN-20220609-57293 IN-20220608-39833
IN-20220609-57294 IN-20220608-39837
IN-20220609-54151 IN-20220608-41483
IN-20220609-54152 IN-20220608-56165
IN-20220609-54153 IN-20220609-17172
IN-20220609-54154 IN-20220609-32884
IN-20220609-57286 IN-20220609-47246
IN-20220609-57287 IN-20220609-47247
IN-20220609-57288 IN-20220609-47239
IN-20220609-57337 IN-20220609-47240
IN-20220609-54155 IN-20220609-47863
IN-20220609-57289 IN-20220609-47864
IN-20220609-54156 IN-20220609-47865
IN-20220609-54157 IN-20220609-47866
IN-20220609-57290 IN-20220609-47867
IN-20220609-57338 IN-20220609-47960
IN-20220609-54019 IN-20220609-47861
IN-20220609-54020 IN-20220609-47862
IN-20220609-57308 IN-20220609-36889
IN-20220609-57309 IN-20220609-47886
IN-20220609-54055 IN-20220609-47153
IN-20220609-54059 IN-20220609-47156
IN-20220609-54060 IN-20220609-47158
IN-20220609-57312 IN-20220609-34425
IN-20220609-57313 IN-20220609-60359
IN-20220609-57314 IN-20220609-60361
IN-20220609-11551 IN-20220609-60362
IN-20220609-13079 IN-20220609-4901
IN-20220609-44995 IN-20220609-49866
IN-20220609-45042 IN-20220609-49867
IN-20220609-52538 IN-20220609-47242
IN-20220609-52900 IN-20220609-34417
IN-20220609-55170 IN-20220609-47192
IN-20220609-55171 IN-20220609-47194
IN-20220609-55172 IN-20220609-47377
IN-20220609-55174 IN-20220609-47386
IN-20220609-55177 IN-20220609-47390
IN-20220609-57354 IN-20220608-38707
IN-20220609-57355 IN-20220608-39952
IN-20220609-57356 IN-20220608-39953
IN-20220609-57357 IN-20220608-39954
IN-20220609-57358 IN-20220608-39955
IN-20220609-12820 IN-20220608-39956
IN-20220609-55233 IN-20220608-39957
IN-20220609-57366 IN-20220608-44964
IN-20220609-55234 IN-20220609-47279
IN-20220609-55235 IN-20220609-47285
IN-20220609-55236 IN-20220609-33037
IN-20220609-55238 IN-20220609-60481
IN-20220609-55244 IN-20220609-60482
IN-20220609-55245 IN-20220609-60483
IN-20220609-56094 IN-20220608-37560
IN-20220609-56116 IN-20220608-37561
IN-20220609-56117 IN-20220609-59377
IN-20220609-57367 IN-20220609-59465
IN-20220609-57368 IN-20220609-47233
IN-20220609-57369 IN-20220609-47234
IN-20220609-57370 IN-20220609-47236
IN-20220609-57371 IN-20220609-47859
IN-20220609-57372 IN-20220609-47181
IN-20220609-57812 IN-20220609-47243
IN-20220609-57813 IN-20220609-60045
IN-20220609-57835 IN-20220609-60046
IN-20220609-54686 IN-20220609-60047
IN-20220609-57374 IN-20220609-60048
IN-20220609-13077 IN-20220609-60049
IN-20220609-45040 IN-20220609-60658
IN-20220609-54687 IN-20220609-60659
IN-20220609-55165 IN-20220609-60660
IN-20220609-55166 IN-20220609-60661
IN-20220609-55167 IN-20220609-60662
IN-20220609-55168 IN-20220609-60663
IN-20220609-57375 IN-20220609-60664
IN-20220609-57380 IN-20220609-47544
IN-20220609-57381 IN-20220609-47546
IN-20220609-57382 IN-20220609-47868
IN-20220609-57383 IN-20220609-47869
IN-20220609-13078 IN-20220609-47870
IN-20220609-45041 IN-20220609-47196
IN-20220609-55169 IN-20220609-47319
IN-20220609-56679 IN-20220609-36895
IN-20220609-57384 IN-20220609-36896
IN-20220609-57658 IN-20220609-47374
IN-20220609-12767 IN-20220609-47315
IN-20220609-12768 IN-20220609-47248
IN-20220609-49929 IN-20220609-47382
IN-20220609-55055 IN-20220609-47385
IN-20220609-56423 IN-20220609-47387
IN-20220609-57396 IN-20220609-47287
IN-20220609-56886 IN-20220609-47293
IN-20220609-56887 IN-20220609-47846
IN-20220609-56888 IN-20220608-42611
IN-20220609-57413 IN-20220609-47379
IN-20220609-57650 IN-20220609-47354
IN-20220609-57651 IN-20220609-47496
IN-20220609-56895 IN-20220609-47422
IN-20220609-56899 IN-20220609-47184
IN-20220609-56900 IN-20220609-47186
IN-20220609-57415 IN-20220609-47189
IN-20220609-57416 IN-20220609-47335
IN-20220609-57417 IN-20220609-28535
IN-20220609-56902 IN-20220609-28649
IN-20220609-56904 IN-20220609-47878
IN-20220609-57418 IN-20220609-47879
IN-20220609-57419 IN-20220609-54450
IN-20220609-56775 IN-20220609-54465
IN-20220609-56776 IN-20220609-54478
IN-20220609-56779 IN-20220609-54481
IN-20220609-56782 IN-20220609-54485
IN-20220609-56784 IN-20220609-47338
IN-20220609-56786 IN-20220609-47350
IN-20220609-56788 IN-20220609-46355
IN-20220609-57455 IN-20220609-47358
IN-20220609-57457 IN-20220609-47361
IN-20220609-57458 IN-20220609-47363
IN-20220609-57460 IN-20220608-37890
IN-20220609-57806 IN-20220609-60416
IN-20220609-57807 IN-20220609-47415
IN-20220609-57817 IN-20220609-47323
IN-20220609-13053 IN-20220609-60388
IN-20220609-45019 IN-20220609-60389
IN-20220609-56758 IN-20220609-60390
IN-20220609-56763 IN-20220609-47531
IN-20220609-56765 IN-20220609-47541
IN-20220609-56767 IN-20220609-56130
IN-20220609-56769 IN-20220609-56223
IN-20220609-56771 IN-20220609-56224
IN-20220609-56772 IN-20220609-56225
IN-20220609-56773 IN-20220609-56226
IN-20220609-56777 IN-20220609-56228
IN-20220609-56778 IN-20220609-56230
IN-20220609-56780 IN-20220609-47565
IN-20220609-56781 IN-20220609-47872
IN-20220609-56783 IN-20220609-47873
IN-20220609-56785 IN-20220608-42988
IN-20220609-56787 IN-20220609-60132
IN-20220609-56789 IN-20220609-47143
IN-20220609-56792 IN-20220609-47145
IN-20220609-56794 IN-20220609-47147
IN-20220609-56795 IN-20220609-47331
IN-20220609-57456 IN-20220609-56118
IN-20220609-57459 IN-20220609-56120
IN-20220609-57461 IN-20220609-56121
IN-20220609-57464 IN-20220609-60419
IN-20220609-57618 IN-20220609-47551
IN-20220609-57628 IN-20220609-47554
IN-20220609-57630 IN-20220609-47298
IN-20220609-57632 IN-20220609-47389
IN-20220609-57634 IN-20220609-47160
IN-20220609-57635 IN-20220609-47860
IN-20220609-57636 IN-20220608-43021
IN-20220609-57814 IN-20220608-43022
IN-20220609-57815 IN-20220608-43025
IN-20220609-57816 IN-20220608-43027
IN-20220609-57818 IN-20220609-47249
IN-20220609-57819 IN-20220609-47256
IN-20220609-57821 IN-20220609-47877
IN-20220609-57822 IN-20220609-47392
IN-20220609-57860 IN-20220609-47165
IN-20220609-56790 IN-20220609-47167
IN-20220609-56791 IN-20220609-59387
IN-20220609-57462 IN-20220609-59388
IN-20220609-57463 IN-20220609-59489
IN-20220609-56798 IN-20220609-47197
IN-20220609-56801 IN-20220609-47202
IN-20220609-56803 IN-20220609-47205
IN-20220609-56804 IN-20220609-47207
IN-20220609-56805 IN-20220609-47211
IN-20220609-57478 IN-20220609-34964
IN-20220609-57479 IN-20220609-34965
IN-20220609-57846 IN-20220609-45365
IN-20220609-57847 IN-20220609-45367
IN-20220609-57848 IN-20220609-59379
IN-20220609-56807 IN-20220609-59481
IN-20220609-56810 IN-20220609-47520
IN-20220609-57480 IN-20220609-47522
IN-20220609-57483 IN-20220609-47844
IN-20220609-56806 IN-20220609-47408
IN-20220609-56808 IN-20220609-47413
IN-20220609-56809 IN-20220609-47368
IN-20220609-57481 IN-20220609-47885
IN-20220609-57482 IN-20220609-47403
IN-20220609-57849 IN-20220609-47264
IN-20220609-56811 IN-20220609-48066
IN-20220609-56819 IN-20220609-47341
IN-20220609-56841 IN-20220609-35274
IN-20220609-57484 IN-20220609-35275
IN-20220609-57502 IN-20220609-35276
IN-20220609-57714 IN-20220609-47882
IN-20220609-58152 IN-20220609-47883
IN-20220609-58153 IN-20220609-47151
IN-20220609-58154 IN-20220609-47558
IN-20220609-58155 IN-20220609-47559
IN-20220609-15032 IN-20220609-60192
IN-20220609-45115 IN-20220609-60705
IN-20220609-56812 IN-20220609-47268
IN-20220609-56813 IN-20220609-47275
IN-20220609-56814 IN-20220609-47432
IN-20220609-56815 IN-20220609-47440
IN-20220609-56816 IN-20220609-47880
IN-20220609-56817 IN-20220609-47881
IN-20220609-56818 IN-20220609-27291
IN-20220609-57485 IN-20220609-27292
IN-20220609-57486 IN-20220609-27295
IN-20220609-57487 IN-20220609-47445
IN-20220609-57488 IN-20220609-47451
IN-20220609-57850 IN-20220609-47454
IN-20220609-57851 IN-20220609-47407
IN-20220609-57852 IN-20220609-47574
IN-20220608-36694 IN-20220609-47538
IN-20220608-37468 IN-20220609-59480
IN-20220608-42005 IN-20220609-59690
IN-20220608-42009 IN-20220609-5064
IN-20220608-42011 IN-20220609-32927
IN-20220608-42013 IN-20220609-33046
IN-20220608-42016 IN-20220609-60122
IN-20220608-47267 IN-20220609-47414
IN-20220608-47268 IN-20220609-47289
IN-20220608-47269 IN-20220609-59374
IN-20220608-47270 IN-20220609-59456
IN-20220608-47271 IN-20220609-47897
IN-20220608-47272 IN-20220609-47898
IN-20220608-47273 IN-20220609-47961
IN-20220608-47274 IN-20220609-47430
IN-20220608-47275 IN-20220609-47438
IN-20220608-47276 IN-20220609-47381
IN-20220608-47277 IN-20220609-47384
IN-20220608-47279 IN-20220609-47578
IN-20220608-47280 IN-20220609-47447
IN-20220608-47281 IN-20220609-47260
IN-20220608-47282 IN-20220608-35153
IN-20220608-47283 IN-20220608-35154
IN-20220608-47284 IN-20220608-35216
IN-20220608-47286 IN-20220608-35218
IN-20220608-47287 IN-20220608-35219
IN-20220608-47288 IN-20220608-35221
IN-20220608-47289 IN-20220608-35222
IN-20220608-47290 IN-20220608-35225
IN-20220608-47291 IN-20220608-35226
IN-20220608-47292 IN-20220608-35227
IN-20220608-47293 IN-20220608-35228
IN-20220608-47294 IN-20220608-35232
IN-20220608-47295 IN-20220608-35234
IN-20220608-47296 IN-20220608-35236
IN-20220608-47297 IN-20220608-35239
IN-20220608-47299 IN-20220608-35241
IN-20220608-47300 IN-20220608-35242
IN-20220608-47301 IN-20220608-35243
IN-20220608-47302 IN-20220608-35245
IN-20220608-47304 IN-20220608-35247
IN-20220608-47351 IN-20220608-35250
IN-20220608-47352 IN-20220608-35252
IN-20220608-47353 IN-20220608-35254
IN-20220608-47354 IN-20220608-35960
IN-20220608-47355 IN-20220608-35961
IN-20220608-47356 IN-20220608-35963
IN-20220608-47357 IN-20220608-35964
IN-20220608-47358 IN-20220608-35965
IN-20220608-47359 IN-20220608-35966
IN-20220608-47360 IN-20220608-35967
IN-20220608-47361 IN-20220608-35969
IN-20220608-47362 IN-20220608-35970
IN-20220608-47363 IN-20220608-35974
IN-20220608-47364 IN-20220608-35975
IN-20220608-47365 IN-20220608-35976
IN-20220608-47366 IN-20220608-35977
IN-20220608-47367 IN-20220608-35978
IN-20220608-47368 IN-20220608-35979
IN-20220608-47369 IN-20220608-35980
IN-20220608-47370 IN-20220608-35981
IN-20220608-47371 IN-20220608-35982
IN-20220608-47372 IN-20220608-35983
IN-20220608-47373 IN-20220608-35984
IN-20220608-47374 IN-20220608-35985
IN-20220608-47375 IN-20220608-35986
IN-20220608-47376 IN-20220608-35988
IN-20220608-47377 IN-20220608-35991
IN-20220608-47378 IN-20220608-36062
IN-20220608-47379 IN-20220608-45591
IN-20220608-47380 IN-20220609-47418
IN-20220608-47381 IN-20220609-47471
IN-20220608-47382 IN-20220609-47474
IN-20220608-47383 IN-20220609-47470
IN-20220609-4339 IN-20220609-47396
IN-20220609-4385 IN-20220609-47399
IN-20220609-5165 IN-20220609-47402
IN-20220609-13416 IN-20220609-47485
IN-20220609-14589 IN-20220609-47493
IN-20220609-28984 IN-20220609-47500
IN-20220609-30929 IN-20220609-35181
IN-20220609-32870 IN-20220609-35182
IN-20220609-53909 IN-20220609-35183
IN-20220609-56488 IN-20220609-35184
IN-20220609-57500 IN-20220609-35185
IN-20220609-56152 IN-20220609-35186
IN-20220609-56153 IN-20220609-35187
IN-20220609-56154 IN-20220609-47891
IN-20220609-56155 IN-20220609-47892
IN-20220609-56158 IN-20220609-47893
IN-20220609-56159 IN-20220609-47542
IN-20220609-56162 IN-20220609-47321
IN-20220609-57505 IN-20220609-47327
IN-20220609-57506 IN-20220609-60033
IN-20220609-57507 IN-20220609-60123
IN-20220609-57524 IN-20220609-60633
IN-20220609-57525 IN-20220608-37904
IN-20220609-57526 IN-20220608-42998
IN-20220609-57527 IN-20220608-43000
IN-20220609-56264 IN-20220608-43003
IN-20220609-56267 IN-20220609-33026
IN-20220609-56270 IN-20220609-47504
IN-20220609-56274 IN-20220609-47449
IN-20220609-57522 IN-20220609-47452
IN-20220609-57523 IN-20220609-47359
IN-20220609-57600 IN-20220608-40095
IN-20220609-57601 IN-20220608-40097
IN-20220609-56166 IN-20220608-43415
IN-20220609-56172 IN-20220608-43417
IN-20220609-56173 IN-20220609-32962
IN-20220609-56174 IN-20220609-60056
IN-20220609-57528 IN-20220609-60714
IN-20220609-57542 IN-20220609-60715
IN-20220609-57543 IN-20220609-60716
IN-20220609-57544 IN-20220609-60718
IN-20220609-56193 IN-20220609-60719
IN-20220609-56197 IN-20220609-60720
IN-20220609-56201 IN-20220609-47906
IN-20220609-56203 IN-20220609-47907
IN-20220609-56204 IN-20220609-34796
IN-20220609-56206 IN-20220609-34797
IN-20220609-56207 IN-20220609-46259
IN-20220609-57560 IN-20220609-46261
IN-20220609-57743 IN-20220609-46263
IN-20220609-57744 IN-20220609-46264
IN-20220609-57757 IN-20220609-46265
IN-20220609-57758 IN-20220609-46266
IN-20220609-57773 IN-20220609-46267
IN-20220609-57774 IN-20220609-47543
IN-20220609-56209 IN-20220609-47889
IN-20220609-56210 IN-20220609-47501
IN-20220609-56211 IN-20220609-47259
IN-20220609-56212 IN-20220609-47261
IN-20220609-57562 IN-20220609-47292
IN-20220609-57563 IN-20220609-34395
IN-20220609-57564 IN-20220609-55181
IN-20220609-57759 IN-20220609-55182
IN-20220609-56231 IN-20220609-55183
IN-20220609-56238 IN-20220609-55184
IN-20220609-56241 IN-20220609-55185
IN-20220609-56249 IN-20220609-55186
IN-20220609-57580 IN-20220609-55187
IN-20220609-57795 IN-20220609-55188
IN-20220609-57797 IN-20220609-55189
IN-20220609-57836 IN-20220609-47303
IN-20220609-3878 IN-20220609-47307
IN-20220609-4145 IN-20220609-47309
IN-20220609-12832 IN-20220609-47843
IN-20220609-13155 IN-20220609-47405
IN-20220609-15037 IN-20220609-47395
IN-20220609-44998 IN-20220609-47401
IN-20220609-45118 IN-20220609-47900
IN-20220609-50725 IN-20220609-47901
IN-20220609-52790 IN-20220609-47931
IN-20220609-56250 IN-20220609-47934
IN-20220609-56255 IN-20220609-47935
IN-20220609-56258 IN-20220609-47973
IN-20220609-56259 IN-20220609-59369
IN-20220609-56260 IN-20220609-59417
IN-20220609-57581 IN-20220609-46524
IN-20220609-57582 IN-20220609-46525
IN-20220609-57583 IN-20220609-46526
IN-20220609-57598 IN-20220609-46527
IN-20220609-57599 IN-20220609-60116
IN-20220609-56629 IN-20220609-60117
IN-20220609-57597 IN-20220609-60118
IN-20220609-56320 IN-20220609-60119
IN-20220609-56325 IN-20220609-60120
IN-20220609-56334 IN-20220609-60121
IN-20220609-56342 IN-20220609-60178
IN-20220609-56343 IN-20220609-60179
IN-20220609-56345 IN-20220609-60180
IN-20220609-57607 IN-20220609-60181
IN-20220609-57608 IN-20220609-60182
IN-20220609-57654 IN-20220609-60183
IN-20220609-57656 IN-20220609-60184
IN-20220609-57755 IN-20220609-60185
IN-20220609-57756 IN-20220609-60186
IN-20220609-56349 IN-20220609-60187
IN-20220609-56352 IN-20220609-60188
IN-20220609-56353 IN-20220609-60189
IN-20220609-56356 IN-20220609-60190
IN-20220609-56359 IN-20220609-60191
IN-20220609-57609 IN-20220609-47364
IN-20220609-57611 IN-20220824-0072
IN-20220609-57612 IN-20220609-47296
IN-20220609-57613 IN-20220824-0071
IN-20220609-57764 IN-20220824-0070
IN-20220609-56351 IN-20220609-47324
IN-20220609-56371 IN-20220609-47326
IN-20220609-56373 IN-20220609-47519
IN-20220609-56376 IN-20220609-47564
IN-20220609-57610 IN-20220609-47568
IN-20220609-57625 IN-20220609-47919
IN-20220609-57670 IN-20220824-0069
IN-20220609-57671 IN-20220824-0068
IN-20220609-56357 IN-20220609-47431
IN-20220609-56363 IN-20220608-35730
IN-20220609-57614 IN-20220608-35738
IN-20220609-57623 IN-20220608-35739
IN-20220609-56754 IN-20220608-35740
IN-20220609-56755 IN-20220609-47502
IN-20220609-56756 IN-20220609-47245
IN-20220609-57617 IN-20220609-47250
IN-20220609-57832 IN-20220609-47251
IN-20220609-57833 IN-20220609-47252
IN-20220609-15114 IN-20220609-47254
IN-20220609-15116 IN-20220609-47560
IN-20220609-45130 IN-20220609-47494
IN-20220609-45131 IN-20220609-47914
IN-20220609-56277 IN-20220609-47575
IN-20220609-56282 IN-20220609-47524
IN-20220609-56285 IN-20220609-47526
IN-20220609-56288 IN-20220824-0067
IN-20220609-56291 IN-20220609-47499
IN-20220609-56294 IN-20220824-0048
IN-20220609-57620 IN-20220824-0052
IN-20220609-57761 IN-20220824-0053
IN-20220609-57810 IN-20220609-34254
IN-20220609-57811 IN-20220609-36382
IN-20220609-57824 IN-20220609-36383
IN-20220609-57854 IN-20220609-36384
IN-20220609-13242 IN-20220609-36385
IN-20220609-45063 IN-20220609-36168
IN-20220609-56312 IN-20220609-36169
IN-20220609-57621 IN-20220609-36170
IN-20220609-56365 IN-20220609-36171
IN-20220609-56366 IN-20220609-47489
IN-20220609-56367 IN-20220609-47491
IN-20220609-56369 IN-20220824-0066
IN-20220609-56370 IN-20220609-47944
IN-20220609-57624 IN-20220609-47947
IN-20220609-57668 IN-20220609-28522
IN-20220609-57669 IN-20220609-28620
IN-20220609-57678 IN-20220609-28621
IN-20220609-57679 IN-20220609-47573
IN-20220609-56757 IN-20220609-47352
IN-20220609-56759 IN-20220609-47920
IN-20220609-56760 IN-20220609-47921
IN-20220609-56761 IN-20220609-47922
IN-20220609-56762 IN-20220609-60207
IN-20220609-56764 IN-20220609-60208
IN-20220609-56766 IN-20220609-47448
IN-20220609-56768 IN-20220609-35583
IN-20220609-56770 IN-20220609-35584
IN-20220609-57619 IN-20220609-35585
IN-20220609-57629 IN-20220609-35586
IN-20220609-57631 IN-20220609-35587
IN-20220609-57633 IN-20220609-46377
IN-20220609-57834 IN-20220609-46378
IN-20220609-57857 IN-20220609-46379
IN-20220609-57858 IN-20220609-47320
IN-20220609-57859 IN-20220609-47322
IN-20220609-57861 IN-20220609-47355
IN-20220609-56302 IN-20220609-45909
IN-20220609-56305 IN-20220609-47420
IN-20220609-56306 IN-20220609-47376
IN-20220609-56308 IN-20220609-47450
IN-20220609-56309 IN-20220824-0046
IN-20220609-57639 IN-20220824-0065
IN-20220609-57640 IN-20220609-47527
IN-20220609-57871 IN-20220608-37598
IN-20220609-57872 IN-20220608-37599
IN-20220609-57873 IN-20220824-0064
IN-20220609-56727 IN-20220609-45239
IN-20220609-56732 IN-20220609-45241
IN-20220609-56736 IN-20220609-34153
IN-20220609-56737 IN-20220609-59384
IN-20220609-56739 IN-20220609-59724
IN-20220609-56740 IN-20220824-0062
IN-20220609-57644 IN-20220824-0063
IN-20220609-57647 IN-20220609-47533
IN-20220609-57648 IN-20220609-47534
IN-20220609-57782 IN-20220609-47282
IN-20220609-57801 IN-20220609-47479
IN-20220609-57802 IN-20220824-0042
IN-20220609-28346 IN-20220824-0043
IN-20220609-28347 IN-20220824-0044
IN-20220609-28348 IN-20220824-0045
IN-20220609-28349 IN-20220824-0041
IN-20220609-28350 IN-20220609-59380
IN-20220609-28351 IN-20220609-59482
IN-20220609-28352 IN-20220609-47276
IN-20220609-28353 IN-20220609-47294
IN-20220609-28354 IN-20220609-47299
IN-20220609-28355 IN-20220609-60201
IN-20220609-28356 IN-20220609-60202
IN-20220609-28357 IN-20220824-0040
IN-20220609-28358 IN-20220824-0035
IN-20220609-56702 IN-20220824-0037
IN-20220609-56706 IN-20220824-0038
IN-20220609-56707 IN-20220609-34409
IN-20220609-56714 IN-20220609-60014
IN-20220609-56718 IN-20220609-47334
IN-20220609-56724 IN-20220609-47339
IN-20220609-56729 IN-20220609-47345
IN-20220609-56733 IN-20220609-47529
IN-20220609-56738 IN-20220609-47536
IN-20220609-57645 IN-20220609-47550
IN-20220609-57649 IN-20220609-59373
IN-20220609-57776 IN-20220609-59455
IN-20220609-57778 IN-20220609-47486
IN-20220609-57783 IN-20220609-47492
IN-20220609-57862 IN-20220609-47229
IN-20220609-57863 IN-20220609-47223
IN-20220609-57864 IN-20220609-47329
IN-20220609-57869 IN-20220609-47330
IN-20220609-60141 IN-20220609-47569
IN-20220609-56722 IN-20220609-47577
IN-20220609-56725 IN-20220609-47518
IN-20220609-56735 IN-20220609-47463
IN-20220609-57646 IN-20220608-37867
IN-20220609-57779 IN-20220609-60368
IN-20220609-57781 IN-20220609-60369
IN-20220609-56348 IN-20220609-60370
IN-20220609-57657 IN-20220609-47409
IN-20220609-56675 IN-20220609-47297
IN-20220609-56682 IN-20220609-47348
IN-20220609-56685 IN-20220609-47356
IN-20220609-56688 IN-20220609-60193
IN-20220609-56693 IN-20220609-60194
IN-20220609-57661 IN-20220608-42612
IN-20220609-57663 IN-20220609-47547
IN-20220609-57665 IN-20220609-47553
IN-20220609-57667 IN-20220609-48021
IN-20220609-57738 IN-20220609-48022
IN-20220609-56687 IN-20220609-48024
IN-20220609-56691 IN-20220609-48025
IN-20220609-57664 IN-20220609-48026
IN-20220609-57746 IN-20220609-48028
IN-20220609-11385 IN-20220609-47375
IN-20220609-14789 IN-20220609-60673
IN-20220609-56690 IN-20220608-40096
IN-20220609-57666 IN-20220609-32896
IN-20220609-56624 IN-20220609-33045
IN-20220609-56625 IN-20220609-60112
IN-20220609-56626 IN-20220609-60113
IN-20220609-57673 IN-20220609-60114
IN-20220609-57729 IN-20220609-60115
IN-20220609-57730 IN-20220609-47217
IN-20220609-56634 IN-20220609-47314
IN-20220609-56635 IN-20220609-47887
IN-20220609-56636 IN-20220609-47888
IN-20220609-56637 IN-20220609-47566
IN-20220609-56638 IN-20220609-47365
IN-20220609-57674 IN-20220609-47367
IN-20220609-57789 IN-20220609-47424
IN-20220609-57790 \ No newline at end of file
IN-20220609-57791
IN-20220609-57792
IN-20220609-56639
IN-20220609-56642
IN-20220609-56643
IN-20220609-56644
IN-20220609-56645
IN-20220609-57615
IN-20220609-57675
IN-20220609-57677
IN-20220609-57793
IN-20220609-57794
IN-20220609-56640
IN-20220609-57676
IN-20220609-13243
IN-20220609-45064
IN-20220609-50747
IN-20220609-52784
IN-20220609-56378
IN-20220609-56379
IN-20220609-56380
IN-20220609-56381
IN-20220609-57680
IN-20220609-57681
IN-20220609-57682
IN-20220609-57683
IN-20220609-13244
IN-20220609-45065
IN-20220609-56594
IN-20220609-56596
IN-20220609-56610
IN-20220609-56611
IN-20220609-56614
IN-20220609-57684
IN-20220609-57702
IN-20220609-57703
IN-20220609-57722
IN-20220609-57724
IN-20220609-56646
IN-20220609-56648
IN-20220609-56649
IN-20220609-56650
IN-20220609-57687
IN-20220609-57688
IN-20220609-57689
IN-20220609-57798
IN-20220609-56651
IN-20220609-56652
IN-20220609-56653
IN-20220609-57690
IN-20220609-57691
IN-20220609-57692
IN-20220609-56654
IN-20220609-56657
IN-20220609-56659
IN-20220609-57693
IN-20220609-57734
IN-20220609-57799
IN-20220609-12747
IN-20220609-12748
IN-20220609-56662
IN-20220609-57698
IN-20220609-57901
IN-20220609-57906
IN-20220609-57909
IN-20220609-13285
IN-20220609-13286
IN-20220609-13287
IN-20220609-15046
IN-20220609-45124
IN-20220609-57918
IN-20220609-57923
IN-20220609-57924
IN-20220609-57925
IN-20220609-57926
IN-20220609-57928
IN-20220609-57929
IN-20220609-57930
IN-20220609-57931
IN-20220609-57943
IN-20220609-57945
IN-20220609-57946
IN-20220609-57948
IN-20220609-57949
IN-20220609-57950
IN-20220609-56600
IN-20220609-56605
IN-20220609-56606
IN-20220609-56608
IN-20220609-57699
IN-20220609-57700
IN-20220609-57701
IN-20220609-57723
IN-20220609-3883
IN-20220609-4150
IN-20220609-15016
IN-20220609-45110
IN-20220609-57028
IN-20220609-57031
IN-20220609-57033
IN-20220609-57034
IN-20220609-57036
IN-20220609-57707
IN-20220609-57715
IN-20220609-57716
IN-20220609-57717
IN-20220609-57718
IN-20220609-56664
IN-20220609-56665
IN-20220609-56666
IN-20220609-56667
IN-20220609-57709
IN-20220609-57710
IN-20220609-57711
IN-20220609-57712
IN-20220609-57037
IN-20220609-57719
IN-20220609-57039
IN-20220609-57041
IN-20220609-57720
IN-20220609-57721
IN-20220609-56186
IN-20220609-56198
IN-20220609-57725
IN-20220609-58529
IN-20220609-58532
IN-20220609-58534
IN-20220609-58537
IN-20220609-56627
IN-20220609-57731
IN-20220609-56628
IN-20220609-57733
IN-20220609-56676
IN-20220609-57745
IN-20220609-56699
IN-20220609-57749
IN-20220609-3799
IN-20220609-4066
IN-20220609-15061
IN-20220609-45125
IN-20220609-57958
IN-20220609-57959
IN-20220609-57960
IN-20220609-57961
IN-20220609-57962
IN-20220609-57963
IN-20220609-57964
IN-20220609-57965
IN-20220609-56347
IN-20220609-57762
IN-20220609-13301
IN-20220609-56709
IN-20220609-56716
IN-20220609-57767
IN-20220609-57777
IN-20220609-56712
IN-20220609-57768
IN-20220609-14729
IN-20220609-56713
IN-20220609-57769
IN-20220609-15042
IN-20220609-45121
IN-20220609-56175
IN-20220609-56184
IN-20220609-57545
IN-20220609-57770
IN-20220609-56723
IN-20220609-57780
IN-20220609-58000
IN-20220609-58001
IN-20220609-58002
IN-20220609-58003
IN-20220609-58004
IN-20220609-58005
IN-20220609-58006
IN-20220609-58007
IN-20220609-58008
IN-20220609-58009
IN-20220609-58010
IN-20220609-58011
IN-20220609-58012
IN-20220609-58013
IN-20220609-12936
IN-20220609-45006
IN-20220609-58014
IN-20220609-58015
IN-20220609-58016
IN-20220609-58017
IN-20220609-58018
IN-20220609-58019
IN-20220609-56630
IN-20220609-56631
IN-20220609-56632
IN-20220609-56633
IN-20220609-57785
IN-20220609-57786
IN-20220609-57787
IN-20220609-57788
IN-20220609-58021
IN-20220609-58022
IN-20220609-58023
IN-20220609-58024
IN-20220609-56741
IN-20220609-57803
IN-20220609-56745
IN-20220609-57804
IN-20220609-13052
IN-20220609-28361
IN-20220609-28362
IN-20220609-28363
IN-20220609-28364
IN-20220609-28365
IN-20220609-45018
IN-20220609-52544
IN-20220609-52902
IN-20220609-56743
IN-20220609-56746
IN-20220609-56747
IN-20220609-56748
IN-20220609-56749
IN-20220609-56750
IN-20220609-56751
IN-20220609-56752
IN-20220609-56753
IN-20220609-57616
IN-20220609-57805
IN-20220609-57825
IN-20220609-57826
IN-20220609-57827
IN-20220609-57828
IN-20220609-57829
IN-20220609-57830
IN-20220609-57831
IN-20220609-13054
IN-20220609-56796
IN-20220609-56797
IN-20220609-56799
IN-20220609-56800
IN-20220609-56802
IN-20220609-57823
IN-20220609-57842
IN-20220609-57843
IN-20220609-57844
IN-20220609-57845
IN-20220609-56671
IN-20220609-57840
IN-20220609-12716
IN-20220609-56298
IN-20220609-57856
IN-20220609-56710
IN-20220609-57865
IN-20220609-56720
IN-20220609-57868
IN-20220609-12769
IN-20220609-58140
IN-20220609-15036
IN-20220609-45117
IN-20220609-58141
IN-20220609-13057
IN-20220609-45022
IN-20220609-52542
IN-20220609-52901
IN-20220609-58163
IN-20220609-58165
IN-20220609-58166
IN-20220609-58167
IN-20220609-58168
IN-20220609-58172
IN-20220609-58213
IN-20220609-58215
IN-20220609-58217
IN-20220609-58222
IN-20220609-58232
IN-20220609-58238
IN-20220609-58240
IN-20220609-58242
IN-20220609-58243
IN-20220609-58248
IN-20220609-58249
IN-20220609-11974
IN-20220609-11977
IN-20220609-11981
IN-20220609-11983
IN-20220609-11985
IN-20220609-11986
IN-20220609-11987
IN-20220609-44996
IN-20220609-58320
IN-20220609-58321
IN-20220609-58322
IN-20220609-58301
IN-20220609-58306
IN-20220609-58309
IN-20220609-58311
IN-20220609-58313
IN-20220609-58314
IN-20220609-58315
IN-20220609-13058
IN-20220609-45023
IN-20220609-58251
IN-20220609-58255
IN-20220609-58257
IN-20220609-58258
IN-20220609-58262
IN-20220609-58265
IN-20220609-58266
IN-20220609-58268
IN-20220609-58272
IN-20220609-58274
IN-20220609-58277
IN-20220609-58278
IN-20220609-58279
IN-20220609-58286
IN-20220609-58289
IN-20220609-58293
IN-20220609-58294
IN-20220609-58299
IN-20220609-58142
IN-20220609-15033
IN-20220609-45116
IN-20220609-50726
IN-20220609-52761
IN-20220609-58143
IN-20220609-58144
IN-20220609-12823
IN-20220609-58145
IN-20220609-58146
IN-20220609-58147
IN-20220609-58148
IN-20220609-13047
IN-20220609-45014
IN-20220609-57879
IN-20220609-57880
IN-20220609-57882
IN-20220609-57889
IN-20220609-57890
IN-20220609-57891
IN-20220609-57892
IN-20220609-57894
IN-20220609-57895
IN-20220609-57897
IN-20220609-57967
IN-20220609-12720
IN-20220609-57968
IN-20220609-57969
IN-20220609-57970
IN-20220609-57971
IN-20220609-57972
IN-20220609-57973
IN-20220609-57974
IN-20220609-57975
IN-20220609-57976
IN-20220609-57977
IN-20220609-58053
IN-20220609-58851
IN-20220609-13068
IN-20220609-13069
IN-20220609-45031
IN-20220609-45032
IN-20220609-58626
IN-20220609-58632
IN-20220609-58635
IN-20220609-58639
IN-20220609-58642
IN-20220609-58645
IN-20220609-58647
IN-20220609-58650
IN-20220609-13067
IN-20220609-45030
IN-20220609-58653
IN-20220609-58668
IN-20220609-58672
IN-20220609-58673
IN-20220609-15101
IN-20220609-45129
IN-20220609-58618
IN-20220609-58619
IN-20220609-58623
IN-20220609-13063
IN-20220609-45027
IN-20220609-58538
IN-20220609-58539
IN-20220609-58540
IN-20220609-58541
IN-20220609-58543
IN-20220609-58544
IN-20220609-58545
IN-20220609-58546
IN-20220609-13064
IN-20220609-45028
IN-20220609-58551
IN-20220609-58566
IN-20220609-58570
IN-20220609-58572
IN-20220609-58582
IN-20220609-58586
IN-20220609-58590
IN-20220609-58511
IN-20220609-58513
IN-20220609-58514
IN-20220609-58515
IN-20220609-58516
IN-20220609-58518
IN-20220609-58519
IN-20220609-58520
IN-20220609-58523
IN-20220609-58525
IN-20220609-58527
IN-20220609-58711
IN-20220609-58714
IN-20220609-58716
IN-20220609-58718
IN-20220609-58720
IN-20220609-58723
IN-20220609-13071
IN-20220609-45034
IN-20220609-52540
IN-20220609-52905
IN-20220609-58730
IN-20220609-58738
IN-20220609-58739
IN-20220609-58741
IN-20220609-58743
IN-20220609-58744
IN-20220609-58745
IN-20220609-58748
IN-20220609-58749
IN-20220609-58752
IN-20220609-58754
IN-20220609-58757
IN-20220609-3836
IN-20220609-4103
IN-20220609-13048
IN-20220609-15120
IN-20220609-57875
IN-20220609-57877
IN-20220609-58149
IN-20220609-58150
IN-20220609-58151
IN-20220609-58171
IN-20220609-58173
IN-20220609-58174
IN-20220609-13061
IN-20220609-45025
IN-20220609-52541
IN-20220609-52906
IN-20220609-58510
IN-20220609-58512
IN-20220609-58517
IN-20220609-58522
IN-20220609-58524
IN-20220609-58536
IN-20220609-58929
IN-20220609-58932
IN-20220609-58934
IN-20220609-58936
IN-20220609-58942
IN-20220609-58948
IN-20220609-58950
IN-20220609-12712
IN-20220609-12713
IN-20220609-12714
IN-20220609-58782
IN-20220609-58784
IN-20220609-58927
IN-20220609-58928
IN-20220609-58780
IN-20220609-14984
IN-20220609-45099
IN-20220609-58790
IN-20220609-58792
IN-20220609-58793
IN-20220609-58799
IN-20220609-58806
IN-20220609-58825
IN-20220609-58827
IN-20220609-58834
IN-20220609-13066
IN-20220609-58598
IN-20220609-13070
IN-20220609-45033
IN-20220609-58726
IN-20220609-58727
IN-20220609-58728
IN-20220609-58729
IN-20220609-58607
IN-20220609-58610
IN-20220609-58613
IN-20220609-58615
IN-20220609-3816
IN-20220609-4083
IN-20220609-58478
IN-20220609-58479
IN-20220609-58480
IN-20220609-58481
IN-20220609-58482
IN-20220609-58483
IN-20220609-58485
IN-20220609-58487
IN-20220609-58490
IN-20220609-12703
IN-20220609-58839
IN-20220609-58843
IN-20220609-58845
IN-20220609-58847
IN-20220609-58850
IN-20220609-58853
IN-20220609-58492
IN-20220609-13055
IN-20220609-45020
IN-20220609-52543
IN-20220609-52907
IN-20220609-58156
IN-20220609-58157
IN-20220609-13056
IN-20220609-45021
IN-20220609-58158
IN-20220609-58493
IN-20220609-3819
IN-20220609-4086
IN-20220609-58159
IN-20220609-58160
IN-20220609-58161
IN-20220609-58162
IN-20220609-57966
IN-20220609-59206
IN-20220609-59207
IN-20220609-59208
IN-20220609-59210
IN-20220609-59211
IN-20220609-59212
IN-20220609-59214
IN-20220609-59215
IN-20220609-59216
IN-20220609-59217
IN-20220609-59282
IN-20220609-59283
IN-20220609-59285
IN-20220609-58994
IN-20220609-58995
IN-20220609-58996
IN-20220609-58997
IN-20220609-58998
IN-20220609-14971
IN-20220609-14973
IN-20220609-14975
IN-20220609-45091
IN-20220609-45092
IN-20220609-45093
IN-20220609-58999
IN-20220609-59000
IN-20220609-59001
IN-20220609-59002
IN-20220609-59003
IN-20220609-59004
IN-20220609-59005
IN-20220609-14970
IN-20220609-59009
IN-20220609-59010
IN-20220609-59011
IN-20220609-59012
IN-20220609-59013
IN-20220609-59014
IN-20220609-59017
IN-20220609-13185
IN-20220609-13186
IN-20220609-13187
IN-20220609-59088
IN-20220609-59024
IN-20220609-59034
IN-20220609-59045
IN-20220609-58980
IN-20220609-58981
IN-20220609-58982
IN-20220609-58983
IN-20220609-58984
IN-20220609-58985
IN-20220609-58986
IN-20220609-58987
IN-20220609-58988
IN-20220609-58989
IN-20220609-58990
IN-20220609-58991
IN-20220609-58992
IN-20220609-58993
IN-20220609-59223
IN-20220609-59225
IN-20220609-59227
IN-20220609-59228
IN-20220609-59229
IN-20220609-59230
IN-20220609-59231
IN-20220609-59232
IN-20220609-3871
IN-20220609-4138
IN-20220609-4614
IN-20220609-4615
IN-20220609-4616
IN-20220609-4617
IN-20220609-4618
IN-20220609-4619
IN-20220609-4620
IN-20220609-4621
IN-20220609-5702
IN-20220609-5703
IN-20220609-15212
IN-20220609-29518
IN-20220609-29519
IN-20220609-31463
IN-20220609-31464
IN-20220609-33404
IN-20220609-33405
IN-20220609-44955
IN-20220609-44956
IN-20220609-44957
IN-20220609-44958
IN-20220609-44959
IN-20220609-44960
IN-20220609-44961
IN-20220609-44962
IN-20220609-45137
IN-20220609-58964
IN-20220609-58966
IN-20220609-58968
IN-20220609-58970
IN-20220609-58971
IN-20220609-58972
IN-20220609-58973
IN-20220609-58974
IN-20220609-58976
IN-20220609-59293
IN-20220609-59020
IN-20220609-59023
IN-20220609-59026
IN-20220609-59027
IN-20220609-59028
IN-20220609-59029
IN-20220609-59030
IN-20220609-59280
IN-20220609-59281
IN-20220609-59292
IN-20220609-59294
IN-20220609-59295
IN-20220609-59296
IN-20220609-59297
IN-20220609-59274
IN-20220609-59244
IN-20220609-59246
IN-20220609-59248
IN-20220609-59250
IN-20220609-59253
IN-20220609-59254
IN-20220609-59256
IN-20220609-59257
IN-20220609-59258
IN-20220609-59260
IN-20220609-59261
IN-20220609-59262
IN-20220609-59265
IN-20220609-59031
IN-20220609-59032
IN-20220609-59033
IN-20220609-59035
IN-20220609-59036
IN-20220609-59037
IN-20220609-12904
IN-20220609-59038
IN-20220609-59039
IN-20220609-59040
IN-20220609-59041
IN-20220609-59042
IN-20220609-59043
IN-20220609-59044
IN-20220609-4493
IN-20220609-4494
IN-20220609-4495
IN-20220609-13240
IN-20220609-28366
IN-20220609-28367
IN-20220609-28368
IN-20220609-28369
IN-20220609-28370
IN-20220609-44921
IN-20220609-44922
IN-20220609-45061
IN-20220609-58977
IN-20220609-59298
IN-20220609-59299
IN-20220609-59300
IN-20220609-59301
IN-20220609-59302
IN-20220609-14990
IN-20220609-45102
IN-20220609-50735
IN-20220609-52765
IN-20220609-58951
IN-20220609-58960
IN-20220609-58961
IN-20220609-58962
IN-20220609-59006
IN-20220609-59007
IN-20220609-59008
IN-20220609-4496
IN-20220609-4497
IN-20220609-4498
IN-20220609-4499
IN-20220609-13127
IN-20220609-13128
IN-20220609-13129
IN-20220609-13130
IN-20220609-44923
IN-20220609-44924
IN-20220609-44925
IN-20220609-50869
IN-20220609-52652
IN-20220609-52791
IN-20220609-52851
IN-20220609-53172
IN-20220609-53215
IN-20220609-59057
IN-20220609-59059
IN-20220609-59060
IN-20220609-59061
IN-20220609-59062
IN-20220609-59063
IN-20220609-59289
IN-20220609-59290
IN-20220609-59233
IN-20220609-59234
IN-20220609-59235
IN-20220609-59236
IN-20220609-59237
IN-20220609-59238
IN-20220609-59239
IN-20220609-59291
IN-20220609-59279
\ No newline at end of file
...@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ? ...@@ -58,9 +58,9 @@ kafka.consumer.task=0 0/2 * * * ?
kafka.producer.servers=114.115.159.144:9092 kafka.producer.servers=114.115.159.144:9092
#kafka.producer.servers=39.101.72.117:19092 #kafka.producer.servers=39.101.72.117:19092
kafka.producer.retries=0 kafka.producer.retries=0
kafka.producer.batch.size=4096 kafka.producer.batch.size=409600
kafka.producer.linger=1 kafka.producer.linger=1
kafka.producer.buffer.memory=40960 kafka.producer.buffer.memory=409600
spring.activemq.broker-url= tcp://127.0.0.1:61616 spring.activemq.broker-url= tcp://127.0.0.1:61616
......
...@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0 ...@@ -25,7 +25,6 @@ SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS=10 JWYQJC_MEMCACHED_DAYS=10
TITLE_SIMILARITY_RATE=0.8 TITLE_SIMILARITY_RATE=0.8
MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE=1 CACHE_UPDATE=1
...@@ -36,7 +35,7 @@ PROXYID=1 ...@@ -36,7 +35,7 @@ PROXYID=1
THREAD_SIZE=1 THREAD_SIZE=1
# #
CHROMEDRIVE= E:\\chrome\\chromedriver.exe CHROMEDRIVE= E:\\chrome\\chromedriver.exe
CHROMEBIN= C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe CHROMEBIN= C:\Users\WIN10\AppData\Local\Google\Chrome\Application\chrome.exe
USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default USER_DATA_DIR= C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default
#mysql connection #mysql connection
...@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -52,7 +51,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#KAFKA_CONSUMER_TOPIC = staticCrawlTopic #KAFKA_CONSUMER_TOPIC = staticCrawlTopic
KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin KAFKA_CONSUMER_TOPIC =clb-infosource-handler-dynamin
# #
KAFKA_CONSUMER_GROUP_ID=test-zs1 KAFKA_CONSUMER_GROUP_ID=test1
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest #KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
KAFKA_PRODUCT_TOPIC=crawlerInfo KAFKA_PRODUCT_TOPIC=crawlerInfo
...@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0 ...@@ -72,15 +71,15 @@ KAFKA_PRODUCT_PARTITION=0
#redis.host=114.116.26.150 #redis.host=114.116.26.150
#redis.port=6379 #redis.port=6379
#redis.pass=zzsn9988 #redis.pass=zzsn9988
#redis.host=114.115.236.206 redis.host=114.115.236.206
#redis.port=6379 redis.port=6379
#redis.pass=clbzzsn redis.pass=clbzzsn
#redis.host=8.130.30.33 #redis.host=8.130.30.33
#redis.port=9010 #redis.port=9010
#redis.pass=wxadS&jklim #redis.pass=wxadS&jklim
redis.host=127.0.0.1 #redis.host=127.0.0.1
redis.port=6379 #redis.port=6379
redis.pass=xxxxxx #redis.pass=xxxxxx
redis.timeout=10000 redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
redis.maxTotal=600 redis.maxTotal=600
...@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\ ...@@ -100,6 +99,8 @@ IMGPATH= E:\\ideaWorkerspace\\meta_crawler\\comm_crawler\\src\\main\\resources\\
selenium.driver.cache=selenium_driver_cache_loc112 selenium.driver.cache=selenium_driver_cache_loc112
#采集缓存的rediskey
MODEL_SCORE_URL=dy-1
......
# Redis settings # Redis settings
#redis.host=114.115.236.206 redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
redis.host=127.0.0.1
redis.port=6379 redis.port=6379
redis.pass=xxxxxx redis.pass=clbzzsn
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
redis.timeout=10000 redis.timeout=10000
redis.maxIdle=300 redis.maxIdle=300
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论