提交 14bada64 作者: liuweigang

爬虫修改提交2

上级 94d7a333
...@@ -4,12 +4,14 @@ ...@@ -4,12 +4,14 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.com.com.zzsn</groupId> <groupId>com.zzsn</groupId>
<artifactId>crawler_2022</artifactId> <artifactId>crawler_2022</artifactId>
<version>1.0-SNAPSHOT</version> <version>1.0-SNAPSHOT</version>
<modules> <modules>
<module>weixinCrawler</module> <module>weixinCrawler</module>
<module>weixinSearch</module> <module>weixinSearch</module>
<module>googleSearch</module>
<module>yahooSearch</module>
</modules> </modules>
<packaging>pom</packaging> <packaging>pom</packaging>
......
...@@ -228,7 +228,7 @@ ...@@ -228,7 +228,7 @@
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId> <artifactId>spring-boot-maven-plugin</artifactId>
<configuration> <configuration>
<mainClass>com.com.zzsn.WeixinCrawlerApplication</mainClass> <mainClass>com.zzsn.WeixinCrawlerApplication</mainClass>
<includeSystemScope>true</includeSystemScope><!--外部进行打包--> <includeSystemScope>true</includeSystemScope><!--外部进行打包-->
<excludes> <excludes>
<exclude> <exclude>
......
...@@ -18,3 +18,4 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer { ...@@ -18,3 +18,4 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer {
} }
} }
package com.zzsn.job; package com.zzsn.job;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.crawler.WeixinDetailThread; import com.zzsn.crawler.WeixinDetailThread;
import com.zzsn.crawler.WeixinSiteThread; import com.zzsn.crawler.WeixinSiteThread;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
...@@ -12,6 +14,7 @@ import org.apache.kafka.clients.consumer.ConsumerConfig; ...@@ -12,6 +14,7 @@ import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka; import org.springframework.kafka.annotation.EnableKafka;
...@@ -20,8 +23,8 @@ import org.springframework.scheduling.annotation.EnableScheduling; ...@@ -20,8 +23,8 @@ import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled; import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.Arrays; import java.text.SimpleDateFormat;
import java.util.Properties; import java.util.*;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
...@@ -61,17 +64,44 @@ public class KafkaConsumerJob { ...@@ -61,17 +64,44 @@ public class KafkaConsumerJob {
log.info("定时获取mq消息"); log.info("定时获取mq消息");
//1.创建消费者 //1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer(); KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC)); // consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
// // 消费某个主题的某个分区数据
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try{ try{
while(true){ while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环 //消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0); ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync(); // consumer.commitSync();
for(ConsumerRecord record : records){ for(ConsumerRecord record : records){
SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class);
String siteName = siteMsgTemple.getSiteName();
String url="http://114.115.130.239:8005/sougou_wechart/?query="+siteName;
//这个属于发起请求并获取json数据(得引入hutool,即上面引入的第二个依赖)
String json= HttpUtil.createGet(url).execute().body();
//转化请求的 json 数据
JSONObject jsonObject = JSONObject.parseObject(json);
//获取 error 返回状态码
String latestNewsTime = jsonObject.getString("latestNewsTime");
if(!latestNewsTime.contains("None")){
SimpleDateFormat simpleFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//将时间调整为yyyy-MM-dd HH:mm:ss时间样式
String oldTime =latestNewsTime;
String nowTime = simpleFormat.format(new Date());
int hours = SouGouSearch.getHours(oldTime, nowTime);
if(hours>2){
continue;
}
}
WeixinSiteThread siteThread=new WeixinSiteThread(); WeixinSiteThread siteThread=new WeixinSiteThread();
siteThread.siteMsgTemple=siteMsgTemple; siteThread.siteMsgTemple=siteMsgTemple;
//创建使用固定线程数的线程池 //创建使用固定线程数的线程池
......
...@@ -51,7 +51,7 @@ public class Constants { ...@@ -51,7 +51,7 @@ public class Constants {
public static final String KAFKA_WXDETAILURL_TOPIC =prop.getProperty("KAFKA_WXDETAILURL_TOPIC"); public static final String KAFKA_WXDETAILURL_TOPIC =prop.getProperty("KAFKA_WXDETAILURL_TOPIC");
public static String path=prop.getProperty("path"); public static String path=prop.getProperty("path");
public static final String KAFKA_CONSUMER_PARTITION= prop.getProperty("KAFKA_CONSUMER_PARTITION");
//输出目录,仅此处需要修改 //输出目录,仅此处需要修改
//public static final String DEST_DIR = prop.getProperty("DEST_DIR"); //public static final String DEST_DIR = prop.getProperty("DEST_DIR");
//正文目录 //正文目录
......
...@@ -55,6 +55,10 @@ KAFKA_PRODUCT_TOPIC=wxcrawlerInfo1 ...@@ -55,6 +55,10 @@ KAFKA_PRODUCT_TOPIC=wxcrawlerInfo1
#抓取资讯统计 #抓取资讯统计
KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
#指定分区使用逗号分割
#KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7
#微信账号名称 #微信账号名称
#WXSENDNAME= LiuWeiGang #WXSENDNAME= LiuWeiGang
WXSENDNAME= lwg WXSENDNAME= lwg
......
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<template><content><exp>*.div[id="js_content"]</exp></content><title><exp>*.h1[class="rich_media_title"]|h2[class="rich_media_title"]</exp></title><author><exp>*.a[id="js_name"]</exp></author><publish_date><exp>*.em[id="post-date"]</exp></publish_date></template> <template>
<content>
<exp>*.div[id="js_content"]</exp>
</content>
<title>
<exp>*.h1[class="rich_media_title"]|h2[class="rich_media_title"]</exp>
</title>
<author>
<exp>*.a[id="js_name"]</exp>
</author>
<publish_date>
<exp>*.em[id="post-date"]</exp>
</publish_date>
</template>
英特尔+重大战略
英特尔+科技资源
英特尔+科技攻关
英特尔+科技创新 英特尔+科技创新
英特尔+技术创新 英特尔+技术创新
英特尔+国家战略 英特尔+国家战略
......
...@@ -97,12 +97,28 @@ ...@@ -97,12 +97,28 @@
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-support</artifactId> <artifactId>selenium-support</artifactId>
<version>3.141.59</version> <version>4.0.0-rc-2</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-api</artifactId>
<version>4.0.0-rc-2</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>4.0.0-rc-2</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-remote-driver</artifactId>
<version>4.0.0-rc-2</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>htmlunit-driver</artifactId> <artifactId>htmlunit-driver</artifactId>
<!--<version>3.60.0</version>-->
<version>2.33.3</version> <version>2.33.3</version>
</dependency> </dependency>
...@@ -110,7 +126,7 @@ ...@@ -110,7 +126,7 @@
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>4.1.2</version> <version>4.0.0-rc-2</version>
</dependency> </dependency>
<dependency> <dependency>
...@@ -141,7 +157,6 @@ ...@@ -141,7 +157,6 @@
<version>1.15</version> <version>1.15</version>
</dependency> </dependency>
<!--redis依赖--> <!--redis依赖-->
<dependency> <dependency>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
...@@ -235,11 +250,11 @@ ...@@ -235,11 +250,11 @@
<artifactId>lombok</artifactId> <artifactId>lombok</artifactId>
<optional>true</optional> <optional>true</optional>
</dependency> </dependency>
<dependency> <!--<dependency>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId> <artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>-->
<dependency> <dependency>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId> <artifactId>spring-boot-starter-web</artifactId>
......
...@@ -33,11 +33,11 @@ public class WeixinSearch { ...@@ -33,11 +33,11 @@ public class WeixinSearch {
int index = 0; int index = 0;
try { try {
for (List<String> keywordList : splitList) { for (List<String> keywordList : splitList) {
WeixinSearchThread baiduSearchThread = new WeixinSearchThread(); WeixinSearchThread souGouSearchThread = new WeixinSearchThread();
baiduSearchThread.setThreadId(index++); souGouSearchThread.setThreadId(index++);
baiduSearchThread.setKeywords(keywordList); souGouSearchThread.setKeywords(keywordList);
workers.add(baiduSearchThread); workers.add(souGouSearchThread);
threadPool.execute(baiduSearchThread); threadPool.execute(souGouSearchThread);
Thread.sleep(1000); Thread.sleep(1000);
} }
}catch (Exception e){ }catch (Exception e){
......
...@@ -6,6 +6,7 @@ import com.zzsn.crawler.db.SnowIdUtils; ...@@ -6,6 +6,7 @@ import com.zzsn.crawler.db.SnowIdUtils;
import com.zzsn.entity.*; import com.zzsn.entity.*;
import com.zzsn.search.ChromeUtil; import com.zzsn.search.ChromeUtil;
import com.zzsn.util.Constants; import com.zzsn.util.Constants;
import com.zzsn.util.SplitKeyword;
import com.zzsn.util.Utility; import com.zzsn.util.Utility;
import lombok.Data; import lombok.Data;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -34,6 +35,7 @@ import java.security.KeyStoreException; ...@@ -34,6 +35,7 @@ import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Types; import java.sql.Types;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -56,8 +58,14 @@ public class WeixinSearchThread implements Runnable { ...@@ -56,8 +58,14 @@ public class WeixinSearchThread implements Runnable {
// public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); // public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
public void crawler(){ public void crawler(){
// String keyWord = keywordMsg.getKeyWord(); String keyWord = keywordMsg.getKeyWord();
List<String> keyWords = keywords; List<String> keyWords = SplitKeyword.transForm(keyWord);
// List<String> keyWords = keywords;
// List<String> keyWords = new ArrayList<>();
// keyWords.add("超微半导体+重大战略");
// keyWords.add("高通公司+科技人才");
// keyWords.add("日本电信电话株式会社+科技资源");
// keyWords.add("爱立信+创新能力");
log.info("keyWords:" + keyWords); log.info("keyWords:" + keyWords);
for (String kWord :keyWords) { for (String kWord :keyWords) {
kWord=kWord.replaceAll("\\+", " ").replaceAll(",", " "); kWord=kWord.replaceAll("\\+", " ").replaceAll(",", " ");
......
...@@ -155,6 +155,7 @@ public class WebExtractorImplforweixin implements Extractor { ...@@ -155,6 +155,7 @@ public class WebExtractorImplforweixin implements Extractor {
for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) { for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
processor = iterator.next(); processor = iterator.next();
ename = processor.getExtType().getEname().toUpperCase(); ename = processor.getExtType().getEname().toUpperCase();
//标题 //标题
if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) { if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
result = processor.extract(html); result = processor.extract(html);
......
...@@ -50,7 +50,7 @@ public class KafkaConsumerJob { ...@@ -50,7 +50,7 @@ public class KafkaConsumerJob {
/** /**
* 从kafka中获取公众号信息进行发送获取列表内容提取链接 * 从kafka中获取公众号信息进行发送获取列表内容提取链接
*/ */
@Scheduled(cron = "0 0/1 * * * ?") @Scheduled(cron = "0 0/5 * * * ?")
@Async("asyncTaskExecutor") @Async("asyncTaskExecutor")
public void wxOfficialConsumer (){ public void wxOfficialConsumer (){
log.info(Constants.KAFKA_CONSUMER_TOPIC); log.info(Constants.KAFKA_CONSUMER_TOPIC);
...@@ -76,6 +76,8 @@ public class KafkaConsumerJob { ...@@ -76,6 +76,8 @@ public class KafkaConsumerJob {
TimeUnit.SECONDS.sleep(20); TimeUnit.SECONDS.sleep(20);
} }
} }
// WeixinSearchThread siteThread=new WeixinSearchThread();
// threadPool.execute(siteThread);
}catch (Exception e){ }catch (Exception e){
consumer = createConsumer(); consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC)); consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
......
...@@ -50,11 +50,11 @@ public class ChromeUtil { ...@@ -50,11 +50,11 @@ public class ChromeUtil {
public static String redis_name="weixin4"; public static String redis_name="weixin4";
static ChromeOptions options1=new ChromeOptions() ; static ChromeOptions options1=new ChromeOptions() ;
static WebDriver driver; static WebDriver driver;
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); public static KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
static{ static{
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);//chromedriver服务地址 System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);//chromedriver服务地址
System.setProperty("webdriver.chrome.bin", Constants.CHROMEBIN); System.setProperty("webdriver.chrome.bin", Constants.CHROMEBIN);
options1.setBinary(Constants.CHROMEBIN); // options1.setBinary(Constants.CHROMEBIN);
//options1.addArguments("--headless"); //options1.addArguments("--headless");
/* Map<String, Object> contentSettings = new HashMap<String, Object>(); /* Map<String, Object> contentSettings = new HashMap<String, Object>();
contentSettings.put("images", 2); contentSettings.put("images", 2);
...@@ -304,7 +304,7 @@ public class ChromeUtil { ...@@ -304,7 +304,7 @@ public class ChromeUtil {
//爬完一页就去发送给消息队列 //爬完一页就去发送给消息队列
entitys=new ArrayList<ExtEntity>(); entitys=new ArrayList<ExtEntity>();
driver.switchTo().window(curenthandle);//转到主页,否则在副页面执行点击会报错 driver.switchTo().window(curenthandle);//转到主页,否则在副页面执行点击会报错
Thread.sleep(5000); Thread.sleep(3000);
if(i==1) { if(i==1) {
// crawler(key,orgid+"",entitys); // crawler(key,orgid+"",entitys);
}else { }else {
...@@ -324,8 +324,8 @@ public class ChromeUtil { ...@@ -324,8 +324,8 @@ public class ChromeUtil {
//发送消息队列 //发送消息队列
for (int k = 0; k < entitys.size(); k++) { for (int k = 0; k < entitys.size(); k++) {
ChromeUtil chromeUtil=new ChromeUtil(); // ChromeUtil chromeUtil=new ChromeUtil();
chromeUtil.sendExtentity(entitys.get(k),key,orgid,tid); sendExtentity(entitys.get(k),key,orgid,tid);
} }
} }
...@@ -393,7 +393,7 @@ public class ChromeUtil { ...@@ -393,7 +393,7 @@ public class ChromeUtil {
// } // }
searchBoxs.get(i).findElement(By.tagName("a")).click();//点击标题,打开微信信息链接 searchBoxs.get(i).findElement(By.tagName("a")).click();//点击标题,打开微信信息链接
Thread.sleep(8000); Thread.sleep(4000);
Set<String> ss=driver.getWindowHandles();//获取六浏览器句柄的记录 Set<String> ss=driver.getWindowHandles();//获取六浏览器句柄的记录
System.out.println(ss.toString()); System.out.println(ss.toString());
...@@ -422,7 +422,8 @@ public class ChromeUtil { ...@@ -422,7 +422,8 @@ public class ChromeUtil {
System.out.println(entity.getTitle()); System.out.println(entity.getTitle());
entity.setUri(dirverother.getCurrentUrl()); entity.setUri(dirverother.getCurrentUrl());
entity.setTilteLike(titlelike); entity.setTilteLike(titlelike);
String formatImgContent=""; String formatImgContent=entity.getContentWithTag();
// String formatImgContent="";
//存储图片 //存储图片
// formatImgContent= ContentFileFinder.getContentImgTag(entity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw"); // formatImgContent= ContentFileFinder.getContentImgTag(entity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
//去掉图片 //去掉图片
...@@ -435,7 +436,7 @@ public class ChromeUtil { ...@@ -435,7 +436,7 @@ public class ChromeUtil {
// formatImgContent = formatImgContent.replace(key, ""); // formatImgContent = formatImgContent.replace(key, "");
// } // }
// } // }
entity.setContentWithTag(formatImgContent); entity.setContentWithTag(formatImgContent);
entitys.add(entity); entitys.add(entity);
dirverother.close(); dirverother.close();
} }
...@@ -577,7 +578,7 @@ public class ChromeUtil { ...@@ -577,7 +578,7 @@ public class ChromeUtil {
} }
public void sendExtentity(ExtEntity extEntity,String key,Long orgid,Long tid) { public static void sendExtentity(ExtEntity extEntity,String key,Long orgid,Long tid) {
try { try {
String rediskey = redis_name + orgid + key; String rediskey = redis_name + orgid + key;
String weixinurl = extEntity.getUri(); String weixinurl = extEntity.getUri();
...@@ -598,7 +599,7 @@ public class ChromeUtil { ...@@ -598,7 +599,7 @@ public class ChromeUtil {
docInfo.setTitle(extEntity.getTitle()); docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor()); docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate()); docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-" + extEntity.getAuthor()); docInfo.setOrigin("微信公众号-" +extEntity.getOrigin()==null ? extEntity.getAuthor():extEntity.getOrigin());
// docInfo.setKeywords(extEntity.getKeywords()); // docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary()); //docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
...@@ -630,6 +631,7 @@ public class ChromeUtil { ...@@ -630,6 +631,7 @@ public class ChromeUtil {
System.out.println("资讯的信息不全没有发送"); System.out.println("资讯的信息不全没有发送");
} }
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
// KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
log.info("发送到kafka成功。"); log.info("发送到kafka成功。");
......
...@@ -177,6 +177,7 @@ public class JsoupTagProcessor implements Processor, Serializable { ...@@ -177,6 +177,7 @@ public class JsoupTagProcessor implements Processor, Serializable {
public String extract(String html) { public String extract(String html) {
Document doc = Jsoup.parse(html); Document doc = Jsoup.parse(html);
doc.removeAttr("div[class=\"weui-mask js_mask\"]");
//如果是exp表达式 //如果是exp表达式
if(isReg){ if(isReg){
if(regPattern!=null){ if(regPattern!=null){
......
...@@ -48,7 +48,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -48,7 +48,7 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
KAFKA_CONSUMER_TOPIC = keyWordsInfo KAFKA_CONSUMER_TOPIC = keyWordsInfo
#消费信息组 #消费信息组
KAFKA_CONSUMER_GROUP_ID=wx-es-sync KAFKA_CONSUMER_GROUP_ID=sgwx-es-sync
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息保存的topic #信息保存的topic
KAFKA_PRODUCT_TOPIC=sgwxcrawlerInfo1 KAFKA_PRODUCT_TOPIC=sgwxcrawlerInfo1
......
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<template><content><exp>*.div[id="js_content"]</exp></content><title><exp>*.h2[class="rich_media_title"]</exp></title><author><exp>*.a[id="post-user"]</exp></author><publish_date><exp>*.em[id="post-date"]</exp></publish_date></template> <template>
<content>
<exp>*.div[id="js_content"]</exp>
</content>
<title>
<exp>*.h2[class="rich_media_title"]|h1[class="rich_media_title"]</exp>
</title>
<author>
<exp>*.span[class="rich_media_meta rich_media_meta_text"]|a[id="post-user"]</exp>
</author>
<publish_date>
<exp>*.em[id="post-date"]</exp>
</publish_date>
<origin>
<exp>*.a[id="js_name"]</exp>
</origin>
</template>
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论