提交 80a9df55 作者: liuweigang

采集代码更新4

上级 9f957d5a
...@@ -132,7 +132,7 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -132,7 +132,7 @@ public class MetaBaiduSearchThread implements Runnable {
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
log.info("url:" + url); log.info("url:" + url);
String charset = "utf-8"; String charset = "utf-8";
Long orgId = Long.parseLong(keywordMsg.getWordsCode());//关键词组编码 Long orgId = Long.parseLong(keywordMsg.getId());//关键词组编码
Long tid = Long.parseLong(keywordMsg.getId());//关键词组id Long tid = Long.parseLong(keywordMsg.getId());//关键词组id
for (int i = 0; i < 6; i++) { for (int i = 0; i < 6; i++) {
String urla = url1.replace("[keyword]",kWord); String urla = url1.replace("[keyword]",kWord);
...@@ -141,7 +141,7 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -141,7 +141,7 @@ public class MetaBaiduSearchThread implements Runnable {
urla=urla.replace("[pn]",i*10+""); urla=urla.replace("[pn]",i*10+"");
urlList.add(urla); urlList.add(urla);
} }
List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid, keyWord); List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid, keyWord,keywordMsg);
try { try {
//对关键词进行缓存判断 开始时间和结束时间 //对关键词进行缓存判断 开始时间和结束时间
JedisUtil.sadd(keyid, kWord); JedisUtil.sadd(keyid, kWord);
...@@ -197,7 +197,7 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -197,7 +197,7 @@ public class MetaBaiduSearchThread implements Runnable {
} }
// 抓取新闻内容 // 抓取新闻内容
public int CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) { public int CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword,KeywordMsg keywordMsg) {
int repeat=0; int repeat=0;
try { try {
int count = 0; int count = 0;
...@@ -205,7 +205,7 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -205,7 +205,7 @@ public class MetaBaiduSearchThread implements Runnable {
try { try {
CatchWebByMetaSearch cwbm = catchWebList.get(i); CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中 // 判断该网址是否存在于缓存池中
String orgId = String.valueOf(cwbm.getOrgId()); String orgId = String.valueOf(keywordMsg.getWordsCode());
try { try {
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress()); boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
if (sismember) { if (sismember) {
...@@ -366,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable { ...@@ -366,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+ log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+"")); "|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
intsertData(docInfo); // intsertData(docInfo);
//信息转换 //信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo); ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem); String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson); System.out.println(docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson); kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
log.info("发送成功到kafka"); log.info("发送成功到kafka");
}else { }else {
log.info("资讯发布时间:"+docInfo.getPublishDate()); log.info("资讯发布时间:"+docInfo.getPublishDate());
......
...@@ -3,6 +3,7 @@ package com.zzsn.search.util; ...@@ -3,6 +3,7 @@ package com.zzsn.search.util;
import cn.hutool.core.util.RandomUtil; import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.BaiduSearchThread; import com.zzsn.search.BaiduSearchThread;
import com.zzsn.search.MetaBaiduSearchThread; import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.oracledb.OracleDBManager; import com.zzsn.search.oracledb.OracleDBManager;
import com.zzsn.search.oracledb.OracleDataTable; import com.zzsn.search.oracledb.OracleDataTable;
import com.zzsn.utility.index.Constants; import com.zzsn.utility.index.Constants;
...@@ -303,7 +304,7 @@ public class RecorderUtil { ...@@ -303,7 +304,7 @@ public class RecorderUtil {
// 提取百度新闻列表URL // 提取百度新闻列表URL
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfBaiduList( public static List<CatchWebByMetaSearch> catchWebOfBaiduList(
List<String> urlList, String charset, Long orgId, Long tid,String keywords) { List<String> urlList, String charset, Long orgId, Long tid, String keywords, KeywordMsg keywordMsg) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>(); List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
try { try {
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
...@@ -398,7 +399,7 @@ public class RecorderUtil { ...@@ -398,7 +399,7 @@ public class RecorderUtil {
} }
//对采集一个列表解析一个列表的详情 //对采集一个列表解析一个列表的详情
MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread(); MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords); int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords,keywordMsg);
if(repeat/metaSearchList.size()>0.6){ if(repeat/metaSearchList.size()>0.6){
break; break;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论