提交 80a9df55 作者: liuweigang

采集代码更新4

上级 9f957d5a
......@@ -132,7 +132,7 @@ public class MetaBaiduSearchThread implements Runnable {
List<String> urlList = new ArrayList<String>();
log.info("url:" + url);
String charset = "utf-8";
Long orgId = Long.parseLong(keywordMsg.getWordsCode());//关键词组编码
Long orgId = Long.parseLong(keywordMsg.getId());//关键词组编码
Long tid = Long.parseLong(keywordMsg.getId());//关键词组id
for (int i = 0; i < 6; i++) {
String urla = url1.replace("[keyword]",kWord);
......@@ -141,7 +141,7 @@ public class MetaBaiduSearchThread implements Runnable {
urla=urla.replace("[pn]",i*10+"");
urlList.add(urla);
}
List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid, keyWord);
List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid, keyWord,keywordMsg);
try {
//对关键词进行缓存判断 开始时间和结束时间
JedisUtil.sadd(keyid, kWord);
......@@ -197,7 +197,7 @@ public class MetaBaiduSearchThread implements Runnable {
}
// 抓取新闻内容
public int CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) {
public int CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword,KeywordMsg keywordMsg) {
int repeat=0;
try {
int count = 0;
......@@ -205,7 +205,7 @@ public class MetaBaiduSearchThread implements Runnable {
try {
CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中
String orgId = String.valueOf(cwbm.getOrgId());
String orgId = String.valueOf(keywordMsg.getWordsCode());
try {
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
if (sismember) {
......@@ -366,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
intsertData(docInfo);
// intsertData(docInfo);
//信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
log.info("发送成功到kafka");
}else {
log.info("资讯发布时间:"+docInfo.getPublishDate());
......
......@@ -3,6 +3,7 @@ package com.zzsn.search.util;
import cn.hutool.core.util.RandomUtil;
import com.zzsn.search.BaiduSearchThread;
import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.oracledb.OracleDBManager;
import com.zzsn.search.oracledb.OracleDataTable;
import com.zzsn.utility.index.Constants;
......@@ -303,7 +304,7 @@ public class RecorderUtil {
// 提取百度新闻列表URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfBaiduList(
List<String> urlList, String charset, Long orgId, Long tid,String keywords) {
List<String> urlList, String charset, Long orgId, Long tid, String keywords, KeywordMsg keywordMsg) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
try {
for (int i = 0; i < urlList.size(); i++) {
......@@ -398,7 +399,7 @@ public class RecorderUtil {
}
//对采集一个列表解析一个列表的详情
MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords);
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords,keywordMsg);
if(repeat/metaSearchList.size()>0.6){
break;
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论