提交 9f957d5a 作者: liuweigang

采集代码更新2

上级 8f456e71
This source diff could not be displayed because it is too large. You can view the blob instead.
KW-20220809-0002
KW-20220602-0003
KW-20220826-0001
KW-20220108-0004
KW-20220602-0002
KW-20220809-0003
KW-20220113-0007
KW-20220602-0006
KW-20220809-0005
KW-20220824-0001
KW-20220809-0004
KW-20220524-0004
KW-20220506-0001
\ No newline at end of file
package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.job.KafkaConsumerJob;
import com.zzsn.search.FileUtil;
import com.zzsn.search.MetaBaiduSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.util.SpringContextUtil;
......@@ -24,8 +26,10 @@ import org.springframework.boot.web.servlet.support.SpringBootServletInitializer
import org.springframework.context.annotation.Bean;
import javax.servlet.MultipartConfigElement;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Slf4j
......@@ -41,29 +45,14 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i
@Override
public void run(String... args) throws Exception {
consumerPartition();
// System.out.println("——————++++++++++++——————===");
// String key="{\n" +
// " \"id\": \"1532331241232039937\",\n" +
// " \"wordsCode\": \"KW-20220602-0003\",\n" +
// " \"wordsName\": \"人工智能应用\",\n" +
// " \"keyWord\": \"(人工智能|人工智能应用|应用|人工智能技术|人工智能领域|人工智能系统|人工智能产品|智能汽车|无人驾驶|人脸识别|人像识别|面部识别|机器翻译|自然语言处理|声纹识别|智能客服|智能音箱|语音识别|语音合成|个性化推荐|图像识别|图像搜索|人工智能应用|大数据分析|大数据|人工智能设计|人机交互|人工智能方案|人工智能解决方案|人工智能实验室|人工智能模型|人工智能问题|人工智能流程|人工智能设备|生成式对抗网络|计算智能|感知智能|认知智能|机器学习|增强学习|结构化数据|非结构化数据|传感器|理解能力|归纳能力|推理能力|特征提取|模式分析|预测|智能农业|智能工业|智能工厂|工业机器人|智能手机|无人驾驶汽车|无人机|智能机器人|环境感知|路径规划|行为决策|算法|智能分拣|设备健康管理|表面缺陷检测|智能决策|数字孪生|创成式设计|需求预测|供应链优化|深度学习|Applications of artificial intelligence|artificial intelligence|Applications|AI|Driverless Car|Automatic Speech Recognition|ASR|Natural Language Processing|NLP|Text To Speech|TTS|GAN| generative adversarial network|SLAM|simultaneous localization and mapping|Generative Design|AI Application in E-Commerce|Personalized Shopping|AI-powered Assistants|Fraud Prevention| Applications Of Artificial Intelligence in Education|Administrative Tasks Automated to Aid Educators|Administrative Tasks Automated to Aid Educators|Creating Smart Content|Voice Assistants|Personalized Learning|Applications of Artificial Intelligence in Lifestyle|Autonomous Vehicles|Spam Filters|Facial Recognition|Recommendation System|Applications of Artificial intelligence in Navigation|Applications of Artificial Intelligence in Robotics|Applications of Artificial Intelligence in Human Resource|Applications of Artificial Intelligence in Healthcare|Applications of Artificial Intelligence in Agriculture|Applications of Artificial Intelligence in Gaming|Applications of Artificial Intelligence in Automobiles|Applications of Artificial Intelligence in Social Media|Applications of Artificial Intelligence in Marketing| Applications of Artificial Intelligence in Chatbots|Applications of of Artificial Intelligence in Finance)+(人工智能|artificial intelligence|AI)\\n\",\n" +
// " \"exclusionWord\": null,\n" +
// " \"status\": \"1\",\n" +
// " \"subjectId\": null,\n" +
// " \"subjectIds\": null,\n" +
// " \"startTime\": null,\n" +
// " \"endTime\": null \n" +
// "}";
// try {
// KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
// MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
// metaSearchThread.keywordMsg = keywordMsg;
// metaSearchThread.crawler();
// }catch (Exception e){
// e.printStackTrace();
// }
// System.out.println("——————++++++++++++——————===");
try {
consumerPartition();
} catch (Exception e) {
consumerPartition();
}
// loadSiteMsgLoc();
}
public void consumerPartition (){
log.info("定时获取mq消息");
......@@ -96,7 +85,55 @@ public class CrawlerMateSearchApplication extends SpringBootServletInitializer i
}
}
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
System.out.println("——————++++++++++++——————===");
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = new Gson().fromJson(subvalue, KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
public void loadloc(){
String key="{\n" +
" \"id\": \"2022090522\",\n" +
" \"wordsCode\": \"KW-20220602-0003\",\n" +
" \"wordsName\": \"2022世界机器人大会\",\n" +
" \"keyWord\": \"2022世界机器人大会\",\n" +
" \"exclusionWord\": null,\n" +
" \"status\": \"1\",\n" +
" \"subjectId\": null,\n" +
" \"subjectIds\": null,\n" +
" \"startTime\": null,\n" +
" \"endTime\": null \n" +
"}";
try {
KeywordMsg keywordMsg = new Gson().fromJson(key, KeywordMsg.class);
MetaBaiduSearchThread metaSearchThread = new MetaBaiduSearchThread();
metaSearchThread.keywordMsg = keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
e.printStackTrace();
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
......
......@@ -144,7 +144,13 @@ public class JedisUtil {
}
return getDefaultJedis().get(PREFIX + key);
}
public static String getNoPrefixString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
......
package com.zzsn.search;
import cn.hutool.Hutool;
import cn.hutool.core.date.DateUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
......@@ -123,32 +125,34 @@ public class BaiduSearchThread implements Runnable {
}catch (Exception e){
log.info("缓存出问题");
}
String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]";
// String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]";
// String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]";
String url1= "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]";
// String url1= "https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1";
String url="";
List<String> urlList = new ArrayList<String>();
log.info("url:" + url);
String charset = "utf-8";
// Long orgId = Long.parseLong("2022082801");
Long orgId = Long.parseLong("202208290111");
Long tid = Long.parseLong("202208290111");
// Long orgId = Long.parseLong("2022090301"); //企业
// Long orgId = Long.parseLong("20220903022");//机器
Long orgId = Long.parseLong("20220905022");//智能
Long tid = Long.parseLong("20220905022");
String proxyid=Constants.PROXY;
if(proxyid.equals("1")) {
CatchWebNews(RecorderUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid),kWord);
}else {
// for (int i = 0; i < 2; i++) {
// String urla = url1.replace("[keyword]",kWord);
//// urla = urla.replace("[startTime]",startTime);
//// urla = urla.replace("[endTime]",endTime);
// urla=urla.replace("[pn]",i*10+"");
// urlList.add(urla);
// }
for (int i = 0; i <30; i++) {
String urla = url1.replace("[keyword]",kWord);
// urla = urla.replace("[startTime]",startTime);
// urla = urla.replace("[endTime]",endTime);
urla=urla.replace("[pn]",i*10+"");
urlList.add(urla);
}
// List<CatchWebByMetaSearch> catchWebByMetaSearches = RecorderUtil.catchWebOfBaiduList(urlList, charset, orgId, tid);
try {
String urla = url1.replace("[keyword]",kWord);
urlList.add(urla);
// String urla = url1.replace("[keyword]",kWord);
// urlList.add(urla);
RecorderUtil.CatchWebDetailOfBaidu(urlList, charset, orgId, tid, kWord);
}catch (Exception e){
e.printStackTrace();
......@@ -501,8 +505,10 @@ public static String dateToStamp(String s) {
String content=docInfo.getContentNoTag();
String words=docInfo.getContentWithTag();
String keywords=docInfo.getKeywords();
System.out.println("保存关键词keywords:"+keywords);
String sourceaddress=docInfo.getSourceaddress();
String fromwhere="百度搜索";
String fromwhere="百度搜索"+DateUtil.now();
DBManager dm = new DBManager();
String[] coulmn = new String[]{id, sid, title,summary,publish_date,origin,author,content,words,keywords,sourceaddress,fromwhere};
......
......@@ -81,7 +81,7 @@ public class MetaBaiduSearchThread implements Runnable {
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Async("asyncexecutorServiceWebBaidu")
// @Async("asyncexecutorServiceWebBaidu")
public void crawler(){
//对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord();
......@@ -114,27 +114,27 @@ public class MetaBaiduSearchThread implements Runnable {
}
String keyid=cache_key+"_"+keywordMsg.getId();
for (String kWord :keyWords) {
try {
boolean sismember = JedisUtil.sismember(keyid, kWord);
if (sismember) {
// try {
// boolean sismember = JedisUtil.sismember(keyid, kWord);
// if (sismember) {
// if(StringUtils.isEmpty(keywordMsg.getEndTime().toString())||Long.parseLong(nowTime)*1000>Long.parseLong(endTime)){
// startTime=nowTime;
// endTime=nowTime;
// }
// }
// }catch (Exception e){
// log.info("缓存出问题");
// continue;
if(StringUtils.isEmpty(keywordMsg.getEndTime().toString())||Long.parseLong(nowTime)*1000>Long.parseLong(endTime)){
startTime=nowTime;
endTime=nowTime;
}
}
}catch (Exception e){
log.info("缓存出问题");
}
// }
// String url1= "https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1";
String url1= Constants.META_SEARCH_URL;
String url="";
List<String> urlList = new ArrayList<String>();
log.info("url:" + url);
String charset = "utf-8";
Long orgId = Long.parseLong(keywordMsg.getId());
Long tid = Long.parseLong(keywordMsg.getId());
for (int i = 0; i < 76; i++) {
Long orgId = Long.parseLong(keywordMsg.getWordsCode());//关键词组编码
Long tid = Long.parseLong(keywordMsg.getId());//关键词组id
for (int i = 0; i < 6; i++) {
String urla = url1.replace("[keyword]",kWord);
urla = urla.replace("[startTime]",startTime);
urla = urla.replace("[endTime]",endTime);
......@@ -147,6 +147,7 @@ public class MetaBaiduSearchThread implements Runnable {
JedisUtil.sadd(keyid, kWord);
}catch (Exception e){
log.info("缓存保存数据失败!");
continue;
}
}
}
......@@ -206,16 +207,27 @@ public class MetaBaiduSearchThread implements Runnable {
// 判断该网址是否存在于缓存池中
String orgId = String.valueOf(cwbm.getOrgId());
try {
String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
log.info(cwbm.getSourceaddress()+" 数据重复");
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
if (sismember) {
log.info("百度采集信息重复:" + cwbm.getTitle() + " :" + cwbm.getSourceaddress());
repeat++;
continue;
}
}catch (Exception e){
} catch (Exception e) {
log.info("redis获取信息失败");
}
// try {
// String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
// if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
// log.info(cwbm.getSourceaddress()+" 数据重复");
// repeat++;
// continue;
// }
// }catch (Exception e){
// log.info("redis获取信息失败");
// }
String infourl = cwbm.getSourceaddress();
String infodata = "";
String charset = "";
......@@ -354,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable {
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
// intsertData(docInfo);
intsertData(docInfo);
//信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
// kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
log.info("发送成功到kafka");
}else {
log.info("资讯发布时间:"+docInfo.getPublishDate());
......
......@@ -28,10 +28,11 @@ public class KeywordMsg {
private List<String> subjectIds;
private Long startTime;
// private List<String> startTime;
private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
// private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
......
......@@ -324,14 +324,6 @@ public class RecorderUtil {
log.info("url处理异常!");
}
docstr = proxyRequest(uri_code);
// String proxyIP = getProxyIP();
// log.info("使用的代理IP:"+proxyIP );
// String[] proxys=proxyIP.split("-");
// String proxyHost = proxys[0];
// int proxyPort = Integer.parseInt(proxys[1]);
// String userName = proxys[2];
// String password = proxys[3];
// docstr = HttpClientProxy.build(proxyHost, proxyPort, userName, password).requestUrl(uri_code); // 代理认证
log.info("请求内容:"+docstr);
} catch (Exception e) {
log.info("使用代理请求异常");
......@@ -361,6 +353,7 @@ public class RecorderUtil {
Thread.sleep(1000);
SeleniumTime seleniumTime2=new SeleniumTime();
docstr=seleniumTime2.getScopehtml(urlList.get(i));
doc = Jsoup.parse(docstr);
firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
}catch (Exception e){
e.printStackTrace();
......@@ -406,7 +399,7 @@ public class RecorderUtil {
//对采集一个列表解析一个列表的详情
MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords);
if(repeat/metaSearchList.size()>0.7){
if(repeat/metaSearchList.size()>0.6){
break;
}
catchWebByMetaSearchList.addAll(metaSearchList);
......@@ -487,9 +480,9 @@ public class RecorderUtil {
}
BaiduSearchThread baiduSearchThread=new BaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keyword);
// if(repeat/metaSearchList.size()>0.7){
// break;
// }
if(repeat/metaSearchList.size()>0.7){
break;
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
......
......@@ -72,7 +72,7 @@ public class SeleniumTime {
chromeOptions.addArguments("headless");//无界面参数
chromeOptions.addArguments("no-sandbox");//禁用沙盒 就是被这个参数搞了一天
}
chromeOptions.addArguments("headless");//无界面参数
// chromeOptions.addArguments("headless");//无界面参数
driver = new ChromeDriver(chromeOptions);//生成实例
String html="";
try{
......
......@@ -44,13 +44,17 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=google_crawler_urlList
#搜索地址
#META_SEARCH_URL=https://www.google.com.hk/search?q=[keyword]&newwindow=1&tbs=cdr:1,cd_min:[startTime],cd_max:[endTime]&tbm=nws&ei=fYBfYp-CHffo2roPhoOPsA4&start=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]
META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]
META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=[keyword]
#META_SEARCH_URL=https://www.baidu.com/s?wd=[keyword]&pn=[pn]&oq=[keyword]&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&gpc=stf=[startTime],[endTime]|stftype=2&tfflag=1
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\baidu_search\\data\\project.txt
#META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\baidu_search\\data\\projectkw.txt
# Redis settings
#redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
redis.host=127.0.0.1
redis.port=6379
redis.pass=xxxxxx
......
......@@ -46,12 +46,12 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
@Override
public void run(String... args) throws Exception {
// try {
// loadSiteMsg();
// } catch (Exception e) {
// loadSiteMsg();
// }
loadSiteMsgLoc();
try {
loadSiteMsg();
} catch (Exception e) {
loadSiteMsg();
}
// loadSiteMsgLoc();
// loadSiteMsgLoc2();
// loadSiteMsgLoc3();
}
......
package com.zzsn.test;
import java.net.InetAddress;
import java.net.UnknownHostException;
/**
* 通过域名解析IP
*
* @author java小强
*/
public class TestInetAddress {
public static void main(String args[]) {
TestInetAddress address = new TestInetAddress();
String domain = "www.163.com";
System.out.println("要获取域名的地址为: " + domain);
InetAddress[] array = address.getServerIP(domain);
int count = 0;
for (int i = 0; i < array.length; i++) {
System.out.println("ip:" + i + "--》" + array[i].getHostAddress());
count++;
}
System.out.println("该域名解析到IP: " + count);
}
/**
* 获得某域名的IP地址
*/
public InetAddress[] getServerIP(String domain) {
InetAddress[] myServer = null;
try {
myServer = InetAddress.getAllByName(domain);
} catch (UnknownHostException e) {
e.printStackTrace();
}
return myServer;
}
}
......@@ -63,14 +63,34 @@ public class DriverUtil {
// log.error("未获取到驱动服务地址、sessionId");
// return null;
// }
ReuseWebDriver driver = new ReuseWebDriver(serverUrl, sessionId);
ReuseWebDriver driver=null;
try {
driver = new ReuseWebDriver(serverUrl, sessionId);
System.out.println(driver.connectTestFail());
if (driver.connectTestFail()) {
// 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
serverUrl = DriverUtil.getServerUrl(chromeDriver);
if(chromeDriver.getSessionId() != null) {
if (chromeDriver.getSessionId() != null) {
sessionId = chromeDriver.getSessionId().toString();
}
Map<String, String> map = new HashMap<>(2);
map.put("sessionId", sessionId);
map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId);
}
}catch (Exception e){
System.out.println("出现异常");
// 若驱动返回错误码,重新创建驱动服务并缓存
ChromeDriver chromeDriver = DriverUtil.reconnectDriver();
serverUrl = DriverUtil.getServerUrl(chromeDriver);
if (chromeDriver.getSessionId() != null) {
sessionId = chromeDriver.getSessionId().toString();
}
......@@ -79,6 +99,7 @@ public class DriverUtil {
map.put("serverUrl", serverUrl);
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动driver失败重新设置:" + Constants.SELENIUM_DRIVER_CACHE + "::" + JSON.toJSONString(map));
driver = new ReuseWebDriver(serverUrl, sessionId);
}
return driver;
......@@ -97,6 +118,7 @@ public class DriverUtil {
}
public static Map<String, String> getSessionInfo() throws Exception{
String cacheInfo = JedisUtil.getString(Constants.SELENIUM_DRIVER_CACHE);
System.out.println("获取驱动session:"+Constants.SELENIUM_DRIVER_CACHE+"::"+cacheInfo);
Map<String, String> map = JSON.parseObject(cacheInfo, Map.class);
if(map==null || map.size()<1) {
map = new HashMap<>(2);
......@@ -104,6 +126,7 @@ public class DriverUtil {
map.put("serverUrl", "https://www.baidu.com/");
// 缓存浏览器驱动信息
JedisUtil.setString(Constants.SELENIUM_DRIVER_CACHE, JSON.toJSONString(map), -1);
System.out.println("获取驱动session失败重新设置:"+Constants.SELENIUM_DRIVER_CACHE+"::"+JSON.toJSONString(map));
}
return map;
}
......
......@@ -48,6 +48,7 @@ boiler.timeout.readTimeout=6000
logging.level.root=info
logging.level.org.springframework.web=info
logging.level.org.hibernate=info
logging.level.com.gargoylesoftware=off
logging.config=classpath:logback-spring.xml
kafka.consumer.task=0 0/2 * * * ?
......
......@@ -3,6 +3,15 @@ package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.search.MetaSoSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
......@@ -10,6 +19,11 @@ import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Properties;
@Slf4j
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerStaticApplication extends SpringBootServletInitializer implements CommandLineRunner {
@Override
......@@ -41,6 +55,57 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// MetaSoSearchThread metaSearchThread=new MetaSoSearchThread();
// metaSearchThread.keywordMsg=keywordMsg;
// metaSearchThread.crawler();
soKeyword ();
}
public void soKeyword (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
// consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
String kafkaConsumerPartition = Constants.KAFKA_CONSUMER_PARTITION;
String[] partitions = kafkaConsumerPartition.split(",");
for (int i = 0; i < partitions.length; i++) {
topicPartitions.add(new TopicPartition(Constants.KAFKA_CONSUMER_TOPIC, Integer.parseInt(partitions[i])));
}
consumer.assign(topicPartitions);
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
MetaSoSearchThread metaSearchThread=new MetaSoSearchThread();
metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler();
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
\ No newline at end of file
......@@ -48,7 +48,7 @@ public class KafkaConsumerSoTask {
}
//打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务
@Scheduled(cron = "0 0/3 * * * ?")
// @Scheduled(cron = "0 0/3 * * * ?")
@Async("webExecutor")
public void consumerKeyword (){
log.info("定时获取mq消息");
......
......@@ -58,9 +58,11 @@ public class MetaSoRecorderUtil {
doc = conn.timeout(10000).get();
} catch (Exception ex) {
// ex.printStackTrace();
System.out.println("360搜索中该关键词搜索没有相关新闻!");
log.info("360搜索关键词异常");
// continue;
}
// 测试获取内容通过模拟浏览器获取
String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(doc==null){
String proxyIP = getProxyIP();
log.info("使用的代理IP:"+proxyIP );
......@@ -69,10 +71,10 @@ public class MetaSoRecorderUtil {
int proxyPort = Integer.parseInt(proxys[1]);
String userName = proxys[2];
String password = proxys[3];
String docstr = HttpClientProxy.build(proxyHost, proxyPort, userName, password).requestUrl(uri_code); // 代理认证
docstr = HttpClientProxy.build(proxyHost, proxyPort, userName, password).requestUrl(uri_code); // 代理认证
log.info("请求内容:"+docstr);
doc=Jsoup.parse(docstr);
}
doc=Jsoup.parse(docstr);
System.out.println("----360搜索----" + uri);
Elements firstElementsLink = doc.select("li[data-from=\"news\"]");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
......
......@@ -110,7 +110,7 @@ public class MetaSoSearchThread implements Runnable {
if(proxyid.equals("1")) {
CatchWebNews(SoRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord);
}else {
for (int i = 1; i < 100; i++) {
for (int i = 1; i < 50; i++) {
String urla = url1.replace("[keyword]",kWord);
urla=urla.replace("[page_num]",i+"");
urlList.add(urla);
......
package com.zzsn;
import com.google.gson.Gson;
import com.zzsn.cache.JedisUtil;
import com.zzsn.job.KafkaConsumerSougouTask;
import com.zzsn.search.FileUtil;
import com.zzsn.search.MetaSouGouSearchThread;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
......@@ -20,8 +22,10 @@ import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
@Slf4j
......@@ -56,8 +60,41 @@ public class CrawlerStaticApplication extends SpringBootServletInitializer imple
// MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
// metaSearchThread.keywordMsg=keywordMsg;
// metaSearchThread.crawler();
try {
consumerKeyword ();
} catch (Exception e) {
consumerKeyword ();
}
// loadSiteMsgLoc();
}
public void loadSiteMsgLoc() {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
System.out.println(filepath);
// String filepath="E:\\baidu\\gaojibaidu\\baidu1\\data\\project.txt";
try {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
System.out.println(allLines.size());
for (String keysite:allLines) {
try {
String value = JedisUtil.getNoPrefixString("KEY_WORDS_TO_REDIS::"+keysite);
System.out.println("——————++++++++++++——————===");
String subvalue=value.replace(value.substring(value.indexOf("startTime"),value.indexOf("searchEngines")),"");
KeywordMsg keywordMsg = new Gson().fromJson(subvalue, KeywordMsg.class);
log.info("关键词解析keywordMsg正常");
MetaSouGouSearchThread metaSearchThread=new MetaSouGouSearchThread();
metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler();
}catch (Exception e){
continue;
}
}
}catch (Exception e){
e.getMessage();
}
}
public void consumerKeyword (){
log.info("定时获取mq消息");
......
......@@ -52,7 +52,11 @@ public class JedisUtil {
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
if(pass.equals("xxxxxx")){
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}else{
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout),pass);
}
}
private static Jedis getJedis() {
init();
......@@ -136,6 +140,13 @@ public class JedisUtil {
return getDefaultJedis().get(PREFIX + key);
}
public static String getNoPrefixString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
......
......@@ -130,7 +130,7 @@ public class SouGouRecorderUtil {
catchWebByMetaSearches.add(catchMetaSearch);
DetailSouGouSearchThread detailSouGouSearchThread=new DetailSouGouSearchThread();
detailSouGouSearchThread.CatchWebNews(catchWebByMetaSearches,"");
detailSouGouSearchThread.CatchWebNews(catchWebByMetaSearches,"2022世界机器人大会");
// ObjectMapper mapper = new ObjectMapper();
// try {
// String docjson = mapper.writeValueAsString(catchMetaSearch);
......
......@@ -28,9 +28,42 @@ public class WebSouGouSearch {
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
paser(allLines);
paser2();
}
public static void paser2(){
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<WebSouGouSearchThread> workers = new Vector<WebSouGouSearchThread>();
int index = 0;
try {
List<String> keywordList=new ArrayList<>();
keywordList.add("2022世界机器人大会");
WebSouGouSearchThread webSouGouSearchThread = new WebSouGouSearchThread();
webSouGouSearchThread.setThreadId(index++);
webSouGouSearchThread.setKeywords(keywordList);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2022090333");
keywordMsg.setKeyWord("2022世界机器人大会");
webSouGouSearchThread.setKeywordMsg(keywordMsg);
workers.add(webSouGouSearchThread);
threadPool.execute(webSouGouSearchThread);
}catch (Exception e){
e.printStackTrace();
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
public static void paser(List<String> keywords){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
......@@ -42,8 +75,8 @@ public class WebSouGouSearch {
webSouGouSearchThread.setThreadId(index++);
webSouGouSearchThread.setKeywords(keywordList);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2022052504");
keywordMsg.setKeyWord("衣学东");
keywordMsg.setId("2022090333");
keywordMsg.setKeyWord("2022世界机器人大会");
webSouGouSearchThread.setKeywordMsg(keywordMsg);
workers.add(webSouGouSearchThread);
threadPool.execute(webSouGouSearchThread);
......
......@@ -108,7 +108,7 @@ public class WebSouGouSearchThread implements Runnable {
if(proxyid.equals("1")) {
// CatchWebNews(SouGouRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord);
}else {
for (int i = 1; i < 100; i++) {
for (int i = 1; i < 20; i++) {
String urla = url1.replace("[keyword]",kWord);
urla=urla.replace("[page_num]",i+"");
urlList.add(urla);
......
......@@ -138,6 +138,7 @@ public class MetaSouGouRecorderUtil {
}
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
metaSearchList.add(catchWebByMetaSearch);
}
MetaSouGouSearchThread metaSouGouSearchThread=new MetaSouGouSearchThread();
......
......@@ -78,7 +78,7 @@ public class MetaSouGouSearchThread implements Runnable {
}
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Async("asyncexecutorServiceWebBaidu")
// @Async("asyncexecutorServiceWebBaidu")
public void crawler(){
//对传进来的关键词组进行组合
String keyWord = keywordMsg.getKeyWord();
......@@ -107,7 +107,7 @@ public class MetaSouGouSearchThread implements Runnable {
Long orgId = Long.parseLong(keywordMsg.getId());
Long tid = Long.parseLong(keywordMsg.getId());
String proxyid=Constants.PROXY;
for (int i = 1; i < 500; i++) {
for (int i = 1; i < 10; i++) {
String urla = url1.replace("[keyword]",kWord);
urla=urla.replace("[page_num]",i+"");
urlList.add(urla);
......
......@@ -30,7 +30,7 @@ public class KeywordMsg {
private Long endTime;
//需要启动的信息采集器
private List<String> searchEngines;
// private List<String> searchEngines;
//采集的要求(1:标题 2:正文 3:全文)
private String crawlerType;
......
......@@ -45,12 +45,15 @@ META_SEARCH_URL=https://www.sogou.com/sogou?interation=1728053249&query=intitle:
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\souGouSearch\\data\\projectbak.txt
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\meta_crawler\\sougou_crawler\\data\\project.txt
# Redis settings
redis.host=127.0.0.1
redis.host=114.115.236.206
redis.port=6379
redis.pass=xxxxxx
redis.pass=clbzzsn
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
redis.timeout=10000
redis.maxIdle=300
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论