提交 e43e2f42 作者: liuweigang

爬虫修改提交3

上级 23352971
Japan Science and Technology Agency
Research Institute of Economy
International Institute for Strategic Studies
\ No newline at end of file
Japan Science and Technology Agency
Research Institute of Economy
International Institute for Strategic Studies
\ No newline at end of file
......@@ -139,7 +139,7 @@ public class DetailGoogleSearchThread implements Runnable {
String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+cwbm.getSourceaddress());
if (!StringUtils.isEmpty(urlflag)) {
log.info(cwbm.getSourceaddress()+" 数据重复");
break;
continue;
}
}catch (Exception e){
log.info("redis获取信息失败");
......@@ -151,12 +151,17 @@ public class DetailGoogleSearchThread implements Runnable {
if (infourl == null || infourl.contains(".pdf") || infourl.trim().length()==0|| infourl.contains(".PDF")||infourl.contains("download")) {
continue;
}
// infodata=getContentByUrl(infourl);
// if(StringUtils.isEmpty(infodata)) {
//// 测试获取内容通过模拟浏览器获取
// infodata = ChromeUtil.getChromeDoc(infourl);
// }
infodata = ChromeUtil.getChromeDoc(infourl);
infodata=getContentByUrl(infourl);
if(StringUtils.isEmpty(infodata)) {
// 测试获取内容通过模拟浏览器获取
infodata = ChromeUtil.getDoc(infourl);
try {
Thread.sleep(1000 * 2);
}catch (Exception e){
}
}
// infodata = ChromeUtil.getChromeDoc(infourl);
// String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(StringUtils.isEmpty(infodata)){
try {
......@@ -202,37 +207,10 @@ public class DetailGoogleSearchThread implements Runnable {
docInfo.setSummary(cwbm.getSummary());
StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
try {
// 判断是否存在对应域名的模板
// if(infourl.contains("qq.com") && !infourl.contains("://new.qq.com")){
// infourl= transqqURl(infourl);
// }
// String domainurl = new URL(infourl).getHost();
// Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
// SiteTemplate siteTemplate=new SiteTemplate();
// if (siteTempObj != null && !"null".equals(siteTempObj)) {
// Site site=(Site)siteTempObj;
// siteTemplate.setMatchTitle(site.getMatchTitle());
// siteTemplate.setMatchAuthor(site.getMatchAuthor());
// siteTemplate.setMatchContent(site.getMatchContent());
// siteTemplate.setMatchOrigin(site.getMatchOrigin());
// siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
// siteTemplate.setMatchSummary(site.getMatchSummary());
// System.out.println("1++++++++doPaserByTag");
// docInfo= SourceTemplateByTag.doPaserByTag(content, docInfo, siteTemplate);
// }
// if(null!=docInfo.getContentWithTag()) {
// System.out.println("使用模板解析内容成功"+domainurl);
// log.info("使用模板解析内容成功"+domainurl);
// }
if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
// SourceTemplateByTag.saveNoTempSite(cwbm);
swe.doHandler(content, docInfo);
}
swe.doHandler(content, docInfo);
} catch (Exception e1) {
log.info("模板解析异常"+e1.getMessage());
// SourceTemplateByTag.saveNoTempSite(cwbm);
swe.doHandler(content, docInfo);
// swe.doHandler(content, docInfo);
}
System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
docInfo.setFileDownLoadPath(null);
......
......@@ -84,12 +84,73 @@ public class GoogleRecorderUtil {
}
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfGoogleListByKey(
List<String> urlList, String charset, Long orgId, Long tid,String keyword,KafkaTemplate kafkaTemplate) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i <100; i++) {
// Thread.sleep(2000);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
System.out.println(urlList.get(i));
// String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20)+"");
String docstr=ChromeUtil.getChromeDocKey(keyword,i*20+"");
System.out.println("列表页内容"+docstr.length());
System.out.println("关键词请求:"+keyword+"第"+i+"页");
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
//标题
// Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
//catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
// for (CatchWebByMetaSearch cWebByMetaSearch:metaSearchList){
// List<CatchWebByMetaSearch> catchWebByMetaSearches=new ArrayList<>();
// catchWebByMetaSearches.add(cWebByMetaSearch);
// }
DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread();
detailGoogleSearchThread.CatchWebNews(metaSearchList,keyword);
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static List<CatchWebByMetaSearch> catchWebOfGoogleList(
List<String> urlList, String charset, Long orgId, Long tid,String keyword,KafkaTemplate kafkaTemplate) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(2000);
// Thread.sleep(2000);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
......@@ -97,8 +158,11 @@ public class GoogleRecorderUtil {
// String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20)+"");
String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(docstr==null){
Thread.sleep(20000);
continue;
}
System.out.println("列表页内容"+docstr.length());
System.out.println("关键词请求:"+keyword+"第"+i+"页");
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
......@@ -126,12 +190,13 @@ public class GoogleRecorderUtil {
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
for (CatchWebByMetaSearch cWebByMetaSearch:metaSearchList){
List<CatchWebByMetaSearch> catchWebByMetaSearches=new ArrayList<>();
catchWebByMetaSearches.add(cWebByMetaSearch);
DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread();
detailGoogleSearchThread.CatchWebNews(catchWebByMetaSearches,keyword);
}
// for (CatchWebByMetaSearch cWebByMetaSearch:metaSearchList){
// List<CatchWebByMetaSearch> catchWebByMetaSearches=new ArrayList<>();
// catchWebByMetaSearches.add(cWebByMetaSearch);
// }
DetailGoogleSearchThread detailGoogleSearchThread=new DetailGoogleSearchThread();
detailGoogleSearchThread.CatchWebNews(metaSearchList,keyword);
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
......
......@@ -26,7 +26,7 @@ public class WebGoogleSearch {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
String startTime="2021-01-01";
String endTime="2022-05-18";
String endTime="2022-05-23";
startTime=dateToStamp(startTime);
endTime=dateToStamp(endTime);
File f = new File(filepath);
......@@ -60,7 +60,7 @@ public class WebGoogleSearch {
webGoogleSearchThread.setStartTime(startTime);
webGoogleSearchThread.setEndTime(endTime);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2020051901");
keywordMsg.setId("2020052301");
keywordMsg.setStartTime(Long.parseLong(startTime));
keywordMsg.setEndTime(Long.parseLong(endTime));
......
......@@ -117,7 +117,7 @@ public class WebGoogleSearchThread implements Runnable {
if(proxyid.equals("1")) {
CatchWebNews(GoogleRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord);
}else {
for (int i = 0; i < 100; i++) {
for (int i = 0; i < 30; i++) {
String urla = url1.replace("[keyword]",kWord.replace(" ","+"));
urla = urla.replace("[startTime]",startTime);
urla = urla.replace("[endTime]",endTime);
......@@ -125,6 +125,7 @@ public class WebGoogleSearchThread implements Runnable {
urlList.add(urla);
}
List<CatchWebByMetaSearch> catchWebByMetaSearches = GoogleRecorderUtil.catchWebOfGoogleList(urlList, charset, orgId, tid,kWord,kafkaTemplate);
// List<CatchWebByMetaSearch> catchWebByMetaSearches = GoogleRecorderUtil.catchWebOfGoogleListByKey(urlList, charset, orgId, tid,kWord,kafkaTemplate);
log.info("关键词搜索到信息数:"+catchWebByMetaSearches.size());
// List<String> keyWordList=new ArrayList<>();
// for (int i = 0; i < 100; i++) {
......
......@@ -108,7 +108,7 @@ public class SplitKeyword {
}
public static void main(String[] args) {
String kwords="Center for a New America Security|WestExec Advisor|Carnegie Endowment for International Peace|Brookings Institution|National Endowment for Democracy |Council on Foreign Relations |Foreign Policy for American |Center for American Progress |Meridian International|Asia Society|The Asia Group LLG|Penn Biden Center for Diplomacy and Global Engagement|RAND Corporation|Foreign Affairs|Center for Strategic and International Studies|Hoover Institution|American Enterprise Institute for Public Policy Research|Hudson Institute|Urban Institute|International Institute for Applied Systems Analysis|Heritage Foundation|CATO Institute|Carter Center|The Boston Consulting Group|Aspen Human Institute|East-West Center|Center for Defense Information|Battelle|The Population Council|Institute for the Future|U.S. Asian Cultural Academy|Atlantic Council|Centre for European Policy Studies|Royal Institute of Intemational Affairs|The Fabian Society|The Foreign Policy Centre|Institut français des relations internationales|democracynow|project-syndicate|Bruegel|Fundação Getulio Vargas|International Institute for Strategic Studies|Peterson Institute for International Economics|Japan Institute of International Affairs|The Frazer Institute|Konrad Adenauer| Friedrich Ebert Foundation|Korea Development Institute|Carnegie Endowment for International Peace|the Netherlands Institute of International Relations|Carnegie Moscow Center|Stiftung Wissenschaft und Politik|Centre for International Governance Innovation|Stockholm International Peace Research Institute|Korea Institute for International Economic Policy|The Institute of World Economy and International Relations |Danish Institute for International Studies|The African Centre for the Constructive Resolution of Disputes|Barcelona Centre for International Affairs|Institute for International Political Studies|Institute for Defence Studies and Analyses|The German Development Institute|Consejoargentino para las Relaciones Internacionales |Razumkov Centre|Centre for Economic Policy Research|\"Elcano Royal Institute|\"|The Royal Institute of International Affairs Chatham House|German Council on Foreign Relations|Transparency International|International Crisis Group|Royal United Services Institute|fedesarrollo|Center for Social and Economic Research|Singapore Institute of International Affairs|Kiel Institute for the World Economy|\"Lowy Institute|\"|Norwegian Institute of International Affairs|Institute of Economic Affairs|NUS East Asian Institute|Bonn International Centre for Conflict Studies|The African Economic Research Consortium|Australian Institute of International Affairs|The Food Agriculture and Natural Resources Policy Analysis Network|Overseas Development Institute|The Belfer Center for Science and International Affairs|Centre for European Reform|Institute of Development Studies|Türkiye Ekonomik ve Sosyal Etüdler Vakfı|Wilfried Martens Centre for European Studies|The World Economic Forum|European Union Institute for Security Studies|The South African Institute of International Affairs|Polski Instytut Spraw Międzynarodowych|Heinrich Böll Stiftung|The European Centre for International Political Economy |Center for Strategic and International Studies|Centro de Estudios Públicos|Yusof Ishak Institute|Institute for the Study of Civil Society|El Centro de Divulgación del Conocimiento Económico para la Libertad|Institute for Defence and Strategic Studies|Russian Institute for Strategic Studies|French Institute of International Relations |Japan Institute of International Affairs|Chatham House|Institut de Relations Internationales et Stratégiques|Centre for European Reform|World Resources Institute|Center for Strategic and International Studies|Korea Development Institute|Council on Foreign Relations|Barcelona Institute of International Studies|Konrad-Adenauer-Stiftung|the German Institute for International and Security Affairs|The Third World Academy of Sciences|European Research Council|European Commission|council on Competitiveness|American Association for the Advancement of Science|National Institute of Science and Technology Policy|Science Policy Research Unit|Fraunhofer Institute for Systems and Innovation Research ISI|Max Planck Institute for the Advancement of Science|Brookings Institution|United States National Academy of Sciences|General Commission for Strategy and Foresight|Japan Science and Technology Agency|Research Institute of Economy|International Institute for Strategic Studies";
String kwords="(缅甸军方|缅方军方)+(太阳能|光伏)";
List<String> strings = transForm(kwords);
for (String key :strings) {
System.out.println(key);
......
......@@ -28,7 +28,7 @@ public class ChromeUtil {
preferences.put("profile.default_content_settings", contentSettings);*/
// options1.addArguments("blink-settings=imagesEnabled=false");
driver = new ChromeDriver(options1);
// driver = new ChromeDriver(options1);
}
public static String getChromeDoc(String url,String start){
try {
......@@ -117,7 +117,7 @@ public class ChromeUtil {
public static String getChromeDoc(String url){
WebDriver driver= new ChromeDriver(options1);
//新建????WebDriver 的对象,但是new 的是FirefoxDriver的驱??
// try {
// driver.get("https://www.google.com");
......@@ -127,18 +127,17 @@ public class ChromeUtil {
// driver.get("https://www.google.com");
// }
try {
driver.get(url);
Thread.sleep(1000);
WebElement towe=driver.findElement(By.linkText("2"));
if(towe!=null){
towe.click();
Thread.sleep(2000);
}else{
return null;
}
Thread.sleep(2000);
// WebElement towe=driver.findElement(By.linkText("2"));
// if(towe!=null){
// towe.click();
// Thread.sleep(2000);
// }else{
// return null;
// }
String doc=driver.getPageSource();
Thread.sleep(5000);
Thread.sleep(3000);
// driver.close();
driver.quit();
return doc;
......@@ -155,51 +154,86 @@ public class ChromeUtil {
*/
return null;
}
public static List<String> getChromeDocMap(String url){
List<String> list=new ArrayList<String>();
//新建????WebDriver 的对象,但是new 的是FirefoxDriver的驱??
try {
driver.get("https://www.baidu.com");
//打开指定的网??
} catch (Exception e1) {
// TODO Auto-generated catch block
//e1.printStackTrace();
if(driver!=null) {
public static String getDoc(String url){
driver = new ChromeDriver(options1);
//新建�?��WebDriver 的对象,但是new 的是FirefoxDriver的驱�?
try {
driver.get(url);
//打开指定的网�?
} catch (Exception e1) {
// TODO Auto-generated catch block
//e1.printStackTrace();
driver =new ChromeDriver(options1);
driver.get(url);
}
try {
Thread.sleep(4000);
String doc=driver.getPageSource();
driver.quit();
return doc;
} catch (Exception e) {
driver.quit();
// TODO Auto-generated catch block
e.printStackTrace();
}
driver =new ChromeDriver(options1);
driver.get("https://www.baidu.com");
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return null;
}
try {
driver.get(url);
Thread.sleep(2000);
String doc=driver.getPageSource();
String url1=driver.getCurrentUrl();
// driver.close();
list.add(doc);
list.add(url1);
return list;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
if(driver!=null) {
driver.quit();
public static String getChromeDocKey(String url,String start){
//新建�?��WebDriver 的对象,但是new 的是FirefoxDriver的驱�?
try {
driver.get("https://www.google.com");
//打开指定的网�?
} catch (Exception e1) {
// TODO Auto-generated catch block
//e1.printStackTrace();
driver =new ChromeDriver(options1);
driver.get("https://www.google.com");
}
try {
WebElement searchBox2 = driver.findElement(By.name("q"));
searchBox2.sendKeys(url);
Thread.sleep(5000);
searchBox2.sendKeys(Keys.ENTER);
Thread.sleep(2000);
String local=driver.getCurrentUrl();
String realUrl=local+"&tbs=lr:lang_1en,sbd:1&hl=en&lr=lang_en&tbm=nws&num=20&start="+start;
driver.get(realUrl);
Thread.sleep(2000);
/*WebElement towe=driver.findElement(By.linkText("2"));
if(towe!=null){
towe.click();
Thread.sleep(2000);
}else{
return null;
}*/
String doc=driver.getPageSource();
// driver.close();
return doc;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return null;
}
/**
* dr.quit()和dr.close()都可以??出浏览器,????的说????两??的区别:第一个close??
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面??第二个quit??
* 是??出了????Webdriver????的窗口,????非常干净,所以推荐使用quit????????case????的方法??
*/
return null;
}
public static void main(String[] args) {
ChromeUtil.getChromeDoc("https://www.baidu.com/s?rtt=4&bsst=1&cl=2&tn=news&rsv_dl=ns_pc&lqst=1&x_bd_lqst=1&word=%E5%A4%A9%E6%B4%A5+%E6%B7%B7%E6%94%B9&pn=60");
......
......@@ -38,11 +38,11 @@ KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC=google_crawler_urlList
#搜索地址
META_SEARCH_URL=https://www.google.com.hk/search?q=[keyword]&newwindow=1&tbs=cdr:1,cd_min:[startTime],cd_max:[endTime]&tbm=nws&ei=fYBfYp-CHffo2roPhoOPsA4&start=[pn]
META_SEARCH_URL=https://www.google.com/search?q=[keyword]&newwindow=1&tbs=cdr:1,cd_min:[startTime],cd_max:[endTime]&tbm=nws&ei=fYBfYp-CHffo2roPhoOPsA4&start=[pn]
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
#META_SEARCH_URL=https://www.baidu.com/s?q1=[kd1]&q2=&q3=[kd2]&q4=&rn=50&lm=0&ct=0&ft=&q5=1&q6=&tn=baiduadv&pn=50
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak.txt
META_SEARCH_KEYWORDPATH=E:\\ideaWorkerspace\\crawler_2022\\googleSearch\\data\\projectbak2.txt
# Redis settings
redis.host=127.0.0.1
......
AMD
Intel
Qualcomm
NTT
Ericsson
Toshiba
LG Electronics
Samsung Electronics
TE Connectivity
Apple
Fujitsu
NEC
Panasonic
Sony
GE
Hitachi
Emerson
Honeywell
Boeing
Dow
Saint-Gobain
LS Electric
Honda
Toyota
Microsoft
NortonLifeLock
Roche
\ No newline at end of file
国资委+衣学东
\ No newline at end of file
package com.zzsn.local;
import com.google.gson.Gson;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class DetailSoSearch {
public static void main(String[] args) throws IOException {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
CatchWebByMetaSearch catchWebByMetaSearch = new Gson().fromJson(record.value().toString(), CatchWebByMetaSearch.class);
DetailSoSearchThread detailSouGouSearchThread=new DetailSoSearchThread();
detailSouGouSearchThread.catchWebByMetaSearch=catchWebByMetaSearch;
detailSouGouSearchThread.crawler();
//创建使用固定线程数的线程池
// TimeUnit.SECONDS.sleep(120);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
package com.zzsn.local;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.util.ChromeUtil;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utility.util.Utility;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Slf4j
public class SoRecorderUtil {
// 提取搜狗新闻列表URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfSougouList(
List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),
url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
} catch (URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Connection conn = Jsoup.connect(uri_code);
conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50");
Document doc = null;
try {
doc = conn.timeout(10000).get();
} catch (Exception ex) {
// ex.printStackTrace();
System.out.println("360搜索中该关键词搜索没有相关新闻!");
continue;
}
System.out.println("----360搜索----" + uri);
Elements firstElementsLink = doc.select("li[data-from=\"news\"]");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
for (int m=0;m<firstElementsLink.size();m++) {
catchWebByMetaSearch = new CatchWebByMetaSearch();
Elements orainAndDate = firstElementsLink.get(m).select("div[class=\"info b-info\"]");
if (orainAndDate.size()>0) {
String orainAndDatestr = orainAndDate.select("span:eq(1)").text();
//发布时间
String publishDate = DateUtil.getPublishDate(orainAndDatestr);
catchWebByMetaSearch.setPublishDate(publishDate);
//来源
// String orin = orainAndDatestr.split(" ")[0].trim();
String orin = orainAndDate.select("span:eq(0)").text();;
catchWebByMetaSearch.setSourcesite(orin);
}
Elements titleAndUrl = firstElementsLink.get(m).select("a");
if (titleAndUrl.size()>0) {
//标题
String title = titleAndUrl.get(0).attr("title");
catchWebByMetaSearch.setTitle(title);
//源网址
String addressurl = titleAndUrl.get(0).absUrl("href");
catchWebByMetaSearch.setSourceaddress(addressurl);
}
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
metaSearchList.add(catchWebByMetaSearch);
}
for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
List<CatchWebByMetaSearch> catchWebByMetaSearches=new ArrayList<>();
catchWebByMetaSearches.add(catchMetaSearch);
DetailSoSearchThread detailSouGouSearchThread=new DetailSoSearchThread();
detailSouGouSearchThread.CatchWebNews(catchWebByMetaSearches,"");
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static String sendGet(String url) {
String result = "";
String urlName = url;
try {
URL realURL = new URL(urlName);
URLConnection conn = realURL.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
conn.connect();
Map<String, List<String>> map = conn.getHeaderFields();
for (String s : map.keySet()) {
System.out.println(s + "-->" + map.get(s));
}
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
String line;
while ((line = in.readLine()) != null) {
result += "\n" + line;
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfGoogleList(
List<String> urlList, String charset, Long orgId, Long tid) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(1000*2);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
System.out.println(urlList.get(i));
// String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20)+"");
String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
String info = doc.toString();
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
// System.out.println(firstElementsLink.get(j).toString());
//标题
// Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
//catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> CatchWebOfGoogle1(
List<String> urlList, String charset, Long orgId, Long tid) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(1000*5);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
System.out.println(urlList.get(i));
String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20+0)+"");
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
String info = doc.toString();
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
// System.out.println(firstElementsLink.get(j).toString());
//标题
// Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
//catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
package com.zzsn.local;
import com.zzsn.search.FileUtil;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class WebSoSearch {
public static void main(String[] args) throws IOException {
// String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH;
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
paser(allLines);
}
public static String dateToStamp(String s) {
String res="";
//设置时间模版
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
Date date = simpleDateFormat.parse(s);
long ts = date.getTime()/1000;
res = String.valueOf(ts);
}catch (Exception e){
}
return res;
}
public static void paser(List<String> keywords){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<WebSoSearchThread> workers = new Vector<WebSoSearchThread>();
int index = 0;
try {
for (List<String> keywordList : splitList) {
WebSoSearchThread webSouGouSearchThread = new WebSoSearchThread();
webSouGouSearchThread.setThreadId(index++);
webSouGouSearchThread.setKeywords(keywordList);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2022052505");
keywordMsg.setKeyWord("国资委+衣学东");
webSouGouSearchThread.setKeywordMsg(keywordMsg);
workers.add(webSouGouSearchThread);
threadPool.execute(webSouGouSearchThread);
Thread.sleep(1000);
}
}catch (Exception e){
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
}
AMD
Intel
Qualcomm
NTT
Ericsson
Toshiba
LG Electronics
Samsung Electronics
TE Connectivity
Apple
Fujitsu
NEC
Panasonic
Sony
GE
Hitachi
Emerson
Honeywell
Boeing
Dow
Saint-Gobain
LS Electric
Honda
Toyota
Microsoft
NortonLifeLock
Roche
\ No newline at end of file
衣学东
\ No newline at end of file
package com.zzsn.localsearch;
import com.google.gson.Gson;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class DetailSouGouSearch {
public static void main(String[] args) throws IOException {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
CatchWebByMetaSearch catchWebByMetaSearch = new Gson().fromJson(record.value().toString(), CatchWebByMetaSearch.class);
DetailSouGouSearchThread detailSouGouSearchThread=new DetailSouGouSearchThread();
detailSouGouSearchThread.catchWebByMetaSearch=catchWebByMetaSearch;
detailSouGouSearchThread.crawler();
//创建使用固定线程数的线程池
// TimeUnit.SECONDS.sleep(120);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
package com.zzsn.localsearch;
import com.zzsn.search.FileUtil;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.utility.index.Constants;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class WebSouGouSearch {
public static void main(String[] args) throws IOException {
// String filepath=args[0];
String filepath= Constants.META_SEARCH_KEYWORDPATH;
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
paser(allLines);
}
public static void paser(List<String> keywords){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<WebSouGouSearchThread> workers = new Vector<WebSouGouSearchThread>();
int index = 0;
try {
for (List<String> keywordList : splitList) {
WebSouGouSearchThread webSouGouSearchThread = new WebSouGouSearchThread();
webSouGouSearchThread.setThreadId(index++);
webSouGouSearchThread.setKeywords(keywordList);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("2022052504");
keywordMsg.setKeyWord("衣学东");
webSouGouSearchThread.setKeywordMsg(keywordMsg);
workers.add(webSouGouSearchThread);
threadPool.execute(webSouGouSearchThread);
Thread.sleep(1000);
}
}catch (Exception e){
e.printStackTrace();
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
}
......@@ -54,8 +54,8 @@ public class WebSouGouSearch {
webSouGouSearchThread.setThreadId(index++);
webSouGouSearchThread.setKeywords(keywordList);
KeywordMsg keywordMsg=new KeywordMsg();
keywordMsg.setId("123456");
keywordMsg.setKeyWord("AMD");
keywordMsg.setId("2022052504");
keywordMsg.setKeyWord("国资委+衣学东");
webSouGouSearchThread.setKeywordMsg(keywordMsg);
workers.add(webSouGouSearchThread);
threadPool.execute(webSouGouSearchThread);
......
......@@ -77,6 +77,7 @@ public class WebSouGouSearchThread implements Runnable {
crawler();
}
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
// public KafkaTemplate kafkaTemplate= null;
@Async("asyncexecutorServiceWebBaidu")
public void crawler(){
......
package com.zzsn;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class CrawlerStaticApplication extends SpringBootServletInitializer {
@Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(CrawlerStaticApplication.class);
}
public static void main(String[] args) {
SpringApplication.run(CrawlerStaticApplication.class, args);
}
}
\ No newline at end of file
package com.zzsn.cache;
import com.alibaba.druid.util.StringUtils;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Set;
public class JedisFactory {
private static final Logger Log = LoggerFactory.getLogger(JedisFactory.class);
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
public static void init(){
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXIDLE;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
public static String getKeyStr(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
return null;
}
return jedis.get( key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
private static Jedis getJedis() {
return jedisPool.getResource();
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeyStr(String key, String value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static boolean setKeyStrtime(String key, String value,Integer time) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
if (time > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(key, time);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static Set<String> getKeySet(String key) {
Jedis jedis=getJedis();
try {
Set<String> obj = jedis.smembers(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeySet(String key, Set<String> value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if(value==null){
return false;
}
jedis.sadd(key, value.toArray(new String[value.size()]));
} catch (Exception e){
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static void del(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
return;
}
jedis.del(key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
}
}
package com.zzsn.cache;
import com.alibaba.druid.util.StringUtils;
import com.zzsn.utility.index.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.List;
import java.util.Set;
public class JedisUtil {
private static final String PREFIX = "yahoo_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
private JedisUtil() {
}
//写成静态代码块形式,只加载一次,节省资源
static {
}
/**
* 从jedis连接池中获取获取jedis对象
*
* @return
*/
private static void init(){
// Properties properties = PropertyUtil.loadProperties("redis.properties");
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXTOTAL;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
private static Jedis getJedis() {
init();
return jedisPool.getResource();
}
private static final JedisUtil jedisUtil = new JedisUtil();
/**
* 获取JedisUtil实例
*
* @return
*/
public static JedisUtil getInstance() {
return jedisUtil;
}
public static void returnResource(final Jedis jedis) {
if (jedis != null && jedisPool != null) {
jedis.close();
}
}
public static Jedis getDefaultJedis() {
// return getJedis(HOST_IP, HOST_PORT);//简装版
return getJedis();
}
/**
* 根据 pattern 获取 redis 中的键
*/
public static Set<String> getKeysByPattern(String pattern) {
return getDefaultJedis().keys(pattern);
}
public static boolean exists(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
throw new Exception("key is null");
}
return getDefaultJedis().exists(PREFIX + key);
}
public static void del(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(PREFIX + key);
}
public static void setString(String key, String value, int expireTime) throws Exception {
Jedis jedis=null;
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
String finalKey = PREFIX + key;
jedis = getDefaultJedis();
jedis.set(finalKey, value);
if (expireTime > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(finalKey, expireTime);
}
}catch (Exception e){
}finally {
returnResource(jedis);
}
}
public static String getString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(PREFIX + key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().setnx(PREFIX + key, value);
}
public static long expire(String key, int seconds) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().expire(PREFIX + key, seconds);
}
public static void pushList(String key, String value, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
/**
* key代表的是链表的名字 List是一个双端链表,lpush是往链表的头部插入一条数据,rpush是往尾部插入一条数据
*/
if (flag.equalsIgnoreCase("L")) {
getDefaultJedis().lpush(PREFIX + key, value);
} else if (flag.equalsIgnoreCase("R")) {
getDefaultJedis().rpush(PREFIX + key, value);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
public static String popList(String key, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
if (flag.equalsIgnoreCase("L")) {
return getDefaultJedis().lpop(PREFIX + key);
} else if (flag.equalsIgnoreCase("R")) {
return getDefaultJedis().rpop(PREFIX + key);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
/**
* 获取 List 中指定区间上的元素
*/
public static List<String> getAppointedList(String key, long start, long end) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, start, end);
}
public static List<String> getList(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, 0, -1);
}
public static void sadd(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
Jedis defaultJedis=null;
try {
defaultJedis = getDefaultJedis();
Long sadd = defaultJedis.sadd(key, value);
}catch (Exception e){
}finally {
if(defaultJedis!=null){
defaultJedis.close();
}
}
}
public static boolean sismember(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
Jedis defaultJedis=null;
Boolean aBoolean=false;
try {
defaultJedis = getDefaultJedis();
aBoolean = defaultJedis.sismember(key, value);
}catch (Exception e){
}finally {
returnResource(defaultJedis);
}
return aBoolean;
}
}
package com.zzsn.cache;
import com.google.code.yanf4j.core.impl.StandardSocketOption;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.command.BinaryCommandFactory;
import net.rubyeye.xmemcached.transcoders.SerializingTranscoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Memcached客户端
*/
public class Memcached {
private static Logger logger = LoggerFactory.getLogger(Memcached.class);
private static MemcachedClient client = null;
static {
logger.debug("memcached initialize...");
Properties prop = getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
int[] weights = new int[servers.length];
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
String weight = prop.getProperty("memcached.server"+(i+1)+".weight");
if (weight == null || weight.isEmpty()) {
weights[i] = 1;
} else {
weights[i] = Integer.parseInt(weight);
}
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList, weights);
String poolSize = prop.getProperty("memcached.connectionPoolSize");
if (poolSize != null && !poolSize.isEmpty()) {
builder.setConnectionPoolSize(Integer.parseInt(poolSize));
}
String failureMode = prop.getProperty("memcached.failureMode");
if (failureMode != null && !failureMode.isEmpty()) {
builder.setFailureMode(Boolean.parseBoolean(failureMode));
}
String connTimeout = prop.getProperty("memcached.connectTimeout");
if (connTimeout != null && !connTimeout.isEmpty()) {
builder.setConnectTimeout(Integer.parseInt(connTimeout));
}
String opTimeout = prop.getProperty("memcached.opTimeout");
if (opTimeout != null && !opTimeout.isEmpty()) {
builder.setOpTimeout(Integer.parseInt(opTimeout));
}
String enableHealSession = prop.getProperty("memcached.enableHealSession");
if (enableHealSession != null && !enableHealSession.isEmpty()) {
builder.setEnableHealSession(Boolean.parseBoolean(enableHealSession));//启用或者禁止连接修复
}
String statistics = prop.getProperty("memcached.statistics");
if (statistics != null && !statistics.isEmpty()) {
builder.getConfiguration().setStatisticsServer(Boolean.parseBoolean(statistics));
}
String binary = prop.getProperty("memcached.binaryCommand");
if (binary != null && "true".equals(binary)) {
builder.setCommandFactory(new BinaryCommandFactory());
}
builder.setTranscoder(new SerializingTranscoder());
builder.setSocketOption(StandardSocketOption.SO_RCVBUF, 32* 1024);// 设置接收缓存区为32K,默认16K
builder.setSocketOption(StandardSocketOption.SO_SNDBUF,16 *1024); // 设置发送缓冲区为16K,默认为8K
builder.setSocketOption(StandardSocketOption.TCP_NODELAY,true); // 启用nagle算法,提高吞吐量,默认关闭
String sessionIdleTimeout = prop.getProperty("memcahced.sessionIdleTimeout");
if (sessionIdleTimeout != null && !sessionIdleTimeout.isEmpty()) {
builder.getConfiguration().setSessionIdleTimeout(Integer.parseInt(sessionIdleTimeout)*1000); // 如果连接超过x秒没有任何IO操作发生即认为空闲并发起心跳检测
}
try {
client = builder.build();
String optimizeMergeBuffer = prop.getProperty("memcached.optimizeMergeBuffer");
if (optimizeMergeBuffer != null && !optimizeMergeBuffer.isEmpty()) {
client.setOptimizeMergeBuffer(Boolean.parseBoolean(optimizeMergeBuffer));
}
String mergeFactor = prop.getProperty("memcached.mergeFactor");
if (mergeFactor != null && !optimizeMergeBuffer.isEmpty()) {
client.setMergeFactor(Integer.parseInt(mergeFactor));
}
logger.debug("memcached initialize completed!");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private Memcached() {
}
public static MemcachedClient getClient() {
return client;
}
public static void shutdown(MemcachedClient client) {
if (client != null) {
try {
client.shutdown();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static Properties getConfig() {
Properties properties = new Properties();
InputStream is = null;
String location = "memcached.properties";
try {
Resource resource = new DefaultResourceLoader().getResource(location);
is = resource.getInputStream();
properties.load(is);
logger.debug("memcached config: {}", properties.toString());
} catch (IOException ex) {
logger.error("Could not load property file:" + location, ex);
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException ioe) {
// ignore
}
}
return properties;
}
}
package com.zzsn.cache;
import net.rubyeye.xmemcached.KeyIterator;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
/**
* @Author Sugar
* @Version 2018/6/8 13:22
*/
public class MemcachedAdmin {
private static Logger logger = LoggerFactory.getLogger(MemcachedAdmin.class);
private static MemcachedClient client = null;
private MemcachedAdmin() {
}
private static MemcachedClient getClient() {
if (client != null) {
return client;
}
Properties prop = Memcached.getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList);
try {
client = builder.build();
} catch (IOException e) {
e.printStackTrace();
}
return client;
}
@Deprecated
public static long deleteAll(String keyPrefix) {
AtomicLong count = new AtomicLong();
MemcachedClient client = getClient();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
boolean result = client.delete(key);
long i = count.incrementAndGet();
if (logger.isDebugEnabled()) {
logger.debug("[{}] Delete key[{}]: {}={}", inet, i, key, result);
}
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
logger.info("Delete a total of {} keys starting with {}", count.get(), keyPrefix);
return count.get();
}
@Deprecated
public static List<String> getAllKey(String keyPrefix) {
MemcachedClient client = getClient();
List<String> keys = new ArrayList<>();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
keys.add(key);
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
return keys;
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port) {
return addServer(host, port, 1);
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port, int weight) {
try {
Memcached.getClient().addServer(host, port, weight);
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
}
package com.zzsn.cache;
import io.protostuff.*;
import io.protostuff.runtime.RuntimeSchema;
import org.springframework.objenesis.Objenesis;
import org.springframework.objenesis.ObjenesisStd;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @Author Sugar
* @Version 2018/3/16 13:32
*/
public class ProtostuffUtil {
public static final Objenesis objenesis = new ObjenesisStd(true);
private static final String ERR_TRUNCATED_MESSAGE =
"While parsing a protocol message, the input ended unexpectedly " +
"in the middle of a field. This could mean either than the " +
"input has been truncated or that an embedded message " +
"misreported its own length.";
/**
* 序列化
*
* @param obj
* @return
*/
public static <T> byte[] serializer(T obj) {
LinkedBuffer buffer = LinkedBuffer.allocate(LinkedBuffer.DEFAULT_BUFFER_SIZE);
try {
Schema<T> schema = RuntimeSchema.getSchema((Class<T>) obj.getClass());
return ProtostuffIOUtil.toByteArray(obj, schema, buffer);
} catch (Exception e) {
throw new IllegalStateException("序列化对象失败:" + obj, e);
} finally {
buffer.clear();
}
}
/**
* 反序列化
*
* @param data
* @param clazz
* @return
*/
public static <T> T deserializer(byte[] data, Class<T> clazz) {
T obj = null;
try {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
// obj = schema.newMessage();
obj = objenesis.newInstance(clazz);
ProtostuffIOUtil.mergeFrom(data, obj, schema);
} catch (Exception e) {
throw new IllegalStateException("反序列化对象失败:class=" + clazz + ", data=" + new String(data), e);
}
return obj;
}
public static <T> byte[] serializeList(List<T> list) {
@SuppressWarnings("unchecked")
Schema<T> schema = (Schema<T>) RuntimeSchema.getSchema(list.get(0).getClass());
LinkedBuffer buffer = LinkedBuffer.allocate(1024 * 1024);
ByteArrayOutputStream bos = null;
try {
bos = new ByteArrayOutputStream();
ProtostuffIOUtil.writeListTo(bos, list, schema, buffer);
return bos.toByteArray();
} catch (Exception e) {
throw new IllegalStateException("序列化对象列表失败:" + list, e);
} finally {
buffer.clear();
try {
if (bos != null) {
bos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 反序列化对象列表
*
* @param data
* @param clazz
* @param <T>
* @return
*/
public static <T> List<T> deserializeList(byte[] data, Class<T> clazz) {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
List<T> result = null;
try {
result = parseListFrom(new ByteArrayInputStream(data), schema, clazz);
} catch (IOException e) {
throw new IllegalStateException("反序列化对象列表失败:class=" + clazz + ", data=" + new String(data), e);
}
return result;
}
private static <T> List<T> parseListFrom(final InputStream in, final Schema<T> schema, Class<T> clazz)
throws IOException {
int size = in.read();
if (size == -1) {
return Collections.emptyList();
}
if (size > 0x7f) {
size = readRawVarint32(in, size);
}
final ArrayList<T> list = new ArrayList<T>(size);
final CodedInput input = new CodedInput(in, true);
for (int i = 0; i < size; i++) {
// final T message = schema.newMessage();
final T message = objenesis.newInstance(clazz);//使用objensis代替newInstance()
list.add(message);
schema.mergeFrom(input, message);
input.checkLastTagWas(0);
}
assert in.read() == -1;
return list;
}
/**
* Reads a varint from the input one byte at a time, so that it does not read any bytes after the end of the varint.
* If you simply wrapped the stream in a CodedInput and used readRawVarint32(InputStream) then you would
* probably end up reading past the end of the varint since CodedInput buffers its input.
*/
private static int readRawVarint32(final InputStream input, final int firstByte) throws IOException {
int result = firstByte & 0x7f;
int offset = 7;
for (; offset < 32; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
result |= (b & 0x7f) << offset;
if ((b & 0x80) == 0) {
return result;
}
}
// Keep reading up to 64 bits.
for (; offset < 64; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
if ((b & 0x80) == 0) {
return result;
}
}
throw new ProtobufException(
"CodedInput encountered a malformed varint.");
}
}
package com.zzsn.conf;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.web.servlet.MultipartAutoConfiguration;
import org.springframework.boot.web.servlet.MultipartConfigFactory;
import org.springframework.context.annotation.Bean;
import javax.servlet.MultipartConfigElement;
import java.io.File;
@SpringBootApplication(exclude = {MultipartAutoConfiguration.class})
public class SpringBootFileuploadApplication {
@Bean
MultipartConfigElement multipartConfigElement() {
MultipartConfigFactory factory = new MultipartConfigFactory();
String location = System.getProperty("user.dir") + "/data/tmp";
File tmpFile = new File(location);
if (!tmpFile.exists()) {
tmpFile.mkdirs();
}
factory.setLocation(location);
return factory.createMultipartConfig();
}
}
package com.zzsn.conf;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
@Configuration
@EnableAsync
public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor")
public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "asyncTaskExecutorSelenium")
public Executor executorSelenium() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(2);//线程池维护线程的最少数量
executor.setMaxPoolSize(4);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "webExecutor")
public Executor webExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "detailExecutor")
public Executor detailExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "asyncexecutorService")
public Executor executorService() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "asyncexecutorServiceWebBaidu")
public Executor executorServiceWebBaidu() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize();
return executor;
}
@Bean(value = "asyncexecutorServiceDetailUrl")
public Executor asyncexecutorServiceDetailUrl() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(5);//线程池维护线程的最少数量
executor.setMaxPoolSize(5);//线程池维护线程的最大数量
executor.setQueueCapacity(100000);//缓存队列
executor.setThreadNamePrefix("selenium-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60*60);//允许的空闲时间
executor.initialize();
return executor;
}
}
\ No newline at end of file
package com.zzsn.docinfo;
import java.io.Serializable;
import java.util.Map;
/**
* 数据接口文档
* 创建人:李东亮
* 创建时间:2016-4-6 下午3:44:17
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DocInfo implements Serializable{
private String contentType;
private Long orgId;
private Long sid;
//News:新闻,BBS:论坛,Blog:博客,MicroBlog:微博,WeChat:微信,Video:视频,Other:其他
private String sourceType;
private String lastModified;
private String charset;
private String sourceaddress;
private String lang;
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
private String fileDownLoadPath;
private Map<String,String> otherParams;
public Long getOrgId() {
return orgId;
}
public void setOrgId(Long orgId) {
this.orgId = orgId;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public Long getSid() {
return sid;
}
public void setSid(Long sid) {
this.sid = sid;
}
public String getSourceType() {
return sourceType;
}
public void setSourceType(String sourceType) {
this.sourceType = sourceType;
}
public String getLastModified() {
return lastModified;
}
public void setLastModified(String lastModified) {
this.lastModified = lastModified;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
public String getSourceaddress() {
return sourceaddress;
}
public void setSourceaddress(String sourceaddress) {
this.sourceaddress = sourceaddress;
}
public String getLang() {
return lang;
}
public void setLang(String lang) {
this.lang = lang;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getOrigin() {
return origin;
}
public void setOrigin(String origin) {
this.origin = origin;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContentWithTag() {
return contentWithTag;
}
public void setContentWithTag(String contentWithTag) {
this.contentWithTag = contentWithTag;
}
public String getContentNoTag() {
return contentNoTag;
}
public void setContentNoTag(String contentNoTag) {
this.contentNoTag = contentNoTag;
}
public Map<String, String> getOtherParams() {
return otherParams;
}
public void setOtherParams(Map<String, String> otherParams) {
this.otherParams = otherParams;
}
public String getFileDownLoadPath() {
return fileDownLoadPath;
}
public void setFileDownLoadPath(String fileDownLoadPath) {
this.fileDownLoadPath = fileDownLoadPath;
}
public String getContentImgCvtTag() {
return contentImgCvtTag;
}
public void setContentImgCvtTag(String contentImgCvtTag) {
this.contentImgCvtTag = contentImgCvtTag;
}
}
package com.zzsn.entity;
import java.io.Serializable;
public class SiteTemplate implements Serializable {
private static final long serialVersionUID = 1L;
String sourceId;
String siteName;
String siteHost;
String siteLang;
String matchUrl;
String matchTitle;
String matchSummary;
String matchContent;
String matchPublishDate;
String matchOrigin;
String matchAuthor;
String excloume1;
String excloume2;
public SiteTemplate() {
super();
}
public SiteTemplate(String siteName, String siteHost, String siteLang, String matchUrl, String matchTitle,
String matchSummary, String matchContent, String matchPublishDate, String matchOrigin, String matchAuthor,
String excloume1, String excloume2) {
super();
this.siteName = siteName;
this.siteHost = siteHost;
this.siteLang = siteLang;
this.matchUrl = matchUrl;
this.matchTitle = matchTitle;
this.matchSummary = matchSummary;
this.matchContent = matchContent;
this.matchPublishDate = matchPublishDate;
this.matchOrigin = matchOrigin;
this.matchAuthor = matchAuthor;
this.excloume1 = excloume1;
this.excloume2 = excloume2;
}
public String getSourceId() {
return sourceId;
}
public void setSourceId(String sourceId) {
this.sourceId = sourceId;
}
public String getSiteName() {
return siteName;
}
public void setSiteName(String siteName) {
this.siteName = siteName;
}
public String getSiteHost() {
return siteHost;
}
public void setSiteHost(String siteHost) {
this.siteHost = siteHost;
}
public String getSiteLang() {
return siteLang;
}
public void setSiteLang(String siteLang) {
this.siteLang = siteLang;
}
public String getMatchUrl() {
return matchUrl;
}
public void setMatchUrl(String matchUrl) {
this.matchUrl = matchUrl;
}
public String getMatchTitle() {
return matchTitle;
}
public void setMatchTitle(String matchTitle) {
this.matchTitle = matchTitle;
}
public String getMatchSummary() {
return matchSummary;
}
public void setMatchSummary(String matchSummary) {
this.matchSummary = matchSummary;
}
public String getMatchContent() {
return matchContent;
}
public void setMatchContent(String matchContent) {
this.matchContent = matchContent;
}
public String getMatchPublishDate() {
return matchPublishDate;
}
public void setMatchPublishDate(String matchPublishDate) {
this.matchPublishDate = matchPublishDate;
}
public String getMatchOrigin() {
return matchOrigin;
}
public void setMatchOrigin(String matchOrigin) {
this.matchOrigin = matchOrigin;
}
public String getMatchAuthor() {
return matchAuthor;
}
public void setMatchAuthor(String matchAuthor) {
this.matchAuthor = matchAuthor;
}
@Override
public String toString() {
return "SiteTemplate [sourceId=" + sourceId + ", siteName=" + siteName + ", siteHost=" + siteHost
+ ", siteLang=" + siteLang + ", matchUrl=" + matchUrl + ", matchTitle=" + matchTitle + ", matchSummary="
+ matchSummary + ", matchContent=" + matchContent + ", matchPublishDate=" + matchPublishDate
+ ", matchOrigin=" + matchOrigin + ", matchAuthor=" + matchAuthor + ", excloume1=" + excloume1
+ ", excloume2=" + excloume2 + "]";
}
}
package com.zzsn.job;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* 阻塞线程池
* 线程池的线程数到达最大线程数阻塞等待
* 可用于多线程获取MQ消息任务
* 因为会阻塞,就不用考虑拒绝策略这一块的重写
*/
public class BlockThreadPoolExecute extends ThreadPoolExecutor {
private ReentrantLock lock = new ReentrantLock();
private Condition condition = this.lock.newCondition();
public BlockThreadPoolExecute(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
@Override
public void execute(Runnable command) {
//进行同步锁定
this.lock.lock();
super.execute(command);
try {
//如果线程池的数量已经达到最大线程池的数量,则进行挂起操作
if (getPoolSize() == getMaximumPoolSize()) {
this.condition.await();
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
this.lock.unlock();
}
}
@Override
protected void afterExecute(Runnable r, Throwable t) {
try{
lock.lock();
this.condition.signal();
}finally {
this.lock.unlock();
}
}
}
\ No newline at end of file
package com.zzsn.job;
import com.google.gson.Gson;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.googleThread.DetailGoogleSearchThread;
import com.zzsn.search.googleThread.WebGoogleSearchThread;
import com.zzsn.search.yaooThread.DetailYahooSearchThread;
import com.zzsn.search.yaooThread.WebYahooSearchThread;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Arrays;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@Component
@EnableScheduling
@Slf4j
public class KafkaConsumerYahooTask {
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
//打包编译时修改定时启动的任务 如果是搜索关键词放开consumerKeyword 如果是内容解析放开consumerDetailUrl 上面的定时任务
@Scheduled(cron = "0 0/5 * * * ?")
@Async("webExecutor")
public void consumerKeyword (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
KeywordMsg keywordMsg = new Gson().fromJson(record.value().toString(), KeywordMsg.class);
WebYahooSearchThread metaSearchThread=new WebYahooSearchThread();
metaSearchThread.keywordMsg=keywordMsg;
metaSearchThread.crawler();
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
// @Scheduled(cron = "0 0/5 * * * ?")
@Async("detailExecutor")
public void consumerDetailUrl (){
log.info("定时获取mq2消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC));
ExecutorService poolExecuter = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
CatchWebByMetaSearch catchWebByMetaSearch = new Gson().fromJson(record.value().toString(), CatchWebByMetaSearch.class);
DetailYahooSearchThread detailBaiduSearchThread=new DetailYahooSearchThread();
detailBaiduSearchThread.catchWebByMetaSearch=catchWebByMetaSearch;
detailBaiduSearchThread.crawler();
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_URLLIST_TOPIC));
}
}
}
package com.zzsn.job;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.StringSerializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.core.ProducerFactory;
import java.util.HashMap;
import java.util.Map;
/**
* kafka生产者配置
* @author Java小白
*
*/
@Configuration
@EnableKafka
public class KafkaProducerConfig {
@Value("${kafka.producer.servers}")
private String servers;
@Value("${kafka.producer.retries}")
private int retries;
@Value("${kafka.producer.batch.size}")
private int batchSize;
@Value("${kafka.producer.linger}")
private int linger;
@Value("${kafka.producer.buffer.memory}")
private int bufferMemory;
/**
* 配置生产者信息(消费提供者信息)
* @return
*/
public Map<String, Object> producerConfigs() {
Map<String, Object> props = new HashMap<>();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, servers);
props.put(ProducerConfig.RETRIES_CONFIG, retries);
props.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize);
props.put(ProducerConfig.LINGER_MS_CONFIG, linger);
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, bufferMemory);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
return props;
}
/**
* 消费工厂
* @return
*/
public ProducerFactory<String, String> producerFactory() {
return new DefaultKafkaProducerFactory<>(producerConfigs());
}
/**
* 消息发送工具类
* @return
*/
@Bean
public KafkaTemplate<String, String> kafkaTemplate() {
//需要指定消费工厂
return new KafkaTemplate<String, String>(producerFactory());
}
}
\ No newline at end of file
package com.zzsn.job;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertyUtil {
//加载property文件到io流里面
public static Properties loadProperties(String propertyFile) {
Properties properties = new Properties();
try {
InputStream is = PropertyUtil.class.getClassLoader().getResourceAsStream(propertyFile);
if(is == null){
is = PropertyUtil.class.getClassLoader().getResourceAsStream("properties/" + propertyFile);
}
properties.load(is);
} catch (IOException e) {
e.printStackTrace();
}
return properties;
}
/**
* 根据key值取得对应的value值
*
* @param key
* @return
*/
public static String getValue(String propertyFile, String key) {
Properties properties = loadProperties(propertyFile);
return properties.getProperty(key);
}
}
package com.zzsn.paser;
import com.zzsn.utility.index.Constants;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
public class SeleniumTime {
public ChromeOptions chromeOptions =new ChromeOptions() ;
public ChromeDriver driver;
public SeleniumTime(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
chromeOptions.addArguments("blink-settings=imagesEnabled=false");
// chromeOptions.addArguments("--headless");
driver = new ChromeDriver(chromeOptions);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public String getScopehtml(String url){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try{
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
try{
Thread.sleep(3000l);
String html = webElement.getAttribute("outerHTML");
Thread.sleep(5000l);
driver.quit();
// System.out.println(html);
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}catch(Exception e){
System.out.println("动态爬取方式一出现+"+"org.openqa.selenium.StaleElementReferenceException异常"
+"可能原因为过快的执行没有找到指定的页面元素");
System.out.println("=============执行方法二==============");
Thread.sleep(3000l);
String html = driver.getPageSource();
Thread.sleep(5000l);
driver.quit();
if(url.contains("http://www.flw.ph")){
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
if(html.contains(a)&&html.contains(b)){
String[] split = html.split(a);
String sa = split[0];
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
String sab = sa + substring ;
return sab;
}
}
return html;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}catch(Exception e){
try {
Thread.sleep(5000l);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
driver.quit();
e.printStackTrace();
}
try {
Thread.sleep(5000l);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
driver.quit();
return null;
}
public static void main(String[] args) {
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime s = new SeleniumTime();
String scopehtml = s.getScopehtml("http://www.flw.ph/thread-869016-1-1.html");
String a = "<div class=\"attach_nopermission attach_tips\">";
String b = "<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>";
System.out.println("开始");
if(scopehtml.contains(a)){
System.out.println("包含a");
}
if(scopehtml.contains(a)){
System.out.println("包含b");
}
System.out.println("结束");
String[] split = scopehtml.split(a);
String sa = split[0];
System.out.println("首次截取的长度"+split.length);
String[] split2 = split[1].split(b);
String sb = split2[1];
String substring = sb.substring(7);
System.out.println("再次截取的长度"+split2.length);
String sab = sa + substring ;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File file = new File("D:/123.txt");
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(sab);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.zzsn.paser;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.Site;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.extractor.DefaultMsg;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utility.util.Utility;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class SourceTemplateByTag {
// 从缓存中获取对应的模板
public SiteTemplate getSiteTemp(String infourl){
SiteTemplate sTemplate=new SiteTemplate();
try {
String domain=new URL(infourl).getHost();
Site site=(Site) MemcachedUtils.get("domainUri_"+domain);
if (null != site.getMatchTitle()) {
sTemplate.setMatchTitle(site.getMatchTitle());
}
if (null != site.getMatchSummary()) {
sTemplate.setMatchSummary(site.getMatchSummary());
}
if (null != site.getMatchContent()) {
sTemplate.setMatchContent(site.getMatchContent());
}
if (null != site.getMatchTitle()) {
sTemplate.setMatchAuthor(site.getMatchAuthor());
}
if (null != site.getMatchOrigin()) {
sTemplate.setMatchOrigin(site.getMatchOrigin());
}
if (null != site.getMatchPublishDate()) {
sTemplate.setMatchPublishDate(site.getMatchPublishDate());
}
}catch (Exception e){
return null;
}
return sTemplate;
}
static List<Site> sList=new ArrayList<Site>();
//保存有问题的站点
public static void saveNoTempSite( CatchWebByMetaSearch cwbm ){
try {
String infourl = cwbm.getSourceaddress();
String domainurl = new URL(infourl).getHost();
Site site = new Site();
site.setDomainUri(domainurl);
site.setUri(infourl);
site.setName(cwbm.getSourcesite());
Object cacheObj=MemcachedUtils.get("tempSite");
if (null==cacheObj ) {
List<Site> s2List=new ArrayList<Site>();
sList=s2List;
}
sList.add(site);
MemcachedUtils.set("tempSite", sList,60*60*24);
System.out.println("保存成功!!");
}catch (Exception e){
e.printStackTrace();
System.out.println("保存失败");
}
}
public static DocInfo doPaserByTag(String htmlContent, DocInfo docInfo, SiteTemplate siteTemplate){
DefaultMsg dm = new DefaultMsg();
Document doc = Jsoup.parse(htmlContent);
System.out.println("===========doPaserByTag");
if(null!=siteTemplate.getMatchTitle()&&siteTemplate.getMatchTitle().length()>0) {
//标题
String title =paseElementByCSS(doc,siteTemplate.getMatchTitle());
if (StringUtils.isNotEmpty(title)) {
docInfo.setTitle(title.replace("...", ""));
}
}
if(null!=siteTemplate.getMatchContent()&&siteTemplate.getMatchContent().length()>0) {
Elements elementsByTag = doc.select(siteTemplate.getMatchContent());
String contentWithTag = Utility.RemoveUselessHTMLTagX(elementsByTag.html());
System.out.println("==========="+elementsByTag);
// String contentWithTag =paseElementByCSS(doc,siteTemplate.getMatchContent());
if (contentWithTag == null || contentWithTag.trim().length() == 0) {
return docInfo;
}
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentWithTag));
docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag).replaceAll("\\n",""));
}
if(null!=siteTemplate.getMatchAuthor()&&siteTemplate.getMatchAuthor().length()>0) {
String author=paseElementByCSS(doc,siteTemplate.getMatchAuthor());
if(author.length()>0) {
docInfo.setAuthor(author);
}
}
if(null!=siteTemplate.getMatchPublishDate()&&siteTemplate.getMatchPublishDate().length()>0) {
String publishDate=paseElementByCSS(doc,siteTemplate.getMatchPublishDate());
if(publishDate.length()>0) {
docInfo.setPublishDate(DateUtil.getPublishDate(publishDate));
}
}
if(null!=siteTemplate.getMatchSummary()&&siteTemplate.getMatchSummary().length()>0) {
String summary=paseElementByCSS(doc,siteTemplate.getMatchSummary());
if(summary.length()>0) {
docInfo.setSummary(summary);
}
}
if(null!=siteTemplate.getMatchOrigin()&&siteTemplate.getMatchOrigin().length()>0) {
String origin=paseElementByCSS(doc,siteTemplate.getMatchOrigin());
if(origin.length()>0) {
docInfo.setOrigin(origin);
}
}
// this.buildProcessItem(docInfo);
return docInfo;
}
public static String paseElementByCSS(Document doc,String tag){
String msg="";
try {
Elements elements = doc.select(tag);
if (elements.size() > 0) {
msg = elements.get(0).text().trim();
}
}catch (Exception e){
e.printStackTrace();
}finally {
return msg;
}
// return msg;
}
}
package com.zzsn.search;
import java.io.*;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class FileUtil {
public static List<String> getFileLines(File file, String encoding)
throws IOException
{
List<String> lines = new ArrayList<String>();
if (encoding == null)
{
encoding = "utf-8";
}
FileInputStream stream = new FileInputStream(file);
InputStreamReader reader = new InputStreamReader(stream, encoding);
BufferedReader bufferedReader = new BufferedReader(reader);
String line = null;
while ((line = bufferedReader.readLine()) != null)
{
lines.add(line);
}
bufferedReader.close();
reader.close();
stream.close();
return lines;
}
//导出到csv文件
public static void array2CSV(ArrayList<ArrayList<String>> data, String path)
{
try {
BufferedWriter out =new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path),"UTF-8"));
for (int i = 0; i < data.size(); i++)
{
ArrayList<String> onerow=data.get(i);
for (int j = 0; j < onerow.size(); j++)
{
out.write(DelQuota(onerow.get(j)));
out.write("|");
}
out.newLine();
}
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static String DelQuota(String str)
{
String result = str;
// String[] strQuota = { "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "`", ";", "'", ",", ".", "/", ":", "/,", "<", ">", "?" };
String[] strQuota = { "|" };
for (int i = 0; i < strQuota.length; i++)
{
if (result.indexOf(strQuota[i]) > -1)
result = result.replace(strQuota[i], "");
}
return result;
}
public static void outMsg(String fileName, String content) {
File files = new File(fileName);
FileWriter writer = null;
try {
if (!files.exists()) {
files.createNewFile();
}
// 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
writer = new FileWriter(files, true);
writer.write(content);
writer.write("\n");
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer != null){
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static String getFileBody(File file, String encoding)
{
StringBuffer buffer = new StringBuffer();
try {
List<String> lines = getFileLines(file, encoding);
for (String line : lines)
{
buffer.append(line);
buffer.append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return buffer.toString();
}
public static String getFileBody(String path, String encoding)
{
File file = new File(path);
if (!file.exists())
{
return null;
}
return getFileBody(file, encoding);
}
public static void mergeFile(String dir, int depth,
String outPath, String encoding)
{
File dirFile = new File(dir);
List<File> filesForMerge = collectFiles(dirFile, depth, -1);
StringBuffer buffer = new StringBuffer();
for (File file : filesForMerge)
{
String temp = getFileBody(file, encoding);
buffer.append(temp);
buffer.append("\n");
}
try {
PrintWriter pw = new PrintWriter(outPath);
pw.write(buffer.toString());
pw.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static List<File> collectFiles(File dirFile, int depth, int sampleNum)
{
List<File> filesForMerge = new ArrayList<File>();
if (1 == depth)
{
File[] subFiles = dirFile.listFiles();
for (File file : subFiles)
{
if (file.isDirectory())
{
continue;
}
filesForMerge.add(file);
}
filesForMerge = sampleFiles(filesForMerge, sampleNum);
}
else
{
File[] subFiles = dirFile.listFiles();
for (File file : subFiles)
{
if (!file.isDirectory())
{
continue;
}
List<File> files = collectFiles(file, depth-1, sampleNum);
if (files.size() > 0)
{
filesForMerge.addAll(files);
}
}
}
return filesForMerge;
}
private static List<File> sampleFiles(List<File> origFiles, int sampleNum)
{
List<File> sampleFiles = new ArrayList<File>();
int totalSize = origFiles.size();
if (sampleNum >= 0 && totalSize > sampleNum)
{
if (sampleNum < totalSize / 10)
{
sampleNum = totalSize / 10;
}
Random random = new Random();
int count = 0;
while (count < sampleNum)
{
int pos = random.nextInt(totalSize);
File nextFile = origFiles.get(pos);
if (sampleFiles.contains(nextFile))
{
continue;
}
sampleFiles.add(nextFile);
count ++;
}
return sampleFiles;
}
return origFiles;
}
/**
* eg: cmread-books\dushi\16
* */
public static String getDifferPath(File subFile, File ancestorFile)
{
File parentFile = subFile.getParentFile();
String dirpath = "";
while (!parentFile.getPath().equalsIgnoreCase(ancestorFile.getPath()))
{
dirpath = String.format("%s\\%s", parentFile.getName(), dirpath);
parentFile = parentFile.getParentFile();
}
return dirpath;
}
public static void makeDir(String path)
{
File dirFile = new File(path);
if (!dirFile.exists())
{
dirFile.mkdirs();
}
}
/**
* find file/dir toFindFileName from ancestors of searchDirName
* */
public static File findDecestorFile(File searchDir, String toFindFileName)
{
if (searchDir.getName().equalsIgnoreCase(toFindFileName))
{
return searchDir;
}
if (searchDir.isDirectory())
{
File[] subFiles = searchDir.listFiles();
for (File subFile : subFiles)
{
File temp = findDecestorFile(subFile, toFindFileName);
if (temp != null)
{
return temp;
}
}
}
return null;
}
public static File findFile(String fileName, File parentFile, int compType)
{
File[] files = parentFile.listFiles();
switch (compType) {
case 0: //equal
for (File file : files)
{
if (file.getName().equalsIgnoreCase(fileName))
{
return file;
}
}
break;
case 1: //start with
for (File file : files)
{
if (file.getName().toLowerCase().startsWith(fileName.toLowerCase()))
{
return file;
}
}
break;
case 2: //contains
for (File file : files)
{
if (file.getName().toLowerCase().indexOf(fileName.toLowerCase()) != -1)
{
return file;
}
}
break;
default:
break;
}
return null;
}
private static final char BOM = '\uFEFF';
public static ArrayList<String> readTxtFile(String fileName,String encoding) {
ArrayList<String> rtn = new ArrayList<String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encoding));
String line = null;
boolean isFirstLine = true;
while ((line = reader.readLine()) != null) {
if (isFirstLine && line.length() > 0 && line.charAt(0) == BOM) {
line = line.substring(1);
isFirstLine = false;
}
rtn.add(line);
}
} catch (Exception e) {
String errMsg = String.format("Exception while reading file: %s", fileName, e.getMessage());
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
String errMsg = String.format("Exception while closing file [%s]: %s", fileName, e.getMessage());
e.printStackTrace();
}
}
}
return rtn;
}
public static void copyFile(File source, File dest) throws IOException {
FileChannel inputChannel = null;
FileChannel outputChannel = null;
try {
inputChannel = new FileInputStream(source).getChannel();
outputChannel = new FileOutputStream(dest).getChannel();
outputChannel.transferFrom(inputChannel, 0, inputChannel.size());
} finally {
inputChannel.close();
outputChannel.close();
}
}
public static List<String> findFiles(String ph) {
List<String> filesList = new ArrayList<String>();
File file = new File(ph);
if (!file.exists()) {
return null;
}
File tempFile;
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
tempFile = files[i];
if (tempFile.isFile()) {
filesList.add(tempFile.getName().toString());
}
}
return filesList;
}
}
package com.zzsn.search.db;
import java.sql.Connection;
import java.sql.SQLException;
/**
* 数据库连接层MYSQL
* @author Administrator
*
*/
public class DBConnection {
/**
* 连接数据库
* @return
*/
public static Connection getDBConnection()
{
// 1. 注册驱动
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 获取数据库的连接
try {
Connection conn = java.sql.DriverManager.getConnection("jdbc:mysql://localhost:3306/clb_project?useUnicode=true&characterEncoding=utf8", "root", "root");
return conn;
} catch (SQLException e1) {
e1.printStackTrace();
}
return null;
}
}
\ No newline at end of file
package com.zzsn.search.db;
import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
/**
* MYSQL数据库底层封装
* @author Administrator
*
*/
public class DBManager {
private PreparedStatement pstmt;
private Connection conn;
private ResultSet rs;
/**
* 打开数据库
*/
public DBManager() {
conn = DBConnection.getDBConnection();
}
/**
* 执行修改添加操作
* @param coulmn
* @param type
* @param sql
* @return
* @throws SQLException
*/
public boolean updateOrAdd(String[] coulmn, int[] type, String sql) throws SQLException
{
if(!setPstmtParam(coulmn, type, sql))
return false;
boolean flag = pstmt.executeUpdate()>0?true:false;
closeDB();
return flag;
}
/**
* 获取查询结果集
* @param coulmn
* @param type
* @param sql
* @throws SQLException
*/
public DataTable getResultData(String[] coulmn, int[] type, String sql) throws SQLException
{
DataTable dt = new DataTable();
ArrayList<HashMap<String, String>>list = new ArrayList<HashMap<String, String>>();
if(!setPstmtParam(coulmn, type, sql))
return null;
rs = pstmt.executeQuery();
ResultSetMetaData rsmd = rs.getMetaData();//取数据库的列名
int numberOfColumns = rsmd.getColumnCount();
while(rs.next())
{
HashMap<String, String> rsTree = new HashMap<String, String>();
for(int r=1;r<numberOfColumns+1;r++)
{
rsTree.put(rsmd.getColumnName(r),rs.getObject(r).toString());
}
list.add(rsTree);
}
closeDB();
dt.setDataTable(list);
return dt;
}
/**
* 参数设置
* @param coulmn
* @param type
* @throws SQLException
* @throws NumberFormatException
*/
private boolean setPstmtParam(String[] coulmn, int[] type, String sql) throws NumberFormatException, SQLException
{
if(sql== null) return false;
pstmt = conn.prepareStatement(sql);
if(coulmn != null && type != null && coulmn.length !=0 && type.length !=0 )
{
for (int i = 0; i<type.length; i++) {
switch (type[i]) {
case Types.INTEGER:
pstmt.setInt(i+1, Integer.parseInt(coulmn[i]));
break;
case Types.BOOLEAN:
pstmt.setBoolean(i+1, Boolean.parseBoolean(coulmn[i]));
break;
case Types.CHAR:
pstmt.setString(i+1, coulmn[i]);
break;
case Types.DOUBLE:
pstmt.setDouble(i+1, Double.parseDouble(coulmn[i]));
break;
case Types.FLOAT:
pstmt.setFloat(i+1, Float.parseFloat(coulmn[i]));
break;
default:
break;
}
}
}
return true;
}
/**
* 关闭数据库
* @throws SQLException
*/
private void closeDB() throws SQLException
{
if(rs != null)
{
rs.close();
}
if(pstmt != null)
{
pstmt.close();
}
if(conn != null)
{
conn.close();
}
}
}
\ No newline at end of file
package com.zzsn.search.db;
import java.util.*;
/**
* 数据集封装
* @author Administrator
*/
public class DataTable {
public String[] column;//列字段
public String[][] row; //行值
public int rowCount = 0;//行数
public int colCoun = 0;//列数
public DataTable() {
super();
}
public DataTable(String[] column, String[][] row, int rowCount, int colCoun) {
super();
this.column = column;
this.row = row;
this.rowCount = rowCount;
this.colCoun = colCoun;
}
public void setDataTable(ArrayList<HashMap<String, String>> list) {
rowCount = list.size();
colCoun = list.get(0).size();
column = new String[colCoun];
row = new String[rowCount][colCoun];
for (int i = 0; i < rowCount; i++) {
Set<Map.Entry<String, String>> set = list.get(i).entrySet();
int j = 0;
for (Iterator<Map.Entry<String, String>> it = set.iterator(); it
.hasNext();) {
Map.Entry<String, String> entry = (Map.Entry<String, String>) it
.next();
row[i][j] = entry.getValue();
if (i == rowCount - 1) {
column[j] = entry.getKey();
}
j++;
}
}
}
public String[] getColumn() {
return column;
}
public void setColumn(String[] column) {
this.column = column;
}
public String[][] getRow() {
return row;
}
public void setRow(String[][] row) {
this.row = row;
}
public int getRowCount() {
return rowCount;
}
public void setRowCount(int rowCount) {
this.rowCount = rowCount;
}
public int getColCoun() {
return colCoun;
}
public void setColCoun(int colCoun) {
this.colCoun = colCoun;
}
}
\ No newline at end of file
package com.zzsn.search.db;
import lombok.extern.slf4j.Slf4j;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
/**
* @author xub
* @Description: 雪花算法
* @date 2019/8/14 下午8:22
*/
@Slf4j
public class SnowIdUtils {
/**
* 私有的 静态内部类
*/
private static class SnowFlake {
/**
* 内部类对象(单例模式)
*/
private static final SnowFlake SNOW_FLAKE = new SnowFlake();
/**
* 起始的时间戳
*/
private final long START_TIMESTAMP = 1557489395327L;
/**
* 序列号占用位数
*/
private final long SEQUENCE_BIT = 12;
/**
* 机器标识占用位数
*/
private final long MACHINE_BIT = 10;
/**
* 时间戳位移位数
*/
private final long TIMESTAMP_LEFT = SEQUENCE_BIT + MACHINE_BIT;
/**
* 最大序列号 (4095)
*/
private final long MAX_SEQUENCE = ~(-1L << SEQUENCE_BIT);
/**
* 最大机器编号 (1023)
*/
private final long MAX_MACHINE_ID = ~(-1L << MACHINE_BIT);
/**
* 生成id机器标识部分
*/
private long machineIdPart;
/**
* 序列号
*/
private long sequence = 0L;
/**
* 上一次时间戳
*/
private long lastStamp = -1L;
/**
* 构造函数初始化机器编码
*/
private SnowFlake() {
//模拟这里获得本机机器编码
long localIp = 4321;
//localIp & MAX_MACHINE_ID最大不会超过1023,在左位移12位
machineIdPart = (localIp & MAX_MACHINE_ID) << SEQUENCE_BIT;
}
/**
* 获取雪花ID
*/
public synchronized long nextId() {
long currentStamp = timeGen();
//避免机器时钟回拨
while (currentStamp < lastStamp) {
// //服务器时钟被调整了,ID生成器停止服务.
throw new RuntimeException(String.format("时钟已经回拨. Refusing to generate id for %d milliseconds", lastStamp - currentStamp));
}
if (currentStamp == lastStamp) {
// 每次+1
sequence = (sequence + 1) & MAX_SEQUENCE;
// 毫秒内序列溢出
if (sequence == 0) {
// 阻塞到下一个毫秒,获得新的时间戳
currentStamp = getNextMill();
}
} else {
//不同毫秒内,序列号置0
sequence = 0L;
}
lastStamp = currentStamp;
//时间戳部分+机器标识部分+序列号部分
return (currentStamp - START_TIMESTAMP) << TIMESTAMP_LEFT | machineIdPart | sequence;
}
/**
* 阻塞到下一个毫秒,直到获得新的时间戳
*/
private long getNextMill() {
long mill = timeGen();
//
while (mill <= lastStamp) {
mill = timeGen();
}
return mill;
}
/**
* 返回以毫秒为单位的当前时间
*/
protected long timeGen() {
return System.currentTimeMillis();
}
}
/**
* 获取long类型雪花ID
*/
public static long uniqueLong() {
return SnowFlake.SNOW_FLAKE.nextId();
}
/**
* 获取String类型雪花ID
*/
public static String uniqueLongHex() {
return String.format("%016x", uniqueLong());
}
/**
* 测试
*/
public static void main(String[] args) throws InterruptedException {
//计时开始时间
long start = System.currentTimeMillis();
//让100个线程同时进行
final CountDownLatch latch = new CountDownLatch(100);
//判断生成的20万条记录是否有重复记录
final Map<Long, Integer> map = new ConcurrentHashMap();
for (int i = 0; i < 100; i++) {
//创建100个线程
new Thread(() -> {
for (int s = 0; s < 2000; s++) {
long snowID = SnowIdUtils.uniqueLong();
log.info("生成雪花ID={}",snowID);
Integer put = map.put(snowID, 1);
if (put != null) {
throw new RuntimeException("主键重复");
}
}
latch.countDown();
}).start();
}
//让上面100个线程执行结束后,在走下面输出信息
latch.await();
log.info("生成20万条雪花ID总用时={}", System.currentTimeMillis() - start);
}
}
package com.zzsn.search.db;
import java.sql.SQLException;
import java.sql.Types;
/**
* 测试Demo
* @author Administrator
*/
public class TestBusIness{
static String searchSql = "SELECT sid,title,summary,keywords,content,sourceaddress,publish_date,origin from cis_ans_processitem where sid='2022021506' limit 10 ";
static String insertSql = "insert into score(name, age, score)values(?,?,?)";
static String deleteSql = "delete from score where id = ?";
static String updateSql = "update score set name = ? where id = ?";
static String searchSql2="select count(*) form from cis_ans_processitem where sid='202202150000' and ";
public static void main(String[] args) {
// intsertData();
searchData();
}
private static void intsertData()
{
DBManager dm = new DBManager();
String[] coulmn = new String[]{"wyf2", "23", "89.5"};
int[] type = new int[]{Types.CHAR, Types.INTEGER, Types.DOUBLE};
try {
boolean flag = dm.updateOrAdd(coulmn, type, insertSql);
if(flag)
System.out.println("插入成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
private static void searchData()
{
DBManager dm = new DBManager();
String[] coulmn = null;
int[] type = null;
try {
DataTable dt = dm.getResultData(coulmn, type, searchSql);
if(dt != null && dt.getRowCount()> 0){
for(int i = 0; i<dt.getRowCount(); i++)
{
for(int j = 0; j<dt.getColCoun(); j++)
System.out.printf(dt.getRow()[i][j]+"\t");
System.out.println();
}
}
else
System.out.println("查询失败");
} catch (SQLException e) {
e.printStackTrace();
}
}
}
\ No newline at end of file
package com.zzsn.search.entity;
import lombok.Data;
import java.util.List;
@Data
public class ClbAnsProcessitem {
/**主键*/
private String id;
/**信息源id*/
private String sid;
/**团队id*/
private String tid;
/**标题*/
private String title;
/**摘要*/
private String summary;
/**关键词*/
private String keyWords;
/**正文*/
private String content;
private String contentWithtag;
/**未知*/
private String hash;
/**作者*/
private String author;
/**来源*/
private String sourceSite;
/**地址*/
private String sourceAddress;
/**未知*/
private String currentProcess;
/**类别*/
private String type;
/**未知*/
private String withTagFile;
/**发布时间*/
private String publishDate;
/**创建人*/
private String createBy;
/**创建时间*/
private String createDate;
/**编码*/
private String charset;
/**未知*/
private Integer processResult;
/**最新更新时间*/
private String lastModified;
/**组织id*/
private String orgId;
/**词*/
private String words;
/**来源*/
private String origin;
/**未知*/
private String orientation;
/**来源*/
private String fromWhere;
/**来源id*/
private String fromId;
/**来源类别*/
private String sourceType;
/**未知*/
private String featureWords;
/**下载地址*/
private String fileDownloadPath;
private String contentImgCvtTag;
/**关联地址*/
private String relatePlaces;
/**关联人*/
private String relatePerson;
/**关联组织*/
private String relateOrg;
/**事件*/
private String relateEvent;
/**时间*/
private String relateDate;
/**未知*/
private Integer relevance1;
/**未知*/
private String relevance;
/**语言*/
private String lang;
/**组织*/
private String orgs;
//采集来源(如通用、定制、微信公众号等)
private String source;
/**(临时处理)关联的专题id*/
private List<String> subjectIds;
}
\ No newline at end of file
package com.zzsn.search.entity;
import lombok.Data;
import java.util.List;
@Data
public class KeywordMsg {
/**主键*/
private String id;
/**词组编码*/
private String wordsCode;
/**词组名称*/
private String wordsName;
/**关键词*/
private String keyWord;
/**排除词*/
private String exclusionWord;
/**状态*/
private Integer status;
/**数据库查询数据时承接*/
private String subjectId;
/**所属专题*/
private List<String> subjectIds;
private Long startTime;
private Long endTime;
}
package com.zzsn.search.extractor;
import com.zzsn.utility.model.FileTag;
import com.zzsn.utility.util.DateUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获取正文中的图片或者文件
* 创建人:李东亮
* 创建时间:2016-8-30 下午5:25:04
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ContentFileFinder {
/**
* 获取父路径
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String getDirPath(String path) {
path = path.substring(0, path.lastIndexOf("/")) ;
return path;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String formatPath(String currentPageURL,String imgPath) {
String start="";
if(currentPageURL.indexOf("http://")!=-1){
start = "http://";
}else if(currentPageURL.indexOf("https://")!=-1){
start = "https://";
}
//绝对路径
if(imgPath.startsWith("/")){
currentPageURL = currentPageURL.replace(start, "");
int subIndex = currentPageURL.indexOf("/");
if(subIndex==-1){
subIndex = currentPageURL.length();
}
String domain = currentPageURL.substring(0, subIndex);
return start+domain+imgPath;
}
//相对路径
String path = currentPageURL+"/"+imgPath;
path = path.replaceAll(start, "D:/");
File f = new File(path);
String filePath="";
try {
filePath = f.getCanonicalPath();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String result = filePath.replaceAll("D:\\\\", start);
result = result.replaceAll("\\\\", "/");
return result;
}
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private static String genImgFileName(String suffix){
String dir = DateUtil.format(new Date(), "yyyy-MM-dd");
String uuid = UUID.randomUUID().toString();
return dir+"/"+uuid+suffix;
}
/**
* 确保有src属性并且src属性指向正确的图片地址
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午1:46:03
* @version 1.0
* @param rawTag
* @return
*/
public static Element ensureSrc(Element imgTag){
// Document doc = Jsoup.parseBodyFragment(rawTag);
String firstSrcAtt=null;
if(imgTag.hasAttr("original")){
firstSrcAtt = "original";
}else if(imgTag.hasAttr("data-src")){
firstSrcAtt = "data-src";
}else if(imgTag.hasAttr("_src")){
firstSrcAtt = "_src";
}else if(imgTag.hasAttr("src")){
firstSrcAtt ="src";
}
if(firstSrcAtt==null){
return null;
}
imgTag.attr("src", imgTag.attr(firstSrcAtt));
return imgTag;
}
/**
* 获取图片的绝对路径
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午2:05:02
* @version 1.0
* @param element
* @param uri
* @return
*/
public static String getAbsolutePath(Element element,String uri,String linkAtt){
String absolutePath = element.attr(linkAtt);
if(absolutePath.startsWith("data:image")){
return null;
}
if(absolutePath.startsWith("file:")){
return null;
}
if (absolutePath.matches("(?i)^javascript.*|#")) {
return null;
}
if(absolutePath!=null && absolutePath.trim().length()>0 && !absolutePath.startsWith("http://")&&!absolutePath.startsWith("https://")&&uri!=null){
String puriDir = getDirPath(uri);
absolutePath = formatPath(puriDir,absolutePath);
}
return absolutePath;
}
/**
* 获取后缀名
* 创建人: 李东亮
* 创建时间: 2016-8-30 下午5:00:39
* @version 1.0
* @param uri
* @return
*/
public static String getSuffix(String uri){
uri = uri.replaceAll("http://|https://", "");
Pattern p = Pattern.compile("/.+(\\.\\w{1,4})$");
Matcher m = p.matcher(uri);
if(m.find()){
return m.group(1);
}
return "";
}
/**
* 获取正文中的文件标签,包含正文中的图片和附件
* 创建人: 李东亮
* 创建时间: 2016-9-8 下午3:01:09
* @version 1.0
* @param content
* @param sourceaddress
* @return
*/
public static Map<String,FileTag> getContentFileTag(String content,String sourceaddress){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml().replaceAll("\"", "'").replaceAll("'>", "' \\/>");
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = getAbsolutePath(imgTag,sourceaddress,filePathAttr);
if(absolutePath==null){
continue;
}
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
/**
* 得到网页中图片的地址
* @param sets html字符串
*/
public static Set<String> getImgStr(String htmlStr) {
Set<String> pics = new HashSet<String>();
String img = "";
Pattern p_image;
Matcher m_image;
String regEx_img = "<img.*src\\s*=\\s*(.*?)[^>]*?>";
p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while (m_image.find()) {
// 得到<img />数据
img = m_image.group();
System.out.println(img);
// 匹配<img>中的src数据
Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
while (m.find()) {
pics.add(m.group(1));
}
}
return pics;
}
public static String rmHtmlImgOrAtag(String htmlStr) {
//匹配img标签的正则表达式
String regxpForImgTag = "<img\\s[^>]+/>|<img.*>(.*?)</img>|<a.*>(.*?)</a>|<a.*/>";
Pattern pattern = Pattern.compile(regxpForImgTag);
Matcher matcher = pattern.matcher(htmlStr);
while (matcher.find()) {
String temp = matcher.group();
htmlStr = htmlStr.replace(temp, " ");
}
return htmlStr;
}
public static void main(String[] args) {
String str = "<span></span><img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\"点击图片查看幻灯模式\" original2=\"http://img3.laibafile.cn/p/l/246500759.jpg\" /> "
+ "<div>hello<div/><img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\\\"点击图片查看幻灯模式\\\" original2=\\\"http://img3.laibafile.cn/p/l/246500759.jpg\\\" >qew</img>";
// System.out.println(ContentFileFinder.getSuffix("http://www.baidu.com//a.xls"));
// getContentFileTag(str,"");
str=rmHtmlImgOrAtag(str);
System.out.println(str);
String aString=str+"<div class=\"_3L5YSq\" title=\"vue兼容ie11的解决方法 2021年最新解决方案\">"
+ "<a class=\"_1-HJSV _1OhGeD\" href=\"/p/4718213d9373\" target=\"_blank\" rel=\"noopener noreferrer\">vue兼容ie11的解决方法 2021年最新解决方案</a>"
+ "<span>qwqe<a href='qwe'/></span</div>";
aString=rmHtmlImgOrAtag(aString);
System.out.println(aString);
}
}
package com.zzsn.search.extractor;
/**
* 默认返回对象设置
* 创建人:李东亮
* 创建时间:2015-5-13 上午10:23:02
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DefaultMsg {
private int success = 1;
private String msg;
public DefaultMsg()
{
}
public DefaultMsg(int success)
{
this.success = success;
}
public int getSuccess() {
return success;
}
public void setSuccess(int success) {
this.success = success;
}
public String getMsg() {
return msg;
}
public void setMsg(String msg) {
this.msg = msg;
}
}
package com.zzsn.search.extractor;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.TimeoutException;
public class MemcachedFactory {
private static final Logger Log = LoggerFactory.getLogger(MemcachedFactory.class);
private static ApplicationContext factory;
protected static MemcachedClient client;
/**
* 加载spring容器
* 创建人: 李东亮 MemcachedFactory
* 创建时间: 2015-5-30 上午11:39:31
* @version 1.0
*/
public static void init(){
factory = new ClassPathXmlApplicationContext("conf/spring-memcached.xml");
client = (MemcachedClient )factory.getBean("memcachedClient");
}
/**
* 获取Key值
* @param key
* @return
*/
public static Object getKey(String key) {
try {
Object obj = client.get(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return null;
}
/**
* 设置缓存 10天
* @param key
* @return
*/
public static boolean setExpireskey(String key, Object value) {
boolean result = false;
try {
if(value==null){
value="null";
}
if (client.get(key) != null ) {
result = client.replace(key, 3600*24*10, value);
} else {
result = client.set(key, 3600*24*10, value);
}
} catch (TimeoutException | InterruptedException | MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKey(String key, Object value) {
boolean result = false;
try {
if(value==null){
value="null";
}
if (client.get(key) != null ) {
result = client.replace(key, 0, value);
} else {
result = client.set(key, 0, value);
}
} catch (TimeoutException | InterruptedException | MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static void setLset(String key,String value){
// client
}
public static void main(String[] args) throws UnsupportedEncodingException {
MemcachedFactory.init();
// DBServiceFactory.init();
Object object=MemcachedFactory.getKey("domainUri_www.ceconline.com");
System.out.println(object);
// List<Site> sites = DBServiceFactory.getCrawlerService().getSnSites();
/* //memcached清空缓存
Map<String,String> map = new HashMap<String,String>();
for(int i=0;i<1000000;i++){
map.put(i+"","adadadadadadadadadadadadadadader");
}
MemcachedFactory.setKey("aa", map);
Map<String,String> m = (Map<String, String>) MemcachedFactory.getKey("aa");
System.out.println(m.size());*/
//MemcachedFactory.flushAll();
}
}
package com.zzsn.search.google;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.search.util.PublishDateUtil;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.util.ChromeUtil;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class GoogleRecorderUtil {
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> CatchWebOfGoogle(List<String> urlList, String charset, Long orgId, Long tid) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(1000*5);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
String docstr= ChromeUtil.getChromeDoc(urlList.get(i),((i)*20+180)+"");
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("div[class=g]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
String info = doc.toString();
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
Elements e=firstElementsLink.get(j).select("h3");
Elements a=firstElementsLink.get(j).select("a");
Elements timespan=firstElementsLink.get(j).select("span[class=f]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
if(timespan.size()>0){
System.out.println(timespan.get(0).text());
if(timespan.get(0).text().contains("hours")) {
catchWebByMetaSearch.setPublishDate(DatePaserUtil.getCreateDate());
}else if(timespan.get(0).text().contains("day")) {
int day=DatePaserUtil.getIntstr(timespan.get(0).text());
catchWebByMetaSearch.setPublishDate(DatePaserUtil.getDateBeforesomdat(day));
}
else if(timespan.get(0).text().contains("ago")) {
catchWebByMetaSearch.setPublishDate(DatePaserUtil.getCreateDate());
}
else{
String date= PublishDateUtil.getPublishDate(timespan.get(0).text());
System.out.println(date);
catchWebByMetaSearch.setPublishDate(date);
}
System.out.println(catchWebByMetaSearch.getPublishDate());
}
metaSearchList.add(catchWebByMetaSearch);
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> catchWebOfGoogleList(
List<String> urlList, String charset, Long orgId, Long tid,KafkaTemplate kafkaTemplate) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(2000);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
System.out.println(urlList.get(i));
// String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20)+"");
String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
//标题
// Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
//catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
for (CatchWebByMetaSearch catchWebByMetaSearch:catchWebByMetaSearchList){
ObjectMapper mapper = new ObjectMapper();
try {
String docjson = mapper.writeValueAsString(catchWebByMetaSearch);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
}catch (Exception e){
log.info(e.getMessage());
}
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> CatchWebOfGoogle1(
List<String> urlList, String charset, Long orgId, Long tid) {
try {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
for (int i = 0; i < urlList.size(); i++) {
Thread.sleep(1000*5);
CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
Document doc = null;
System.out.println(urlList.get(i));
String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20+0)+"");
if(docstr==null){
continue;
}
doc=Jsoup.parse(docstr);
Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
//若果没有结果则不循环
if(firstElementsLink.size()==0){
break;
}
String info = doc.toString();
for (int j = 0; j < firstElementsLink.size(); j++) {
catchWebByMetaSearch= new CatchWebByMetaSearch();
// System.out.println(firstElementsLink.get(j).toString());
//标题
// Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
//链接
Elements a=firstElementsLink.get(j).select("a");
//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
System.out.println(e.get(0).text());
System.out.println(a.get(0).attr("href"));
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSummary(urlList.get(i));
//catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
catchWebByMetaSearch.setTitle(e.get(0).text());
//来源
String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
catchWebByMetaSearch.setSourcesite(origin);
metaSearchList.add(catchWebByMetaSearch);
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
package com.zzsn.search.google;
import com.zzsn.search.FileUtil;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class GoogleSearch {
public static void main(String[] args) throws IOException {
// String filepath=args[0];
String filepath="E:\\crawlerWorkspace\\metaSearch\\data\\googlekeyword.txt";
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
GoogleSearchThread baiduSearchThread = new GoogleSearchThread();
paser(allLines);
}
public static void paser(List<String> keywords){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<GoogleSearchThread> workers = new Vector<GoogleSearchThread>();
int index = 0;
try {
for (List<String> keywordList : splitList) {
// BaiduSearchThread worker = new BaiduSearchThread();
GoogleSearchThread baiduSearchThread = new GoogleSearchThread();
baiduSearchThread.setThreadId(index++);
baiduSearchThread.setKeywords(keywordList);
workers.add(baiduSearchThread);
threadPool.execute(baiduSearchThread);
Thread.sleep(1000);
}
}catch (Exception e){
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
}
package com.zzsn.search.googleThread;
import com.google.gson.Gson;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 百度搜索
* 1.根据关键词请求进入页面,
* 2.将抓取到的内容信息保存到本地数据库表
*/
public class DetailGoogleSearch {
public static void main(String[] args) throws IOException {
String filepath= Constants.META_SEARCH_KEYWORDPATH;
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
CatchWebByMetaSearch catchWebByMetaSearch = new Gson().fromJson(record.value().toString(), CatchWebByMetaSearch.class);
DetailGoogleSearchThread detailBaiduSearchThread=new DetailGoogleSearchThread();
detailBaiduSearchThread.catchWebByMetaSearch=catchWebByMetaSearch;
detailBaiduSearchThread.crawler();
//创建使用固定线程数的线程池
// TimeUnit.SECONDS.sleep(120);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
}
public static String dateToStamp(String s) {
String res="";
//设置时间模版
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
Date date = simpleDateFormat.parse(s);
long ts = date.getTime()/1000;
res = String.valueOf(ts);
}catch (Exception e){
}
return res;
}
public static void paser(List<String> keywords,String startTime,String endTime){
List<List<String>> splitList = splitList(keywords,5000);
ExecutorService threadPool = Executors.newFixedThreadPool(1);
Vector<WebGoogleSearchThread> workers = new Vector<WebGoogleSearchThread>();
int index = 0;
try {
for (List<String> keywordList : splitList) {
// BaiduSearchThread worker = new BaiduSearchThread();
WebGoogleSearchThread baiduSearchThread = new WebGoogleSearchThread();
baiduSearchThread.setThreadId(index++);
baiduSearchThread.setKeywords(keywordList);
baiduSearchThread.setStartTime(startTime);
baiduSearchThread.setEndTime(endTime);
workers.add(baiduSearchThread);
threadPool.execute(baiduSearchThread);
Thread.sleep(1000);
}
}catch (Exception e){
System.out.println(e.getMessage());
}
threadPool.shutdown();
while (true) {
boolean isfinished = threadPool.isTerminated();
if (isfinished)
break;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
System.out.println(e.getMessage());
}
}
}
// 对list进行分割
public static <T> List<List<T>> splitList(List<T> list, int len) {
if (list == null || list.size() == 0 || len < 1) {
return null;
}
List<List<T>> result = new ArrayList<List<T>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++) {
List<T> subList = list.subList(i * len, ((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
System.out.println(Constants.KAFKA_CONSUMER_SERVERS);
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论