采集代码更新5

7053d936 · liuweigang · 80a9df55 · 7053d936 · 7053d936
--- a/baidu_search/src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
+++ b/baidu_search/src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
@@ -398,6 +398,210 @@ public class MetaBaiduSearchThread implements Runnable {
        }
        return repeat;
    }
+
+    // 抓取新闻内容
+    public int  CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) {
+        int repeat=0;
+        try {
+            int count = 0;
+            for (int i = 0; i < catchWebList.size(); i++) {
+                try {
+                    CatchWebByMetaSearch cwbm = catchWebList.get(i);
+                    // 判断该网址是否存在于缓存池中
+                    String orgId = String.valueOf(keywordMsg.getWordsCode());
+                    try {
+                        boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
+                        if (sismember) {
+                            log.info("百度采集信息重复：" + cwbm.getTitle() + " :" + cwbm.getSourceaddress());
+                            repeat++;
+                            continue;
+                        }
+                    } catch (Exception e) {
+                        log.info("redis获取信息失败");
+                    }
+
+//                    try {
+//                        String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
+//                        if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
+//                            log.info(cwbm.getSourceaddress()+" 数据重复");
+//                            repeat++;
+//                            continue;
+//                        }
+//                    }catch (Exception e){
+//                        log.info("redis获取信息失败");
+//                    }
+
+                    String infourl = cwbm.getSourceaddress();
+                    String infodata = "";
+                    String charset = "";
+                    System.out.println(cwbm.getTitle()+"=="+infourl);
+                    if (infourl == null || infourl.contains(".pdf") || infourl.trim().length()==0|| infourl.contains(".PDF")||infourl.contains("download")) {
+                        continue;
+                    }
+                    infodata=getContentByUrl(infourl);
+//					测试获取内容通过模拟浏览器获取
+//                    infodata= ChromeUtil.getChromeDoc(infourl);
+                    //					String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
+                    if(StringUtils.isEmpty(infodata)) {
+                        SeleniumTime seleniumTime = new SeleniumTime();
+                        infodata = seleniumTime.getScopehtml(infourl);
+                    }
+                    if(StringUtils.isEmpty(infodata)){
+                        try {
+                            Thread.sleep(1000*5);
+                            SeleniumTime seleniumTime2=new SeleniumTime();
+                            infodata=seleniumTime2.getScopehtml(infourl);
+                        } catch (InterruptedException e) {
+                            e.printStackTrace();
+                        }
+                    }
+                    if(infourl.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
+                        infodata = RequestUtil.getTaotiaoData(infourl );
+                    }
+                    if(StringUtils.isEmpty(infodata)){
+                        System.out.println("122222222222222222222222/为空，则爬取下一个");
+                        //为空，则爬取下一个
+                        continue;
+                    }
+                    String contentCharset = Utility.getWebEncodingByStr(infodata);
+                    String content = null;
+                    if (infodata != null && charset != null&& contentCharset != null) {
+//                        content = Utility.convertCharset(infodata, charset,charset);
+                        content=infodata;
+                    }
+                    if (content != null) {
+                        cwbm.setCharset(charset);
+                        cwbm.setLastModify("");
+                        cwbm.setContent(content);
+                    }
+                    DocInfo docInfo = new DocInfo();
+                    docInfo.setContentType("HTML");
+                    docInfo.setOrgId(cwbm.getOrgId());
+                    docInfo.setSid(cwbm.getSid());
+                    docInfo.setSourceType("News");
+                    docInfo.setLastModified(cwbm.getLastModify());
+                    docInfo.setCharset("utf-8");
+                    docInfo.setSourceaddress(cwbm.getSourceaddress());
+                    docInfo.setTitle(cwbm.getTitle().replace("...", ""));
+                    docInfo.setAuthor(cwbm.getAuthor());
+                    docInfo.setPublishDate(cwbm.getPublishDate());
+                    docInfo.setOrigin(cwbm.getSourcesite());
+                    docInfo.setKeywords(keyword);
+                    docInfo.setSummary(cwbm.getSummary());
+                    StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
+                    try {
+//						判断是否存在对应域名的模板
+                        if(infourl.contains("qq.com") && !infourl.contains("://new.qq.com")){
+                            infourl= transqqURl(infourl);
+                        }
+                        String domainurl = new URL(infourl).getHost();
+                        Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
+                        SiteTemplate siteTemplate=new SiteTemplate();
+                        if (siteTempObj != null && !"null".equals(siteTempObj)) {
+                            Site site=(Site)siteTempObj;
+                            siteTemplate.setMatchTitle(site.getMatchTitle());
+                            siteTemplate.setMatchAuthor(site.getMatchAuthor());
+                            siteTemplate.setMatchContent(site.getMatchContent());
+                            siteTemplate.setMatchOrigin(site.getMatchOrigin());
+                            siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
+                            siteTemplate.setMatchSummary(site.getMatchSummary());
+                            System.out.println("1++++++++doPaserByTag");
+                            docInfo= SourceTemplateByTag.doPaserByTag(content, docInfo, siteTemplate);
+                        }
+                        if(null!=docInfo.getContentWithTag()) {
+                            System.out.println("使用模板解析内容成功"+domainurl);
+                            log.info("使用模板解析内容成功"+domainurl);
+                        }
+                        if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
+                            SourceTemplateByTag.saveNoTempSite(cwbm);
+                            swe.doHandler(content, docInfo);
+                        }
+
+                    } catch (Exception e1) {
+                        log.info("模板解析异常"+e1.getMessage());
+                        SourceTemplateByTag.saveNoTempSite(cwbm);
+                        swe.doHandler(content, docInfo);
+                    }
+                    System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
+                    docInfo.setFileDownLoadPath(null);
+                    Map<String, String> params = new HashMap<String, String>();
+                    params.put("fromWhere", "百度元搜索");
+                    if (null!=cwbm.getTid()) {
+                        params.put("tid", String.valueOf(cwbm.getTid()));
+                    }
+                    docInfo.setOtherParams(params);
+                    if (docInfo.getTitle() != null
+                            && docInfo.getTitle().trim().length() > 0
+                            && docInfo.getContentNoTag() != null
+                            && docInfo.getContentNoTag().trim().length() > 0) {
+                        if (docInfo.getPublishDate()!=null && docInfo.getPublishDate().trim().length()>0
+                        ) {
+                            ContentFileResult contentFileResult = new ContentFileResult();
+                            try {
+                                contentFileResult = getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
+//								docInfo.setContentWithTag(contentFileResult.getContentImgCvtTag());
+                                docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));
+
+                                docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
+                                Map<String, FileTag> imgMap = contentFileResult.getFileMap();
+                                //
+                                for (String keyImg : imgMap.keySet()) {
+                                    FileTag fileTag = imgMap.get(keyImg);
+                                    String savePath = fileTag.getSavePath();
+                                    InputStream is = getImg(fileTag.getAbsolutePath());
+                                    if (is!=null) {
+                                        int size = is.available();
+                                        if (size>0) {
+//                                            mqSender.sendFile(is, true, savePath);
+                                        }
+                                    }
+
+
+                                }
+
+                            } catch (Exception e) {
+                                // TODO Auto-generated catch block
+//								e.printStackTrace();
+                                log.info(e.getMessage());
+                            }
+
+                            System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
+                            log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
+                                    "|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
+
+//                            intsertData(docInfo);
+                            //信息转换
+                            ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
+                            ObjectMapper mapper = new ObjectMapper();
+                            String docjson = mapper.writeValueAsString(processitem);
+                            System.out.println(docjson);
+                            kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
+                            log.info("发送成功到kafka");
+                        }else {
+                            log.info("资讯发布时间："+docInfo.getPublishDate());
+                        }
+                        count++;
+                    }else {
+                        log.info("资讯内容："+docInfo.getContentNoTag());
+//                        System.out.println(docInfo.getContentNoTag());
+                    }
+                    // 加入缓存池中
+                    JedisUtil.setString(Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress(),"1",-1);
+                    System.out.println("加入缓存池");
+                    Thread.sleep(5000);
+                } catch (Exception e) {
+                    log.info("访问出错"+e.getMessage());
+                    continue;
+                }
+            }
+            System.out.println("本次成功件数：" + count);
+            log.info("本次成功件数：" + count);
+        } catch (Exception e) {
+            log.info("访问出错"+e.getMessage());
+        }
+        return repeat;
+    }
+
    static String insertSql = "insert into cis_ans_processitem " +
            " (id,sid, title,summary,publish_date,origin,author, content,words,keywords,sourceaddress) " +
            " values(?,?,?,?,?,?,?,?,?,?,?)";

--- a/baidu_search/src/main/java/com/zzsn/search/util/RecorderUtil.java
+++ b/baidu_search/src/main/java/com/zzsn/search/util/RecorderUtil.java
@@ -412,6 +412,116 @@ public class RecorderUtil {
 			}
 //			return null;
 		}
+	public static List<CatchWebByMetaSearch> catchWebOfBaiduList(
+			List<String> urlList, String charset, Long orgId, Long tid, String keywords) {
+		List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
+		try {
+			for (int i = 0; i < urlList.size(); i++) {
+				String docstr="";
+				if(Constants.PROXY.equals("1")) {//是否使用代理
+					log.info("使用代理访问百度链接");
+					try {
+						URL url = new URL(urlList.get(i));
+						URI uri = null;
+						String uri_code = "";
+						try {
+							uri = new URI(url.getProtocol(), url.getHost(),
+									url.getPath(), url.getQuery(), null);
+							uri_code = Utility.encodURI(uri.toString())
+									.replaceAll("%2520", "+").replaceAll("%25", "%")
+									.replaceAll("%20", "+");
+						} catch (URISyntaxException e) {
+							log.info("url处理异常！");
+						}
+						docstr = proxyRequest(uri_code);
+						log.info("请求内容："+docstr);
+					} catch (Exception e) {
+						log.info("使用代理请求异常");
+					}
+				}else {
+					SeleniumTime seleniumTime = new SeleniumTime();
+					docstr = seleniumTime.getScopehtml(urlList.get(i));
+				}
+//					SeleniumTime seleniumTime = new SeleniumTime();
+//					docstr = seleniumTime.getScopehtmlByProxy(urlList.get(i));
+				if(StringUtils.isEmpty(docstr)){
+					try {
+						Thread.sleep(1000*5);
+						SeleniumTime seleniumTime2=new SeleniumTime();
+						docstr=seleniumTime2.getScopehtml(urlList.get(i));
+					} catch (InterruptedException e) {
+						e.printStackTrace();
+					}
+				}
+				Document doc = Jsoup.parse(docstr);
+				System.out.println("----百度搜索----" + urlList.get(i));
+				Elements firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
+				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
+				CatchWebByMetaSearch catchWebByMetaSearch = null;
+				if(firstElementsLink.size()<1){
+					try {
+						Thread.sleep(1000);
+						SeleniumTime seleniumTime2=new SeleniumTime();
+						docstr=seleniumTime2.getScopehtml(urlList.get(i));
+						doc = Jsoup.parse(docstr);
+						firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
+					}catch (Exception e){
+						e.printStackTrace();
+					}
+				}
+				for (int m=0;m<firstElementsLink.size();m++) {
+					try {
+						catchWebByMetaSearch = new CatchWebByMetaSearch();
+						Elements dateEle = firstElementsLink.get(m).select("span[class=\"c-color-gray2\"]");
+						Elements orgain = firstElementsLink.get(m).select("span.c-color-gray");
+//							//发布时间
+						if(dateEle.size()>0) {
+							String publishDate = DateUtil.getPublishDate(dateEle.get(0).text());
+							catchWebByMetaSearch.setPublishDate(publishDate);
+						}
+//							//来源
+						if(orgain.size()>0) {
+							catchWebByMetaSearch.setSourcesite(orgain.get(0).text());
+						}
+						Elements titleAndUrl = firstElementsLink.get(m).select("a[data-click]");
+						if (titleAndUrl.size() > 0) {
+							//标题 class="news-title_1YtI1 "
+							String title = titleAndUrl.get(0).text().trim();
+							catchWebByMetaSearch.setTitle(title);
+							//源网址
+							String addressurl= firstElementsLink.get(m).attr("mu");
+							if(StringUtils.isEmpty(addressurl)) {
+								addressurl = titleAndUrl.attr("href");
+							}
+							catchWebByMetaSearch.setSourceaddress(addressurl);
+							System.out.println(addressurl);
+							if (StringUtils.isNotEmpty(addressurl)) {
+								catchWebByMetaSearch.setOrgId(orgId);
+								catchWebByMetaSearch.setTid(tid);
+								catchWebByMetaSearch.setSid(tid);
+								metaSearchList.add(catchWebByMetaSearch);
+							}
+						}
+					}catch (Exception e){
+						continue;
+					}
+				}
+				//对采集一个列表解析一个列表的详情
+				MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
+				int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords);
+				if(repeat/metaSearchList.size()>0.6){
+					break;
+				}
+				catchWebByMetaSearchList.addAll(metaSearchList);
+			}
+			return catchWebByMetaSearchList;
+
+		} catch (ParseException e) {
+			return catchWebByMetaSearchList;
+		}
+//			return null;
+	}
+
 	// 提取百度新闻列表及详情URL
 		@SuppressWarnings("deprecation")
 		public static List<CatchWebByMetaSearch> CatchWebDetailOfBaidu(