提交 7053d936 作者: liuweigang

采集代码更新5

上级 80a9df55
......@@ -412,7 +412,117 @@ public class RecorderUtil {
}
// return null;
}
// 提取百度新闻列表及详情URL
public static List<CatchWebByMetaSearch> catchWebOfBaiduList(
List<String> urlList, String charset, Long orgId, Long tid, String keywords) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
try {
for (int i = 0; i < urlList.size(); i++) {
String docstr="";
if(Constants.PROXY.equals("1")) {//是否使用代理
log.info("使用代理访问百度链接");
try {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),
url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
} catch (URISyntaxException e) {
log.info("url处理异常!");
}
docstr = proxyRequest(uri_code);
log.info("请求内容:"+docstr);
} catch (Exception e) {
log.info("使用代理请求异常");
}
}else {
SeleniumTime seleniumTime = new SeleniumTime();
docstr = seleniumTime.getScopehtml(urlList.get(i));
}
// SeleniumTime seleniumTime = new SeleniumTime();
// docstr = seleniumTime.getScopehtmlByProxy(urlList.get(i));
if(StringUtils.isEmpty(docstr)){
try {
Thread.sleep(1000*5);
SeleniumTime seleniumTime2=new SeleniumTime();
docstr=seleniumTime2.getScopehtml(urlList.get(i));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
Document doc = Jsoup.parse(docstr);
System.out.println("----百度搜索----" + urlList.get(i));
Elements firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
CatchWebByMetaSearch catchWebByMetaSearch = null;
if(firstElementsLink.size()<1){
try {
Thread.sleep(1000);
SeleniumTime seleniumTime2=new SeleniumTime();
docstr=seleniumTime2.getScopehtml(urlList.get(i));
doc = Jsoup.parse(docstr);
firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
}catch (Exception e){
e.printStackTrace();
}
}
for (int m=0;m<firstElementsLink.size();m++) {
try {
catchWebByMetaSearch = new CatchWebByMetaSearch();
Elements dateEle = firstElementsLink.get(m).select("span[class=\"c-color-gray2\"]");
Elements orgain = firstElementsLink.get(m).select("span.c-color-gray");
// //发布时间
if(dateEle.size()>0) {
String publishDate = DateUtil.getPublishDate(dateEle.get(0).text());
catchWebByMetaSearch.setPublishDate(publishDate);
}
// //来源
if(orgain.size()>0) {
catchWebByMetaSearch.setSourcesite(orgain.get(0).text());
}
Elements titleAndUrl = firstElementsLink.get(m).select("a[data-click]");
if (titleAndUrl.size() > 0) {
//标题 class="news-title_1YtI1 "
String title = titleAndUrl.get(0).text().trim();
catchWebByMetaSearch.setTitle(title);
//源网址
String addressurl= firstElementsLink.get(m).attr("mu");
if(StringUtils.isEmpty(addressurl)) {
addressurl = titleAndUrl.attr("href");
}
catchWebByMetaSearch.setSourceaddress(addressurl);
System.out.println(addressurl);
if (StringUtils.isNotEmpty(addressurl)) {
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
metaSearchList.add(catchWebByMetaSearch);
}
}
}catch (Exception e){
continue;
}
}
//对采集一个列表解析一个列表的详情
MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords);
if(repeat/metaSearchList.size()>0.6){
break;
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (ParseException e) {
return catchWebByMetaSearchList;
}
// return null;
}
// 提取百度新闻列表及详情URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> CatchWebDetailOfBaidu(
List<String> urlList, String charset, Long orgId, Long tid,String keyword) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论