提交 7053d936 作者: liuweigang

采集代码更新5

上级 80a9df55
......@@ -398,6 +398,210 @@ public class MetaBaiduSearchThread implements Runnable {
}
return repeat;
}
// 抓取新闻内容
public int CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) {
int repeat=0;
try {
int count = 0;
for (int i = 0; i < catchWebList.size(); i++) {
try {
CatchWebByMetaSearch cwbm = catchWebList.get(i);
// 判断该网址是否存在于缓存池中
String orgId = String.valueOf(keywordMsg.getWordsCode());
try {
boolean sismember = JedisUtil.sismember("baidu::"+orgId, cwbm.getSourceaddress());
if (sismember) {
log.info("百度采集信息重复:" + cwbm.getTitle() + " :" + cwbm.getSourceaddress());
repeat++;
continue;
}
} catch (Exception e) {
log.info("redis获取信息失败");
}
// try {
// String urlflag = JedisUtil.getString( Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress());
// if (!org.apache.commons.lang3.StringUtils.isEmpty(urlflag)) {
// log.info(cwbm.getSourceaddress()+" 数据重复");
// repeat++;
// continue;
// }
// }catch (Exception e){
// log.info("redis获取信息失败");
// }
String infourl = cwbm.getSourceaddress();
String infodata = "";
String charset = "";
System.out.println(cwbm.getTitle()+"=="+infourl);
if (infourl == null || infourl.contains(".pdf") || infourl.trim().length()==0|| infourl.contains(".PDF")||infourl.contains("download")) {
continue;
}
infodata=getContentByUrl(infourl);
// 测试获取内容通过模拟浏览器获取
// infodata= ChromeUtil.getChromeDoc(infourl);
// String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
if(StringUtils.isEmpty(infodata)) {
SeleniumTime seleniumTime = new SeleniumTime();
infodata = seleniumTime.getScopehtml(infourl);
}
if(StringUtils.isEmpty(infodata)){
try {
Thread.sleep(1000*5);
SeleniumTime seleniumTime2=new SeleniumTime();
infodata=seleniumTime2.getScopehtml(infourl);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
if(infourl.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
infodata = RequestUtil.getTaotiaoData(infourl );
}
if(StringUtils.isEmpty(infodata)){
System.out.println("122222222222222222222222/为空,则爬取下一个");
//为空,则爬取下一个
continue;
}
String contentCharset = Utility.getWebEncodingByStr(infodata);
String content = null;
if (infodata != null && charset != null&& contentCharset != null) {
// content = Utility.convertCharset(infodata, charset,charset);
content=infodata;
}
if (content != null) {
cwbm.setCharset(charset);
cwbm.setLastModify("");
cwbm.setContent(content);
}
DocInfo docInfo = new DocInfo();
docInfo.setContentType("HTML");
docInfo.setOrgId(cwbm.getOrgId());
docInfo.setSid(cwbm.getSid());
docInfo.setSourceType("News");
docInfo.setLastModified(cwbm.getLastModify());
docInfo.setCharset("utf-8");
docInfo.setSourceaddress(cwbm.getSourceaddress());
docInfo.setTitle(cwbm.getTitle().replace("...", ""));
docInfo.setAuthor(cwbm.getAuthor());
docInfo.setPublishDate(cwbm.getPublishDate());
docInfo.setOrigin(cwbm.getSourcesite());
docInfo.setKeywords(keyword);
docInfo.setSummary(cwbm.getSummary());
StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
try {
// 判断是否存在对应域名的模板
if(infourl.contains("qq.com") && !infourl.contains("://new.qq.com")){
infourl= transqqURl(infourl);
}
String domainurl = new URL(infourl).getHost();
Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
SiteTemplate siteTemplate=new SiteTemplate();
if (siteTempObj != null && !"null".equals(siteTempObj)) {
Site site=(Site)siteTempObj;
siteTemplate.setMatchTitle(site.getMatchTitle());
siteTemplate.setMatchAuthor(site.getMatchAuthor());
siteTemplate.setMatchContent(site.getMatchContent());
siteTemplate.setMatchOrigin(site.getMatchOrigin());
siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
siteTemplate.setMatchSummary(site.getMatchSummary());
System.out.println("1++++++++doPaserByTag");
docInfo= SourceTemplateByTag.doPaserByTag(content, docInfo, siteTemplate);
}
if(null!=docInfo.getContentWithTag()) {
System.out.println("使用模板解析内容成功"+domainurl);
log.info("使用模板解析内容成功"+domainurl);
}
if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
SourceTemplateByTag.saveNoTempSite(cwbm);
swe.doHandler(content, docInfo);
}
} catch (Exception e1) {
log.info("模板解析异常"+e1.getMessage());
SourceTemplateByTag.saveNoTempSite(cwbm);
swe.doHandler(content, docInfo);
}
System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
docInfo.setFileDownLoadPath(null);
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "百度元搜索");
if (null!=cwbm.getTid()) {
params.put("tid", String.valueOf(cwbm.getTid()));
}
docInfo.setOtherParams(params);
if (docInfo.getTitle() != null
&& docInfo.getTitle().trim().length() > 0
&& docInfo.getContentNoTag() != null
&& docInfo.getContentNoTag().trim().length() > 0) {
if (docInfo.getPublishDate()!=null && docInfo.getPublishDate().trim().length()>0
) {
ContentFileResult contentFileResult = new ContentFileResult();
try {
contentFileResult = getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
// docInfo.setContentWithTag(contentFileResult.getContentImgCvtTag());
docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));
docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
Map<String, FileTag> imgMap = contentFileResult.getFileMap();
//
for (String keyImg : imgMap.keySet()) {
FileTag fileTag = imgMap.get(keyImg);
String savePath = fileTag.getSavePath();
InputStream is = getImg(fileTag.getAbsolutePath());
if (is!=null) {
int size = is.available();
if (size>0) {
// mqSender.sendFile(is, true, savePath);
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
log.info(e.getMessage());
}
System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
"|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));
// intsertData(docInfo);
//信息转换
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(processitem);
System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
log.info("发送成功到kafka");
}else {
log.info("资讯发布时间:"+docInfo.getPublishDate());
}
count++;
}else {
log.info("资讯内容:"+docInfo.getContentNoTag());
// System.out.println(docInfo.getContentNoTag());
}
// 加入缓存池中
JedisUtil.setString(Constants.SOURCEADDRESS+"_"+orgId+"_"+cwbm.getSourceaddress(),"1",-1);
System.out.println("加入缓存池");
Thread.sleep(5000);
} catch (Exception e) {
log.info("访问出错"+e.getMessage());
continue;
}
}
System.out.println("本次成功件数:" + count);
log.info("本次成功件数:" + count);
} catch (Exception e) {
log.info("访问出错"+e.getMessage());
}
return repeat;
}
static String insertSql = "insert into cis_ans_processitem " +
" (id,sid, title,summary,publish_date,origin,author, content,words,keywords,sourceaddress) " +
" values(?,?,?,?,?,?,?,?,?,?,?)";
......
......@@ -412,6 +412,116 @@ public class RecorderUtil {
}
// return null;
}
public static List<CatchWebByMetaSearch> catchWebOfBaiduList(
List<String> urlList, String charset, Long orgId, Long tid, String keywords) {
List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
try {
for (int i = 0; i < urlList.size(); i++) {
String docstr="";
if(Constants.PROXY.equals("1")) {//是否使用代理
log.info("使用代理访问百度链接");
try {
URL url = new URL(urlList.get(i));
URI uri = null;
String uri_code = "";
try {
uri = new URI(url.getProtocol(), url.getHost(),
url.getPath(), url.getQuery(), null);
uri_code = Utility.encodURI(uri.toString())
.replaceAll("%2520", "+").replaceAll("%25", "%")
.replaceAll("%20", "+");
} catch (URISyntaxException e) {
log.info("url处理异常!");
}
docstr = proxyRequest(uri_code);
log.info("请求内容:"+docstr);
} catch (Exception e) {
log.info("使用代理请求异常");
}
}else {
SeleniumTime seleniumTime = new SeleniumTime();
docstr = seleniumTime.getScopehtml(urlList.get(i));
}
// SeleniumTime seleniumTime = new SeleniumTime();
// docstr = seleniumTime.getScopehtmlByProxy(urlList.get(i));
if(StringUtils.isEmpty(docstr)){
try {
Thread.sleep(1000*5);
SeleniumTime seleniumTime2=new SeleniumTime();
docstr=seleniumTime2.getScopehtml(urlList.get(i));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
Document doc = Jsoup.parse(docstr);
System.out.println("----百度搜索----" + urlList.get(i));
Elements firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
CatchWebByMetaSearch catchWebByMetaSearch = null;
if(firstElementsLink.size()<1){
try {
Thread.sleep(1000);
SeleniumTime seleniumTime2=new SeleniumTime();
docstr=seleniumTime2.getScopehtml(urlList.get(i));
doc = Jsoup.parse(docstr);
firstElementsLink = doc.select("div[id=\"content_left\"]>div.new-pmd");
}catch (Exception e){
e.printStackTrace();
}
}
for (int m=0;m<firstElementsLink.size();m++) {
try {
catchWebByMetaSearch = new CatchWebByMetaSearch();
Elements dateEle = firstElementsLink.get(m).select("span[class=\"c-color-gray2\"]");
Elements orgain = firstElementsLink.get(m).select("span.c-color-gray");
// //发布时间
if(dateEle.size()>0) {
String publishDate = DateUtil.getPublishDate(dateEle.get(0).text());
catchWebByMetaSearch.setPublishDate(publishDate);
}
// //来源
if(orgain.size()>0) {
catchWebByMetaSearch.setSourcesite(orgain.get(0).text());
}
Elements titleAndUrl = firstElementsLink.get(m).select("a[data-click]");
if (titleAndUrl.size() > 0) {
//标题 class="news-title_1YtI1 "
String title = titleAndUrl.get(0).text().trim();
catchWebByMetaSearch.setTitle(title);
//源网址
String addressurl= firstElementsLink.get(m).attr("mu");
if(StringUtils.isEmpty(addressurl)) {
addressurl = titleAndUrl.attr("href");
}
catchWebByMetaSearch.setSourceaddress(addressurl);
System.out.println(addressurl);
if (StringUtils.isNotEmpty(addressurl)) {
catchWebByMetaSearch.setOrgId(orgId);
catchWebByMetaSearch.setTid(tid);
catchWebByMetaSearch.setSid(tid);
metaSearchList.add(catchWebByMetaSearch);
}
}
}catch (Exception e){
continue;
}
}
//对采集一个列表解析一个列表的详情
MetaBaiduSearchThread baiduSearchThread=new MetaBaiduSearchThread();
int repeat = baiduSearchThread.CatchWebNews(metaSearchList, keywords);
if(repeat/metaSearchList.size()>0.6){
break;
}
catchWebByMetaSearchList.addAll(metaSearchList);
}
return catchWebByMetaSearchList;
} catch (ParseException e) {
return catchWebByMetaSearchList;
}
// return null;
}
// 提取百度新闻列表及详情URL
@SuppressWarnings("deprecation")
public static List<CatchWebByMetaSearch> CatchWebDetailOfBaidu(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论