提交 16c70511 作者: liuweigang

爬虫修改4

上级 e43e2f42
...@@ -6,8 +6,6 @@ ...@@ -6,8 +6,6 @@
5.对抽取到的信息链接进行访问和信息抽取。 5.对抽取到的信息链接进行访问和信息抽取。
微信爬虫流程: 微信爬虫流程:
1.从kafka获取微信公众号信息。 1.从kafka获取微信公众号信息。
2.从链接中获取微信公众号id。 2.从链接中获取微信公众号id。
......
...@@ -7,10 +7,13 @@ import com.google.gson.Gson; ...@@ -7,10 +7,13 @@ import com.google.gson.Gson;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.WeixinSiteThread; import com.zzsn.crawler.WeixinSiteThread;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.job.JedisUtil;
import com.zzsn.job.KafkaConsumerJob; import com.zzsn.job.KafkaConsumerJob;
import com.zzsn.job.SouGouSearch; import com.zzsn.job.SouGouSearch;
import com.zzsn.util.Constants; import com.zzsn.util.Constants;
import com.zzsn.util.FileUtil;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecord;
...@@ -25,10 +28,11 @@ import org.springframework.boot.autoconfigure.SpringBootApplication; ...@@ -25,10 +28,11 @@ import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder; import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import java.io.File;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.*;
import java.util.Date; import java.util.concurrent.ExecutorService;
import java.util.Properties; import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
//@SpringBootApplication //@SpringBootApplication
...@@ -45,19 +49,102 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl ...@@ -45,19 +49,102 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl
@Override @Override
public void run(String... args) throws Exception { public void run(String... args) throws Exception {
try {
loadSiteAll();
}catch (Exception e){
loadSiteAll();
} finally {
}
// try { // try {
// loadSiteMsg(); // loadSiteMsg();
// }catch (Exception e){ // }catch (Exception e){
// loadSiteMsg(); // loadSiteMsg();
// } finally { // } finally {
// } // }
// try {
// loadSiteFitler();
// }catch (Exception e){
// loadSiteFitler();
// } finally {
// }
} }
public void loadSiteAll(){
try{
int count=0;
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
//创建一个线程池
String wxsendname = Constants.WXSENDNAME;
String[] wxsize = wxsendname.split(",");
int taskSize=wxsize.length;
while(true){
ExecutorService threadPool = Executors.newFixedThreadPool(taskSize);
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300);
if (records != null && records.count() > 0) {
int k=0;
for (ConsumerRecord record : records) {
try {
try {//获取redis记录的数字
String rediscount = JedisUtil.getString((wxsize[k]));
if(StringUtils.isNotEmpty(rediscount)){
count=Integer.parseInt(rediscount);
}
}catch (Exception ee){
}
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
System.out.println("kafka消息:"+record.value().toString());
WeixinSiteThread siteThread = new WeixinSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.wxname=wxsize[k];
count++;
if(count/(wxsize.length)>199){ //判断发送信息数
continue;
}else {
JedisUtil.setString(wxsize[k],count+"",60 * 60 * 24);
}
k++;
// siteThread.crawler();
// Thread.sleep(1000 * workers);
threadPool.execute(siteThread);
}catch (Exception e){
continue;
}
}
threadPool.shutdown();
while(true)
{
boolean isfinished = threadPool.isTerminated();
if(isfinished)
break;
Thread.sleep(1000);
}
}
}
}catch (Exception e){
System.out.println(e.getMessage());
System.out.println("程序异常+++++");
try {
Thread.sleep(30000);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
// loadSiteMsg();
}
}
public void loadSiteMsg(){ public void loadSiteMsg(){
try{ try{
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class); KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息"); System.out.println("进入定时获取mq消息");
//1.创建消费者 //1.创建消费者
int count=0;
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer(); KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
// 消费某个主题的某个分区数据 // 消费某个主题的某个分区数据
ArrayList<TopicPartition> topicPartitions = new ArrayList<>(); ArrayList<TopicPartition> topicPartitions = new ArrayList<>();
...@@ -71,8 +158,22 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl ...@@ -71,8 +158,22 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环 //消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回 //在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300); ConsumerRecords<String, String> records = consumer.poll(300);
try {
String rediscount = JedisUtil.getString(Constants.REDISCOUNT);
if(StringUtils.isNotEmpty(rediscount)){
count=Integer.parseInt(rediscount);
}
}catch (Exception ee){
}
if (records != null && records.count() > 0) { if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) { for (ConsumerRecord record : records) {
count++;
if(count>200){
continue;
}else {
JedisUtil.setString(Constants.REDISCOUNT,count+"",60 * 60 * 24);
}
try { try {
System.out.println("kafka消息:"+record.value().toString()); System.out.println("kafka消息:"+record.value().toString());
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class); SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
...@@ -83,8 +184,8 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl ...@@ -83,8 +184,8 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl
}catch (Exception e){ }catch (Exception e){
msgTime=0; msgTime=0;
} }
if(msgTime>7){//微信的资讯时间超过4小时的不采集 if(msgTime>8){//微信的资讯时间超过12小时的不采集
log.info("资讯发布时间超过5小时的公众号:"+siteName); log.info("资讯发布时间超过12小时的公众号:"+siteName);
System.out.println("资讯发布时间超过5小时的公众号:"+siteName); System.out.println("资讯发布时间超过5小时的公众号:"+siteName);
try { try {
Thread.sleep(1000*10); Thread.sleep(1000*10);
...@@ -208,6 +309,51 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl ...@@ -208,6 +309,51 @@ public class WeixinCrawlerApplication extends SpringBootServletInitializer impl
} }
} }
public void loadSiteFitler(){
try{
String filepath= Constants.JWYQJC_INFILE_URL;
System.out.println(filepath);
File f = new File(filepath);
List<String> allLines = FileUtil.getFileLines(f, "utf-8");
KafkaConsumerJob kafkaConsumerJob= SpringContextUtil.getBean(KafkaConsumerJob.class);
System.out.println("进入定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = kafkaConsumerJob.createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(300);
if (records != null && records.count() > 0) {
for (ConsumerRecord record : records) {
try {
SiteMsgTemple siteMsgTemple = new Gson().fromJson(record.value().toString(), SiteMsgTemple.class);
if(allLines.contains(siteMsgTemple.getInfoSourceCode())) {
System.out.println("kafka消息:"+record.value().toString());
WeixinSiteThread siteThread = new WeixinSiteThread();
siteThread.siteMsgTemple = siteMsgTemple;
siteThread.crawler();
Thread.sleep(1000 * 10);
}
}catch (Exception e){
continue;
}
}
}
}
}catch (Exception e){
System.out.println(e.getMessage());
System.out.println("程序异常+++++");
try {
Thread.sleep(30000);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
// loadSiteMsg();
}
}
public static KafkaConsumer<String, String> createConsumer() { public static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties(); Properties properties = new Properties();
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS); properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
......
...@@ -42,8 +42,8 @@ import java.util.regex.Pattern; ...@@ -42,8 +42,8 @@ import java.util.regex.Pattern;
*/ */
@Slf4j @Slf4j
@Controller @Controller
@RequestMapping("/wxt") //@RequestMapping("/wxt")
//@RequestMapping("/aawxt") @RequestMapping("/aawxt")
public class WeixinController { public class WeixinController {
// http://localhost:8079/wxt/dofiddlerback?wxurl=1 // http://localhost:8079/wxt/dofiddlerback?wxurl=1
......
...@@ -61,7 +61,6 @@ public class SiteService { ...@@ -61,7 +61,6 @@ public class SiteService {
sendurl(url); sendurl(url);
try { try {
//将信息缓存到redis 以便后续查询使用 //将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30); Thread.sleep(1000*30);
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
...@@ -72,6 +71,7 @@ public class SiteService { ...@@ -72,6 +71,7 @@ public class SiteService {
public synchronized void sendurl (String url){ public synchronized void sendurl (String url){
try { try {
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a", WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0"); "ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(10000); Thread.sleep(10000);
......
package com.zzsn.awx.service;
import com.alibaba.fastjson.JSON;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.DateUtil;
import com.zzsn.util.WeixinUtil;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class SiteServiceAll {
private static final Logger Log = LoggerFactory.getLogger(SiteServiceAll.class);
private static Long id=0L;
public void sendUrlToweixin(SiteMsgTemple siteMsgTemple,String wxname){
String url=siteMsgTemple.getSiteUri();
String weixinhaoid="";
try {
weixinhaoid = getParam(url);
String msg= JSON.toJSONString(siteMsgTemple);
// JedisUtil.setString(":"+weixinhaoid+"_"+siteMsgTemple.getId(),msg,0);
JedisUtil.setString(":"+weixinhaoid,msg,0);
} catch (Exception e) {
e.printStackTrace();
}
if(weixinhaoid!=null&&weixinhaoid.trim().length()>0){
clearweixinhaoid(weixinhaoid);
}
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//20秒发送一次链接给微信
sendurl(url,wxname);
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void sendurl (String url,String wxname){
try {
System.out.println("发送的微信号: "+wxname);
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(10000);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public synchronized void sendurl1 (String url,String wxname){
try {
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(20*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public static void clearweixinhaoid(String weixinhaoid){
try {
JedisUtil.del(weixinhaoid);
}catch (Exception e){
}
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
public static String getParambyname(String url,String name) {
Map<String,String> map=parse(url);
return map.get(name);
}
public static String getweixinId(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public static String getweixinId1(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("&mid=");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public void crawlerweixin(SiteMsgTemple siteMsgTemple) throws Exception{
String weixinurl=siteMsgTemple.getSiteUri();
//判断是否yipaqu
String urlflag=JedisUtil.getString(weixinurl);
if(!StringUtils.isEmpty(urlflag)){
System.out.println("已爬取1"+weixinurl);
return;
}
//查询组织
String weixinid=getParam(weixinurl);
String organdtids=JedisUtil.getString(weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
String contentWithTag = "";
contentNoTag = ContentUtility.TransferHTML2Text(contentWithTag);
DocInfo docInfo=new DocInfo();
docInfo.setSourceType("WeChat");
// docInfo.setLastModified(lastModified);
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
// docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
Iterator<String> it =null;
// Iterator<String> it = organdtids.iterator();
while (it.hasNext()) {
String str = it.next();
System.out.println(str);
//解析ITD ORGID
String[] ss= str.split("-");
if(ss.length!=3){
return;
}else{
System.out.println(str);
System.out.println(ss.toString());
}
String orgid=ss[0];
String tid=ss[1];
String sid=ss[2];
docInfo.setOrgId(Long.valueOf(orgid));
docInfo.setSid(Long.valueOf(sid));
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "weixincraw");
if (null!=tid&&!"null".equals(tid)) {
params.put("tid", tid);
}
docInfo.setOtherParams(params);
String week = DateUtil.getDateBeforeDays(new Date() , 2);
if(docInfo.getTitle()==null){
if(StringUtils.isEmpty(contentNoTag)){
//空了继续爬 不空爬不下来记录了
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}else if(docInfo.getPublishDate().compareTo(week)<0){
//1天前外事办项目不推
System.out.println("时间过期"+docInfo.getPublishDate());
JedisUtil.setString(weixinurl, 1+"",0);
//其他项目还是推
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}
}
public static void main(String[] args) {
String s="https://mp.weixin.qq.com/mp/profile_ext?action=home&scene=114&__biz=MzAwODE2OTAwNg==#wechat_redirect";
System.out.println(getParam(s));
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
System.out.println(ss);
ss=getParam(s);
System.out.println(ss);
String time="2019-11-18 12:20:23";
String week = DateUtil.getDateBeforeDays(new Date() , 1);
System.out.println(time.compareTo(week));
String sss="http://mp.weixin.qq.com/s?__biz=MzUxMzEzNjg1Ng==&mid=2247484003&idx=1&sn=965ca574850ab65be466c443bf8e2a3b&scene=0965ca574850ab65be466c443bf8e2a3b";
sss=getParambyname(sss, "signature");
System.out.println(sss);
}
}
\ No newline at end of file
package com.zzsn.crawler; package com.zzsn.crawler;
import com.zzsn.awx.service.SiteServiceAll;
import com.zzsn.configuration.SpringContextUtil; import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.SiteMsgTemple; import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.awx.service.SiteService; import com.zzsn.awx.service.SiteService;
...@@ -11,6 +12,7 @@ import org.springframework.kafka.core.KafkaTemplate; ...@@ -11,6 +12,7 @@ import org.springframework.kafka.core.KafkaTemplate;
public class WeixinSiteThread extends Thread{ public class WeixinSiteThread extends Thread{
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple(); public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public String wxname="";
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class); public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
...@@ -19,16 +21,29 @@ public class WeixinSiteThread extends Thread{ ...@@ -19,16 +21,29 @@ public class WeixinSiteThread extends Thread{
crawler(); crawler();
} }
public synchronized void crawler(){ public void crawler(){
//发送公众号链接到手机微信 //发送公众号链接到手机微信
try { try {
SiteService sites= SpringContextUtil.getBean(SiteService.class); // SiteService sites= SpringContextUtil.getBean(SiteService.class);
sites.sendUrlToweixin(siteMsgTemple); SiteServiceAll sites= SpringContextUtil.getBean(SiteServiceAll.class);
sites.sendUrlToweixin(siteMsgTemple,wxname);
// Thread.sleep(1000*10);
Thread.sleep(1000*90); Thread.sleep(1000*90);
}catch (Exception e){ }catch (Exception e){
} }
} }
// public synchronized void crawler(){
// //发送公众号链接到手机微信
// try {
// SiteService sites= SpringContextUtil.getBean(SiteService.class);
// sites.sendUrlToweixin(siteMsgTemple);
// Thread.sleep(1000*90);
// }catch (Exception e){
//
// }
//
// }
} }
...@@ -49,7 +49,12 @@ public class KafkaConsumerJob { ...@@ -49,7 +49,12 @@ public class KafkaConsumerJob {
// latest earliest // latest earliest
//时间间隔设置为1h //时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000); properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1); // properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
String wxsendname = Constants.WXSENDNAME;
String[] wxsize = wxsendname.split(",");
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, wxsize.length);
properties.put("batch-listener", true );//是否批量消费
// properties.put("batch-listener", false );//是否批量消费
return new KafkaConsumer<>(properties); return new KafkaConsumer<>(properties);
} }
......
...@@ -89,7 +89,7 @@ public class SouGouSearch { ...@@ -89,7 +89,7 @@ public class SouGouSearch {
System.out.println(siteName + "————资讯最新发布时间:" + sd); System.out.println(siteName + "————资讯最新发布时间:" + sd);
hours = SouGouSearch.getBeforeTime(latestNewsTime); hours = SouGouSearch.getBeforeTime(latestNewsTime);
} }
return 0; return hours;
} }
public static int getMsgTime(String wxname){ public static int getMsgTime(String wxname){
int hours=0; int hours=0;
...@@ -101,22 +101,18 @@ public class SouGouSearch { ...@@ -101,22 +101,18 @@ public class SouGouSearch {
Elements elements; Elements elements;
try { try {
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li"); elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
System.out.println(wxname+"搜狗搜索到内容个数:"+elements.size()); System.out.println(wxname+"搜狗搜索到公众号个数:"+elements.size());
if(elements.size()<1){ if(elements.size()<1){
Thread.sleep(10000); Thread.sleep(10000);
getCookie3(); getCookie3();
String connect = connect(url); String connect = get(url);
document = Jsoup.parse(connect); document = Jsoup.parse(connect);
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li"); elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
} }
}catch (Exception e){ }catch (Exception e){
String connect = get(url); hours=getTime(wxname);
document = Jsoup.parse(connect);
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
if(elements.size()<1){
return hours; return hours;
} }
}
try { try {
for (Element element :elements) { for (Element element :elements) {
String name = element.select("p[class=\"tit\"]>a>em").text(); String name = element.select("p[class=\"tit\"]>a>em").text();
...@@ -129,7 +125,6 @@ public class SouGouSearch { ...@@ -129,7 +125,6 @@ public class SouGouSearch {
Matcher matcher = pattern.matcher(index); Matcher matcher = pattern.matcher(index);
if(matcher.find()){ if(matcher.find()){
time=matcher.group(); time=matcher.group();
} }
if(name.equals(wxname)) { if(name.equals(wxname)) {
hours=getBeforeTime(time); hours=getBeforeTime(time);
...@@ -239,7 +234,8 @@ public class SouGouSearch { ...@@ -239,7 +234,8 @@ public class SouGouSearch {
response = client.execute(httpget); response = client.execute(httpget);
} catch (UnknownHostException hostEx) { } catch (UnknownHostException hostEx) {
hostEx.printStackTrace(); System.out.println("代理请求访问异常");
return content;
} }
//2、获取实体 //2、获取实体
HttpEntity entity = response.getEntity(); HttpEntity entity = response.getEntity();
...@@ -284,19 +280,7 @@ public class SouGouSearch { ...@@ -284,19 +280,7 @@ public class SouGouSearch {
try { try {
// 创建httpget. // 创建httpget.
HttpGet httpget = new HttpGet(url); HttpGet httpget = new HttpGet(url);
// httpget.addHeader("token"," token");
// httpget.addHeader("Connection","keep-alive");
// httpget.addHeader("Cache-Control","max-age=0");
// httpget.addHeader("Upgrade-Insecure-Requests","1");
// httpget.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
// httpget.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// httpget.addHeader("Sec-Fetch-Site","none");
// httpget.addHeader("Sec-Fetch-Mode","navigate");
// httpget.addHeader("Sec-Fetch-User","?1");
// httpget.addHeader("Sec-Fetch-Dest","document");
// httpget.addHeader("Accept-Language","zh-CN,zh;q=0.9");
// httpget.addHeader("Cookie","IPLOC=CN4101; SUV=001D46FC7D29AE7D61163051AB952530; SUID=D381343DC830A40A0000000061163173; GOTO=Af121011; SNUID=FD28FF2CF0F53E5DB8F35ED0F18CD145; ABTEST=0|1634614242|v1; weixinIndexVisited=1; JSESSIONID=aaaa12QKYfTMweWDxHhUx; ld=Kyllllllll2Pb6lklllllpVmyjllllllKvum1kllll9lllll9Zlll5@@@@@@@@@@");
//
httpget.addHeader("Accept"," text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); httpget.addHeader("Accept"," text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpget.addHeader("Accept-Encoding"," gzip, deflate, br"); httpget.addHeader("Accept-Encoding"," gzip, deflate, br");
httpget.addHeader("Accept-Language"," zh-CN,zh;q=0.9"); httpget.addHeader("Accept-Language"," zh-CN,zh;q=0.9");
...@@ -314,7 +298,12 @@ public class SouGouSearch { ...@@ -314,7 +298,12 @@ public class SouGouSearch {
httpget.addHeader("Sec-Fetch-User"," ?1"); httpget.addHeader("Sec-Fetch-User"," ?1");
httpget.addHeader("Upgrade-Insecure-Requests"," 1"); httpget.addHeader("Upgrade-Insecure-Requests"," 1");
httpget.addHeader("User-Agent"," Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"); httpget.addHeader("User-Agent"," Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36");
// System.out.println("executing request " + httpget.getURI()); try {
String wxcookies = JedisUtil.getString("wxcookies");
httpget.addHeader("Cookie", wxcookies);
}catch (Exception e){
httpget.addHeader("Cookie", "ABTEST=0|1660717633|v1; IPLOC=CN1100; SUID=31A373722583A20A0000000062FC8A41; SUID=31A373727050A00A0000000062FC8A41; JSESSIONID=aaaTAV1wUSPUH8_i9Czjy; SUV=00B7E8BD7273A33162FC8A437C13A967; PHPSESSID=4kh8ifemsbo3o9picgb43uot52; weixinIndexVisited=1; ariaDefaultTheme=undefined; seccodeErrorCount=1|Thu, 18 Aug 2022 03:51:16 GMT; SNUID=2FBD6E6F1D18FB66CD8E1E051E83C0CB; seccodeRight=success; successCount=1|Thu, 18 Aug 2022 03:51:28 GMT; refresh=1");
}
// 执行get请求. // 执行get请求.
CloseableHttpResponse response = httpclient.execute(httpget); CloseableHttpResponse response = httpclient.execute(httpget);
try { try {
...@@ -433,17 +422,14 @@ public class SouGouSearch { ...@@ -433,17 +422,14 @@ public class SouGouSearch {
conn.setRequestMethod("POST"); conn.setRequestMethod("POST");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
}finally{
} }
String sessionId = ""; String sessionId = "";
String cookieVal = ""; String cookieVal = "";
String key = null; String key = null;
Map<String, List<String>> map2 = conn.getHeaderFields();
// for (String key1 : map2.keySet()) {
// System.out.println(key1 + "--->" + map2.get(key1));
// }
//取cookie //取cookie
Map<String,String> map=new HashMap<>();
for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){ if(key.equalsIgnoreCase("set-cookie")){
cookieVal = conn.getHeaderField(i); cookieVal = conn.getHeaderField(i);
......
...@@ -207,4 +207,6 @@ public class Constants { ...@@ -207,4 +207,6 @@ public class Constants {
public static final String HUAWEICLOUD_AK= prop.getProperty("HUAWEICLOUD_AK"); public static final String HUAWEICLOUD_AK= prop.getProperty("HUAWEICLOUD_AK");
public static final String HUAWEICLOUD_SK= prop.getProperty("HUAWEICLOUD_SK"); public static final String HUAWEICLOUD_SK= prop.getProperty("HUAWEICLOUD_SK");
public static final String REDISCOUNT= prop.getProperty("REDISCOUNT");
} }
...@@ -207,4 +207,27 @@ public class FileUtil { ...@@ -207,4 +207,27 @@ public class FileUtil {
file.delete(); file.delete();
} }
} }
public static List<String> getFileLines(File file, String encoding)
throws IOException
{
List<String> lines = new ArrayList<String>();
if (encoding == null)
{
encoding = "utf-8";
}
FileInputStream stream = new FileInputStream(file);
InputStreamReader reader = new InputStreamReader(stream, encoding);
BufferedReader bufferedReader = new BufferedReader(reader);
String line = null;
while ((line = bufferedReader.readLine()) != null)
{
lines.add(line);
}
bufferedReader.close();
reader.close();
stream.close();
return lines;
}
} }
...@@ -43,6 +43,11 @@ public class ObsUpload { ...@@ -43,6 +43,11 @@ public class ObsUpload {
} }
public static void main(String[] args) {
byte[] bytes =getImageFromNetByUrl("http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg");
InputStream inputStream = new ByteArrayInputStream(bytes);
}
/** /**
* 根据地址获得数据的字节流 * 根据地址获得数据的字节流
* *
......
...@@ -22,12 +22,13 @@ KEYWORDS_EXCLE_URL=D\://typeword.xls ...@@ -22,12 +22,13 @@ KEYWORDS_EXCLE_URL=D\://typeword.xls
RESULT_EXCLE_URL=D\://data//subject//\u56FD\u4F01\u516C\u76CA\u5206\u7C7B// RESULT_EXCLE_URL=D\://data//subject//\u56FD\u4F01\u516C\u76CA\u5206\u7C7B//
THREAD_NUM=10 THREAD_NUM=10
SUBJECT_MEMCACHED_DAYS=0 SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS=10 JWYQJC_MEMCACHED_DAYS=10
TITLE_SIMILARITY_RATE=0.8 TITLE_SIMILARITY_RATE=0.8
MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE=1 CACHE_UPDATE=1
#待采集的redis编码
JWYQJC_INFILE_URL=E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\aa.txt
PROXY=0 PROXY=0
PROXYID=1 PROXYID=1
...@@ -46,7 +47,8 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092 ...@@ -46,7 +47,8 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
KAFKA_CONSUMER_TOPIC = weChatInfo KAFKA_CONSUMER_TOPIC = weChatInfo
#消费信息组 #消费信息组
KAFKA_CONSUMER_GROUP_ID=wx-info #KAFKA_CONSUMER_GROUP_ID=wx-info
KAFKA_CONSUMER_GROUP_ID=wx-infotest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest #KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息保存的topic #信息保存的topic
...@@ -62,7 +64,8 @@ KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7 ...@@ -62,7 +64,8 @@ KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7
#微信账号名称 #微信账号名称
#WXSENDNAME= LiuWeiGang #WXSENDNAME= LiuWeiGang
WXSENDNAME= lwg #WXSENDNAME= lwg
WXSENDNAME= lwg,lwg2,lwg3,lwg4
path= E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\static\\wechat-processor.templete path= E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\static\\wechat-processor.templete
...@@ -104,7 +107,7 @@ redis.maxTotal=600 ...@@ -104,7 +107,7 @@ redis.maxTotal=600
redis.maxWaitMillis=1000 redis.maxWaitMillis=1000
redis.testOnBorrow=false redis.testOnBorrow=false
REDISCOUNT=wx1
......
package com.zzsn.caipan;
public class JudgeSenium {
public static void main(String[] args) {
}
}
package com.zzsn.caipan;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil;
import com.zzsn.conf.SpringContextUtil;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.extrator.ExtEntity;
import com.zzsn.extrator.ProcessorReader;
import com.zzsn.extrator.WebExtractorImplforweixin;
import com.zzsn.orc.BaiduOCR;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.kafka.core.KafkaTemplate;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
@Slf4j
public class JundgeChromeUtil {
public static String redis_name="judge";
static ChromeOptions options1=new ChromeOptions() ;
static WebDriver driver;
static{
System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);//chromedriver服务地址
System.setProperty("webdriver.chrome.bin", Constants.CHROMEBIN);
// options1.setBinary(Constants.CHROMEBIN);
//options1.addArguments("--headless");
/* Map<String, Object> contentSettings = new HashMap<String, Object>();
contentSettings.put("images", 2);
Map<String, Object> preferences = new HashMap<String, Object>();
preferences.put("profile.default_content_settings", contentSettings);*/
driver =new ChromeDriver(options1);
}
public static String getChromeDoc(String url){
//新建�?��WebDriver 的对象,但是new 的是FirefoxDriver的驱�?
try {
driver.get("https://www.google.com");
//打开指定的网�?
} catch (Exception e1) {
// TODO Auto-generated catch block
//e1.printStackTrace();
driver =new ChromeDriver(options1);
driver.get("https://www.google.com");
}
try {
WebElement searchBox2 = driver.findElement(By.name("q"));
searchBox2.sendKeys(url);
Thread.sleep(2000);
WebElement searchBox3 = driver.findElement(By.name("btnK"));
searchBox3.click();
Thread.sleep(2000);
String local=driver.getCurrentUrl();
String realUrl=local+"&hl=en&lr=lang_en&tbm=nws&num=20&&tbs=qdr:d";
driver.get(realUrl);
Thread.sleep(2000);
String doc=driver.getPageSource();
// driver.close();
return doc;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return null;
}
public static List<ExtEntity> getSougouweixin(String key){
List<ExtEntity> entitys=new ArrayList<ExtEntity>();
try {
driver.get("https://weixin.sogou.com/");
} catch (Exception e1) {
driver =new ChromeDriver(options1);
driver.get("https://weixin.sogou.com/");
}
try {
WebElement searchBox2 = driver.findElement(By.id("query"));
searchBox2.sendKeys(key);
Thread.sleep(2000);
WebElement searchBox3 = driver.findElement(By.className("swz"));
searchBox3.click();
Thread.sleep(2000);
WebElement searchBox4 = driver.findElement(By.id("tool_show"));
searchBox4.click();
Thread.sleep(2000);
WebElement searchBox6 = driver.findElement(By.className("btn-time"));
searchBox6.click();
Thread.sleep(2000);
WebElement searchBox5 = driver.findElement(By.linkText("一天内"));
searchBox5.click();
Thread.sleep(2000);
List<WebElement> searchBoxs= driver.findElement(By.className("news-list")).findElements(By.tagName("h3"));
if(searchBoxs.size()>0){
System.out.println(searchBoxs.size());
System.out.println(driver.getCurrentUrl());
for (int i = 0; i < 1; i++) {
String titlelike=searchBoxs.get(i).getText();
searchBoxs.get(i).click();
Thread.sleep(3000);
}
}
Set<String> ss=driver.getWindowHandles();
System.out.println(ss.toString());
String curenthandle=driver.getWindowHandle();
for (String str : ss) {
if(str.equals(curenthandle)){
continue;
}
WebDriver dirverother= driver.switchTo().window(str);
System.out.println(dirverother.getCurrentUrl());
System.out.println(dirverother.getPageSource().length());
String result=dirverother.getPageSource();
List processors = ProcessorReader.readWeChatProcessors();
WebExtractorImplforweixin extractor = new WebExtractorImplforweixin(processors, result);
ExtEntity entity = new ExtEntity();
extractor.process(entity);
System.out.println(entity.getTitle());
entity.setUri(dirverother.getCurrentUrl());
entitys.add(entity);
}
Thread.sleep(2000);
driver.quit();
return entitys;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return entitys;
}
public static List<ExtEntity> getSougouweixinForOrgid(String key,Long orgid){
List<ExtEntity> entitys=new ArrayList<ExtEntity>();
try {
driver.get("https://weixin.sogou.com/");
} catch (Exception e1) {
driver =new ChromeDriver(options1);
driver.get("https://weixin.sogou.com/");
}
try {
WebElement searchBox2 = driver.findElement(By.id("query"));
searchBox2.sendKeys(key);
Thread.sleep(2000);
WebElement searchBox3 = driver.findElement(By.className("swz"));
searchBox3.click();
Thread.sleep(2000);
/*
* WebElement searchBox4 = driver.findElement(By.id("tool_show"));
* searchBox4.click(); Thread.sleep(2000); WebElement searchBox6 =
* driver.findElement(By.className("btn-time")); searchBox6.click();
* Thread.sleep(2000); WebElement searchBox5 =
* driver.findElement(By.linkText("一天内")); searchBox5.click();
* Thread.sleep(2000);
*/
//信息列表
List<WebElement> searchBoxs= driver.findElement(By.className("news-list")).findElements(By.tagName("h3"));
if(searchBoxs.size()>0){
System.out.println(searchBoxs.size());
System.out.println(driver.getCurrentUrl());
Set<String> handles=new HashSet<String>();//谷歌打开页面句柄存储
String curenthandle=driver.getWindowHandle();//首页句柄
handles.add(curenthandle);
for (int i = 0; i < searchBoxs.size(); i++) {
driver.switchTo().window(curenthandle);//转到主页,否则在副页面执行点击会报错
String titlelike=searchBoxs.get(i).getText();
String titlelikekey=titlelike+orgid;
System.out.println(titlelikekey);
// String keyvalue=MemcachedFactory.getKeyStr(titlelikekey);//缓存标题,如果点击过就不再点击
// if(StringUtils.isEmpty(keyvalue)){
//
// }else{
// continue;
// }
searchBoxs.get(i).click();//点击标题,打开微信信息链接
Thread.sleep(5000);
Set<String> ss=driver.getWindowHandles();//获取六浏览器句柄的记录
System.out.println(ss.toString());
for (String str : ss) {//如果副页面句柄已经爬取过就不再爬取
if(handles.contains(str)){
System.out.println(str);
continue;
}
WebDriver dirverother= driver.switchTo().window(str);//转到新打开的副页面。爬取信息
String otherhandle=dirverother.getWindowHandle();
handles.add(otherhandle);
System.out.println(dirverother.getCurrentUrl());
String result=dirverother.getPageSource();
List processors = ProcessorReader.readWeChatProcessors();
WebExtractorImplforweixin extractor = new WebExtractorImplforweixin(processors, result);
ExtEntity entity = new ExtEntity();
extractor.process(entity);
System.out.println(entity.getTitle());
entity.setUri(dirverother.getCurrentUrl());
entity.setTilteLike(titlelike);
entitys.add(entity);
dirverother.close();
}
Thread.sleep(2000);
}
}
driver.quit();
return entitys;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return entitys;
}
public static List<ExtEntity> getSougouweixinForOrgid1(String key,Long orgid,Long tid){
List<ExtEntity> entitys=new ArrayList<ExtEntity>();
try {
driver.get("https://weixin.sogou.com/");
} catch (Exception e1) {
driver =new ChromeDriver(options1);
driver.get("https://weixin.sogou.com/");
}
String curenthandle=driver.getWindowHandle();//首页句柄
try {
WebElement searchBox2 = driver.findElement(By.id("query"));
searchBox2.sendKeys(key);
Thread.sleep(2000);
WebElement searchBox3 = driver.findElement(By.className("swz"));
searchBox3.click();
Thread.sleep(2000);
/*
* WebElement searchBox4 = driver.findElement(By.id("tool_show"));
* searchBox4.click(); Thread.sleep(2000); WebElement searchBox6 =
* driver.findElement(By.className("btn-time")); searchBox6.click();
* Thread.sleep(2000); WebElement searchBox5 =
* driver.findElement(By.linkText("一天内")); searchBox5.click();
* Thread.sleep(2000);
*/
//信息列表
String idname="sogou_page_";
for (int i = 1; i <=10; i++) {
//爬完一页就去发送给消息队列
entitys=new ArrayList<ExtEntity>();
driver.switchTo().window(curenthandle);//转到主页,否则在副页面执行点击会报错
Thread.sleep(3000);
if(i==1) {
// crawler(key,orgid+"",entitys);
}else {
WebElement pageClick = driver.findElement(By.id(idname+i));
pageClick.click();
Thread.sleep(5000);
}
//判断是否需要输入验证码//最多3次,不用输入验证码时执行会报异常,不影响爬取
for (int j = 0; j < 3; j++) {
sendYzm();
}
//页面恢复伸缩
String script1="document.getElementsByTagName('body')[0].style.zoom=1;";
((JavascriptExecutor) driver).executeScript(script1);
//爬取内容
crawler(key,orgid+"",entitys);
//发送消息队列
for (int k = 0; k < entitys.size(); k++) {
// ChromeUtil chromeUtil=new ChromeUtil();
sendExtentity(entitys.get(k),key,orgid,tid);
}
}
//driver.quit();
driverint(driver,curenthandle);
return entitys;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
//driver.quit();
driverint(driver,curenthandle);
}
// driver.quit();
driverint(driver,curenthandle);
/**
* dr.quit()和dr.close()都可以�?出浏览器,�?��的说�?��两�?的区别:第一个close�?
* 如果打开了多个页面是关不干净的,它只关闭当前的一个页面�?第二个quit�?
* 是�?出了�?��Webdriver�?��的窗口,�?��非常干净,所以推荐使用quit�?���?��case�?��的方法�?
*/
return entitys;
}
//初始化浏览器
public static void driverint(WebDriver driver,String curenthandle) {
Set<String> ss=driver.getWindowHandles();//获取六浏览器句柄的记录
System.out.println(ss.toString());
for (String str : ss) {//如果副页面句柄已经爬取过就不再爬取
if(curenthandle.equals(str)){
continue;
}else {
driver.close();
}
}
}
public static void crawler (String keyword,String orgid,List<ExtEntity> entitys) {
try {
List<WebElement> searchBoxs= driver.findElement(By.className("news-list")).findElements(By.tagName("h3"));
if(searchBoxs.size()>0){
System.out.println(searchBoxs.size());
System.out.println(driver.getCurrentUrl());
Set<String> handles=new HashSet<String>();//谷歌打开页面句柄存储
String curenthandle=driver.getWindowHandle();//首页句柄
handles.add(curenthandle);
for (int i = 0; i < searchBoxs.size(); i++) {
driver.switchTo().window(curenthandle);//转到主页,否则在副页面执行点击会报错
String titlelike=searchBoxs.get(i).getText();
String titlelikekey=titlelike+orgid;
System.out.println(titlelikekey);
String rediskey=redis_name+orgid+keyword;
// boolean hasput=MemcachedFactory.hasPut(rediskey, titlelikekey);//缓存标题,如果点击过就不再点击
//
// if(hasput){
// System.out.println("paguole");
// continue;
// }
searchBoxs.get(i).findElement(By.tagName("a")).click();//点击标题,打开微信信息链接
Thread.sleep(2000);
Set<String> ss=driver.getWindowHandles();//获取六浏览器句柄的记录
System.out.println(ss.toString());
for (String str : ss) {//如果副页面句柄已经爬取过就不再爬取
if(handles.contains(str)){
System.out.println(str);
continue;
}
WebDriver dirverother= driver.switchTo().window(str);//转到新打开的副页面。爬取信息
String currentUrl = dirverother.getCurrentUrl();
// WeixinUtil.sendWxMessage("LiuWeiGang",
// "点击链接: "+currentUrl,
// 1000002, "ww6bef1e81aacbf27a",
// "ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
String otherhandle=dirverother.getWindowHandle();
handles.add(otherhandle);
System.out.println(dirverother.getCurrentUrl());
String result=dirverother.getPageSource();
List processors = ProcessorReader.readWeChatProcessors();
WebExtractorImplforweixin extractor = new WebExtractorImplforweixin(processors, result);
ExtEntity entity = new ExtEntity();
extractor.process(entity);
System.out.println(entity.getTitle());
entity.setUri(dirverother.getCurrentUrl());
entity.setTilteLike(titlelike);
String formatImgContent=entity.getContentWithTag();
// String formatImgContent="";
//存储图片
// formatImgContent= ContentFileFinder.getContentImgTag(entity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
//去掉图片
// Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(entity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
// String formatImgContent=entity.getContentWithTag();
// for (String key : imgDataMap.keySet()) {
// FileTag fileTag = imgDataMap.get(key);
// while (formatImgContent.contains(key)) {
// //转换为绝对路径
// formatImgContent = formatImgContent.replace(key, "");
// }
// }
entity.setContentWithTag(formatImgContent);
entitys.add(entity);
dirverother.close();
}
Thread.sleep(1000);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 验证码判断,并输入验证码
*/
public static void sendYzm() {
try {
String script="document.getElementsByTagName('body')[0].style.zoom=1.5;";
Thread.sleep(1000L);
WebElement ele1 = driver.findElement(By.id("seccodeForm")).findElement(By.className("s1"));
Point point1 = ele1.getLocation();
((JavascriptExecutor) driver).executeScript(script);
Thread.sleep(5000L);
File screenshot = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
WebElement ele = driver.findElement(By.id("seccodeImage"));
String imgurl= ele.getAttribute("src");
System.out.println(imgurl);
//File file= ele .getScreenshotAs(OutputType.FILE);
BufferedImage fullImg = ImageIO.read(screenshot);
Point point = ele.getLocation();
int eleWidth = ele.getSize().getWidth();
int eleHeight = ele.getSize().getHeight();
int xlocation=(int) (point.x*1.5);
int ylocation=(int) (point.y*1.5);
System.out.println("x--------"+xlocation);
System.out.println("y--------"+ylocation);
BufferedImage eleScreenshot= fullImg.getSubimage(xlocation, ylocation+2,
150, 50);
String savePath = "D:/1111.png";
String savePath1 = "D:/1111222.png";
String code="";
String ocrBady = BaiduOCR.imageBasicGeneralOcr(imageToBytes(eleScreenshot));
List<String> msgList = BaiduOCR.pasermsgJson(ocrBady);
for (String msg:
msgList) {
if(msg.length()>2){
code=msg;
}
System.out.println(msg);
}
// ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
// ImageIO.write(eleScreenshot, "png", new File(savePath));
// ImageIO.write(eleScreenshot, "png", outputStream);
// String base64Img = Base64.encode(outputStream.toByteArray());
//
// JSONObject json =new JSONObject();
// json.put("image", base64Img);
// String sendurl=Constants.yzmApi;
// String re=post(json, sendurl);
// System.out.println(re);
// JSONObject jsStr = JSONObject.parseObject(re);
// String code=jsStr.getString("message");
String script1="document.getElementsByTagName('body')[0].style.zoom=1;";
((JavascriptExecutor) driver).executeScript(script1);
Thread.sleep(4000);
WebElement sendyzm = driver.findElement(By.id("seccodeInput"));
WebElement submit = driver.findElement(By.id("submit"));
sendyzm.sendKeys(code);
Thread.sleep(4000);
submit.click();
Thread.sleep(4000);
FileUtils.copyFile(screenshot, new File(savePath1));
} catch (Exception e) {
// TODO Auto-generated catch block
//e.printStackTrace();
System.out.println("buyongyanzm");
}
}
/**
* BufferedImage转byte[]
*
* @param bImage BufferedImage对象
* @return byte[]
* @auth zhy
*/
private static byte[] imageToBytes(BufferedImage bImage) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
ImageIO.write(bImage, "png", out);
} catch (IOException e) {
//log.error(e.getMessage());
}
return out.toByteArray();
}
public static String post(JSONObject json, String url){
String result = "";
HttpPost post = new HttpPost(url);
try{
CloseableHttpClient httpClient = HttpClients.createDefault();
post.setHeader("Content-Type","application/json;charset=utf-8");
//post.addHeader("Authorization", "Basic YWRtaW46");
StringEntity postingString = new StringEntity(json.toString(),"utf-8");
post.setEntity(postingString);
HttpResponse response = httpClient.execute(post);
InputStream in = response.getEntity().getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
StringBuilder strber= new StringBuilder();
String line = null;
while((line = br.readLine())!=null){
strber.append(line+'\n');
}
br.close();
in.close();
result = strber.toString();
if(response.getStatusLine().getStatusCode()!=HttpStatus.SC_OK){
result = "服务器异常";
}
} catch (Exception e){
System.out.println("请求异常");
throw new RuntimeException(e);
} finally{
post.abort();
}
return result;
}
public static void sendExtentity(ExtEntity extEntity,String key,Long orgid,Long tid) {
try {
String rediskey = redis_name + orgid + key;
String weixinurl = extEntity.getUri();
String titlekey = extEntity.getTilteLike() + orgid;
String contentNoTag = null;
String contentWithTag = extEntity.getContentWithTag();
contentNoTag = ContentUtility.TransferHTML2Text(contentWithTag);
DocInfo docInfo = new DocInfo();
docInfo.setSid(tid);
docInfo.setSourceType("WeChat");
// docInfo.setLastModified(lastModified);
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("WeChat");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-" +extEntity.getOrigin()==null ? extEntity.getAuthor():extEntity.getOrigin());
// docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
docInfo.setOrgId(orgid);
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "weixinsearch");
if (null != tid) {
params.put("tid", tid + "");
}
docInfo.setOtherParams(params);
if (docInfo.getTitle() == null) {
//空了继续爬 不空爬不下来记录了
} else {
ObjectMapper mapper = new ObjectMapper();
try {
ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
processitem.setSource("搜狗微信爬虫");
if (StringUtils.isEmpty(processitem.getTitle()) || StringUtils.isEmpty(processitem.getContent())) {
System.out.println("资讯的信息不全没有发送");
}
String docjson = mapper.writeValueAsString(processitem);
// KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
log.info("发送到kafka成功。");
JedisUtil.setString(weixinurl + "_" + docInfo.getSid(), "1", -1);
// mqSender.sendMessage(docInfo);
// MemcachedFactory.putNewURI(rediskey, titlekey);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}catch (Exception e){
}
}
public static ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
try {
String newURL="";
for (int i = 0; i <10 ; i++) {
Thread.sleep(1000);
newURL = JedisUtil.getString(docInfo.getSourceaddress());
if(StringUtils.isNotEmpty(newURL)){
clbAnsProcessitem.setSourceAddress(newURL);
break;
}
}
if(StringUtils.isNotEmpty(newURL)){
clbAnsProcessitem.setSourceAddress(newURL);
}else {
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
}
} catch (Exception e) {
e.printStackTrace();
}
return clbAnsProcessitem;
}
public static void main(String[] args) {
// MemcachedFactory.init();
//ChromeUtil.getSougouweixin("外交部");
// ChromeUtil.getSougouweixinForOrgid1("伊朗",1L,1L);
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论