提交 16c70511 作者: liuweigang

爬虫修改4

上级 e43e2f42
......@@ -6,8 +6,6 @@
5.对抽取到的信息链接进行访问和信息抽取。
微信爬虫流程:
1.从kafka获取微信公众号信息。
2.从链接中获取微信公众号id。
......
......@@ -42,8 +42,8 @@ import java.util.regex.Pattern;
*/
@Slf4j
@Controller
@RequestMapping("/wxt")
//@RequestMapping("/aawxt")
//@RequestMapping("/wxt")
@RequestMapping("/aawxt")
public class WeixinController {
// http://localhost:8079/wxt/dofiddlerback?wxurl=1
......
......@@ -61,7 +61,6 @@ public class SiteService {
sendurl(url);
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
......@@ -72,7 +71,8 @@ public class SiteService {
public synchronized void sendurl (String url){
try {
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(10000);
} catch (Exception e) {
......
package com.zzsn.awx.service;
import com.alibaba.fastjson.JSON;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.DateUtil;
import com.zzsn.util.WeixinUtil;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class SiteServiceAll {
private static final Logger Log = LoggerFactory.getLogger(SiteServiceAll.class);
private static Long id=0L;
public void sendUrlToweixin(SiteMsgTemple siteMsgTemple,String wxname){
String url=siteMsgTemple.getSiteUri();
String weixinhaoid="";
try {
weixinhaoid = getParam(url);
String msg= JSON.toJSONString(siteMsgTemple);
// JedisUtil.setString(":"+weixinhaoid+"_"+siteMsgTemple.getId(),msg,0);
JedisUtil.setString(":"+weixinhaoid,msg,0);
} catch (Exception e) {
e.printStackTrace();
}
if(weixinhaoid!=null&&weixinhaoid.trim().length()>0){
clearweixinhaoid(weixinhaoid);
}
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//20秒发送一次链接给微信
sendurl(url,wxname);
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void sendurl (String url,String wxname){
try {
System.out.println("发送的微信号: "+wxname);
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(10000);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public synchronized void sendurl1 (String url,String wxname){
try {
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(20*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public static void clearweixinhaoid(String weixinhaoid){
try {
JedisUtil.del(weixinhaoid);
}catch (Exception e){
}
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
public static String getParambyname(String url,String name) {
Map<String,String> map=parse(url);
return map.get(name);
}
public static String getweixinId(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public static String getweixinId1(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("&mid=");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public void crawlerweixin(SiteMsgTemple siteMsgTemple) throws Exception{
String weixinurl=siteMsgTemple.getSiteUri();
//判断是否yipaqu
String urlflag=JedisUtil.getString(weixinurl);
if(!StringUtils.isEmpty(urlflag)){
System.out.println("已爬取1"+weixinurl);
return;
}
//查询组织
String weixinid=getParam(weixinurl);
String organdtids=JedisUtil.getString(weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
String contentWithTag = "";
contentNoTag = ContentUtility.TransferHTML2Text(contentWithTag);
DocInfo docInfo=new DocInfo();
docInfo.setSourceType("WeChat");
// docInfo.setLastModified(lastModified);
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
// docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
Iterator<String> it =null;
// Iterator<String> it = organdtids.iterator();
while (it.hasNext()) {
String str = it.next();
System.out.println(str);
//解析ITD ORGID
String[] ss= str.split("-");
if(ss.length!=3){
return;
}else{
System.out.println(str);
System.out.println(ss.toString());
}
String orgid=ss[0];
String tid=ss[1];
String sid=ss[2];
docInfo.setOrgId(Long.valueOf(orgid));
docInfo.setSid(Long.valueOf(sid));
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "weixincraw");
if (null!=tid&&!"null".equals(tid)) {
params.put("tid", tid);
}
docInfo.setOtherParams(params);
String week = DateUtil.getDateBeforeDays(new Date() , 2);
if(docInfo.getTitle()==null){
if(StringUtils.isEmpty(contentNoTag)){
//空了继续爬 不空爬不下来记录了
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}else if(docInfo.getPublishDate().compareTo(week)<0){
//1天前外事办项目不推
System.out.println("时间过期"+docInfo.getPublishDate());
JedisUtil.setString(weixinurl, 1+"",0);
//其他项目还是推
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}
}
public static void main(String[] args) {
String s="https://mp.weixin.qq.com/mp/profile_ext?action=home&scene=114&__biz=MzAwODE2OTAwNg==#wechat_redirect";
System.out.println(getParam(s));
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
System.out.println(ss);
ss=getParam(s);
System.out.println(ss);
String time="2019-11-18 12:20:23";
String week = DateUtil.getDateBeforeDays(new Date() , 1);
System.out.println(time.compareTo(week));
String sss="http://mp.weixin.qq.com/s?__biz=MzUxMzEzNjg1Ng==&mid=2247484003&idx=1&sn=965ca574850ab65be466c443bf8e2a3b&scene=0965ca574850ab65be466c443bf8e2a3b";
sss=getParambyname(sss, "signature");
System.out.println(sss);
}
}
\ No newline at end of file
package com.zzsn.crawler;
import com.zzsn.awx.service.SiteServiceAll;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.awx.service.SiteService;
......@@ -11,6 +12,7 @@ import org.springframework.kafka.core.KafkaTemplate;
public class WeixinSiteThread extends Thread{
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public String wxname="";
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
......@@ -19,16 +21,29 @@ public class WeixinSiteThread extends Thread{
crawler();
}
public synchronized void crawler(){
public void crawler(){
//发送公众号链接到手机微信
try {
SiteService sites= SpringContextUtil.getBean(SiteService.class);
sites.sendUrlToweixin(siteMsgTemple);
// SiteService sites= SpringContextUtil.getBean(SiteService.class);
SiteServiceAll sites= SpringContextUtil.getBean(SiteServiceAll.class);
sites.sendUrlToweixin(siteMsgTemple,wxname);
// Thread.sleep(1000*10);
Thread.sleep(1000*90);
}catch (Exception e){
}
}
// public synchronized void crawler(){
// //发送公众号链接到手机微信
// try {
// SiteService sites= SpringContextUtil.getBean(SiteService.class);
// sites.sendUrlToweixin(siteMsgTemple);
// Thread.sleep(1000*90);
// }catch (Exception e){
//
// }
//
// }
}
......@@ -49,7 +49,12 @@ public class KafkaConsumerJob {
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
// properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
String wxsendname = Constants.WXSENDNAME;
String[] wxsize = wxsendname.split(",");
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, wxsize.length);
properties.put("batch-listener", true );//是否批量消费
// properties.put("batch-listener", false );//是否批量消费
return new KafkaConsumer<>(properties);
}
......
......@@ -89,7 +89,7 @@ public class SouGouSearch {
System.out.println(siteName + "————资讯最新发布时间:" + sd);
hours = SouGouSearch.getBeforeTime(latestNewsTime);
}
return 0;
return hours;
}
public static int getMsgTime(String wxname){
int hours=0;
......@@ -101,21 +101,17 @@ public class SouGouSearch {
Elements elements;
try {
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
System.out.println(wxname+"搜狗搜索到内容个数:"+elements.size());
System.out.println(wxname+"搜狗搜索到公众号个数:"+elements.size());
if(elements.size()<1){
Thread.sleep(10000);
getCookie3();
String connect = connect(url);
String connect = get(url);
document = Jsoup.parse(connect);
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
}
}catch (Exception e){
String connect = get(url);
document = Jsoup.parse(connect);
elements = document.select("div[class=\"news-box\"]>ul[class=\"news-list2\"]>li");
if(elements.size()<1){
return hours;
}
hours=getTime(wxname);
return hours;
}
try {
for (Element element :elements) {
......@@ -129,7 +125,6 @@ public class SouGouSearch {
Matcher matcher = pattern.matcher(index);
if(matcher.find()){
time=matcher.group();
}
if(name.equals(wxname)) {
hours=getBeforeTime(time);
......@@ -239,7 +234,8 @@ public class SouGouSearch {
response = client.execute(httpget);
} catch (UnknownHostException hostEx) {
hostEx.printStackTrace();
System.out.println("代理请求访问异常");
return content;
}
//2、获取实体
HttpEntity entity = response.getEntity();
......@@ -284,19 +280,7 @@ public class SouGouSearch {
try {
// 创建httpget.
HttpGet httpget = new HttpGet(url);
// httpget.addHeader("token"," token");
// httpget.addHeader("Connection","keep-alive");
// httpget.addHeader("Cache-Control","max-age=0");
// httpget.addHeader("Upgrade-Insecure-Requests","1");
// httpget.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
// httpget.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// httpget.addHeader("Sec-Fetch-Site","none");
// httpget.addHeader("Sec-Fetch-Mode","navigate");
// httpget.addHeader("Sec-Fetch-User","?1");
// httpget.addHeader("Sec-Fetch-Dest","document");
// httpget.addHeader("Accept-Language","zh-CN,zh;q=0.9");
// httpget.addHeader("Cookie","IPLOC=CN4101; SUV=001D46FC7D29AE7D61163051AB952530; SUID=D381343DC830A40A0000000061163173; GOTO=Af121011; SNUID=FD28FF2CF0F53E5DB8F35ED0F18CD145; ABTEST=0|1634614242|v1; weixinIndexVisited=1; JSESSIONID=aaaa12QKYfTMweWDxHhUx; ld=Kyllllllll2Pb6lklllllpVmyjllllllKvum1kllll9lllll9Zlll5@@@@@@@@@@");
//
httpget.addHeader("Accept"," text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpget.addHeader("Accept-Encoding"," gzip, deflate, br");
httpget.addHeader("Accept-Language"," zh-CN,zh;q=0.9");
......@@ -314,7 +298,12 @@ public class SouGouSearch {
httpget.addHeader("Sec-Fetch-User"," ?1");
httpget.addHeader("Upgrade-Insecure-Requests"," 1");
httpget.addHeader("User-Agent"," Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36");
// System.out.println("executing request " + httpget.getURI());
try {
String wxcookies = JedisUtil.getString("wxcookies");
httpget.addHeader("Cookie", wxcookies);
}catch (Exception e){
httpget.addHeader("Cookie", "ABTEST=0|1660717633|v1; IPLOC=CN1100; SUID=31A373722583A20A0000000062FC8A41; SUID=31A373727050A00A0000000062FC8A41; JSESSIONID=aaaTAV1wUSPUH8_i9Czjy; SUV=00B7E8BD7273A33162FC8A437C13A967; PHPSESSID=4kh8ifemsbo3o9picgb43uot52; weixinIndexVisited=1; ariaDefaultTheme=undefined; seccodeErrorCount=1|Thu, 18 Aug 2022 03:51:16 GMT; SNUID=2FBD6E6F1D18FB66CD8E1E051E83C0CB; seccodeRight=success; successCount=1|Thu, 18 Aug 2022 03:51:28 GMT; refresh=1");
}
// 执行get请求.
CloseableHttpResponse response = httpclient.execute(httpget);
try {
......@@ -433,17 +422,14 @@ public class SouGouSearch {
conn.setRequestMethod("POST");
} catch (Exception e) {
e.printStackTrace();
}finally{
}
String sessionId = "";
String cookieVal = "";
String key = null;
Map<String, List<String>> map2 = conn.getHeaderFields();
// for (String key1 : map2.keySet()) {
// System.out.println(key1 + "--->" + map2.get(key1));
// }
//取cookie
Map<String,String> map=new HashMap<>();
for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){
cookieVal = conn.getHeaderField(i);
......
......@@ -207,4 +207,6 @@ public class Constants {
public static final String HUAWEICLOUD_AK= prop.getProperty("HUAWEICLOUD_AK");
public static final String HUAWEICLOUD_SK= prop.getProperty("HUAWEICLOUD_SK");
public static final String REDISCOUNT= prop.getProperty("REDISCOUNT");
}
......@@ -207,4 +207,27 @@ public class FileUtil {
file.delete();
}
}
public static List<String> getFileLines(File file, String encoding)
throws IOException
{
List<String> lines = new ArrayList<String>();
if (encoding == null)
{
encoding = "utf-8";
}
FileInputStream stream = new FileInputStream(file);
InputStreamReader reader = new InputStreamReader(stream, encoding);
BufferedReader bufferedReader = new BufferedReader(reader);
String line = null;
while ((line = bufferedReader.readLine()) != null)
{
lines.add(line);
}
bufferedReader.close();
reader.close();
stream.close();
return lines;
}
}
......@@ -43,6 +43,11 @@ public class ObsUpload {
}
public static void main(String[] args) {
byte[] bytes =getImageFromNetByUrl("http://www.cggc.ceec.net.cn/picture/0/s_14f1a1a063434205bd17b8769e0746f0.jpg");
InputStream inputStream = new ByteArrayInputStream(bytes);
}
/**
* 根据地址获得数据的字节流
*
......
......@@ -22,12 +22,13 @@ KEYWORDS_EXCLE_URL=D\://typeword.xls
RESULT_EXCLE_URL=D\://data//subject//\u56FD\u4F01\u516C\u76CA\u5206\u7C7B//
THREAD_NUM=10
SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS=10
TITLE_SIMILARITY_RATE=0.8
MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE=1
#待采集的redis编码
JWYQJC_INFILE_URL=E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\aa.txt
PROXY=0
PROXYID=1
......@@ -46,7 +47,8 @@ KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
KAFKA_CONSUMER_TOPIC = weChatInfo
#消费信息组
KAFKA_CONSUMER_GROUP_ID=wx-info
#KAFKA_CONSUMER_GROUP_ID=wx-info
KAFKA_CONSUMER_GROUP_ID=wx-infotest
KAFKA_CONSUMER_AUTO_OFFSET_RESET=latest
#KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息保存的topic
......@@ -62,7 +64,8 @@ KAFKA_CONSUMER_PARTITION=0,1,2,3,4,5,6,7
#微信账号名称
#WXSENDNAME= LiuWeiGang
WXSENDNAME= lwg
#WXSENDNAME= lwg
WXSENDNAME= lwg,lwg2,lwg3,lwg4
path= E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\static\\wechat-processor.templete
......@@ -104,7 +107,7 @@ redis.maxTotal=600
redis.maxWaitMillis=1000
redis.testOnBorrow=false
REDISCOUNT=wx1
......
package com.zzsn.caipan;
public class JudgeSenium {
public static void main(String[] args) {
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论