Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
af30a040
提交
af30a040
authored
7月 27, 2022
作者:
liuweigang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
通用采集代码更新
上级
649ac47c
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
529 行增加
和
68 行删除
+529
-68
doc.txt
comm_crawler/doc.txt
+2
-0
CrawlerStaticApplication.java
...wler/src/main/java/com/zzsn/CrawlerStaticApplication.java
+0
-0
PaserSiteDownload.java
...ler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
+1
-0
SiteThread.java
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
+65
-3
WebContentPaserByRegular.java
...java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
+25
-13
HttpgetUtil.java
...src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
+129
-0
SeleniumTime.java
...rc/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
+1
-0
SeleniumTime4.java
...c/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
+231
-0
WebContentPaserByXpath.java
...a/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
+2
-2
JedisUtil.java
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
+2
-1
KafkaConsumerJob.java
..._crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
+1
-1
ChromeTest.java
comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
+1
-0
HttpClientTester.java
...crawler/src/main/java/com/zzsn/test/HttpClientTester.java
+4
-1
WebTest.java
comm_crawler/src/main/java/com/zzsn/test/WebTest.java
+5
-13
ContentUtility.java
comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
+9
-7
DriverUtil.java
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
+17
-6
Utility.java
comm_crawler/src/main/java/com/zzsn/util/Utility.java
+0
-0
WindowsProcess.java
comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
+17
-12
aa.txt
comm_crawler/src/main/resources/aa.txt
+0
-0
application.properties
comm_crawler/src/main/resources/application.properties
+4
-4
constants.properties
comm_crawler/src/main/resources/constants.properties
+7
-3
redis.properties
comm_crawler/src/main/resources/redis.properties
+6
-2
没有找到文件。
comm_crawler/doc.txt
浏览文件 @
af30a040
...
...
@@ -11,3 +11,5 @@
comm_crawler/src/main/java/com/zzsn/CrawlerStaticApplication.java
浏览文件 @
af30a040
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/crawler/PaserSiteDownload.java
浏览文件 @
af30a040
...
...
@@ -469,6 +469,7 @@ public class PaserSiteDownload {
}
public
static
String
getHtml
(
String
url
,
String
charset
)
{
java
.
security
.
Security
.
setProperty
(
"networkaddress.cache.ttl"
,
"0"
);
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
...
...
comm_crawler/src/main/java/com/zzsn/crawler/SiteThread.java
浏览文件 @
af30a040
...
...
@@ -2,6 +2,7 @@ package com.zzsn.crawler;
import
cn.hutool.core.date.DateTime
;
import
cn.hutool.core.date.DateUtil
;
import
cn.hutool.core.io.FileUtil
;
import
com.fasterxml.jackson.core.JsonProcessingException
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
com.zzsn.configuration.SpringContextUtil
;
...
...
@@ -11,6 +12,9 @@ import com.zzsn.crawler.paser.WebContentPaserByRegular;
import
com.zzsn.crawler.paser.WebContentPaserByXpath
;
import
com.zzsn.crawler.uriparser.HisURIConfig
;
import
com.zzsn.crawler.uriparser.HisURIParser
;
import
com.zzsn.crawler.uriparser.HttpgetUtil
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
import
com.zzsn.generation.Constants
;
import
lombok.extern.slf4j.Slf4j
;
...
...
@@ -33,16 +37,74 @@ public class SiteThread implements Runnable{
public
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
public
SiteMsgTemple
siteMsgTemple
=
new
SiteMsgTemple
();
public
KafkaTemplate
kafkaTemplate
=
SpringContextUtil
.
getBean
(
KafkaTemplate
.
class
);
//
public KafkaTemplate kafkaTemplate=SpringContextUtil.getBean(KafkaTemplate.class);
@Override
public
void
run
()
{
crawler
();
}
public
static
PageDownloader
pageDownload
=
new
PageDownloader
();
public
void
crawler
(){
//获取栏目链接以及翻页的链接
// List<String> urlList=getPageListUrl(siteMsgTemple);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
siteMsgTemple
.
getSiteUri
());
//兼容就平台的历史链接方法
String
charset
=
"utf-8"
;
//获取列表url等信息通过匹配url过滤
List
<
CatchWebByMetaSearch
>
metaSearchList
=
new
ArrayList
<>();
List
<
DocInfo
>
docInfoList
=
new
ArrayList
<>();
log
.
info
(
"信息源名称:"
+
siteMsgTemple
.
getSiteName
()+
" 信息源采集开始时间:"
+
DateTime
.
now
());
// Date collectTime=DateTime.now();
// String infoSourceId=siteMsgTemple.getId();
// //默认表达式类型
// siteMsgTemple.setListExpressionType(siteMsgTemple.getListExpressionType()==null?"0":siteMsgTemple.getListExpressionType());
//
// //判断列表解析表达式类型
// if(siteMsgTemple.getListExpressionType().equals("3")) {//css表达式
// WebContentPaserByCss webContentPaserByCss=new WebContentPaserByCss();
// metaSearchList = webContentPaserByCss.catchWebOfStaticmsgByCSS(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("2")){//xpath解析
// WebContentPaserByXpath webContentPaserByXpath=new WebContentPaserByXpath();
// metaSearchList = webContentPaserByXpath.catchWebOfStaticmsgByXapth(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("1")){//jsonpath解析
// WebContentPaserByJsonXpath webContentPaserByJsonXpath=new WebContentPaserByJsonXpath();
// metaSearchList = webContentPaserByJsonXpath.catchWebOfStaticmsgByJsonPath(urlList, charset, siteMsgTemple);
//
// }else if(siteMsgTemple.getListExpressionType().equals("0")){//正则解析
// WebContentPaserByRegular webContentPaserByRegular=new WebContentPaserByRegular();
// metaSearchList = webContentPaserByRegular.catchWebOfStaticmsgByRegular(urlList, charset, siteMsgTemple);
// }
String
body
=
""
;
if
(
StringUtils
.
isNotEmpty
(
siteMsgTemple
.
getHeaders
())){
body
=
pageDownload
.
downloadWithStrAddHeader
(
urlList
.
get
(
0
),
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
try
{
//先使用静态网络请求获取列表内容
body
=
HttpgetUtil
.
getHtml
(
urlList
.
get
(
0
));
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}
catch
(
Exception
e
)
{
log
.
info
(
e
.
getMessage
());
body
=
pageDownload
.
downloadWithStr
(
urlList
.
get
(
0
),
charset
,
false
,
false
);
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if
(
StringUtils
.
isEmpty
(
body
)
&&
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
body
=
SeleniumTime
.
getScopehtml
(
urlList
.
get
(
0
));
}
}
if
(
body
.
length
()<
1000
){
FileUtil
.
appendString
(
siteMsgTemple
.
getInfoSourceCode
()+
"\n\r"
,
"D:\\jingwai.txt"
,
"utf-8"
);
}
}
public
void
crawler2
(){
//获取栏目链接以及翻页的链接
List
<
String
>
urlList
=
getPageListUrl
(
siteMsgTemple
);
//兼容就平台的历史链接方法
HisURIParser
hisURIParser
=
new
HisURIParser
();
...
...
@@ -130,7 +192,7 @@ public class SiteThread implements Runnable{
siteMsgRecord
.
setCollectTime
(
collectTime
);
String
docjson
=
mapper
.
writeValueAsString
(
siteMsgRecord
);
kafkaTemplate
.
send
(
Constants
.
KAFKA_COLLECT_TOPIC
,
"key"
,
docjson
);
//
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log
.
info
(
"发送到kafka成功。"
);
}
catch
(
JsonProcessingException
e
)
{
// e.printStackTrace();
...
...
comm_crawler/src/main/java/com/zzsn/crawler/paser/WebContentPaserByRegular.java
浏览文件 @
af30a040
...
...
@@ -5,9 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import
com.zzsn.configuration.SpringContextUtil
;
import
com.zzsn.crawler.PaserSiteDownload
;
import
com.zzsn.crawler.outlinkfinder.DefaultOutlinkFinder
;
import
com.zzsn.crawler.uriparser.HtmlPageParser
;
import
com.zzsn.crawler.uriparser.SeleniumTime
;
import
com.zzsn.crawler.uriparser.WebPageScreenShot
;
import
com.zzsn.crawler.uriparser.*
;
import
com.zzsn.download.PageBuilderParser
;
import
com.zzsn.download.PageDownloader
;
import
com.zzsn.entity.*
;
...
...
@@ -61,13 +59,17 @@ public class WebContentPaserByRegular {
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
try
{
//先使用静态网络请求获取列表内容
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
body
=
HttpgetUtil
.
getHtml
(
uri_code
);
// body = pageDownload.downloadWithStr(uri_code, charset, false, false);
}
catch
(
Exception
e
){
log
.
info
(
e
.
getMessage
());
body
=
paserSiteDownload
.
getHtml
(
uri_code
,
charset
);
body
=
pageDownload
.
downloadWithStr
(
uri_code
,
charset
,
false
,
false
);
// body = paserSiteDownload.getHtml(uri_code, charset);
}
//请求返回为空时判断为动态请求使用模拟浏览器的方式
if
(
StringUtils
.
isEmpty
(
body
)
&&
siteMsgTemple
.
getYnDynamicCrawl
()==
1
)
{
// SeleniumTime seleniumTime=new SeleniumTime();
// body = seleniumTime.getScopehtml(uri_code);
body
=
SeleniumTime
.
getScopehtml
(
uri_code
);
}
if
(
StringUtils
.
isEmpty
(
body
)
||
pageDownload
.
isBadDownloadPage
(
body
))
{
...
...
@@ -270,13 +272,17 @@ public class WebContentPaserByRegular {
String
content
=
""
;
try
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
// SeleniumTime seleniumTime=new SeleniumTime();
// content = seleniumTime.getScopehtml(cwbm.getSourceaddress());
content
=
SeleniumTime
.
getScopehtml
(
cwbm
.
getSourceaddress
());
}
else
{
try
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
null
,
false
,
false
);
content
=
HttpgetUtil
.
getHtml
(
cwbm
.
getSourceaddress
());
// content = pageDownload.downloadWithStr(cwbm.getSourceaddress(), null, false, false);
}
catch
(
Exception
e
)
{
content
=
pageDownload
.
downloadWithStr
(
cwbm
.
getSourceaddress
(),
null
,
false
,
false
);
log
.
info
(
e
.
getMessage
());
content
=
paserSiteDownload
.
getHtml
(
cwbm
.
getSourceaddress
(),
null
);
//
content = paserSiteDownload.getHtml(cwbm.getSourceaddress(), null);
}
}
//超时,失效连接反馈,selenium驱动关闭不掉,信息源分类
...
...
@@ -307,11 +313,12 @@ public class WebContentPaserByRegular {
docInfo
.
setTitle
(
cwbm
.
getTitle
()
==
null
?
""
:
cwbm
.
getTitle
().
replace
(
"..."
,
""
));
docInfo
.
setAuthor
(
cwbm
.
getAuthor
());
docInfo
.
setPublishDate
(
cwbm
.
getPublishDate
());
if
(
cwbm
.
getSourceaddress
()
!=
null
)
{
docInfo
.
setOrigin
(
cwbm
.
getSourcesite
());
}
else
{
docInfo
.
setOrigin
(
siteMsgTemple
.
getSiteName
());
}
// if (cwbm.getSourceaddress() != null) {
// docInfo.setOrigin(cwbm.getSourcesite());
// } else {
// docInfo.setOrigin(siteMsgTemple.getSiteName());
// }
docInfo
.
setOrigin
(
siteMsgTemple
.
getSiteName
());
docInfo
.
setSummary
(
cwbm
.
getSummary
());
//封装解析的docinfo对象
try
{
...
...
@@ -533,7 +540,7 @@ public class WebContentPaserByRegular {
}
docInfo
.
setContentWithTag
(
contentWithTag
);
docInfo
.
setContentNoTag
(
Utility
.
TransferHTML2Text
(
contentWithTag
).
replaceAll
(
"\\n"
,
""
));
docInfo
.
setContentNoTag
(
Content
Utility
.
TransferHTML2Text
(
contentWithTag
).
replaceAll
(
"\\n"
,
""
));
}
//作者
...
...
@@ -567,8 +574,13 @@ public class WebContentPaserByRegular {
origin
=
paseElementByCSS
(
doc
,
siteTemplate
.
getDetailExpressionSource
());
if
(
StringUtils
.
isNotEmpty
(
origin
))
{
docInfo
.
setOrigin
(
origin
);
}
else
{
docInfo
.
setOrigin
(
siteTemplate
.
getSiteName
());
}
}
else
{
docInfo
.
setOrigin
(
siteTemplate
.
getSiteName
());
}
return
docInfo
;
}
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/HttpgetUtil.java
0 → 100644
浏览文件 @
af30a040
package
com
.
zzsn
.
crawler
.
uriparser
;
import
java.io.IOException
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
com.zzsn.download.CreateSSLClientDefault
;
import
com.zzsn.util.Utility
;
import
org.apache.commons.httpclient.params.HttpMethodParams
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.ParseException
;
import
org.apache.http.client.ClientProtocolException
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.params.CoreConnectionPNames
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Connection
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
public
class
HttpgetUtil
{
public
static
String
getHtml
(
String
url
)
{
String
html
=
""
;
CloseableHttpClient
httpClient
=
CreateSSLClientDefault
.
createSSLClientDefault
();
HttpGet
httpgeturl
=
new
HttpGet
(
url
);
// Get请求
httpgeturl
.
getParams
().
setIntParameter
(
CoreConnectionPNames
.
CONNECTION_TIMEOUT
,
60000
);
httpgeturl
.
getParams
().
setParameter
(
HttpMethodParams
.
SO_TIMEOUT
,
60000
);
// 伪装成浏览器
httpgeturl
.
setHeader
(
"Content-Type"
,
"application/x-www-form-urlencoded;charset=utf-8"
);
httpgeturl
.
setHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
httpgeturl
.
setHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
//httpgeturl.setHeader("Accept-Language", "en");
//httpgeturl.setHeader("Accept-Encoding", "gzip, deflate");
HttpResponse
httprespse
=
null
;
try
{
Thread
.
sleep
(
500L
);
httprespse
=
httpClient
.
execute
(
httpgeturl
);
}
catch
(
Exception
e2
)
{
// TODO Auto-generated catch block
e2
.
printStackTrace
();
}
// 发送请求
HttpEntity
entitydata
=
httprespse
.
getEntity
();
// 获取返回数据
Header
lastModify
=
httprespse
.
getFirstHeader
(
"Last-Modified"
);
if
(
lastModify
==
null
)
{
lastModify
=
httprespse
.
getLastHeader
(
"Last-Modified"
);
}
String
charset
=
"utf-8"
;
String
charstype
=
EntityUtils
.
getContentCharSet
(
entitydata
);
if
(
charstype
!=
null
)
{
charset
=
charstype
;
}
else
{
charset
=
LocateCharSet
(
url
);
}
charset
=
Utility
.
charsetcheck
(
charset
);
String
infodata
=
""
;
try
{
Thread
.
sleep
(
500L
);
infodata
=
EntityUtils
.
toString
(
entitydata
,
charset
);
}
catch
(
Exception
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
httpgeturl
.
releaseConnection
();
return
infodata
;
}
public
static
String
LocateCharSet
(
String
url
)
{
String
encoding
=
"gb2312"
;
try
{
Thread
.
sleep
(
500L
);
Connection
conn
=
Jsoup
.
connect
(
url
);
conn
.
header
(
"User-Agent"
,
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);"
);
// 伪装成浏览器
Document
doc
=
conn
.
ignoreContentType
(
true
).
timeout
(
10000
).
get
();
Pattern
p1
=
Pattern
.
compile
(
"<meta[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m1
=
p1
.
matcher
(
doc
.
toString
());
while
(
m1
.
find
())
{
String
str
=
m1
.
group
();
Pattern
p2
=
Pattern
.
compile
(
"charset[^\\s||\"||;||'||>]*"
);
Matcher
m2
=
p2
.
matcher
(
str
);
if
(
m2
.
find
())
{
encoding
=
m2
.
group
().
substring
(
8
);
if
(
encoding
.
trim
().
length
()
==
0
)
{
Pattern
p3
=
Pattern
.
compile
(
"charset=\"[^\\s||\"||;||>]*"
);
Matcher
m3
=
p3
.
matcher
(
str
);
if
(
m3
.
find
())
{
encoding
=
m3
.
group
().
substring
(
9
);
}
if
(
encoding
.
trim
().
length
()
==
0
)
{
// encoding = DetectCharSet.detectCharSet(fileName);
// if(encoding == null){
encoding
=
"GB2312"
;
// }
}
}
return
encoding
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
System
.
out
.
println
(
"获取出错编码方式"
);
return
encoding
;
}
return
encoding
;
}
}
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime.java
浏览文件 @
af30a040
...
...
@@ -32,6 +32,7 @@ public class SeleniumTime {
public
static
String
getScopehtml
(
String
url
)
{
String
html
=
""
;
try
{
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
try
{
Duration
duration
=
Duration
.
of
(
100
,
ChronoUnit
.
SECONDS
);
...
...
comm_crawler/src/main/java/com/zzsn/crawler/uriparser/SeleniumTime4.java
0 → 100644
浏览文件 @
af30a040
package
com
.
zzsn
.
crawler
.
uriparser
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileOutputStream
;
import
java.io.PrintStream
;
import
com.zzsn.generation.Constants
;
import
org.openqa.selenium.By
;
import
org.openqa.selenium.WebElement
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.chrome.ChromeOptions
;
public
class
SeleniumTime4
{
public
ChromeOptions
chromeOptions
=
new
ChromeOptions
()
;
public
ChromeDriver
driver
;
public
SeleniumTime4
(){
// System.setProperty("webdriver.chrome.driver", "E:\\cmd\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "D:\\cmdvip\\chromedriver.exe");
// System.setProperty("webdriver.chrome.driver", "E:\\chrome\\chromedriver.exe");
System
.
setProperty
(
"webdriver.chrome.driver"
,
Constants
.
CHROMEDRIVE
);
// System.setProperty("webdriver.chrome.bin", "C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe");
chromeOptions
.
addArguments
(
"blink-settings=imagesEnabled=false"
);
// chromeOptions.addArguments("user-data-dir=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\User Data\\Default");
// chromeOptions.addArguments("--start-maximized");
// chromeOptions.addArguments("--headless");
driver
=
new
ChromeDriver
(
chromeOptions
);
}
/**
* 根据网址获取网页html信息
* @param url
* @return
*/
public
String
getScopehtml
(
String
url
){
//=====================================================================================================
// ChromeOptions chromeOptions =new ChromeOptions();
//// System.setProperty("webdriver.chrome.driver", Constants.CHROMEDRIVE);
// System.setProperty("webdriver.chrome.driver", "D:\\project\\cmd\\chromedriver.exe");
// //System.setProperty("webdriver.chrome.bin", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //chromeOptions.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// //C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe
// //C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
// //chromeOptions.addArguments("--headless");
// ChromeDriver driver = new ChromeDriver(chromeOptions);
//=====================================================================================================
try
{
driver
.
get
(
url
);
WebElement
webElement
=
driver
.
findElement
(
By
.
xpath
(
"/html"
));
try
{
Thread
.
sleep
(
3000
l
);
String
html
=
webElement
.
getAttribute
(
"outerHTML"
);
Thread
.
sleep
(
5000
l
);
driver
.
quit
();
// System.out.println(html);
if
(
url
.
contains
(
"http://www.flw.ph"
)){
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
if
(
html
.
contains
(
a
)&&
html
.
contains
(
b
)){
String
[]
split
=
html
.
split
(
a
);
String
sa
=
split
[
0
];
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
String
sab
=
sa
+
substring
;
return
sab
;
}
}
return
html
;
}
catch
(
Exception
e
){
System
.
out
.
println
(
"动态爬取方式一出现+"
+
"org.openqa.selenium.StaleElementReferenceException异常"
+
"可能原因为过快的执行没有找到指定的页面元素"
);
System
.
out
.
println
(
"=============执行方法二=============="
);
Thread
.
sleep
(
3000
l
);
String
html
=
driver
.
getPageSource
();
Thread
.
sleep
(
5000
l
);
driver
.
quit
();
if
(
url
.
contains
(
"http://www.flw.ph"
)){
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
if
(
html
.
contains
(
a
)&&
html
.
contains
(
b
)){
String
[]
split
=
html
.
split
(
a
);
String
sa
=
split
[
0
];
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
String
sab
=
sa
+
substring
;
return
sab
;
}
}
return
html
;
}
// Thread.sleep(3000l);
// String source = driver.getPageSource();
// //if(source.length()!=0){
// driver.quit();
// return source;
//}
// String html = webElement.getAttribute("outerHTML");
// //System.out.println(html);
// driver.quit();
// return html;
//==========================================================================
// driver.get(url);
// // 休眠1s,为了让js执行完
// Thread.sleep(1000l);
// // 网页源码
// String source = driver.getPageSource();
// System.out.println("进入SeleniumTime中的getScopehtml方法获取相应的html");
// driver.quit();
// return source;
}
catch
(
Exception
e
){
try
{
Thread
.
sleep
(
5000
l
);
}
catch
(
InterruptedException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
driver
.
quit
();
e
.
printStackTrace
();
}
try
{
Thread
.
sleep
(
5000
l
);
}
catch
(
InterruptedException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
driver
.
quit
();
return
null
;
}
public
static
void
main
(
String
[]
args
)
{
//去除html中的相关标签
/**
* 网上大多是说明直接使用正则表达式不能很好的适用于html
* 经过尝试我无法删除先关div中内容,只能自己通过字符串切割的形式获取
*/
SeleniumTime4
s
=
new
SeleniumTime4
();
String
scopehtml
=
s
.
getScopehtml
(
"http://www.flw.ph/thread-869016-1-1.html"
);
String
a
=
"<div class=\"attach_nopermission attach_tips\">"
;
String
b
=
"<span class=\"atips_close\" onclick=\"this.parentNode.style.display='none'\">x</span>"
;
System
.
out
.
println
(
"开始"
);
if
(
scopehtml
.
contains
(
a
)){
System
.
out
.
println
(
"包含a"
);
}
if
(
scopehtml
.
contains
(
a
)){
System
.
out
.
println
(
"包含b"
);
}
System
.
out
.
println
(
"结束"
);
String
[]
split
=
scopehtml
.
split
(
a
);
String
sa
=
split
[
0
];
System
.
out
.
println
(
"首次截取的长度"
+
split
.
length
);
String
[]
split2
=
split
[
1
].
split
(
b
);
String
sb
=
split2
[
1
];
String
substring
=
sb
.
substring
(
7
);
System
.
out
.
println
(
"再次截取的长度"
+
split2
.
length
);
String
sab
=
sa
+
substring
;
// //解决方式 正则匹配删除标签
// // *.div[class="t_fsz"]
// String regex = "<div class=\"attach_nopermission attach_tips\">(.*?)</div>";
// //String regex = "<div.*?>(.*?)</div>";
// //String regex = "*.div[class="+"attach_nopermission attach_tips"+"]";
//
//// boolean isMatch = regex.matches(scopehtml);
//// System.out.println("字符串中是否包含了 'runoob' 子字符串? " + isMatch);
////
// // 创建 Pattern 对象
// Pattern r = Pattern.compile(regex);
//
// // 现在创建 matcher 对象
// Matcher m = r.matcher(scopehtml);
// if (m.find( )) {
// System.out.println("Found value: " + m.group(0) );
// System.out.println("Found value: " + m.group(1) );
// System.out.println("Found value: " + m.group(2) );
// System.out.println("Found value: " + m.group(3) );
// } else {
// System.out.println("NO MATCH");
// }
//
//
File
file
=
new
File
(
"D:/123.txt"
);
try
{
PrintStream
ps
=
new
PrintStream
(
new
FileOutputStream
(
file
));
ps
.
println
(
sab
);
}
catch
(
FileNotFoundException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
}
}
comm_crawler/src/main/java/com/zzsn/crawlerOther/paser/WebContentPaserByXpath.java
浏览文件 @
af30a040
...
...
@@ -89,8 +89,8 @@ public class WebContentPaserByXpath {
body
=
pageDownload
.
downloadWithStrAddHeader
(
uri_code
,
charset
,
false
,
false
,
siteMsgTemple
.
getHeaders
());
}
else
{
if
(
siteMsgTemple
.
getYnDynamicCrawl
()
==
1
)
{
seleniumTime
=
new
SeleniumTime
();
body
=
s
eleniumTime
.
getScopehtml
(
uri_code
);
//
seleniumTime=new SeleniumTime();
body
=
S
eleniumTime
.
getScopehtml
(
uri_code
);
TimeUnit
.
SECONDS
.
sleep
(
5
);
seleniumTime
.
close
();
}
else
{
...
...
comm_crawler/src/main/java/com/zzsn/job/JedisUtil.java
浏览文件 @
af30a040
...
...
@@ -165,7 +165,8 @@ public class JedisUtil {
throw
new
Exception
(
"key is null"
);
}
jedis
=
getDefaultJedis
();
value
=
jedis
.
get
(
PREFIX
+
key
);
// value = jedis.get(PREFIX + key);
value
=
jedis
.
get
(
key
);
}
catch
(
Exception
e
){
}
finally
{
...
...
comm_crawler/src/main/java/com/zzsn/job/KafkaConsumerJob.java
浏览文件 @
af30a040
...
...
@@ -62,7 +62,7 @@ public class KafkaConsumerJob {
// , Constants.THREAD_SIZE , 0, TimeUnit.SECONDS, new ArrayBlockingQueue<>(1));
@Scheduled
(
cron
=
"0 0/2 * * * ?"
)
//
@Scheduled(cron = "0 0/2 * * * ?")
// @Async("asyncTaskExecutor")
public
void
consumer
(){
// ExecutorService threadPool = Executors.newFixedThreadPool(Constants.THREAD_SIZE);
...
...
comm_crawler/src/main/java/com/zzsn/test/ChromeTest.java
浏览文件 @
af30a040
...
...
@@ -98,6 +98,7 @@ public class ChromeTest {
// 可复用驱动使用Demo
public
static
void
main
(
String
[]
args
)
throws
Exception
{
ReuseWebDriver
driver
=
DriverUtil
.
getChromeDriver
();
if
(
driver
==
null
)
{
// 从缓存取出SessionId为空才时,驱动会返回null,可参考工具类重新设置缓存
...
...
comm_crawler/src/main/java/com/zzsn/test/HttpClientTester.java
浏览文件 @
af30a040
package
com
.
zzsn
.
test
;
import
com.zzsn.crawler.uriparser.HttpgetUtil
;
import
com.zzsn.download.PageBuilderParser
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.NameValuePair
;
...
...
@@ -41,7 +42,9 @@ import java.util.List;
public
class
HttpClientTester
{
private
static
PageBuilderParser
builderParser
=
null
;
public
static
void
main
(
String
[]
args
)
{
get
(
"https://www.cas.cn/zjs/"
);
// get("https://edition.cnn.com/world");
String
html
=
HttpgetUtil
.
getHtml
(
"https://edition.cnn.com/world"
);
System
.
out
.
println
(
html
);
// post();
}
...
...
comm_crawler/src/main/java/com/zzsn/test/WebTest.java
浏览文件 @
af30a040
package
com
.
zzsn
.
test
;
import
com.zzsn.crawler.PaserSiteDownload
;
import
com.zzsn.download.PageDownloader
;
import
java.io.IOException
;
...
...
@@ -17,21 +18,12 @@ import java.io.InputStream;
public
class
WebTest
{
public
static
void
main
(
String
[]
args
)
{
//
String url="https://www.teriin.org/opinion";
String
url
=
"https://www.teriin.org/opinion"
;
// PageDownloader pageDownload=new PageDownloader();
// String body = pageDownload.downloadWithStr(url, "utf-8", false, false);
// System.out.println(body);
try
{
Runtime
mt
=
Runtime
.
getRuntime
();
String
cmd
=
"taskkill /F /im chrome.exe"
;
Process
pro
=
mt
.
exec
(
cmd
);
InputStream
ers
=
pro
.
getErrorStream
();
pro
.
waitFor
();
System
.
out
.
println
(
"++++++++ taskkill /F /im chromedriver.exe"
);
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
catch
(
InterruptedException
e
)
{
// TODO Auto-generated catch block
}
PaserSiteDownload
paserSiteDownload
=
new
PaserSiteDownload
();
String
html
=
paserSiteDownload
.
getHtml
(
"https://edition.cnn.com/world"
,
"utf-8"
);
System
.
out
.
println
(
html
);
}
}
comm_crawler/src/main/java/com/zzsn/util/ContentUtility.java
浏览文件 @
af30a040
...
...
@@ -287,15 +287,17 @@ public class ContentUtility {
if
(
htmlText
==
null
){
return
null
;
}
String
text
=
ContentUtility
.
HTMLDecode
(
ContentUtility
.
RemoveHTMLCode
(
ContentUtility
.
RemoveStyleCode
(
Content
Utility
.
RemoveHTMLReturnCode
(
htmlText
))));
text
=
text
.
replaceAll
(
" "
,
"\r\n"
);
String
text
=
Utility
.
HTMLDecode
(
Utility
.
RemoveHTMLCode
(
Utility
.
RemoveStyleCode
(
Utility
.
RemoveHTMLReturnCode
(
htmlText
))));
text
=
text
.
replaceAll
(
" "
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +\r\n"
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" +\r\n"
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
text
=
text
.
replaceAll
(
" \n"
,
"\n"
);
text
=
text
.
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
).
replaceAll
(
"\n\n"
,
"\n"
);
return
text
;
return
text
;
}
...
...
comm_crawler/src/main/java/com/zzsn/util/DriverUtil.java
浏览文件 @
af30a040
...
...
@@ -58,10 +58,10 @@ public class DriverUtil {
}
public
static
ReuseWebDriver
connectChrome
(
String
sessionId
,
String
serverUrl
)
throws
Exception
{
if
(
serverUrl
==
null
||
""
.
equals
(
serverUrl
)
||
sessionId
==
null
||
""
.
equals
(
sessionId
))
{
log
.
error
(
"未获取到驱动服务地址、sessionId"
);
return
null
;
}
//
if (serverUrl == null || "".equals(serverUrl) || sessionId == null || "".equals(sessionId)) {
//
log.error("未获取到驱动服务地址、sessionId");
//
return null;
//
}
ReuseWebDriver
driver
=
new
ReuseWebDriver
(
serverUrl
,
sessionId
);
if
(
driver
.
connectTestFail
())
{
...
...
@@ -89,10 +89,21 @@ public class DriverUtil {
* @date 2022/7/25 15:07
*/
public
static
ReuseWebDriver
getChromeDriver
()
throws
Exception
{
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
Map
<
String
,
String
>
map
=
getSessionInfo
();
String
sessionId
=
map
.
get
(
"sessionId"
);
String
serverUrl
=
map
.
get
(
"serverUrl"
);
return
connectChrome
(
sessionId
,
serverUrl
);
}
public
static
Map
<
String
,
String
>
getSessionInfo
()
throws
Exception
{
String
cacheInfo
=
JedisUtil
.
getString
(
Constants
.
SELENIUM_DRIVER_CACHE
);
Map
<
String
,
String
>
map
=
JSON
.
parseObject
(
cacheInfo
,
Map
.
class
);
if
(
map
==
null
||
map
.
size
()<
1
)
{
map
=
new
HashMap
<>(
2
);
map
.
put
(
"sessionId"
,
"sessionId"
);
map
.
put
(
"serverUrl"
,
"https://www.baidu.com/"
);
// 缓存浏览器驱动信息
JedisUtil
.
setString
(
Constants
.
SELENIUM_DRIVER_CACHE
,
JSON
.
toJSONString
(
map
),
-
1
);
}
return
map
;
}
}
comm_crawler/src/main/java/com/zzsn/util/Utility.java
浏览文件 @
af30a040
差异被折叠。
点击展开。
comm_crawler/src/main/java/com/zzsn/util/WindowsProcess.java
浏览文件 @
af30a040
...
...
@@ -2,6 +2,8 @@ package com.zzsn.util;
import
com.zzsn.crawler.ReuseWebDriver
;
import
lombok.extern.slf4j.Slf4j
;
import
org.springframework.scheduling.annotation.EnableScheduling
;
import
org.springframework.scheduling.annotation.Scheduled
;
import
java.io.BufferedReader
;
import
java.io.InputStreamReader
;
...
...
@@ -13,10 +15,12 @@ import java.util.regex.Pattern;
*/
@Slf4j
@SuppressWarnings
(
"all"
)
@EnableScheduling
public
class
WindowsProcess
{
private
static
Pattern
TASK_LIST_PATTERN
=
Pattern
.
compile
(
"^(.+?)\\s+(\\d+)\\s+(.+?)\\s+\\d+\\s+([0-9,]+)\\s+K$"
);
private
static
String
DRIVER_NAME
=
"chrome.exe"
;
private
static
String
CHROME_NAME
=
"chrome.exe"
;
private
static
String
DRIVER_NAME
=
"chromedriver.exe"
;
public
static
void
main
(
String
[]
args
)
{
WindowsProcess
process
=
new
WindowsProcess
();
...
...
@@ -28,6 +32,7 @@ public class WindowsProcess {
* @author andylau
* @date 2022/7/26 11:23
*/
// @Scheduled(cron = "0 0 1 * * ?")
private
void
killProcess
()
{
try
{
String
line
;
...
...
@@ -35,14 +40,14 @@ public class WindowsProcess {
BufferedReader
input
=
new
BufferedReader
(
new
InputStreamReader
(
p
.
getInputStream
()));
while
((
line
=
input
.
readLine
())
!=
null
)
{
if
(
line
.
contains
(
DRIVER_NAME
)
)
{
if
(
line
.
contains
(
CHROME_NAME
)||
line
.
contains
(
DRIVER_NAME
)
)
{
Matcher
matcher
=
TASK_LIST_PATTERN
.
matcher
(
line
);
if
(
matcher
.
find
())
{
//
String serviceName = matcher.group(1);
String
pid
=
matcher
.
group
(
2
);
//
String sessionName = matcher.group(3);
//
String size = matcher.group(4).replace(",", "") + "K";
//
log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
//
String serviceName = matcher.group(1);
String
pid
=
matcher
.
group
(
2
);
//
String sessionName = matcher.group(3);
//
String size = matcher.group(4).replace(",", "") + "K";
//
log.info("正在关闭服务:\n服务名:{}\nPid:{}\n会话名:{}\n内存使用:{}\n", serviceName, pid, sessionName, size);
Runtime
.
getRuntime
().
exec
(
"taskkill /pid "
+
pid
);
}
}
...
...
@@ -51,11 +56,11 @@ public class WindowsProcess {
log
.
error
(
"浏览器驱动关闭异常..."
);
}
finally
{
// 定时任务关闭驱动后,重新打开驱动
try
{
reopenChromeDriver
();
}
catch
(
Exception
e
)
{
log
.
error
(
"驱动打开异常..."
);
}
//
try {
//
reopenChromeDriver();
//
} catch (Exception e) {
//
log.error("驱动打开异常...");
//
}
}
}
...
...
comm_crawler/src/main/resources/aa.txt
0 → 100644
浏览文件 @
af30a040
差异被折叠。
点击展开。
comm_crawler/src/main/resources/application.properties
浏览文件 @
af30a040
...
...
@@ -5,9 +5,9 @@ spring.profiles.active:=dev
server.port
=
8081
spring.http.encoding.force
=
true
spring.http.encoding.charset
=
UTF-8
spring.http.encoding.enabled
=
true
#
spring.http.encoding.force=true
#
spring.http.encoding.charset=UTF-8
#
spring.http.encoding.enabled=true
spring.thymeleaf.cache
=
false
spring.thymeleaf.enabled
=
false
...
...
@@ -47,7 +47,7 @@ boiler.timeout.readTimeout=6000
logging.level.root
=
info
logging.level.org.springframework.web
=
info
logging
levelorg
hibernate
=
info
logging
.level.org.
hibernate
=
info
logging.config
=
classpath:logback-spring.xml
kafka.consumer.task
=
0 0/2 * * * ?
...
...
comm_crawler/src/main/resources/constants.properties
浏览文件 @
af30a040
...
...
@@ -35,7 +35,7 @@ PROXYID=1
#线程池大小
THREAD_SIZE
=
1
#
CHROMEDRIVE
=
D
:
\\
chrome
\\
chromedriver.exe
CHROMEDRIVE
=
E
:
\\
chrome
\\
chromedriver.exe
CHROMEBIN
=
C:
\\
Program Files
\\
Google
\\
Chrome
\\
Application
\\
chrome.exe
USER_DATA_DIR
=
C:
\\
Users
\\
WIN10
\\
AppData
\\
Local
\\
Google
\\
Chrome
\\
User Data
\\
Default
...
...
@@ -72,6 +72,9 @@ KAFKA_PRODUCT_PARTITION=0
redis.host
=
114.116.26.150
redis.port
=
6379
redis.pass
=
zzsn9988
#redis.host=114.115.236.206
#redis.port=6379
#redis.pass=clbzzsn
#redis.host=8.130.30.33
#redis.port=9010
#redis.pass=wxadS&jklim
...
...
@@ -89,9 +92,10 @@ HUAWEICLOUD_BUCKET_NAME= zzsn
HUAWEICLOUD_AK
=
VEHN7D0TJ9316H8AHCAV
HUAWEICLOUD_SK
=
heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY
IMGPATH
=
E:
\\
chrome
\\
img
\\
shot.png
#
IMGPATH= E:\\chrome\\img\\shot.png
IMGPATH
=
E:
\\
ideaWorkerspace
\\
meta_crawler
\\
comm_crawler
\\
src
\\
main
\\
resources
\\
aa.txt
selenium.driver.cache
=
comm_selenium_driver_cache_1
...
...
comm_crawler/src/main/resources/redis.properties
浏览文件 @
af30a040
# Redis settings
redis.host
=
1
27.0.0.1
redis.host
=
1
14.115.236.206
redis.port
=
6379
redis.pass
=
xxxxxx
redis.pass
=
clbzzsn
redis.timeout
=
10000
#redis.host=127.0.0.1
#redis.port=6379
#redis.pass=xxxxxx
#redis.timeout=10000
redis.maxIdle
=
300
redis.maxTotal
=
600
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论