Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
M
meta_crawler
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘伟刚
meta_crawler
Commits
80a9df55
提交
80a9df55
authored
9月 06, 2022
作者:
liuweigang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
采集代码更新4
上级
9f957d5a
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
9 行增加
和
8 行删除
+9
-8
MetaBaiduSearchThread.java
.../src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
+6
-6
RecorderUtil.java
...arch/src/main/java/com/zzsn/search/util/RecorderUtil.java
+3
-2
没有找到文件。
baidu_search/src/main/java/com/zzsn/search/MetaBaiduSearchThread.java
浏览文件 @
80a9df55
...
@@ -132,7 +132,7 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -132,7 +132,7 @@ public class MetaBaiduSearchThread implements Runnable {
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
log
.
info
(
"url:"
+
url
);
log
.
info
(
"url:"
+
url
);
String
charset
=
"utf-8"
;
String
charset
=
"utf-8"
;
Long
orgId
=
Long
.
parseLong
(
keywordMsg
.
get
WordsCode
());
//关键词组编码
Long
orgId
=
Long
.
parseLong
(
keywordMsg
.
get
Id
());
//关键词组编码
Long
tid
=
Long
.
parseLong
(
keywordMsg
.
getId
());
//关键词组id
Long
tid
=
Long
.
parseLong
(
keywordMsg
.
getId
());
//关键词组id
for
(
int
i
=
0
;
i
<
6
;
i
++)
{
for
(
int
i
=
0
;
i
<
6
;
i
++)
{
String
urla
=
url1
.
replace
(
"[keyword]"
,
kWord
);
String
urla
=
url1
.
replace
(
"[keyword]"
,
kWord
);
...
@@ -141,7 +141,7 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -141,7 +141,7 @@ public class MetaBaiduSearchThread implements Runnable {
urla
=
urla
.
replace
(
"[pn]"
,
i
*
10
+
""
);
urla
=
urla
.
replace
(
"[pn]"
,
i
*
10
+
""
);
urlList
.
add
(
urla
);
urlList
.
add
(
urla
);
}
}
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
RecorderUtil
.
catchWebOfBaiduList
(
urlList
,
charset
,
orgId
,
tid
,
keyWord
);
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearches
=
RecorderUtil
.
catchWebOfBaiduList
(
urlList
,
charset
,
orgId
,
tid
,
keyWord
,
keywordMsg
);
try
{
try
{
//对关键词进行缓存判断 开始时间和结束时间
//对关键词进行缓存判断 开始时间和结束时间
JedisUtil
.
sadd
(
keyid
,
kWord
);
JedisUtil
.
sadd
(
keyid
,
kWord
);
...
@@ -197,7 +197,7 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -197,7 +197,7 @@ public class MetaBaiduSearchThread implements Runnable {
}
}
// 抓取新闻内容
// 抓取新闻内容
public
int
CatchWebNews
(
List
<
CatchWebByMetaSearch
>
catchWebList
,
String
keyword
)
{
public
int
CatchWebNews
(
List
<
CatchWebByMetaSearch
>
catchWebList
,
String
keyword
,
KeywordMsg
keywordMsg
)
{
int
repeat
=
0
;
int
repeat
=
0
;
try
{
try
{
int
count
=
0
;
int
count
=
0
;
...
@@ -205,7 +205,7 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -205,7 +205,7 @@ public class MetaBaiduSearchThread implements Runnable {
try
{
try
{
CatchWebByMetaSearch
cwbm
=
catchWebList
.
get
(
i
);
CatchWebByMetaSearch
cwbm
=
catchWebList
.
get
(
i
);
// 判断该网址是否存在于缓存池中
// 判断该网址是否存在于缓存池中
String
orgId
=
String
.
valueOf
(
cwbm
.
getOrgId
());
String
orgId
=
String
.
valueOf
(
keywordMsg
.
getWordsCode
());
try
{
try
{
boolean
sismember
=
JedisUtil
.
sismember
(
"baidu::"
+
orgId
,
cwbm
.
getSourceaddress
());
boolean
sismember
=
JedisUtil
.
sismember
(
"baidu::"
+
orgId
,
cwbm
.
getSourceaddress
());
if
(
sismember
)
{
if
(
sismember
)
{
...
@@ -366,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable {
...
@@ -366,13 +366,13 @@ public class MetaBaiduSearchThread implements Runnable {
log
.
info
(
"title:"
+
docInfo
.
getTitle
()+
"|address:"
+
docInfo
.
getSourceaddress
()+
log
.
info
(
"title:"
+
docInfo
.
getTitle
()+
"|address:"
+
docInfo
.
getSourceaddress
()+
"|content:"
+(
docInfo
.
getContentNoTag
()==
null
?
""
:
docInfo
.
getContentNoTag
().
length
()+
""
));
"|content:"
+(
docInfo
.
getContentNoTag
()==
null
?
""
:
docInfo
.
getContentNoTag
().
length
()+
""
));
intsertData
(
docInfo
);
//
intsertData(docInfo);
//信息转换
//信息转换
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ClbAnsProcessitem
processitem
=
docInfoTrans2Processitem
(
docInfo
);
ObjectMapper
mapper
=
new
ObjectMapper
();
ObjectMapper
mapper
=
new
ObjectMapper
();
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
String
docjson
=
mapper
.
writeValueAsString
(
processitem
);
System
.
out
.
println
(
docjson
);
System
.
out
.
println
(
docjson
);
//
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, docjson);
kafkaTemplate
.
send
(
Constants
.
KAFKA_PRODUCT_TOPIC
,
docjson
);
log
.
info
(
"发送成功到kafka"
);
log
.
info
(
"发送成功到kafka"
);
}
else
{
}
else
{
log
.
info
(
"资讯发布时间:"
+
docInfo
.
getPublishDate
());
log
.
info
(
"资讯发布时间:"
+
docInfo
.
getPublishDate
());
...
...
baidu_search/src/main/java/com/zzsn/search/util/RecorderUtil.java
浏览文件 @
80a9df55
...
@@ -3,6 +3,7 @@ package com.zzsn.search.util;
...
@@ -3,6 +3,7 @@ package com.zzsn.search.util;
import
cn.hutool.core.util.RandomUtil
;
import
cn.hutool.core.util.RandomUtil
;
import
com.zzsn.search.BaiduSearchThread
;
import
com.zzsn.search.BaiduSearchThread
;
import
com.zzsn.search.MetaBaiduSearchThread
;
import
com.zzsn.search.MetaBaiduSearchThread
;
import
com.zzsn.search.entity.KeywordMsg
;
import
com.zzsn.search.oracledb.OracleDBManager
;
import
com.zzsn.search.oracledb.OracleDBManager
;
import
com.zzsn.search.oracledb.OracleDataTable
;
import
com.zzsn.search.oracledb.OracleDataTable
;
import
com.zzsn.utility.index.Constants
;
import
com.zzsn.utility.index.Constants
;
...
@@ -303,7 +304,7 @@ public class RecorderUtil {
...
@@ -303,7 +304,7 @@ public class RecorderUtil {
// 提取百度新闻列表URL
// 提取百度新闻列表URL
@SuppressWarnings
(
"deprecation"
)
@SuppressWarnings
(
"deprecation"
)
public
static
List
<
CatchWebByMetaSearch
>
catchWebOfBaiduList
(
public
static
List
<
CatchWebByMetaSearch
>
catchWebOfBaiduList
(
List
<
String
>
urlList
,
String
charset
,
Long
orgId
,
Long
tid
,
String
keywords
)
{
List
<
String
>
urlList
,
String
charset
,
Long
orgId
,
Long
tid
,
String
keywords
,
KeywordMsg
keywordMsg
)
{
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearchList
=
new
ArrayList
<
CatchWebByMetaSearch
>();
List
<
CatchWebByMetaSearch
>
catchWebByMetaSearchList
=
new
ArrayList
<
CatchWebByMetaSearch
>();
try
{
try
{
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
...
@@ -398,7 +399,7 @@ public class RecorderUtil {
...
@@ -398,7 +399,7 @@ public class RecorderUtil {
}
}
//对采集一个列表解析一个列表的详情
//对采集一个列表解析一个列表的详情
MetaBaiduSearchThread
baiduSearchThread
=
new
MetaBaiduSearchThread
();
MetaBaiduSearchThread
baiduSearchThread
=
new
MetaBaiduSearchThread
();
int
repeat
=
baiduSearchThread
.
CatchWebNews
(
metaSearchList
,
keywords
);
int
repeat
=
baiduSearchThread
.
CatchWebNews
(
metaSearchList
,
keywords
,
keywordMsg
);
if
(
repeat
/
metaSearchList
.
size
()>
0.6
){
if
(
repeat
/
metaSearchList
.
size
()>
0.6
){
break
;
break
;
}
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论