Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
K
know-base
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
张京坤
know-base
Commits
310640c1
提交
310640c1
authored
7月 11, 2024
作者:
yanxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
整合专题数据
编辑逻辑优化
上级
ea4d4ea6
隐藏空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
1599 行增加
和
96 行删除
+1599
-96
aspose-words-15.12.0-jdk16.jar
lib/aspose-words-15.12.0-jdk16.jar
+0
-0
aspose-words-21.1-jdk17.jar
lib/aspose-words-21.1-jdk17.jar
+0
-0
pom.xml
pom.xml
+2
-2
Constants.java
src/main/java/com/zzsn/knowbase/constant/Constants.java
+3
-3
KbKnowledges.java
src/main/java/com/zzsn/knowbase/entity/KbKnowledges.java
+30
-1
Knowledge.java
src/main/java/com/zzsn/knowbase/entity/Knowledge.java
+11
-8
IntelligenceInfo.java
...va/com/zzsn/knowbase/entity/subject/IntelligenceInfo.java
+79
-0
Label.java
src/main/java/com/zzsn/knowbase/entity/subject/Label.java
+30
-0
IKnowledgeService.java
...ain/java/com/zzsn/knowbase/service/IKnowledgeService.java
+2
-0
KbKnowledgesService.java
...n/java/com/zzsn/knowbase/service/KbKnowledgesService.java
+3
-0
KbAuthorizedUserServiceImpl.java
...sn/knowbase/service/impl/KbAuthorizedUserServiceImpl.java
+42
-37
KbKnowledgeProjectServiceImpl.java
.../knowbase/service/impl/KbKnowledgeProjectServiceImpl.java
+2
-2
KbKnowledgesServiceImpl.java
...m/zzsn/knowbase/service/impl/KbKnowledgesServiceImpl.java
+12
-0
KnowledgeServiceImpl.java
.../com/zzsn/knowbase/service/impl/KnowledgeServiceImpl.java
+40
-25
LocalFileServiceImpl.java
.../com/zzsn/knowbase/service/impl/LocalFileServiceImpl.java
+1
-9
SubjectDataSyncTask.java
...main/java/com/zzsn/knowbase/task/SubjectDataSyncTask.java
+272
-0
ContentUtility.java
src/main/java/com/zzsn/knowbase/util/ContentUtility.java
+372
-0
EsOpUtil.java
src/main/java/com/zzsn/knowbase/util/EsOpUtil.java
+41
-1
HtmlUtil.java
src/main/java/com/zzsn/knowbase/util/HtmlUtil.java
+633
-0
KnowledgeParam.java
src/main/java/com/zzsn/knowbase/vo/KnowledgeParam.java
+1
-0
KnowledgeVO.java
src/main/java/com/zzsn/knowbase/vo/KnowledgeVO.java
+21
-6
application.yml
src/main/resources/application.yml
+2
-2
没有找到文件。
lib/aspose-words-15.12.0-jdk16.jar
0 → 100644
浏览文件 @
310640c1
File added
lib/aspose-words-21.1-jdk17.jar
deleted
100644 → 0
浏览文件 @
ea4d4ea6
File deleted
pom.xml
浏览文件 @
310640c1
...
...
@@ -187,10 +187,10 @@
<dependency>
<groupId>
com.aspose
</groupId>
<artifactId>
aspose-words
</artifactId>
<version>
21.1
.0
</version>
<version>
15.12
.0
</version>
<type>
jar
</type>
<scope>
system
</scope>
<systemPath>
${project.basedir}/lib/aspose-words-
21.1-jdk17
.jar
</systemPath>
<systemPath>
${project.basedir}/lib/aspose-words-
15.12.0-jdk16
.jar
</systemPath>
</dependency>
<dependency>
...
...
src/main/java/com/zzsn/knowbase/constant/Constants.java
浏览文件 @
310640c1
...
...
@@ -20,8 +20,8 @@ public class Constants {
//新领导讲话索引(22.08.24)
public
final
static
String
LEADER_SPEECH_BASE_DATA
=
"leaderspeech_new"
;
//
新专题库(22.04.24)
public
final
static
String
ES_SUBJECT_DATA
=
"subjectdata"
;
//
专题库
public
final
static
String
ES_SUBJECT_DATA
=
"subjectdata
base
"
;
//审计对接旧索引(废弃)
public
final
static
String
ES_SUBJECT_DEV_DATA
=
"subjectdatabase_dev"
;
...
...
@@ -29,7 +29,7 @@ public class Constants {
//处理后的专题资讯信息存储索引。
public
final
static
String
ES_DATA_FOR_SUBJECT
=
"subjectdatabase_2023"
;
//知识库
public
final
static
String
ES_DATA_FOR_KNOWLEDGE
=
"knowledgedatabase"
;
public
final
static
String
ES_DATA_FOR_KNOWLEDGE
=
"knowledgedatabase
_nt
"
;
/*
* 专题内容主次关系索引
*/
...
...
src/main/java/com/zzsn/knowbase/entity/KbKnowledges.java
浏览文件 @
310640c1
...
...
@@ -31,6 +31,35 @@ public class KbKnowledges extends Model<KbKnowledges> {
*/
@TableField
(
"name"
)
private
String
name
;
/**
* 绑定专题,多个用逗号分隔
*/
@TableField
(
"subject_id"
)
private
String
subjectId
;
/**
* 同步数据状态,多个用逗号分隔,为空同步所有状态
*/
@TableField
(
"subject_status"
)
private
String
subjectStatus
;
/**
* 同步数据发布状态,多个用逗号分隔,为空同步所有状态
*/
@TableField
(
"subject_publish_status"
)
private
String
subjectPublishStatus
;
/**
* 二级分类
*/
@TableField
(
"project_id"
)
private
String
projectId
;
/**
* 一级分类
*/
@TableField
(
"type_id"
)
private
String
typeId
;
/**
* 同步状态 1:启用 0:停用
*/
@TableField
(
"sync_status"
)
private
Integer
syncStatus
;
}
src/main/java/com/zzsn/knowbase/entity/Knowledge.java
浏览文件 @
310640c1
...
...
@@ -54,18 +54,17 @@ public class Knowledge implements Serializable {
@ApiModelProperty
(
value
=
"创建日期"
)
private
String
createTime
;
/**
*
更新人
*
专题id
*/
@ApiModelProperty
(
value
=
"更新人"
)
private
String
updateBy
;
private
String
subjectId
;
/**
*
更新日期
*
专题id
*/
private
String
updateTime
;
private
String
contentWithTag
;
/**
*
状态(1启用 0不启用)
*
更新日期
*/
private
Integer
status
;
private
String
updateDate
;
/**
* 来源
*/
...
...
@@ -104,6 +103,10 @@ public class Knowledge implements Serializable {
private
String
kbKnowledgeId
;
private
String
contentAll
;
/**
* 链接地址
*/
private
String
sourceAddress
;;
/**
* 类型
*/
private
String
typeId
;
...
...
@@ -113,7 +116,7 @@ public class Knowledge implements Serializable {
private
List
<
KnowFile
>
files
;
private
Integer
score
;
/**
* 0word 1excel 2ppt 3pdf
* 0word 1excel
&subject
2ppt 3pdf
*/
private
Integer
importData
;
...
...
src/main/java/com/zzsn/knowbase/entity/subject/IntelligenceInfo.java
0 → 100644
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
entity
.
subject
;
import
lombok.Data
;
import
java.util.List
;
@Data
public
class
IntelligenceInfo
{
//作者
private
String
author
;
private
String
authorRaw
;
//审核状态 (0:未审核 1:审核通过 2:审核未通过 3:暂定 4:重复数据 默认值为0)
private
Integer
checkStatus
;
//发布状态 checkStatus=1时生效(0/null:待发布 1:已发布 2:已下架 默认值为null)
private
Integer
publishStatus
;
//正文
private
String
content
;
private
String
contentRaw
;
//带标签正文
private
String
contentWithTag
;
private
String
contentWithTagRaw
;
//创建时间
private
String
createDate
;
//删除标志
private
Integer
deleteFlag
;
//id
private
String
id
;
//关键词
private
String
keyWords
;
//语言
private
String
lang
;
//来源
private
String
origin
;
private
String
originRaw
;
//发布时间
private
String
publishDate
;
//得分
private
Double
score
;
//信息源id
private
String
sid
;
//地址
private
String
sourceAddress
;
//专题id
private
String
subjectId
;
//摘要
private
String
summary
;
private
String
summaryRaw
;
//标题
private
String
title
;
private
String
titleRaw
;
//置顶
private
Integer
topNum
;
private
String
type
;
private
List
<
Label
>
labels
;
//正负面
private
String
orientation
;
//更新时间
private
String
updateDate
;
//图片处理状态
private
String
imgDisposeStatus
;
//数据类型 qbyw:情报要闻 qbnc:情报内参 cgbg: 常规报告
private
String
dataType
;
//abi报表地址
private
String
abiUrl
;
//abi报表id
private
String
abiId
;
//栏目列表 对应栏目管理code字段
private
List
<
String
>
programaIds
;
//资源目录 对应资源目录管理code字段
private
String
resourceCatalogId
;
//关键词列表
private
List
<
String
>
keyWordsList
;
//入专题库时间
private
String
processDate
;
}
src/main/java/com/zzsn/knowbase/entity/subject/Label.java
0 → 100644
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
entity
.
subject
;
import
lombok.Data
;
/**
* 基础信息打上标签的实体对象
*/
@Data
public
class
Label
{
//命中标识
private
String
hitRemarks
;
//标签标识
private
String
labelMark
;
//标签备注
private
String
labelRemarks
;
//项目标签id
private
String
projectLabelId
;
//关联标签id
private
String
relationId
;
//关联标签名称
private
String
relationName
;
//审核状态
private
Integer
status
;
}
src/main/java/com/zzsn/knowbase/service/IKnowledgeService.java
浏览文件 @
310640c1
...
...
@@ -34,6 +34,8 @@ public interface IKnowledgeService {
*/
void
deleteKnowledge
(
List
<
Knowledge
>
knowledgeList
);
void
deleteForPython
(
String
id
,
String
knowledgeBaseId
);
/**
* 分页检索
*
...
...
src/main/java/com/zzsn/knowbase/service/KbKnowledgesService.java
浏览文件 @
310640c1
...
...
@@ -3,6 +3,8 @@ package com.zzsn.knowbase.service;
import
com.baomidou.mybatisplus.extension.service.IService
;
import
com.zzsn.knowbase.entity.KbKnowledges
;
import
java.util.List
;
/**
* <p>
* 服务类
...
...
@@ -13,4 +15,5 @@ import com.zzsn.knowbase.entity.KbKnowledges;
*/
public
interface
KbKnowledgesService
extends
IService
<
KbKnowledges
>
{
List
<
KbKnowledges
>
syncSubjectConf
();
}
src/main/java/com/zzsn/knowbase/service/impl/KbAuthorizedUserServiceImpl.java
浏览文件 @
310640c1
...
...
@@ -831,17 +831,19 @@ public class KbAuthorizedUserServiceImpl extends ServiceImpl<KbAuthorizedUserMap
boolean
success
=
res
.
isSuccess
();
if
(
200
==
res
.
getCode
()
&&
success
){
/**校验成功,获取到第三方用户信息*/
KbAuthorizedUser
bean
=
JSONUtil
.
toBean
(
JSONUtil
.
toJsonStr
(
res
.
getResult
()),
KbAuthorizedUser
.
class
);
/**校验第三方用户是否进行了授权*/
KbAuthorizedUser
one
=
JSONUtil
.
toBean
(
JSONUtil
.
toJsonStr
(
res
.
getResult
()),
KbAuthorizedUser
.
class
);
one
.
setIsAll
(
0
);
one
.
setRoleId
(
"1742844597970673665"
);
/**校验第三方用户是否进行了授权*//*
List<KbAuthorizedUser> list = super.list(Wrappers.<KbAuthorizedUser>lambdaQuery().eq(KbAuthorizedUser::getUserId, bean.getUserId()));
if (CollectionUtil.isEmpty(list)){
HttpServletResponse httpServletResponse = SpringContextUtils.getHttpServletResponse();
httpServletResponse.setStatus(401);
return Result.error("该用户还未进行授权访问,请联系管理员进行授权");
}
/**查询授权用户的角色和权限*/
*//**查询授权用户的角色和权限*//*
UserInfoVo userInfoVo = new UserInfoVo();
/**查询授权用户,“所有”节点的角色优先*/
*//**查询授权用户,“所有”节点的角色优先*//*
QueryWrapper<KbAuthorizedUser> query = new QueryWrapper<>();
query.eq("a.user_id",bean.getUserId());
query.eq("b.knowledge_project_id","0");
...
...
@@ -852,7 +854,7 @@ public class KbAuthorizedUserServiceImpl extends ServiceImpl<KbAuthorizedUserMap
one = authorizedUserMapper.getOne(query1);
}
/**是否是所有节点的管理员*/
*//**是否是所有节点的管理员*//*
List<KbAuthuserKnowledgeprojectMap> list1 = authuserKnowledgeprojectMapService.list(Wrappers.<KbAuthuserKnowledgeprojectMap>lambdaQuery()
.eq(KbAuthuserKnowledgeprojectMap::getAuthUserId, one.getId())
.eq(KbAuthuserKnowledgeprojectMap::getKnowledgeProjectId, "0")
...
...
@@ -863,7 +865,7 @@ public class KbAuthorizedUserServiceImpl extends ServiceImpl<KbAuthorizedUserMap
one.setIsAll(0);
}else {
one.setIsAll(1);
}
}
*/
List
<
KbRole
>
kbRoles
=
roleService
.
listByIds
(
Arrays
.
asList
(
one
.
getRoleId
().
split
(
","
)));
...
...
@@ -877,6 +879,7 @@ public class KbAuthorizedUserServiceImpl extends ServiceImpl<KbAuthorizedUserMap
// e.setPermissions(permissionsService.list(Wrappers.<KbPermissions>lambdaQuery().in(KbPermissions::getId,longs)));
// }
// });
UserInfoVo
userInfoVo
=
new
UserInfoVo
();
if
(
CollectionUtil
.
isNotEmpty
(
permissionMaps
))
{
List
<
KbPermissions
>
permissions
=
permissionsService
.
list
(
Wrappers
.<
KbPermissions
>
lambdaQuery
().
in
(
KbPermissions:
:
getId
,
permissionMaps
.
stream
().
map
(
KbRolePermissionMap:
:
getPermissionId
).
collect
(
Collectors
.
toList
())));
userInfoVo
.
setPermissions
(
permissions
);
...
...
@@ -942,42 +945,44 @@ public class KbAuthorizedUserServiceImpl extends ServiceImpl<KbAuthorizedUserMap
}
@Override
public
Result
<?>
doCheckAndGetUser
(
String
token
)
{
/**调用第三方系统校验接口*/
/**调用第三方系统校验接口*/
Result
res
=
doCheck
(
token
);
boolean
success
=
res
.
isSuccess
();
if
(
200
==
res
.
getCode
()
&&
success
)
{
/**校验成功,获取到第三方用户信息*/
KbAuthorizedUser
bean
=
JSONUtil
.
toBean
(
JSONUtil
.
toJsonStr
(
res
.
getResult
()),
KbAuthorizedUser
.
class
);
/**校验第三方用户是否进行了授权*/
List
<
KbAuthorizedUser
>
list
=
super
.
list
(
Wrappers
.<
KbAuthorizedUser
>
lambdaQuery
().
eq
(
KbAuthorizedUser:
:
getUserId
,
bean
.
getUserId
()));
if
(
CollectionUtil
.
isEmpty
(
list
))
{
HttpServletResponse
httpServletResponse
=
SpringContextUtils
.
getHttpServletResponse
();
httpServletResponse
.
setStatus
(
401
);
return
Result
.
error
(
"该用户还未进行授权访问,请联系管理员进行授权"
);
}
/**查询授权用户,“所有”节点的角色优先*/
QueryWrapper
<
KbAuthorizedUser
>
query
=
new
QueryWrapper
<>();
query
.
eq
(
"a.user_id"
,
bean
.
getUserId
());
query
.
eq
(
"b.knowledge_project_id"
,
"0"
);
KbAuthorizedUser
one
=
authorizedUserMapper
.
getOne
(
query
);
if
(
ObjectUtil
.
isEmpty
(
one
)
||
StringUtils
.
isBlank
(
one
.
getRoleId
())){
QueryWrapper
<
KbAuthorizedUser
>
query1
=
new
QueryWrapper
<>();
query1
.
eq
(
"a.user_id"
,
bean
.
getUserId
());
one
=
authorizedUserMapper
.
getOne
(
query1
);
}
/**是否是所有节点的管理员*/
List
<
KbAuthuserKnowledgeprojectMap
>
list1
=
authuserKnowledgeprojectMapService
.
list
(
Wrappers
.<
KbAuthuserKnowledgeprojectMap
>
lambdaQuery
()
.
eq
(
KbAuthuserKnowledgeprojectMap:
:
getAuthUserId
,
one
.
getId
())
.
eq
(
KbAuthuserKnowledgeprojectMap:
:
getKnowledgeProjectId
,
"0"
)
.
eq
(
KbAuthuserKnowledgeprojectMap:
:
getSign
,
0
)
);
if
(
CollectionUtil
.
isNotEmpty
(
list1
)){
one
.
setIsAll
(
0
);
}
else
{
one
.
setIsAll
(
1
);
}
return
Result
.
OK
(
one
);
// /**校验第三方用户是否进行了授权*/
// List<KbAuthorizedUser> list = super.list(Wrappers.<KbAuthorizedUser>lambdaQuery().eq(KbAuthorizedUser::getUserId, bean.getUserId()));
// if (CollectionUtil.isEmpty(list)) {
// HttpServletResponse httpServletResponse = SpringContextUtils.getHttpServletResponse();
// httpServletResponse.setStatus(401);
// return Result.error("该用户还未进行授权访问,请联系管理员进行授权");
// }
// /**查询授权用户,“所有”节点的角色优先*/
// QueryWrapper<KbAuthorizedUser> query = new QueryWrapper<>();
// query.eq("a.user_id",bean.getUserId());
// query.eq("b.knowledge_project_id","0");
// KbAuthorizedUser one = authorizedUserMapper.getOne(query);
// if (ObjectUtil.isEmpty(one) || StringUtils.isBlank(one.getRoleId())){
// QueryWrapper<KbAuthorizedUser> query1 = new QueryWrapper<>();
// query1.eq("a.user_id",bean.getUserId());
// one = authorizedUserMapper.getOne(query1);
// }
// /**是否是所有节点的管理员*/
// List<KbAuthuserKnowledgeprojectMap> list1 = authuserKnowledgeprojectMapService.list(Wrappers.<KbAuthuserKnowledgeprojectMap>lambdaQuery()
// .eq(KbAuthuserKnowledgeprojectMap::getAuthUserId, one.getId())
// .eq(KbAuthuserKnowledgeprojectMap::getKnowledgeProjectId, "0")
// .eq(KbAuthuserKnowledgeprojectMap::getSign, 0)
// );
//
// if (CollectionUtil.isNotEmpty(list1)){
// one.setIsAll(0);
// }else {
// one.setIsAll(1);
// }
bean
.
setIsAll
(
0
);
bean
.
setRoleId
(
"1742844597970673665"
);
return
Result
.
OK
(
bean
);
}
else
{
return
Result
.
error
(
"第三方系统未通过用户校验"
);
}
...
...
src/main/java/com/zzsn/knowbase/service/impl/KbKnowledgeProjectServiceImpl.java
浏览文件 @
310640c1
...
...
@@ -193,7 +193,7 @@ public class KbKnowledgeProjectServiceImpl extends ServiceImpl<KbKnowledgeProjec
e
.
setRoles
(
list
);
e
.
setPermissions
(
permissions
);
});
//查询此用户,在其他节点有无特殊权限
/
*/
/查询此用户,在其他节点有无特殊权限
List<UserKnowPermissionVo> res = authorizedUserMapper.getUserPermission(userInfo.getId());
List<UserKnowPermissionVo> collect = res.stream().filter(f -> !"0".equals(f.getKnowledgeProjectId())).collect(Collectors.toList());
if (CollectionUtil.isNotEmpty(collect)){
...
...
@@ -225,7 +225,7 @@ public class KbKnowledgeProjectServiceImpl extends ServiceImpl<KbKnowledgeProjec
e.setPermissions(permissionsList);
}
});
}
}
*/
return
Result
.
OK
(
kbKnowledgeProjects
);
}
else
{
...
...
src/main/java/com/zzsn/knowbase/service/impl/KbKnowledgesServiceImpl.java
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
service
.
impl
;
import
com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper
;
import
com.baomidou.mybatisplus.core.toolkit.Wrappers
;
import
com.baomidou.mybatisplus.extension.service.impl.ServiceImpl
;
import
com.zzsn.knowbase.entity.KbKnowledges
;
import
com.zzsn.knowbase.mapper.KbKnowledgesMapper
;
import
com.zzsn.knowbase.service.KbKnowledgesService
;
import
org.springframework.stereotype.Service
;
import
java.util.List
;
/**
* <p>
* 服务实现类
...
...
@@ -18,4 +22,12 @@ import org.springframework.stereotype.Service;
@Service
public
class
KbKnowledgesServiceImpl
extends
ServiceImpl
<
KbKnowledgesMapper
,
KbKnowledges
>
implements
KbKnowledgesService
{
@Override
public
List
<
KbKnowledges
>
syncSubjectConf
()
{
LambdaQueryWrapper
<
KbKnowledges
>
queryWrapper
=
Wrappers
.
lambdaQuery
();
queryWrapper
.
isNotNull
(
KbKnowledges:
:
getSubjectId
);
queryWrapper
.
ne
(
KbKnowledges:
:
getSubjectId
,
""
);
queryWrapper
.
eq
(
KbKnowledges:
:
getSyncStatus
,
1
);
return
this
.
list
(
queryWrapper
);
}
}
src/main/java/com/zzsn/knowbase/service/impl/KnowledgeServiceImpl.java
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
service
.
impl
;
import
cn.hutool.http.HttpUtil
;
import
cn.hutool.json.JSONUtil
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
...
...
@@ -20,7 +21,6 @@ import com.zzsn.knowbase.vo.Result;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.collections4.CollectionUtils
;
import
org.apache.commons.collections4.ListUtils
;
import
org.apache.commons.io.FileUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.lucene.search.join.ScoreMode
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
...
...
@@ -62,8 +62,6 @@ import java.io.*;
import
java.util.*
;
import
java.util.concurrent.CompletableFuture
;
import
java.util.stream.Collectors
;
import
java.util.zip.ZipEntry
;
import
java.util.zip.ZipOutputStream
;
/**
* @Description: 知识
...
...
@@ -302,6 +300,20 @@ class KnowledgeServiceImpl implements IKnowledgeService {
}
BeanUtils
.
copyProperties
(
knowledge
,
knowledgeMessage
);
knowledgeMessage
.
setType
(
knowledge
.
getTypeId
());
if
(
StringUtils
.
isNotEmpty
(
knowledge
.
getContentWithTag
())){
//片段切分
List
<
Content
>
contents
=
new
ArrayList
<>();
List
<
String
>
splitContents
=
HtmlUtil
.
splitContents
(
knowledge
.
getContentWithTag
());
for
(
String
content
:
splitContents
)
{
System
.
out
.
println
(
content
.
length
());
System
.
out
.
println
(
content
);
contents
.
add
(
Content
.
builder
()
.
contentId
(
codeGenerateUtil
.
geneIdNo
(
Constants
.
FINANCE
,
8
))
.
content
(
content
)
.
build
());
}
knowledge
.
setContents
(
contents
);
}
//id为空表示新增
if
(
null
==
knowledge
.
getId
())
{
knowledge
.
setId
(
codeGenerateUtil
.
geneIdNo
(
Constants
.
FINANCE
,
8
));
...
...
@@ -345,7 +357,10 @@ class KnowledgeServiceImpl implements IKnowledgeService {
@Override
public
void
deleteKnowledge
(
List
<
Knowledge
>
knowledgeList
)
{
for
(
Knowledge
knowledge
:
knowledgeList
)
{
esOpUtil
.
docDeleteById
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
knowledge
.
getId
());
knowledge
.
setDeleteFlag
(
1
);
//esOpUtil.docDeleteById(Constants.ES_DATA_FOR_KNOWLEDGE, knowledge.getId());
//修改为标记删除
esOpUtil
.
docUpdateById
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
knowledge
.
getId
(),
JSON
.
toJSONString
(
knowledge
));
}
CompletableFuture
.
runAsync
(()
->
{
...
...
@@ -353,10 +368,9 @@ class KnowledgeServiceImpl implements IKnowledgeService {
deleteForPython
(
knowledge
.
getId
(),
knowledge
.
getKbKnowledgeId
());
}
});
}
@Override
public
void
deleteForPython
(
String
id
,
String
knowledgeBaseId
)
{
JSONObject
params
=
new
JSONObject
();
List
<
String
>
lidList
=
new
ArrayList
<>();
...
...
@@ -364,11 +378,8 @@ class KnowledgeServiceImpl implements IKnowledgeService {
params
.
put
(
"id"
,
lidList
);
params
.
put
(
"knowledge_base_id"
,
knowledgeBaseId
);
log
.
info
(
"delete param:{}"
,
params
);
try
{
HttpUtil
.
doPost
(
deleteUrl
,
params
,
120000
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
String
res
=
HttpUtil
.
post
(
deleteUrl
,
params
.
toJSONString
(),
5000
);
log
.
info
(
"delete res:{}"
,
res
);
}
...
...
@@ -523,6 +534,7 @@ class KnowledgeServiceImpl implements IKnowledgeService {
if
(
StringUtils
.
isNotBlank
(
knowledgeParam
.
getVerifyEndTime
()))
{
boolQuery
.
filter
(
QueryBuilders
.
rangeQuery
(
"verifyTime"
).
lte
(
EsDateUtil
.
esFieldDateFormat
(
knowledgeParam
.
getEndTime
())));
}
boolQuery
.
mustNot
(
QueryBuilders
.
termQuery
(
"deleteFlag"
,
1
));
outer
.
should
(
boolQuery
);
searchSourceBuilder
.
query
(
outer
);
searchRequest
.
source
(
searchSourceBuilder
);
...
...
@@ -604,12 +616,7 @@ class KnowledgeServiceImpl implements IKnowledgeService {
if
(
knowledgeParam
.
getSearchScope
().
equals
(
2
))
{
params
.
put
(
"question"
,
knowledgeParam
.
getSearchInfo
());
}
String
result
=
null
;
try
{
result
=
HttpUtil
.
doPost
(
searchUrl
,
params
,
120000
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
String
result
=
HttpUtil
.
post
(
searchUrl
,
params
.
toJSONString
(),
120000
);
if
(
result
==
null
||
!
result
.
contains
(
"result"
))
{
return
null
;
}
...
...
@@ -831,11 +838,19 @@ class KnowledgeServiceImpl implements IKnowledgeService {
continue
;
}
if
(
StringUtils
.
isNotEmpty
(
info
.
get
(
1
)))
{
specialInformation
.
setContentAll
(
info
.
get
(
1
));
// //段落切分
// specialInformation.setContents(Collections.singletonList(Content.builder()
// .contentId(codeGenerateUtil.geneIdNo(Constants.FINANCE, 8))
// .content(info.get(1)).build()));
specialInformation
.
setContentWithTag
(
info
.
get
(
1
));
//片段切分
List
<
Content
>
contents
=
new
ArrayList
<>();
List
<
String
>
splitContents
=
HtmlUtil
.
splitContents
(
specialInformation
.
getContentWithTag
());
for
(
String
content
:
splitContents
)
{
System
.
out
.
println
(
content
.
length
());
System
.
out
.
println
(
content
);
contents
.
add
(
Content
.
builder
()
.
contentId
(
codeGenerateUtil
.
geneIdNo
(
Constants
.
FINANCE
,
8
))
.
content
(
content
)
.
build
());
}
specialInformation
.
setContents
(
contents
);
}
else
{
log
.
error
(
"上传的数据{}正文为空,此条数据忽略"
,
info
.
get
(
0
));
continue
;
...
...
@@ -850,13 +865,13 @@ class KnowledgeServiceImpl implements IKnowledgeService {
specialInformation
.
setPublishDate
(
EsDateUtil
.
esFieldDateFormat
(
info
.
get
(
4
)));
}
specialInformation
.
setDeleteFlag
(
0
);
specialInformation
.
setCreateTime
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
format
(
new
Date
(),
"yyyy-MM-dd'T'HH:mm:ss"
));
specialInformation
.
setCreateTime
(
EsDateUtil
.
esFieldDateFormat
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
formatDateTime
(
new
Date
())
));
esOpUtil
.
docSavaByEntity
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
specialInformation
.
getId
(),
specialInformation
);
KnowledgeMessage
knowledgeMessage
=
new
KnowledgeMessage
();
BeanUtils
.
copyProperties
(
specialInformation
,
knowledgeMessage
);
knowledgeMessage
.
setType
(
specialInformation
.
getTypeId
());
knowledgeMessage
.
setContents
(
specialInformation
.
getContents
()
);
produceInfo
.
sendKnowledge
Excel
Contents
(
knowledgeMessage
);
//produceInfo.sendKnowledgeExcelContents(knowledgeMessage
);
produceInfo
.
sendKnowledgeContents
(
knowledgeMessage
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"处理异常"
);
}
...
...
src/main/java/com/zzsn/knowbase/service/impl/LocalFileServiceImpl.java
浏览文件 @
310640c1
...
...
@@ -311,18 +311,10 @@ public class LocalFileServiceImpl implements ILocalFileService {
String
publish
=
EsDateUtil
.
esFieldDateFormat
(
knowledge
.
getPublishDate
());
knowledge
.
setPublishDate
(
publish
);
KbAuthorizedUser
userInfo
=
SpringContextUtils
.
getUserInfo
();
knowledge
.
setUpdateBy
(
userInfo
.
getName
());
knowledge
.
setUpdateTime
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
formatDateTime
(
new
Date
()).
replace
(
" "
,
"T"
));
Integer
verifyStatus
=
knowledge
.
getVerifyStatus
();
if
(
verifyStatus
!=
null
&&
(
verifyStatus
==
1
||
verifyStatus
==
2
))
knowledge
.
setVerifyTime
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
formatDateTime
(
new
Date
()).
replace
(
" "
,
"T"
));
knowledge
.
setVerifyTime
(
EsDateUtil
.
esFieldDateFormat
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
formatDateTime
(
new
Date
())));
knowledge
.
setVerifierId
(
userInfo
.
getUserId
());
knowledge
.
setVerifierName
(
userInfo
.
getUsername
());
if
(
Integer
.
valueOf
(
"0"
).
equals
(
knowledge
.
getImportData
())){
List
<
KnowFile
>
files
=
knowledge
.
getFiles
();
KnowFile
knowFile
=
files
.
get
(
0
);
String
filePath
=
knowFile
.
getFilePath
();
...
...
src/main/java/com/zzsn/knowbase/task/SubjectDataSyncTask.java
0 → 100644
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
task
;
import
com.alibaba.fastjson.JSON
;
import
com.zzsn.knowbase.constant.Constants
;
import
com.zzsn.knowbase.entity.Content
;
import
com.zzsn.knowbase.entity.KbKnowledges
;
import
com.zzsn.knowbase.entity.Knowledge
;
import
com.zzsn.knowbase.entity.subject.IntelligenceInfo
;
import
com.zzsn.knowbase.kafka.message.KnowledgeMessage
;
import
com.zzsn.knowbase.kafka.producer.ProduceInfo
;
import
com.zzsn.knowbase.service.IKnowledgeService
;
import
com.zzsn.knowbase.service.KbKnowledgesService
;
import
com.zzsn.knowbase.util.*
;
import
com.zzsn.knowbase.vo.KnowledgeVO
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.lang3.StringUtils
;
import
org.elasticsearch.index.query.BoolQueryBuilder
;
import
org.elasticsearch.index.query.QueryBuilders
;
import
org.elasticsearch.search.builder.SearchSourceBuilder
;
import
org.elasticsearch.search.sort.SortOrder
;
import
org.springframework.beans.BeanUtils
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.beans.factory.annotation.Value
;
import
org.springframework.scheduling.annotation.Scheduled
;
import
org.springframework.web.bind.annotation.GetMapping
;
import
org.springframework.web.bind.annotation.RequestMapping
;
import
org.springframework.web.bind.annotation.RestController
;
import
java.util.*
;
import
java.util.stream.Collectors
;
/**
* 同步专题数据到知识库
*/
@Slf4j
@RestController
@RequestMapping
(
"subjectDataSyncTask"
)
public
class
SubjectDataSyncTask
{
@Autowired
private
KbKnowledgesService
kbKnowledgesService
;
@Autowired
private
IKnowledgeService
knowledgeService
;
@Autowired
private
EsOpUtil
esOpUtil
;
@Autowired
private
CodeGenerateUtil
codeGenerateUtil
;
@Autowired
private
ProduceInfo
produceInfo
;
@Scheduled
(
cron
=
"0 */10 * * * ?"
)
@GetMapping
(
"startTask"
)
public
void
startTask
()
{
List
<
KbKnowledges
>
confList
=
kbKnowledgesService
.
syncSubjectConf
();
for
(
KbKnowledges
kbKnowledges
:
confList
)
{
sync
(
kbKnowledges
);
}
}
@GetMapping
(
"syncById"
)
public
void
syncById
(
String
id
)
{
sync
(
kbKnowledgesService
.
getById
(
id
));
}
public
void
sync
(
KbKnowledges
kbKnowledges
)
{
//按审核状态分组
Map
<
Integer
,
List
<
String
>>
checkStatusMap
=
new
HashMap
<>();
//提取发布状态分组
Map
<
Integer
,
List
<
String
>>
publishStatusMap
=
new
HashMap
<>();
//已删除数据
List
<
String
>
deleteList
=
new
ArrayList
<>();
getSubjectIdGroup
(
kbKnowledges
,
checkStatusMap
,
publishStatusMap
,
deleteList
);
//根据专题配置,提取有效数据
List
<
String
>
subjectValidList1
=
new
ArrayList
<>();
List
<
String
>
subjectValidList2
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotEmpty
(
kbKnowledges
.
getSubjectStatus
())){
String
subjectStatus
=
kbKnowledges
.
getSubjectStatus
();
for
(
String
status
:
subjectStatus
.
split
(
","
))
{
if
(
checkStatusMap
.
get
(
Integer
.
parseInt
(
status
))
!=
null
){
subjectValidList1
.
addAll
(
checkStatusMap
.
get
(
Integer
.
parseInt
(
status
)));
}
}
}
else
{
for
(
Map
.
Entry
<
Integer
,
List
<
String
>>
entry
:
checkStatusMap
.
entrySet
())
{
subjectValidList1
.
addAll
(
entry
.
getValue
());
}
}
if
(
StringUtils
.
isNotEmpty
(
kbKnowledges
.
getSubjectPublishStatus
())){
String
subjectStatus
=
kbKnowledges
.
getSubjectPublishStatus
();
for
(
String
status
:
subjectStatus
.
split
(
","
))
{
if
(
publishStatusMap
.
get
(
Integer
.
parseInt
(
status
))
!=
null
){
subjectValidList2
.
addAll
(
publishStatusMap
.
get
(
Integer
.
parseInt
(
status
)));
}
}
}
else
{
for
(
Map
.
Entry
<
Integer
,
List
<
String
>>
entry
:
publishStatusMap
.
entrySet
())
{
subjectValidList2
.
addAll
(
entry
.
getValue
());
}
}
//专题有效数据 (交集)
List
<
String
>
subjectValidList
=
new
ArrayList
<>(
subjectValidList1
);
subjectValidList
.
retainAll
(
subjectValidList2
);
//知识库有效数据
List
<
String
>
knowValidList
=
new
ArrayList
<>();
//知识库删除数据
List
<
String
>
knowDeleteList
=
new
ArrayList
<>();
getKnowId
(
kbKnowledges
,
knowValidList
,
knowDeleteList
);
//待删除数据id
List
<
String
>
hasDeleteList
=
new
ArrayList
<>();
//待新增数据id
List
<
String
>
hasAddList
=
new
ArrayList
<>();
if
(!
subjectValidList
.
isEmpty
()){
//提取需要新增或更新的数据
hasAddList
=
new
ArrayList
<>(
subjectValidList
);
hasAddList
.
removeAll
(
knowValidList
);
}
if
(!
knowValidList
.
isEmpty
()){
//提取需要删除的数据(知识库存在有效数据,专题库非配置状态范围内数据)
hasDeleteList
=
new
ArrayList
<>(
knowValidList
);
hasDeleteList
.
removeAll
(
subjectValidList
);
//本次更新数据不需要删除
hasDeleteList
.
removeAll
(
hasAddList
);
}
log
.
info
(
"知识库:{},需要新增或更新的数据:{}"
,
kbKnowledges
.
getName
(),
hasAddList
);
log
.
info
(
"知识库:{},需要删除的数据:{}"
,
kbKnowledges
.
getName
(),
hasDeleteList
);
//处理待修改数据
for
(
String
item
:
hasAddList
)
{
String
id
=
item
.
split
(
","
)[
0
];
Map
<
String
,
Object
>
mapItem
=
esOpUtil
.
searchDoc
(
Constants
.
ES_SUBJECT_DATA
,
id
);
IntelligenceInfo
subjectItem
=
JSON
.
parseObject
(
JSON
.
toJSONString
(
mapItem
),
IntelligenceInfo
.
class
);
Knowledge
knowledge
=
new
Knowledge
();
knowledge
.
setUpdateDate
(
subjectItem
.
getUpdateDate
());
knowledge
.
setPublishDate
(
subjectItem
.
getPublishDate
());
knowledge
.
setAuthor
(
subjectItem
.
getAuthor
());
knowledge
.
setContentWithTag
(
subjectItem
.
getContentWithTag
());
knowledge
.
setId
(
id
);
knowledge
.
setDeleteFlag
(
0
);
knowledge
.
setCreateTime
(
EsDateUtil
.
esFieldDateFormat
(
cn
.
hutool
.
core
.
date
.
DateUtil
.
formatDateTime
(
new
Date
())));
knowledge
.
setTitle
(
subjectItem
.
getTitle
());
knowledge
.
setSubjectId
(
kbKnowledges
.
getSubjectId
());
knowledge
.
setVerifyStatus
(
1
);
knowledge
.
setOrigin
(
subjectItem
.
getOrigin
());
knowledge
.
setSourceAddress
(
subjectItem
.
getSourceAddress
());
knowledge
.
setKbKnowledgeId
(
getId
(
kbKnowledges
));
knowledge
.
setKnowledgeProjectId
(
kbKnowledges
.
getProjectId
());
knowledge
.
setType
(
"专题数据"
);
knowledge
.
setTypeId
(
kbKnowledges
.
getTypeId
());
knowledge
.
setImportData
(
1
);
//片段切分
List
<
Content
>
contents
=
new
ArrayList
<>();
List
<
String
>
splitContents
=
HtmlUtil
.
splitContents
(
subjectItem
.
getContentWithTag
());
for
(
String
content
:
splitContents
)
{
System
.
out
.
println
(
content
.
length
());
System
.
out
.
println
(
content
);
contents
.
add
(
Content
.
builder
()
.
contentId
(
codeGenerateUtil
.
geneIdNo
(
Constants
.
FINANCE
,
8
))
.
content
(
content
)
.
build
());
}
knowledge
.
setContents
(
contents
);
//判断是否存在
if
(
esOpUtil
.
docExists
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
id
)){
//更新数据
esOpUtil
.
docUpdateById
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
id
,
JSON
.
toJSONString
(
knowledge
));
log
.
info
(
"知识库数据更新id:{},title:{}"
,
knowledge
.
getId
(),
knowledge
.
getTitle
());
//删除向量库已有数据
knowledgeService
.
deleteForPython
(
id
,
kbKnowledges
.
getId
());
}
else
{
//添加数据
esOpUtil
.
docSaveByJson
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
id
,
JSON
.
toJSONString
(
knowledge
));
log
.
info
(
"知识库数据新增id:{},title:{}"
,
knowledge
.
getId
(),
knowledge
.
getTitle
());
}
//通过kafka同步到向量库
KnowledgeMessage
knowledgeMessage
=
new
KnowledgeMessage
();
BeanUtils
.
copyProperties
(
knowledge
,
knowledgeMessage
);
knowledgeMessage
.
setType
(
knowledge
.
getTypeId
());
produceInfo
.
sendKnowledgeContents
(
knowledgeMessage
);
log
.
info
(
"通过kafka同步到向量库id:{},title:{}"
,
knowledge
.
getId
(),
knowledge
.
getTitle
());
}
//处理待删除数据
for
(
String
item
:
hasDeleteList
)
{
String
id
=
item
.
split
(
","
)[
0
];
//标记删除
Knowledge
knowledge
=
new
Knowledge
();
knowledge
.
setId
(
id
);
knowledge
.
setDeleteFlag
(
1
);
esOpUtil
.
docUpdateById
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
id
,
JSON
.
toJSONString
(
knowledge
));
knowledgeService
.
deleteForPython
(
id
,
kbKnowledges
.
getId
());
log
.
info
(
"知识库数据删除id:{}"
,
knowledge
.
getId
());
}
}
private
static
String
getId
(
KbKnowledges
kbKnowledges
)
{
return
kbKnowledges
.
getId
();
}
public
void
getSubjectIdGroup
(
KbKnowledges
kbKnowledges
,
Map
<
Integer
,
List
<
String
>>
checkStatusMap
,
Map
<
Integer
,
List
<
String
>>
publishStatusMap
,
List
<
String
>
deleteList
)
{
//查询待同步数据id
BoolQueryBuilder
boolQuery
=
QueryBuilders
.
boolQuery
();
boolQuery
.
must
(
QueryBuilders
.
termsQuery
(
"subjectId"
,
kbKnowledges
.
getSubjectId
().
split
(
","
)));
//只查询更新时间在最近一周的数据
boolQuery
.
filter
(
QueryBuilders
.
rangeQuery
(
"updateDate"
).
gte
(
DateUtil
.
getStringDate
(
new
Date
(
System
.
currentTimeMillis
()
-
7
*
24
*
60
*
60
*
1000L
))));
SearchSourceBuilder
searchSourceBuilder
=
new
SearchSourceBuilder
();
//更新时间倒序
searchSourceBuilder
.
sort
(
"updateDate"
,
SortOrder
.
DESC
);
searchSourceBuilder
.
trackTotalHits
(
true
);
searchSourceBuilder
.
query
(
boolQuery
);
//查询指定字段
searchSourceBuilder
.
fetchSource
(
new
String
[]{
"id"
,
"deleteFlag"
,
"checkStatus"
,
"publishStatus"
,
"updateDate"
},
null
);
Map
<
String
,
Object
>
map
=
esOpUtil
.
searchByQuery
(
Constants
.
ES_SUBJECT_DATA
,
0
,
10000
,
searchSourceBuilder
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
//分组提取id列表
for
(
Map
<
String
,
Object
>
mapItem
:
list
)
{
Object
deleteFlag
=
mapItem
.
get
(
"deleteFlag"
);
if
(
deleteFlag
!=
null
&&
"1"
.
equals
(
deleteFlag
.
toString
())){
//已删除数据
deleteList
.
add
(
mapItem
.
get
(
"id"
).
toString
()+
","
+
mapItem
.
get
(
"updateDate"
).
toString
());
continue
;
}
groupMap
(
checkStatusMap
,
mapItem
,
mapItem
.
get
(
"checkStatus"
));
groupMap
(
publishStatusMap
,
mapItem
,
mapItem
.
get
(
"publishStatus"
));
}
}
}
public
void
getKnowId
(
KbKnowledges
kbKnowledges
,
List
<
String
>
validList
,
List
<
String
>
deleteList
)
{
//查询待同步数据id
BoolQueryBuilder
boolQuery
=
QueryBuilders
.
boolQuery
();
boolQuery
.
must
(
QueryBuilders
.
termsQuery
(
"subjectId"
,
kbKnowledges
.
getSubjectId
().
split
(
","
)));
//只查询更新时间在最近一周的数据
boolQuery
.
filter
(
QueryBuilders
.
rangeQuery
(
"updateDate"
).
gte
(
DateUtil
.
getStringDate
(
new
Date
(
System
.
currentTimeMillis
()
-
7
*
24
*
60
*
60
*
1000L
))));
SearchSourceBuilder
searchSourceBuilder
=
new
SearchSourceBuilder
();
//更新时间倒序
searchSourceBuilder
.
sort
(
"updateDate"
,
SortOrder
.
DESC
);
searchSourceBuilder
.
trackTotalHits
(
true
);
searchSourceBuilder
.
query
(
boolQuery
);
//查询指定字段
searchSourceBuilder
.
fetchSource
(
new
String
[]{
"id"
,
"deleteFlag"
,
"verifyStatus"
,
"updateDate"
},
null
);
Map
<
String
,
Object
>
map
=
esOpUtil
.
searchByQuery
(
Constants
.
ES_DATA_FOR_KNOWLEDGE
,
0
,
10000
,
searchSourceBuilder
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
list
!=
null
||
!
list
.
isEmpty
())
{
//分组提取id列表
for
(
Map
<
String
,
Object
>
mapItem
:
list
)
{
Object
deleteFlag
=
mapItem
.
get
(
"deleteFlag"
);
if
(
deleteFlag
!=
null
&&
"1"
.
equals
(
deleteFlag
.
toString
())){
//已删除数据
deleteList
.
add
(
mapItem
.
get
(
"id"
).
toString
()+
","
+
mapItem
.
get
(
"updateDate"
).
toString
());
}
else
{
//只要未删除,均为有效数据
validList
.
add
(
mapItem
.
get
(
"id"
).
toString
()+
","
+
mapItem
.
get
(
"updateDate"
).
toString
());
}
}
}
}
private
void
groupMap
(
Map
<
Integer
,
List
<
String
>>
statusMap
,
Map
<
String
,
Object
>
mapItem
,
Object
status
)
{
if
(
status
==
null
){
status
=
0
;
}
Integer
statusInt
=
Integer
.
valueOf
(
status
.
toString
());
if
(
statusMap
.
containsKey
(
statusInt
)){
statusMap
.
get
(
statusInt
).
add
(
mapItem
.
get
(
"id"
).
toString
()+
","
+
mapItem
.
get
(
"updateDate"
).
toString
());
}
else
{
List
<
String
>
tempList
=
new
ArrayList
<>();
tempList
.
add
(
mapItem
.
get
(
"id"
).
toString
()+
","
+
mapItem
.
get
(
"updateDate"
).
toString
());
statusMap
.
put
(
statusInt
,
tempList
);
}
}
}
src/main/java/com/zzsn/knowbase/util/ContentUtility.java
0 → 100644
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
util
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
public
class
ContentUtility
{
static
Pattern
divNoneP
=
Pattern
.
compile
(
"(?s)<div[^>]*display:none[^>]*>.*?</div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
divP
=
Pattern
.
compile
(
"<div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
divRP
=
Pattern
.
compile
(
"</div>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
brP
=
Pattern
.
compile
(
"<br />"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
br2P
=
Pattern
.
compile
(
"<br>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
spaceP
=
Pattern
.
compile
(
" "
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
strongP
=
Pattern
.
compile
(
"<strong>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
strongRP
=
Pattern
.
compile
(
"</strong>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
pP
=
Pattern
.
compile
(
"<p>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
pRP
=
Pattern
.
compile
(
"</p>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
centerP
=
Pattern
.
compile
(
"<center[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
centerRP
=
Pattern
.
compile
(
"</center>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
removeAttrP
=
Pattern
.
compile
(
"<([a-zA-Z0-9]+)[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
commentP
=
Pattern
.
compile
(
"(?s)<!--[^>]*>.*?<![^>]*-->"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
inputP
=
Pattern
.
compile
(
"<input[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
formP
=
Pattern
.
compile
(
"<form[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
formRP
=
Pattern
.
compile
(
"</form>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
buttonP
=
Pattern
.
compile
(
"(?s)<button[^>]*>.*?</button>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
iframeP
=
Pattern
.
compile
(
"(?s)<iframe[^>]*>.*?</iframe>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
noscriptP
=
Pattern
.
compile
(
"(?s)<noscript>.*?</noscript>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
objectP
=
Pattern
.
compile
(
"(?s)<object[^>]*>.*?</object>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
linkP
=
Pattern
.
compile
(
"(?s)<link[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
imgReplaceP
=
Pattern
.
compile
(
"<img([^>]*)>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
imgRevReplaceP
=
Pattern
.
compile
(
"<_img([^>]*)>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
imgP
=
Pattern
.
compile
(
"<img[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
imgRP
=
Pattern
.
compile
(
"</img>"
,
Pattern
.
CASE_INSENSITIVE
);
public
static
Pattern
aRemoveP
=
Pattern
.
compile
(
"(?s)<a[^>]*>.*?</a>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
legendRemoveP
=
Pattern
.
compile
(
"(?s)<legend[^>]*>.*?</legend>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
aP
=
Pattern
.
compile
(
"<a[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
aRP
=
Pattern
.
compile
(
"</a>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
fontP
=
Pattern
.
compile
(
"<font[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
fontRP
=
Pattern
.
compile
(
"</font>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
hP
=
Pattern
.
compile
(
"<h\\d[^>]*>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
hRP
=
Pattern
.
compile
(
"</h\\d>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
ulRP
=
Pattern
.
compile
(
"</ul>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
liRP
=
Pattern
.
compile
(
"</li>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
trRP
=
Pattern
.
compile
(
"</tr>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
tdRP
=
Pattern
.
compile
(
"</td>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
textareaRemoveP
=
Pattern
.
compile
(
"(?s)<textarea[^>]*>.*?</textarea>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
selectRemoveP
=
Pattern
.
compile
(
"(?s)<select[^>]*>.*?</select>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
optionRemoveP
=
Pattern
.
compile
(
"(?s)<option[^>]*>.*?</option>"
,
Pattern
.
CASE_INSENSITIVE
);
static
Pattern
labelRemoveP
=
Pattern
.
compile
(
"(?s)<label[^>]*>.*?</label>"
,
Pattern
.
CASE_INSENSITIVE
);
static
String
regHTMLNumcode
=
"&#(\\d{4,5});"
;
static
Pattern
patHTMLNumCode
=
Pattern
.
compile
(
regHTMLNumcode
);
/**
* 去掉无用的HTML标签,包括a等
* @param htmlText
* @return
*/
public
static
String
RemoveUselessHTMLTagX
(
String
htmlText
)
{
try
{
htmlText
=
ContentUtility
.
RemoveStyleCode
(
htmlText
);
htmlText
=
htmlText
.
replaceAll
(
" "
,
" "
);
htmlText
=
divNoneP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
textareaRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
selectRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
optionRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
labelRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
inputP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
formP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
buttonP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
formRP
.
matcher
(
htmlText
).
replaceAll
(
""
);
// htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
htmlText
=
removeUselessAtt
(
htmlText
);
// htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
// htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");
htmlText
=
commentP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
legendRemoveP
.
matcher
(
htmlText
).
replaceAll
(
""
);
// htmlText = aP.matcher(htmlText).replaceAll("<sapn>");
// htmlText = aRP.matcher(htmlText).replaceAll("</sapn>");
htmlText
=
iframeP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
noscriptP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
objectP
.
matcher
(
htmlText
).
replaceAll
(
""
);
//htmlText = imgP.matcher(htmlText).replaceAll("");
//htmlText = imgRP.matcher(htmlText).replaceAll("");
htmlText
=
centerP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
centerRP
.
matcher
(
htmlText
).
replaceAll
(
""
);
htmlText
=
htmlText
.
replaceAll
(
"<cufontext>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"</cufontext>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"<cufon>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"</cufon>"
,
""
);
//htmlText = htmlText.replaceAll("(?s)<([a-zA-Z0-9]+)[^>]*>\\s*(</$1>)", "");
htmlText
=
htmlText
.
replaceAll
(
"(?s)<ul[^>]*>\\s*</ul>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"(?s)<div[^>]*>\\s*</div>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"(?s)<p[^>]*>\\s*</p>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"(?s)<li[^>]*>\\s*</li>"
,
""
);
htmlText
=
htmlText
.
replaceAll
(
"(?s)<canvas[^>]*>\\s*</canvas>"
,
""
);
return
htmlText
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
htmlText
;
}
}
public
static
String
RemoveHTMLCode
(
String
src
)
{
src
=
src
.
replaceAll
(
"(<[^>]*>)\\s*(<[^>]*>)"
,
"$1$2"
);
src
=
divP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
divRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
brP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
br2P
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
spaceP
.
matcher
(
src
).
replaceAll
(
" "
);
src
=
src
.
replaceAll
(
"•"
,
"??"
);
src
=
strongP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
strongRP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
pP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
pRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
aP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
aRP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
imgP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
fontP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
fontRP
.
matcher
(
src
).
replaceAll
(
""
);
src
=
hRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
ulRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
liRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
trRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
tdRP
.
matcher
(
src
).
replaceAll
(
"\n\n"
);
src
=
src
.
replaceAll
(
"<[^>]*>"
,
""
);
return
src
.
trim
();
}
/**
* 去除除table的td外的无用的html标签属性
* 创建人: 李东亮
* 创建时间: 2016-7-14 下午5:01:20
* @version 1.0
* @param htmlText
* @return
*/
public
static
String
removeUselessAtt
(
String
htmlText
)
{
Matcher
m
=
removeAttrP
.
matcher
(
htmlText
);
Map
<
String
,
String
>
replaceMap
=
new
HashMap
<
String
,
String
>();
String
tagPre
;
while
(
m
.
find
())
{
tagPre
=
m
.
group
();
if
(!
tagPre
.
startsWith
(
"<td "
)
&&
!
tagPre
.
startsWith
(
"<TD "
)
&&
!
tagPre
.
startsWith
(
"<th "
)
&&
!
tagPre
.
startsWith
(
"<TH "
)
&&
!
tagPre
.
startsWith
(
"<img "
)
&&
!
tagPre
.
startsWith
(
"<IMG "
)
&&
!
tagPre
.
startsWith
(
"<a "
)
&&
!
tagPre
.
startsWith
(
"<A "
))
{
replaceMap
.
put
(
tagPre
,
removeAttrP
.
matcher
(
tagPre
).
replaceAll
(
"<$1>"
));
}
}
String
replaceTagPre
;
for
(
String
key
:
replaceMap
.
keySet
())
{
replaceTagPre
=
replaceMap
.
get
(
key
);
while
(
htmlText
.
contains
(
key
)
&&
!
key
.
equals
(
replaceTagPre
))
{
htmlText
=
htmlText
.
replace
(
key
,
replaceTagPre
);
}
}
return
htmlText
;
}
public
static
String
HTMLDecode
(
String
str
)
{
//
// 去掉一些HTML编码
str
=
str
.
replaceAll
(
"""
,
"\""
);
str
=
str
.
replaceAll
(
" "
,
" "
);
str
=
str
.
replaceAll
(
"·"
,
"·"
);
str
=
str
.
replaceAll
(
"&"
,
"&"
);
str
=
str
.
replaceAll
(
"“"
,
"“"
);
str
=
str
.
replaceAll
(
"”"
,
"”"
);
str
=
str
.
replaceAll
(
">"
,
">"
);
str
=
str
.
replaceAll
(
"<"
,
"<"
);
str
=
str
.
replaceAll
(
"»"
,
"??"
);
str
=
str
.
replaceAll
(
"×"
,
"×"
);
str
=
str
.
replaceAll
(
"ç"
,
"??"
);
str
=
str
.
replaceAll
(
"ã"
,
"??"
);
str
=
str
.
replaceAll
(
"ê"
,
"ê"
);
// 去掉<>
//
str
=
str
.
replaceAll
(
"<\\?[^>]*>"
,
""
);
Matcher
matcher
=
patHTMLNumCode
.
matcher
(
str
);
while
(
matcher
.
find
())
{
str
=
matcher
.
replaceFirst
(
String
.
valueOf
((
char
)
Integer
.
parseInt
(
matcher
.
group
(
1
))));
matcher
=
patHTMLNumCode
.
matcher
(
str
);
}
String
[]
tmp
=
str
.
split
(
";&#|&#|;"
);
StringBuffer
sb
=
new
StringBuffer
(
""
);
for
(
int
i
=
0
;
i
<
tmp
.
length
;
i
++)
{
if
(
tmp
[
i
].
matches
(
"\\d{4,5}"
))
{
sb
.
append
((
char
)
Integer
.
parseInt
(
tmp
[
i
]));
}
else
{
sb
.
append
(
tmp
[
i
]);
}
}
str
=
sb
.
toString
();
return
str
;
}
public
static
String
RemoveHTMLReturnCode
(
String
src
)
{
//src = src.replaceAll("(<[^>]*>)[\r\n]+(<[^>]*>)", "$1$2");
src
=
src
.
replaceAll
(
"\r"
,
""
);
src
=
src
.
replaceAll
(
"\n"
,
""
);
return
src
;
}
/**
* 提取html字符串转中的普通文本,注意处理其中的回车符
* @param htmlText
* @return
*/
public
static
String
TransferHTML2Text
(
String
htmlText
)
{
if
(
htmlText
==
null
){
return
null
;
}
String
text
=
ContentUtility
.
HTMLDecode
(
ContentUtility
.
RemoveHTMLCode
(
ContentUtility
.
RemoveStyleCode
(
ContentUtility
.
RemoveHTMLReturnCode
(
htmlText
))));
text
=
text
.
replaceAll
(
" "
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +\r\n"
,
"\r\n"
);
text
=
text
.
replaceAll
(
" +"
,
" "
);
text
=
text
.
replaceAll
(
"[\\u00A0\\u3000]"
,
""
);
text
=
text
.
replaceAll
(
" "
,
""
);
return
text
;
}
public
static
String
RemoveStyleCode
(
String
content
)
{
try
{
Pattern
p1
=
Pattern
.
compile
(
"(?s)<script\\s*.*?>(.*?)</script>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m1
=
p1
.
matcher
(
content
);
content
=
m1
.
replaceAll
(
""
);
Pattern
p2
=
Pattern
.
compile
(
"(?s)<style\\s*.*?>(.*?)</style>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m2
=
p2
.
matcher
(
content
);
content
=
m2
.
replaceAll
(
""
);
Pattern
p11
=
Pattern
.
compile
(
"(?s)<script\\s*.*?/>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m11
=
p11
.
matcher
(
content
);
content
=
m11
.
replaceAll
(
""
);
Pattern
p21
=
Pattern
.
compile
(
"(?s)<style\\s*.*?/>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m21
=
p21
.
matcher
(
content
);
content
=
m21
.
replaceAll
(
""
);
content
=
noscriptP
.
matcher
(
content
).
replaceAll
(
""
);
content
=
objectP
.
matcher
(
content
).
replaceAll
(
""
);
content
=
linkP
.
matcher
(
content
).
replaceAll
(
""
);
Pattern
p22
=
Pattern
.
compile
(
"(?s)<img\\s*.*?/>"
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
m22
=
p22
.
matcher
(
content
);
content
=
m22
.
replaceAll
(
""
);
// 去除注释
// Pattern p3 = Pattern.compile("(?s)<!--\\s*.*?>(.*?)-->");
Pattern
p3
=
Pattern
.
compile
(
"(?s)<!--.*?-->"
);
Matcher
m3
=
p3
.
matcher
(
content
);
content
=
m3
.
replaceAll
(
""
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
content
;
}
/**
* 返回匹配的域名。例如:www.baidu.com
* @version 1.0
* @param sourceAddress
* @return
*/
public
static
String
domainURL
(
String
sourceAddress
){
if
(
sourceAddress
==
null
||
sourceAddress
.
trim
().
length
()==
0
)
{
return
""
;
}
String
result
=
""
;
try
{
String
regex
=
"(?<=//|)((\\w)+\\.)+[\\s\\S]+?(?=\\/|\\:|\\?)"
;
Pattern
p
=
Pattern
.
compile
(
regex
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
matcher
=
p
.
matcher
(
sourceAddress
);
matcher
.
find
();
result
=
matcher
.
group
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
result
;
}
/**
* 去除域名的前缀(栏目)
* 例如:输入:finance.sina.com.cn ; 输出: sina.com.cn ;
* @version 1.0
* @param domainStr
* @return
*/
public
static
String
cutDomainPrefix
(
String
domainStr
){
String
result
=
""
;
try
{
String
regex
=
"[a-zA-Z0-9-]+.(cn|com|cdt|com.mo|nl|us|biz|de|org.sa|info|ee|org.zw|co.uk|ie|com.sg|co.ke|be|eu|com.cn|gov.cn|co.kr|sh.cn|cssn.cn|org|ac.cn|co|org.cn|net|org.uk|hk|fr|no|se|org.sg|bg|org.pl|cz|at|org.nz|or.jp|mu|org.pe|com.hk|net.cn|mil|edu|edu.cn|cas.cn|hk|tw|tv|me|cc|COM|ORG|NET|MIL|EDU)$"
;
Pattern
p
=
Pattern
.
compile
(
regex
,
Pattern
.
CASE_INSENSITIVE
);
Matcher
matcher
=
p
.
matcher
(
domainStr
);
while
(
matcher
.
find
())
{
result
=
matcher
.
group
();
break
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
result
;
}
}
src/main/java/com/zzsn/knowbase/util/EsOpUtil.java
浏览文件 @
310640c1
...
...
@@ -578,7 +578,47 @@ public class EsOpUtil {
return
null
;
}
}
/**
* 按条件查询数据
*
* @param index
* @param start
* @param size
* @param searchSourceBuilder
* @return
*/
public
Map
<
String
,
Object
>
searchByQuery
(
String
index
,
int
start
,
int
size
,
SearchSourceBuilder
searchSourceBuilder
)
{
try
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<>();
SearchRequest
searchRequest
=
new
SearchRequest
(
index
);
// 默认最大数量是10000,设置为true后,显示准确数量
searchSourceBuilder
.
trackTotalHits
(
true
);
if
(
start
>=
0
&&
size
>=
0
)
{
searchSourceBuilder
.
from
(
start
);
searchSourceBuilder
.
size
(
size
);
}
searchRequest
.
source
(
searchSourceBuilder
);
//System.out.println("param : " + searchSourceBuilder.toString());
SearchResponse
searchResponse
=
client
.
search
(
searchRequest
,
RequestOptions
.
DEFAULT
);
SearchHits
hits
=
searchResponse
.
getHits
();
Long
total
=
hits
.
getTotalHits
().
value
;
resultMap
.
put
(
"total"
,
total
);
SearchHit
[]
searchHits
=
hits
.
getHits
();
List
<
Map
<
String
,
Object
>>
mapList
=
new
ArrayList
<>(
searchHits
.
length
);
for
(
SearchHit
hit
:
searchHits
)
{
//存储的字段
Map
<
String
,
Object
>
sourceAsMap
=
hit
.
getSourceAsMap
();
//得分
//sourceAsMap.put("score", hit.getScore());
mapList
.
add
(
sourceAsMap
);
}
resultMap
.
put
(
"data"
,
mapList
);
return
resultMap
;
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
public
boolean
existBySourceAddress
(
String
index
,
String
sourceAddress
)
{
SearchRequest
searchRequest
=
new
SearchRequest
(
index
);
SearchSourceBuilder
searchSourceBuilder
=
new
SearchSourceBuilder
();
...
...
src/main/java/com/zzsn/knowbase/util/HtmlUtil.java
0 → 100644
浏览文件 @
310640c1
package
com
.
zzsn
.
knowbase
.
util
;
import
com.alibaba.fastjson.JSON
;
import
org.apache.commons.lang3.StringUtils
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
public
class
HtmlUtil
{
/***
* 标签格式化,去除空行,规范添加首行缩进等,与前端ck富文本编辑器格式化保持一致
* @param html
* @return
*/
public
static
String
formatHtml
(
String
html
)
{
return
formatHtml
(
html
,
null
,
null
);
}
/***
* 标签格式化,去除空行,规范添加首行缩进等,与前端ck富文本编辑器格式化保持一致
* @param html
* @param cleanInKeys 清理包含关键词的段落
* @param cleanEqKeys 清理等于关键词的段落
* @return
*/
public
static
String
formatHtml
(
String
html
,
String
cleanInKeys
,
String
cleanEqKeys
)
{
//提取figure部分 不进行格式化处理
html
=
html
.
replaceAll
(
"\r"
,
""
).
replaceAll
(
"\n"
,
""
);
Pattern
figurePtn
=
Pattern
.
compile
(
"<figure.*?</figure>"
);
Matcher
matcher
=
figurePtn
.
matcher
(
html
);
List
<
String
>
figureStrList
=
new
ArrayList
<>();
while
(
matcher
.
find
())
{
figureStrList
.
add
(
matcher
.
group
());
}
if
(
figureStrList
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
figureStrList
.
size
();
i
++)
{
html
=
html
.
replace
(
figureStrList
.
get
(
i
),
"current_figure_wlan"
+
i
+
"current_figure_wlan"
);
}
}
//提取table部分 不进行格式化处理
Pattern
tablePtn
=
Pattern
.
compile
(
"<table.*?</table>"
);
Matcher
matchertable
=
tablePtn
.
matcher
(
html
);
List
<
String
>
tableStrList
=
new
ArrayList
<>();
while
(
matchertable
.
find
())
{
tableStrList
.
add
(
matchertable
.
group
());
}
if
(
tableStrList
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
tableStrList
.
size
();
i
++)
{
html
=
html
.
replace
(
tableStrList
.
get
(
i
),
"current_table_wlan"
+
i
+
"current_table_wlan"
);
}
}
//格式化代码
html
=
html
.
replaceAll
(
"<div"
,
"<p"
);
html
=
html
.
replaceAll
(
"</div>"
,
"</p>"
);
//html = html.replaceAll("<strong[^>]*>", "");
//html = html.replaceAll("</strong>", "");
html
=
html
.
replaceAll
(
"<html[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</html>"
,
""
);
html
=
html
.
replaceAll
(
"<body[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</body>"
,
""
);
html
=
html
.
replaceAll
(
"<head[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</head>"
,
""
);
html
=
html
.
replaceAll
(
"<em[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</em>"
,
""
);
html
=
html
.
replaceAll
(
"<u[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</u>"
,
""
);
html
=
html
.
replaceAll
(
"<li[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</li>"
,
""
);
html
=
html
.
replaceAll
(
"<span[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</span>"
,
""
);
html
=
html
.
replaceAll
(
" "
,
""
);
html
=
html
.
replaceAll
(
" "
,
""
);
html
=
html
.
replaceAll
(
"<p></p>"
,
""
);
html
=
html
.
replaceAll
(
"<a"
,
"<a rel=\"nofollow\""
);
//<br data-cke-filler="true">表示空行,去除br内部样式
html
=
html
.
replaceAll
(
"<br[^>]*>"
,
"<br />"
);
html
=
html
.
replaceAll
(
"</br[^>]*>"
,
"<br />"
);
//将p标签替换成<br />
html
=
html
.
replaceAll
(
"<p[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</p>"
,
"<br />"
);
html
=
html
.
replaceAll
(
"<br /><br />"
,
"<br />"
);
html
=
html
.
replaceAll
(
"\n"
,
"<br />"
);
html
=
html
.
replaceAll
(
"\r"
,
""
);
//按<br />分组,将换行<br>全部替换成p标签
String
[]
bb
=
html
.
split
(
"<br[^>]*>"
);
String
aa
=
""
;
for
(
int
i
=
0
;
i
<
bb
.
length
;
i
++){
if
(
StringUtils
.
isEmpty
(
bb
[
i
])
||
StringUtils
.
isEmpty
(
bb
[
i
].
trim
())){
continue
;
}
//清理包含关键词的段落
if
(
StringUtils
.
isNotEmpty
(
cleanInKeys
)){
List
<
String
>
list
=
JSON
.
parseArray
(
cleanInKeys
,
String
.
class
);
boolean
flag
=
false
;
for
(
String
keys
:
list
)
{
for
(
String
key
:
keys
.
split
(
"\\+"
))
{
if
(
bb
[
i
].
contains
(
key
)){
flag
=
true
;
}
else
{
//组合词只要有一个不满足则不去除
flag
=
false
;
break
;
}
}
if
(
flag
){
//满足一项则去除此段
break
;
}
}
if
(
flag
){
continue
;
}
}
//清理等于关键词的段落
if
(
StringUtils
.
isNotEmpty
(
cleanEqKeys
))
{
List
<
String
>
list
=
JSON
.
parseArray
(
cleanEqKeys
,
String
.
class
);
if
(
list
.
contains
(
bb
[
i
].
trim
())){
continue
;
}
}
if
(
bb
[
i
].
trim
().
startsWith
(
"<img "
)){
aa
=
aa
+
"<p style=\"text-align:center;\">"
+
bb
[
i
].
trim
()+
"</p>"
;
}
else
{
aa
=
aa
+
"<p style=\"text-indent:2em;\">"
+
bb
[
i
].
trim
()+
"</p>"
;
}
}
//首行缩进
//html = aa.replaceAll("<p[^>]*>", "<p style=\"text-indent:2em;\">");
html
=
aa
;
//去除所有外链
html
=
removeLink
(
html
);
//去除所有javascript代码标记
html
=
removeJavascript
(
html
);
//去除空行
html
=
html
.
replaceAll
(
"<p> </p>"
,
""
);
html
=
html
.
replaceAll
(
"<p></p>"
,
""
);
html
=
html
.
replaceAll
(
"<p style=\"text-indent:2em;\"> </p>"
,
""
);
html
=
html
.
replaceAll
(
"<p style=\"text-indent:2em;\"></p>"
,
""
);
html
=
html
.
replaceAll
(
"<p style=\"text-indent:2em;\"><title></title></p>"
,
""
);
//回写table部分
if
(
tableStrList
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
tableStrList
.
size
();
i
++)
{
html
=
html
.
replace
(
"<p style=\"text-indent:2em;\">current_table_wlan"
+
i
+
"current_table_wlan</p>"
,
tableStrList
.
get
(
i
));
html
=
html
.
replace
(
"current_table_wlan"
+
i
+
"current_table_wlan"
,
tableStrList
.
get
(
i
));
}
}
//回写figure部分
if
(
figureStrList
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
figureStrList
.
size
();
i
++)
{
html
=
html
.
replace
(
"<p style=\"text-indent:2em;\">current_figure_wlan"
+
i
+
"current_figure_wlan</p>"
,
figureStrList
.
get
(
i
));
html
=
html
.
replace
(
"current_figure_wlan"
+
i
+
"current_figure_wlan"
,
figureStrList
.
get
(
i
));
}
}
return
html
;
}
/**
* 去除所有外链
* @param html
* @return
*/
public
static
String
removeLink
(
String
html
)
{
html
=
html
.
replaceAll
(
"<a[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</a[^>]*>"
,
""
);
return
html
;
}
/**
* 去除所有javascript代码标记
* @param html
* @return
*/
public
static
String
removeJavascript
(
String
html
)
{
html
=
html
.
replaceAll
(
"<meta[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"<script[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</script[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"<iframe[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</iframe[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"<frame[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"javascript:"
,
"javascript:"
);
return
html
;
}
/**
* 去除表格
* @param html
* @return
*/
public
static
String
removeTabel
(
String
html
)
{
html
=
html
.
replaceAll
(
"<table.*?</table>"
,
""
);
return
html
;
}
/**
* 切割分片 尽量保证段落字数在200左右
* @param html
* @return
*/
public
static
List
<
String
>
splitContents
(
String
html
)
{
List
<
String
>
list
=
new
ArrayList
<>();
html
=
formatHtml
(
html
);
//去除换行
html
=
html
.
replaceAll
(
"\n|\r"
,
""
);
//去除表格
html
=
html
.
replaceAll
(
"<table.*?</table>"
,
""
);
//格式化代码
html
=
html
.
replaceAll
(
"</div>"
,
"</p>"
);
html
=
html
.
replaceAll
(
"<div"
,
"<p"
);
html
=
html
.
replaceAll
(
"<figure[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</figure>"
,
""
);
html
=
html
.
replaceAll
(
"<strong[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</strong>"
,
""
);
html
=
html
.
replaceAll
(
"<html[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</html>"
,
""
);
html
=
html
.
replaceAll
(
"<body[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</body>"
,
""
);
html
=
html
.
replaceAll
(
"<head[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</head>"
,
""
);
html
=
html
.
replaceAll
(
"<em[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</em>"
,
""
);
html
=
html
.
replaceAll
(
"<u[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</u>"
,
""
);
html
=
html
.
replaceAll
(
"<li[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</li>"
,
""
);
html
=
html
.
replaceAll
(
"<span[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</span>"
,
""
);
html
=
html
.
replaceAll
(
" "
,
""
);
html
=
html
.
replaceAll
(
" "
,
""
);
html
=
html
.
replaceAll
(
"<p></p>"
,
""
);
html
=
html
.
replaceAll
(
"<a"
,
"<a rel=\"nofollow\""
);
//<br data-cke-filler="true">表示空行,去除br内部样式
html
=
html
.
replaceAll
(
"<br[^>]*>"
,
"<br />"
);
html
=
html
.
replaceAll
(
"</br[^>]*>"
,
"<br />"
);
//将p标签替换成<br />
html
=
html
.
replaceAll
(
"<p[^>]*>"
,
""
);
html
=
html
.
replaceAll
(
"</p>"
,
"<br />"
);
html
=
html
.
replaceAll
(
"<br /><br />"
,
"<br />"
);
html
=
html
.
replaceAll
(
"\n"
,
"<br />"
);
html
=
html
.
replaceAll
(
"\r"
,
""
);
StringBuilder
item
=
new
StringBuilder
();
for
(
String
s
:
html
.
split
(
"<br[^>]*>"
))
{
item
.
append
(
ContentUtility
.
TransferHTML2Text
(
s
.
trim
()));
if
(
item
.
length
()>
500
){
//单段超过500字符的,按照句号进行段落切分
String
[]
bb
=
item
.
toString
().
split
(
"。"
);
StringBuilder
item2
=
new
StringBuilder
();
for
(
String
s2
:
bb
)
{
item2
.
append
(
s2
.
trim
()).
append
(
"。"
);
if
(
item2
.
length
()>
150
){
list
.
add
(
item2
.
toString
());
item2
=
new
StringBuilder
();
}
}
if
(
item2
.
length
()>
0
){
list
.
add
(
item2
.
toString
());
}
item
=
new
StringBuilder
();
}
else
if
(
item
.
length
()>
30
){
//段落字数在30到500之间的,直接添加
list
.
add
(
item
.
toString
());
item
=
new
StringBuilder
();
}
}
if
(
item
.
length
()>
0
){
list
.
add
(
item
.
toString
());
}
return
list
;
}
public
static
void
main
(
String
[]
args
)
{
System
.
out
.
println
(
formatHtml
(
"<p>标题</p><p>摘要</p><p>正搜索文</p>"
,
"[\"正+文\",\"正文\"]"
,
"[\"标题\",\"要\"]"
));
System
.
out
.
println
(
formatHtml
(
"<html>\n"
+
" <head>111</head>\n"
+
" <body> \n"
+
" <div> \n"
+
" <div> \n"
+
" <table> \n"
+
" <tbody> \n"
+
" <tr> \n"
+
" <td style=\"\" width=\"54\">品名</td> \n"
+
" <td style=\"\" width=\"54\">规格(mm)</td> \n"
+
" <td style=\"\" width=\"54\">材质</td> \n"
+
" <td style=\"\" width=\"54\">钢厂/产地</td> \n"
+
" <td style=\"\" width=\"54\">价格(元/吨)</td> \n"
+
" <td style=\"\" width=\"54\">涨跌</td> \n"
+
" <td style=\"\" width=\"54\">备注</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td align=\"right\">6</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">5050</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td align=\"right\">8</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4860</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td align=\"right\">10</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4760</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">经销</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td align=\"right\">12</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4290</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td align=\"right\">14</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4250</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>16-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4210</td> \n"
+
" <td>-</td> \n"
+
" <td><br></td> \n"
+
" <td style=\"min-width: 30px\">代理</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>酒钢</td> \n"
+
" <td align=\"right\">4190</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>柳钢</td> \n"
+
" <td align=\"right\">4190</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>临钢</td> \n"
+
" <td align=\"right\">4190</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>萍钢</td> \n"
+
" <td align=\"right\">4190</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>南钢</td> \n"
+
" <td align=\"right\">4190</td> \n"
+
" <td>-</td> \n"
+
" <td>无货</td> \n"
+
" <td style=\"min-width: 30px\">代理</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普中板</td> \n"
+
" <td>22-30</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4330</td> \n"
+
" <td>-</td> \n"
+
" <td><br></td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td>32-40</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4320</td> \n"
+
" <td>-</td> \n"
+
" <td><br></td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td align=\"right\">30</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>酒钢</td> \n"
+
" <td align=\"right\">4290</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td align=\"right\">40</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>酒钢</td> \n"
+
" <td align=\"right\">4270</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td align=\"right\">40</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>萍钢</td> \n"
+
" <td align=\"right\">4290</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td align=\"right\">50</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4360</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">普厚板</td> \n"
+
" <td align=\"right\">60</td> \n"
+
" <td>Q235B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4410</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td align=\"right\">6</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4980</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td align=\"right\">8</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4830</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td align=\"right\">10</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4750</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td align=\"right\">12</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4490</td> \n"
+
" <td>-</td> \n"
+
" <td>无货</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td align=\"right\">14</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4420</td> \n"
+
" <td>-</td> \n"
+
" <td>无货</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>16-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4400</td> \n"
+
" <td>-</td> \n"
+
" <td><br></td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>酒钢</td> \n"
+
" <td align=\"right\">4380</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>萍钢</td> \n"
+
" <td align=\"right\">4380</td> \n"
+
" <td>-</td> \n"
+
" <td>货少</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>临钢</td> \n"
+
" <td align=\"right\">4380</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>柳钢</td> \n"
+
" <td align=\"right\">4370</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金中板</td> \n"
+
" <td>14-20</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>南钢</td> \n"
+
" <td align=\"right\">4370</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金厚板</td> \n"
+
" <td>22-30</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>重钢</td> \n"
+
" <td align=\"right\">4450</td> \n"
+
" <td>-</td> \n"
+
" <td><br></td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" <tr> \n"
+
" <td style=\"\">低合金厚板</td> \n"
+
" <td align=\"right\">30</td> \n"
+
" <td>Q355B</td> \n"
+
" <td>酒钢</td> \n"
+
" <td align=\"right\">4430</td> \n"
+
" <td>-</td> \n"
+
" <td>无货gl</td> \n"
+
" <td style=\"min-width: 30px\">商家</td> \n"
+
" </tr> \n"
+
" </tbody> \n"
+
" </table> \n"
+
" <p style=\"text-indent:2rem\"><br></p> \n"
+
" </div> \n"
+
" </div> \n"
+
" </body>\n"
+
"</html>"
));
}
/**
* 数据高亮显示,适用于标题和摘要处理
* @param text 原文
* @param keys 需要高亮数据
*/
public
static
String
gaoLiang
(
String
text
,
String
keys
)
{
if
(
StringUtils
.
isEmpty
(
keys
)){
return
text
;
}
String
[]
split
=
keys
.
split
(
""
);
//通过set去重
Set
<
String
>
set
=
new
HashSet
<>(
Arrays
.
asList
(
split
));
//去除原有标签
text
=
ContentUtility
.
TransferHTML2Text
(
text
);
for
(
String
key
:
set
)
{
//不替换标签内容
String
zhanwei1
=
"☛"
;
String
zhanwei2
=
"☚"
;
text
=
text
.
replace
(
"<span style='color: #f73131;'>"
,
zhanwei1
).
replace
(
"</span>"
,
zhanwei2
);
text
=
text
.
replace
(
key
,
"<span style='color: #f73131;'>"
+
key
+
"</span>"
);
text
=
text
.
replace
(
zhanwei1
,
"<span style='color: #f73131;'>"
).
replace
(
zhanwei2
,
"</span>"
);
}
return
text
;
}
}
src/main/java/com/zzsn/knowbase/vo/KnowledgeParam.java
浏览文件 @
310640c1
...
...
@@ -78,5 +78,6 @@ public class KnowledgeParam {
*/
private
String
verifyEndTime
;
private
String
kbKnowledgeId
;
private
String
subjectId
;
}
src/main/java/com/zzsn/knowbase/vo/KnowledgeVO.java
浏览文件 @
310640c1
...
...
@@ -20,25 +20,40 @@ import java.util.List;
@NoArgsConstructor
@AllArgsConstructor
public
class
KnowledgeVO
{
private
String
content
;
private
String
id
;
private
String
title
;
private
String
verifyTime
;
private
String
publishDate
;
private
String
updateDate
;
private
String
createTime
;
private
String
kbKnowledgeId
;
private
String
KnowledgeProjectId
;
private
String
type
;
private
String
typeId
;
private
String
verifierName
;
private
String
origin
;
private
String
author
;
private
String
contentAll
;
private
Integer
score
;
private
Integer
verifyStatus
;
private
Integer
importData
;
private
String
subjectId
;
private
String
contentWithTag
;
private
String
contentAll
;
private
List
<
Content
>
contents
;
private
List
<
KnowFile
>
files
;
/**
* 审核时间
*/
private
String
verifyTime
;
/**
* 审核状态(0未审核1审核审核通过 2审核不通过)
*/
private
Integer
verifyStatus
;
/**
* 审核人id
*/
private
String
verifierId
;
/**
* 审核人名字
*/
private
String
verifierName
;
private
Integer
deleteFlag
;
}
src/main/resources/application.yml
浏览文件 @
310640c1
...
...
@@ -61,8 +61,8 @@ document:
host
:
http://114.116.116.241:9088
# host: http://192.168.1.71:9088
files
:
storage
:
/storage/know/
# storage:
C:
/know/
storage
:
/
zzsn/nt/
storage/know/
# storage:
D:/storage
/know/
docservice
:
url
:
site
:
http://114.116.116.241:80/
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论