继续操作前请注册或者登录。
提交 a53dafc5 作者: zgz

数据返回副标题

上级 aa655624
...@@ -90,7 +90,7 @@ public class InformationServiceImpl implements InformationService { ...@@ -90,7 +90,7 @@ public class InformationServiceImpl implements InformationService {
private SysDictItemService sysDictItemService; private SysDictItemService sysDictItemService;
@Autowired @Autowired
private PythonUtil pythonUtil; private PythonUtil pythonUtil;
private String subjectId = "1898653164373065730";//中外智库专栏对应专题id
@Override @Override
public IPage<EventDataVO> collectPageList(InfoDataSearchCondition eventDataCondition) { public IPage<EventDataVO> collectPageList(InfoDataSearchCondition eventDataCondition) {
...@@ -235,6 +235,35 @@ public class InformationServiceImpl implements InformationService { ...@@ -235,6 +235,35 @@ public class InformationServiceImpl implements InformationService {
info.setIndex(num + "."); info.setIndex(num + ".");
dataList.add(info); dataList.add(info);
} }
//中外智库专栏-数据需过滤
if(StringUtils.isNotBlank(searchCondition.getSubjectId()) && searchCondition.getSubjectId().equals(subjectId)){
ArrayList<DisplayInfo> saveList = new ArrayList<>(list.size());
List<String> titles = new ArrayList<>(list.size());
for(DisplayInfo s: dataList){
String title = s.getTitle().trim();
int tem = 0;
for(String t:titles){
double simforcatl = SimilarityUtil.simforcatl(title, t);
if(simforcatl>0.83){
log.info("getArticleNode 获取文章列表中标题为[{}],和标题为[{}],的相似度大于80%过滤后者",title,t);
tem=1;
break;
}
}
if(tem==0){
titles.add(title);
}else{
continue;
}
saveList.add(s);
}
for(DisplayInfo info: saveList){
if(StringUtils.isNotBlank(info.getSubtitle())){
info.setTitle(info.getSubtitle()+info.getTitle());
}
}
return Result.OK(saveList);
}
return Result.OK(dataList); return Result.OK(dataList);
} }
} }
......
package com.zzsn.event.util;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 编辑距离的两字符串相似度
*
* @author jianpo.mo
*/
public class SimilarityUtil {
private static int min(int one, int two, int three) {
int min = one;
if(two < min) {
min = two;
}
if(three < min) {
min = three;
}
return min;
}
public static int ld(String str1, String str2) {
int d[][]; //矩阵
int n = str1.length();
int m = str2.length();
int i; //遍历str1的
int j; //遍历str2的
char ch1; //str1的
char ch2; //str2的
int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1
if(n == 0) {
return m;
}
if(m == 0) {
return n;
}
d = new int[n+1][m+1];
for(i=0; i<=n; i++) { //初始化第一列
d[i][0] = i;
}
for(j=0; j<=m; j++) { //初始化第一行
d[0][j] = j;
}
for(i=1; i<=n; i++) { //遍历str1
ch1 = str1.charAt(i-1);
//去匹配str2
for(j=1; j<=m; j++) {
ch2 = str2.charAt(j-1);
if(ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
//左边+1,上边+1, 左上角+temp取最小
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
}
}
return d[n][m];
}
public static double sim(String str1, String str2) {
int ld = ld(str1, str2);
return 1 - (double) ld / Math.min(str1.length(), str2.length());
}
public static int ldforcatlcn(String str1, String str2) {
if(str1==null||str2==null){
return 0;
}
int d=0; //相同数
int f=0; //相同数
int n = str1.length();
int m = str2.length();
int i; //遍历str1的
int j; //遍历str2的
String ch1; //str1的
String ch2; //str2的
if(n == 0) {
return 0;
}
if(m == 0) {
return 0;
}
for(i=0; i<n; i++) { //遍历str1
ch1 = str1.substring(i, i+1);
//去匹配str2
if(str2.contains(ch1)){
d++;
}
}
for(i=0; i<m; i++) { //遍历str1
ch2 = str2.substring(i, i+1);
//去匹配str2
if(str1.contains(ch2)){
f++;
}
}
return Math.min(d,f);
}
public static double ldforcatlen(String str1, String str2) {
if(str1==null||str2==null){
return 0;
}
String[] a = str1.split("[^a-zA-Z]+");
String[] b = str2.split("[^a-zA-Z]+");
int d=0; //相同数
int f=0; //相同数
int n = a.length;
int m = b.length;
int i; //遍历str1的
String ch1; //str1的
String ch2; //str2的
if(n == 0) {
return 0;
}
if(m == 0) {
return 0;
}
for(i=0; i<n; i++) { //遍历str1
ch1 = a[i];
//去匹配str2
if(str2.contains(ch1)){
d++;
}
}
for(i=0; i<m; i++) { //遍历str1
ch2 = b[i];
//去匹配str2
if(str1.contains(ch2)){
f++;
}
}
return (double)(Math.min(d,f))/(Math.min(m,n));
}
public static double simforcatl(String str1, String str2) {
if(isChinese(str1)&&isChinese(str2)){
int ld = ldforcatlcn(str1, str2);
return (double) ld / Math.min(str1.length(), str2.length());
}
if(isEnglish(str1)&&isEnglish(str2)){
double ld = ldforcatlen(str1, str2);
return ld;
}
return 0;
}
public static boolean isEnglish(String charaString){
return charaString.getBytes().length == charaString.length();
}
public static boolean isChinese(String str){
String regEx = "[\\u4e00-\\u9fa5]+";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
if(m.find())
return true;
else
return false;
}
public static void main(String[] args) {
String str1 = "【观察】 三年行动 国企改革再升级";
String str2 = "三年行动 国企改革再升级";
String str3 = "【国企改革】三年行动, 国企改革再升级";
System.out.println("sim12="+simforcatl(str1, str2));
System.out.println("sim13="+simforcatl(str1, str3));
System.out.println("sim23="+simforcatl(str2, str3));
String str4="混合所有制改革";
String str5="混改";
System.out.println("sim45="+simforcatl(str4, str5));
System.out.println("sim55="+simforcatl(str5, str5));
String str6="【国企混改】国企混合所有制改革实务(上)";
String str7="【国企混改】国企混合所有制改革实务(下)";
System.out.println("sim67="+simforcatl(str5, str5));
}
}
\ No newline at end of file
...@@ -46,6 +46,7 @@ public class DisplayInfo { ...@@ -46,6 +46,7 @@ public class DisplayInfo {
//标题 //标题
private String title; private String title;
private String titleRaw; private String titleRaw;
private String subtitle;
//采集来源(如通用、定制、微信公众号等) //采集来源(如通用、定制、微信公众号等)
private String source; private String source;
//附加字段 //附加字段
......
...@@ -46,6 +46,7 @@ public class SpecialInformation { ...@@ -46,6 +46,7 @@ public class SpecialInformation {
//标题 //标题
private String title; private String title;
private String titleRaw; private String titleRaw;
private String subtitle;
//采集来源(如通用、定制、微信公众号等) //采集来源(如通用、定制、微信公众号等)
private String source; private String source;
//附加字段 //附加字段
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论