提交 7b78e206 作者: 925993793@qq.com

推荐关键词改用python模型

上级 0fa19054
package com.zzsn.event.controller;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSONObject;
......@@ -22,6 +23,9 @@ import com.zzsn.event.xxljob.service.IXxlJobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.springframework.beans.BeanUtils;
......@@ -85,6 +89,8 @@ public class SubjectManageController {
private ObsUtil obsUtil;
@Resource
private KafkaTemplate<String, String> kafkaTemplate;
@Autowired
private PythonUtil pythonUtil;
@Value("${kafka.topic.subject.run:}")
......@@ -171,7 +177,12 @@ public class SubjectManageController {
if (!flag) {
return Result.FAIL("包含不支持的文件类型");
}
List<StatisticsKeyWordVo> statisticsKeyWordVos = articleWords(files);
List<StatisticsKeyWordVo> statisticsKeyWordVos = new ArrayList<>();
try {
statisticsKeyWordVos = articleWords(files);
} catch (Exception e) {
e.printStackTrace();
}
jsonObject.put("article", statisticsKeyWordVos);
}
if (StringUtils.isNotBlank(words)) {
......@@ -642,14 +653,14 @@ public class SubjectManageController {
*/
@PostMapping("/bindInfoSourceLabel")
public Result<?> bindInfoSourceLabel(@RequestBody SubjectSourceTagVO subjectSourceTagVO) {
List<InfoSourceLabelVO> labelList = subjectSourceTagVO.getLabelList();
if (CollectionUtils.isEmpty(labelList)) {
return Result.OK();
}
String subjectId = subjectSourceTagVO.getSubjectId();
if (StringUtils.isEmpty(subjectId)) {
return Result.FAIL("专题id不能为空");
}
List<InfoSourceLabelVO> labelList = subjectSourceTagVO.getLabelList();
if (CollectionUtils.isEmpty(labelList)) {
return Result.FAIL("信息源标签数据不能为空");
}
List<SubjectInfoSourceMap> dataList = new ArrayList<>();
for (InfoSourceLabelVO infoSourceLabelVO : labelList) {
List<InfoSourceLabelItemVO> infoSourceLabelItemList = infoSourceLabelVO.getInfoSourceLabelItemList();
......@@ -1083,6 +1094,8 @@ public class SubjectManageController {
searchWord.setSearchInfo(words);
searchWordList.add(searchWord);
searchCondition.setSearchWordList(searchWordList);
searchCondition.setColumn("score");
searchCondition.setOrder("desc");
try {
IPage<SpecialInformation> page = esService.pageListByCondition(searchCondition, null);
if (page.getTotal() > 0) {
......@@ -1091,7 +1104,9 @@ public class SubjectManageController {
for (SpecialInformation information : records) {
text.append(information.getTitle()).append(information.getContent());
}
List<Map.Entry<String, Integer>> extractKeyWordsByText = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
List<String> wordsList = pythonUtil.extractKeyword(text.toString(), 10);
wordList = formatWordInfo(text.toString(), wordsList);
/*List<Map.Entry<String, Integer>> extractKeyWordsByText = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
if (CollectionUtils.isNotEmpty(extractKeyWordsByText)) {
for (Map.Entry<String, Integer> entry : extractKeyWordsByText) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
......@@ -1099,12 +1114,12 @@ public class SubjectManageController {
statisticsKeyWordVo.setValue(entry.getValue());
wordList.add(statisticsKeyWordVo);
}
}
}*/
}
} catch (IOException e) {
e.printStackTrace();
}
return wordList;
return wordList.stream().sorted(Comparator.comparing(StatisticsKeyWordVo::getValue).reversed()).collect(Collectors.toList());
}
/**
......@@ -1120,13 +1135,17 @@ public class SubjectManageController {
String originalFilename = file.getOriginalFilename();
if (originalFilename.endsWith(".txt")) {
parseTxt(text, file);
} else {
parseWord(text, file);
} else if (originalFilename.endsWith(".docx")){
parseWordDocx(text, file);
} else if (originalFilename.endsWith(".doc")) {
parseWordDoc(text, file);
}
}
List<StatisticsKeyWordVo> articleWordList = new ArrayList<>();
if (StringUtils.isNotEmpty(text)) {
List<Map.Entry<String, Integer>> keywordsList = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
List<String> wordsList = pythonUtil.extractKeyword(text.toString(), 10);
articleWordList = formatWordInfo(text.toString(), wordsList);
/*List<Map.Entry<String, Integer>> keywordsList = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
if (CollectionUtils.isNotEmpty(keywordsList)) {
for (Map.Entry<String, Integer> entry : keywordsList) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
......@@ -1134,26 +1153,59 @@ public class SubjectManageController {
statisticsKeyWordVo.setValue(entry.getValue());
articleWordList.add(statisticsKeyWordVo);
}
}*/
}
return articleWordList.stream().sorted(Comparator.comparing(StatisticsKeyWordVo::getValue).reversed()).collect(Collectors.toList());
}
private List<StatisticsKeyWordVo> formatWordInfo(String text,List<String> wordsList){
List<StatisticsKeyWordVo> articleWordList = new ArrayList<>();
Map<String, Integer> hitWordsAndTimes = HanlpUtil.getHitWordsAndTimes(wordsList, text);
for (Map.Entry<String, Integer> entry : hitWordsAndTimes.entrySet()) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
statisticsKeyWordVo.setName(entry.getKey());
statisticsKeyWordVo.setValue(entry.getValue());
articleWordList.add(statisticsKeyWordVo);
}
return articleWordList;
}
/**
* 解析word文档,获取纯文本内容
* 解析word文档,docx后缀,获取纯文本内容
*
* @param text 内容
* @param file word文件
* @author lkg
* @date 2025/1/3
*/
private void parseWord(StringBuilder text, MultipartFile file) {
private void parseWordDocx(StringBuilder text, MultipartFile file) {
try {
InputStream inputStream = file.getInputStream();
XWPFDocument doc = new XWPFDocument(inputStream);
for (XWPFParagraph paragraph : doc.getParagraphs()) {
text.append(paragraph.getText());
XWPFDocument docx = new XWPFDocument(inputStream);
XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
text.append(extractor.getText());
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 解析word文档,doc后缀,获取纯文本内容
*
* @param text 内容
* @param file word文件
* @author lkg
* @date 2025/1/3
*/
private void parseWordDoc(StringBuilder text, MultipartFile file) {
try {
InputStream inputStream = file.getInputStream();
HWPFDocument doc = new HWPFDocument(inputStream);
WordExtractor wordExtractor = new WordExtractor(doc);
text.append(wordExtractor.getText());
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
......
......@@ -27,7 +27,6 @@ import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.*;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.aggregations.Aggregation;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.aggregations.BucketOrder;
......@@ -41,7 +40,6 @@ import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.Cardinality;
import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.collapse.CollapseBuilder;
import org.elasticsearch.search.sort.SortBuilders;
......
......@@ -93,7 +93,7 @@ public class HanlpUtil {
* @创建时间 2020/9/3 18:41
* @Version 1.0
*/
private static Map<String, Integer> getHitWordsAndTimes(Collection<String> srcList, String text){
public static Map<String, Integer> getHitWordsAndTimes(Collection<String> srcList, String text){
Map<String, Integer> map = new HashMap<>();
if(srcList==null || StringUtils.isEmpty(text)){
return map;
......
package com.zzsn.event.util;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* python工具类
*
* @author lkg
* @date 2025/1/23
*/
@Service
public class PythonUtil {
@Value("${python.keyWordsExtractUrl}")
private String keywordExtractUrl;
/**
* 提取关键词
*
* @param content 文本内容
* @param number 提取的关键词数量
* @author lkg
* @date 2025/1/23
*/
public List<String> extractKeyword(String content, Integer number) {
List<String> wordsList = new ArrayList<>();
if (StringUtils.isNotBlank(content)) {
Map<String, Object> params = new HashMap<>();
params.put("text", content);
params.put("name", "phrase");
params.put("topK", number.toString());
String result = HttpUtil.doPostForm(keywordExtractUrl, params, 60000);
JSONObject jsonObject = JSONObject.parseObject(result);
JSONObject resultData = jsonObject.getJSONObject("resultData");
if (resultData != null) {
JSONArray data = resultData.getJSONArray("data");
wordsList = JSON.parseArray(data.toString(), String.class);
}
}
return wordsList;
}
}
......@@ -5,13 +5,6 @@ import lombok.Data;
@Data
public class StatisticsKeyWordVo {
//专题id
private String subjectId;
// 关键词id
private String kid;
//词频
private Integer value;
......
......@@ -49,6 +49,8 @@ public class DisplayInfo {
private String type;
//标签信息
private List<Label> labels;
//模型打分信息
private List<ModelScore> modelScores;
//视频下载链接
private String downLoadUrl;
//视频链接(原链接 网页版)
......
......@@ -52,6 +52,8 @@ public class SpecialInformation {
private String type;
//标签信息
private List<Label> labels;
//模型打分信息
private List<ModelScore> modelScores;
//视频下载链接
private String downLoadUrl;
//视频链接(原链接 网页版)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论