提交 7b78e206 作者: 925993793@qq.com

推荐关键词改用python模型

上级 0fa19054
package com.zzsn.event.controller; package com.zzsn.event.controller;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.ObjectUtil; import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSONObject; import com.alibaba.fastjson2.JSONObject;
...@@ -22,6 +23,9 @@ import com.zzsn.event.xxljob.service.IXxlJobInfoService; ...@@ -22,6 +23,9 @@ import com.zzsn.event.xxljob.service.IXxlJobInfoService;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.springframework.beans.BeanUtils; import org.springframework.beans.BeanUtils;
...@@ -85,6 +89,8 @@ public class SubjectManageController { ...@@ -85,6 +89,8 @@ public class SubjectManageController {
private ObsUtil obsUtil; private ObsUtil obsUtil;
@Resource @Resource
private KafkaTemplate<String, String> kafkaTemplate; private KafkaTemplate<String, String> kafkaTemplate;
@Autowired
private PythonUtil pythonUtil;
@Value("${kafka.topic.subject.run:}") @Value("${kafka.topic.subject.run:}")
...@@ -171,7 +177,12 @@ public class SubjectManageController { ...@@ -171,7 +177,12 @@ public class SubjectManageController {
if (!flag) { if (!flag) {
return Result.FAIL("包含不支持的文件类型"); return Result.FAIL("包含不支持的文件类型");
} }
List<StatisticsKeyWordVo> statisticsKeyWordVos = articleWords(files); List<StatisticsKeyWordVo> statisticsKeyWordVos = new ArrayList<>();
try {
statisticsKeyWordVos = articleWords(files);
} catch (Exception e) {
e.printStackTrace();
}
jsonObject.put("article", statisticsKeyWordVos); jsonObject.put("article", statisticsKeyWordVos);
} }
if (StringUtils.isNotBlank(words)) { if (StringUtils.isNotBlank(words)) {
...@@ -642,14 +653,14 @@ public class SubjectManageController { ...@@ -642,14 +653,14 @@ public class SubjectManageController {
*/ */
@PostMapping("/bindInfoSourceLabel") @PostMapping("/bindInfoSourceLabel")
public Result<?> bindInfoSourceLabel(@RequestBody SubjectSourceTagVO subjectSourceTagVO) { public Result<?> bindInfoSourceLabel(@RequestBody SubjectSourceTagVO subjectSourceTagVO) {
List<InfoSourceLabelVO> labelList = subjectSourceTagVO.getLabelList();
if (CollectionUtils.isEmpty(labelList)) {
return Result.OK();
}
String subjectId = subjectSourceTagVO.getSubjectId(); String subjectId = subjectSourceTagVO.getSubjectId();
if (StringUtils.isEmpty(subjectId)) { if (StringUtils.isEmpty(subjectId)) {
return Result.FAIL("专题id不能为空"); return Result.FAIL("专题id不能为空");
} }
List<InfoSourceLabelVO> labelList = subjectSourceTagVO.getLabelList();
if (CollectionUtils.isEmpty(labelList)) {
return Result.FAIL("信息源标签数据不能为空");
}
List<SubjectInfoSourceMap> dataList = new ArrayList<>(); List<SubjectInfoSourceMap> dataList = new ArrayList<>();
for (InfoSourceLabelVO infoSourceLabelVO : labelList) { for (InfoSourceLabelVO infoSourceLabelVO : labelList) {
List<InfoSourceLabelItemVO> infoSourceLabelItemList = infoSourceLabelVO.getInfoSourceLabelItemList(); List<InfoSourceLabelItemVO> infoSourceLabelItemList = infoSourceLabelVO.getInfoSourceLabelItemList();
...@@ -1083,6 +1094,8 @@ public class SubjectManageController { ...@@ -1083,6 +1094,8 @@ public class SubjectManageController {
searchWord.setSearchInfo(words); searchWord.setSearchInfo(words);
searchWordList.add(searchWord); searchWordList.add(searchWord);
searchCondition.setSearchWordList(searchWordList); searchCondition.setSearchWordList(searchWordList);
searchCondition.setColumn("score");
searchCondition.setOrder("desc");
try { try {
IPage<SpecialInformation> page = esService.pageListByCondition(searchCondition, null); IPage<SpecialInformation> page = esService.pageListByCondition(searchCondition, null);
if (page.getTotal() > 0) { if (page.getTotal() > 0) {
...@@ -1091,7 +1104,9 @@ public class SubjectManageController { ...@@ -1091,7 +1104,9 @@ public class SubjectManageController {
for (SpecialInformation information : records) { for (SpecialInformation information : records) {
text.append(information.getTitle()).append(information.getContent()); text.append(information.getTitle()).append(information.getContent());
} }
List<Map.Entry<String, Integer>> extractKeyWordsByText = HanlpUtil.extractKeyWordsByText(text.toString(), 10); List<String> wordsList = pythonUtil.extractKeyword(text.toString(), 10);
wordList = formatWordInfo(text.toString(), wordsList);
/*List<Map.Entry<String, Integer>> extractKeyWordsByText = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
if (CollectionUtils.isNotEmpty(extractKeyWordsByText)) { if (CollectionUtils.isNotEmpty(extractKeyWordsByText)) {
for (Map.Entry<String, Integer> entry : extractKeyWordsByText) { for (Map.Entry<String, Integer> entry : extractKeyWordsByText) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo(); StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
...@@ -1099,12 +1114,12 @@ public class SubjectManageController { ...@@ -1099,12 +1114,12 @@ public class SubjectManageController {
statisticsKeyWordVo.setValue(entry.getValue()); statisticsKeyWordVo.setValue(entry.getValue());
wordList.add(statisticsKeyWordVo); wordList.add(statisticsKeyWordVo);
} }
} }*/
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
return wordList; return wordList.stream().sorted(Comparator.comparing(StatisticsKeyWordVo::getValue).reversed()).collect(Collectors.toList());
} }
/** /**
...@@ -1120,13 +1135,17 @@ public class SubjectManageController { ...@@ -1120,13 +1135,17 @@ public class SubjectManageController {
String originalFilename = file.getOriginalFilename(); String originalFilename = file.getOriginalFilename();
if (originalFilename.endsWith(".txt")) { if (originalFilename.endsWith(".txt")) {
parseTxt(text, file); parseTxt(text, file);
} else { } else if (originalFilename.endsWith(".docx")){
parseWord(text, file); parseWordDocx(text, file);
} else if (originalFilename.endsWith(".doc")) {
parseWordDoc(text, file);
} }
} }
List<StatisticsKeyWordVo> articleWordList = new ArrayList<>(); List<StatisticsKeyWordVo> articleWordList = new ArrayList<>();
if (StringUtils.isNotEmpty(text)) { if (StringUtils.isNotEmpty(text)) {
List<Map.Entry<String, Integer>> keywordsList = HanlpUtil.extractKeyWordsByText(text.toString(), 10); List<String> wordsList = pythonUtil.extractKeyword(text.toString(), 10);
articleWordList = formatWordInfo(text.toString(), wordsList);
/*List<Map.Entry<String, Integer>> keywordsList = HanlpUtil.extractKeyWordsByText(text.toString(), 10);
if (CollectionUtils.isNotEmpty(keywordsList)) { if (CollectionUtils.isNotEmpty(keywordsList)) {
for (Map.Entry<String, Integer> entry : keywordsList) { for (Map.Entry<String, Integer> entry : keywordsList) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo(); StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
...@@ -1134,26 +1153,59 @@ public class SubjectManageController { ...@@ -1134,26 +1153,59 @@ public class SubjectManageController {
statisticsKeyWordVo.setValue(entry.getValue()); statisticsKeyWordVo.setValue(entry.getValue());
articleWordList.add(statisticsKeyWordVo); articleWordList.add(statisticsKeyWordVo);
} }
} }*/
}
return articleWordList.stream().sorted(Comparator.comparing(StatisticsKeyWordVo::getValue).reversed()).collect(Collectors.toList());
}
private List<StatisticsKeyWordVo> formatWordInfo(String text,List<String> wordsList){
List<StatisticsKeyWordVo> articleWordList = new ArrayList<>();
Map<String, Integer> hitWordsAndTimes = HanlpUtil.getHitWordsAndTimes(wordsList, text);
for (Map.Entry<String, Integer> entry : hitWordsAndTimes.entrySet()) {
StatisticsKeyWordVo statisticsKeyWordVo = new StatisticsKeyWordVo();
statisticsKeyWordVo.setName(entry.getKey());
statisticsKeyWordVo.setValue(entry.getValue());
articleWordList.add(statisticsKeyWordVo);
} }
return articleWordList; return articleWordList;
} }
/** /**
* 解析word文档,获取纯文本内容 * 解析word文档,docx后缀,获取纯文本内容
* *
* @param text 内容 * @param text 内容
* @param file word文件 * @param file word文件
* @author lkg * @author lkg
* @date 2025/1/3 * @date 2025/1/3
*/ */
private void parseWord(StringBuilder text, MultipartFile file) { private void parseWordDocx(StringBuilder text, MultipartFile file) {
try { try {
InputStream inputStream = file.getInputStream(); InputStream inputStream = file.getInputStream();
XWPFDocument doc = new XWPFDocument(inputStream); XWPFDocument docx = new XWPFDocument(inputStream);
for (XWPFParagraph paragraph : doc.getParagraphs()) { XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
text.append(paragraph.getText()); text.append(extractor.getText());
} inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 解析word文档,doc后缀,获取纯文本内容
*
* @param text 内容
* @param file word文件
* @author lkg
* @date 2025/1/3
*/
private void parseWordDoc(StringBuilder text, MultipartFile file) {
try {
InputStream inputStream = file.getInputStream();
HWPFDocument doc = new HWPFDocument(inputStream);
WordExtractor wordExtractor = new WordExtractor(doc);
text.append(wordExtractor.getText());
inputStream.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
......
...@@ -27,7 +27,6 @@ import org.elasticsearch.client.RestHighLevelClient; ...@@ -27,7 +27,6 @@ import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.*; import org.elasticsearch.index.query.*;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.aggregations.Aggregation;
import org.elasticsearch.search.aggregations.AggregationBuilders; import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.BucketOrder;
...@@ -41,7 +40,6 @@ import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude; ...@@ -41,7 +40,6 @@ import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude;
import org.elasticsearch.search.aggregations.bucket.terms.Terms; import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.Cardinality; import org.elasticsearch.search.aggregations.metrics.Cardinality;
import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder;
import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.collapse.CollapseBuilder; import org.elasticsearch.search.collapse.CollapseBuilder;
import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortBuilders;
......
...@@ -93,7 +93,7 @@ public class HanlpUtil { ...@@ -93,7 +93,7 @@ public class HanlpUtil {
* @创建时间 2020/9/3 18:41 * @创建时间 2020/9/3 18:41
* @Version 1.0 * @Version 1.0
*/ */
private static Map<String, Integer> getHitWordsAndTimes(Collection<String> srcList, String text){ public static Map<String, Integer> getHitWordsAndTimes(Collection<String> srcList, String text){
Map<String, Integer> map = new HashMap<>(); Map<String, Integer> map = new HashMap<>();
if(srcList==null || StringUtils.isEmpty(text)){ if(srcList==null || StringUtils.isEmpty(text)){
return map; return map;
......
package com.zzsn.event.util;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* python工具类
*
* @author lkg
* @date 2025/1/23
*/
@Service
public class PythonUtil {
@Value("${python.keyWordsExtractUrl}")
private String keywordExtractUrl;
/**
* 提取关键词
*
* @param content 文本内容
* @param number 提取的关键词数量
* @author lkg
* @date 2025/1/23
*/
public List<String> extractKeyword(String content, Integer number) {
List<String> wordsList = new ArrayList<>();
if (StringUtils.isNotBlank(content)) {
Map<String, Object> params = new HashMap<>();
params.put("text", content);
params.put("name", "phrase");
params.put("topK", number.toString());
String result = HttpUtil.doPostForm(keywordExtractUrl, params, 60000);
JSONObject jsonObject = JSONObject.parseObject(result);
JSONObject resultData = jsonObject.getJSONObject("resultData");
if (resultData != null) {
JSONArray data = resultData.getJSONArray("data");
wordsList = JSON.parseArray(data.toString(), String.class);
}
}
return wordsList;
}
}
...@@ -5,13 +5,6 @@ import lombok.Data; ...@@ -5,13 +5,6 @@ import lombok.Data;
@Data @Data
public class StatisticsKeyWordVo { public class StatisticsKeyWordVo {
//专题id
private String subjectId;
// 关键词id
private String kid;
//词频 //词频
private Integer value; private Integer value;
......
...@@ -49,6 +49,8 @@ public class DisplayInfo { ...@@ -49,6 +49,8 @@ public class DisplayInfo {
private String type; private String type;
//标签信息 //标签信息
private List<Label> labels; private List<Label> labels;
//模型打分信息
private List<ModelScore> modelScores;
//视频下载链接 //视频下载链接
private String downLoadUrl; private String downLoadUrl;
//视频链接(原链接 网页版) //视频链接(原链接 网页版)
......
...@@ -52,6 +52,8 @@ public class SpecialInformation { ...@@ -52,6 +52,8 @@ public class SpecialInformation {
private String type; private String type;
//标签信息 //标签信息
private List<Label> labels; private List<Label> labels;
//模型打分信息
private List<ModelScore> modelScores;
//视频下载链接 //视频下载链接
private String downLoadUrl; private String downLoadUrl;
//视频链接(原链接 网页版) //视频链接(原链接 网页版)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论