package com.zzsn.event.util;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.commons.lang3.StringUtils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * @author kongliufeng
 * @Description hanlp分词工具
 * @create 2020-08-31 18:50
 * @Version 1.0
 */
public class HanlpUtil {
    /**
     * 但文本抽取关键词
     *
     * @param text    文本
     * @param limitNo 热词数
     * @author lkg
     * @date 2024/1/9
     */
    public static List<Map.Entry<String, Integer>> extractKeyWordsByText(String text, int limitNo) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        //每100字最多提取30个词
        int keySize = text.length() * 30 / 100;
        if (keySize == 0) {
            keySize = 1;
        }
        List<String> phraseList = HanLP.extractKeyword(text, keySize);
        //获取短语前100个
        if (phraseList != null) {
            Iterator<String> iterator = phraseList.iterator();
            while (iterator.hasNext()) {
                String strs = iterator.next();
                if (strs.length() < 3 || !isChinese(strs)) {
                    //排除长度小于3的 和非中文关键词
                    iterator.remove();
                    continue;
                }
                //如果是人名、地名、组织名 则去除
                //获取人名、地名、组织名
                List<String> resul = CompanyUtil.entityAll(strs);
                if (!resul.isEmpty()) {
                    iterator.remove();
                    continue;
                }
                //判断词性，仅保留词性为gi和词性中包含n的词
                List<Term> termList = HanLP.segment(strs); //调用hanlp算法进行分词
                if (termList != null && termList.size() == 1 && termList.get(0).nature != null) {
                    String nature = termList.get(0).nature.toString();
                    if (nature == null || (!nature.contains("n") && !nature.equals("gi"))) {
                        iterator.remove();
                    }
                }
            }
            //去重
            List<String> distinctList = phraseList.stream().distinct().collect(Collectors.toList());
            Map<String, Integer> map = getHitWordsAndTimes(distinctList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }

    /*
     * 判断是否为中文
     */
    public static boolean isChinese(String str) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        String regEx = "[\\u4e00-\\u9fa5]+";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(str);
        return m.find();
    }

    /**
     * @Description 获取srcList中在text存在的集合(包含频次)
     * @author kongliufeng
     * @创建时间 2020/9/3 18:41
     * @Version 1.0
     */
    public static Map<String, Integer> getHitWordsAndTimes(Collection<String> srcList, String text){
        Map<String, Integer> map = new HashMap<>();
        if(srcList==null || StringUtils.isEmpty(text)){
            return  map;
        }
        for (String s : srcList) {
            int i = countKeyWordInContent(s, text);
            if(i>0){
                map.put(s,i);
            }
        }
        return map;
    }


    /**
     * @Description 计算一个词在一个文本中的次数
     * @author kongliufeng
     * @创建时间 2020/8/27 19:56
     * @Version 1.0
     */
    public static int countKeyWordInContent(String keyword, String srcContent){
        if(keyword==null ||keyword.trim().equals("")){
            return 0;
        }
        int count = 0;
        int leng = srcContent.length();
        int j = 0;
        for (int i = 0; i < leng; i++){
            if (srcContent.charAt(i) == keyword.charAt(j)){
                j++;
                if (j == keyword.length()){
                    count++;
                    j = 0;
                }
            }
            else{
                i = i - j;
                j = 0;
            }
        }
        return count;
    }
}
