package com.zzsn.generation.test;


import java.util.*;

/**
**/

public class KeyExtractor {

	private int MAX_WORD_NUM = 4;
	
	public LinkedHashMap<String,Double> keyExtract(List<String> words, String lang, int maxNum){
	    TokenTf ttf = new TokenTf();
        String word = "";
        String nextword = "";
        for (int i = 0; i < words.size(); i++) {
            word = words.get(i);

            if (word.equals("") || (word.length() == 1)) // 长度为1一般不是关键词
            {
                continue;
            }
            ttf.formhm(word);
            int j=1;
            do
            {
                if(i+j >= words.size())
                {
                    break;
                }
                nextword = words.get(i+j);
                 if(nextword.length() != 1)
                {
                     ttf.formhm(nextword);
                }
                 j++;
            }while(j < MAX_WORD_NUM);               
        }
        // 4. 获得最大的N个
        ArrayList<String> keyWords = new ArrayList<String>();
        ArrayList<Double> keyWordValues = new ArrayList<Double>();
        
        int tempMaxNum = Math.min(maxNum * 2,200); //先多生成几个

        while ((word = ttf.next()) != null) {
            if (word.length() <= 1) {
                continue;
            }
            
            double tfidf = -1.0;
            if (lang.equals("cn") == true) {
                tfidf  = IDFHash.ChiWordIDF(word);
            } else {
                tfidf  = IDFHash.EngWordIDF(word);
            }
            //倾向于2字词
            if(word.length() == 2)
            {
                tfidf += 0.5;
            }
            tfidf = tfidf * ttf.countTf(word) ;
            
            // 最多输出10个
            boolean bAdd = false;
            for (int i = 0; i < keyWords.size(); i++) {
                if (tfidf > keyWordValues.get(i)) {
                    keyWords.add(i, " " + word + " ");
                    keyWordValues.add(i, tfidf);
                    bAdd = true;
                    break;
                }
            }
            if ((!bAdd) && (keyWords.size() < tempMaxNum)) {
                keyWords.add( " " + word + " ");
                keyWordValues.add(tfidf);
            }
            for (int i = keyWords.size() - 1; i > tempMaxNum - 1; i--) {
                keyWords.remove(i);
                keyWordValues.remove(i);
            }
        }
        
        
        for (int i = keyWords.size() - 1; i > maxNum - 1; i--) {
            keyWords.remove(i);
            keyWordValues.remove(i);
        }
        String[] wordArray = new String[keyWords.size()];
        LinkedHashMap<String,Double> map = new LinkedHashMap<String,Double>();
        for(int i = 0 ; i < keyWords.size();i++){
            wordArray[i] = keyWords.get(i);
            map.put(keyWords.get(i).trim(),  keyWordValues.get(i));
        }
        map = SortMap.sortMap(map);
        return  map;
	}
    
	public Set<String> KeyExtractForString(List<String> words, String lang, int maxNum) {
	    Map<String,Double> map  = this.keyExtract(words, lang, maxNum);
	    return map.keySet();
	}
	
	
	public static void main(String args[]) throws Exception {
	/*    String content = FileUtil.readFile(new File("D:/a.txt"),"gbk");
	    
		KeyExtractor k = new KeyExtractor();
		k.keyExtract(SegmentUtil.segment(content), "cn", 200);*/
	IDFHash.ChiWordIDF("你好");
	
	}


}
