/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.test.seg;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.io.FolderWalker;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.phrase.MutualInformationEntropyPhraseExtractor;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import junit.framework.TestCase;

public class TestPhrase
extends TestCase {
    static final String FOLDER = "D:\\Doc\\\u8bed\u6599\u5e93\\\u4e0a\u6d77\u9759\u5b89\\";

    public void testExtract() throws Exception {
        List<File> fileList = FolderWalker.open(FOLDER);
        TreeMap<String, String> phraseMap = new TreeMap<String, String>();
        int i = 0;
        for (File file : fileList) {
            System.out.print(String.valueOf(++i) + " / " + fileList.size() + " " + file.getName() + " ");
            String path = file.getAbsolutePath();
            List<String> phraseList = MutualInformationEntropyPhraseExtractor.extract(IOUtil.readTxt(path), 3);
            System.out.print(phraseList);
            for (String phrase : phraseList) {
                phraseMap.put(phrase, file.getAbsolutePath());
            }
            System.out.println();
        }
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("data/phrase.txt")));
        for (Map.Entry entry : phraseMap.entrySet()) {
            bw.write(String.valueOf((String)entry.getKey()) + "\t" + (String)entry.getValue());
            bw.newLine();
        }
        bw.close();
    }

    public void testSingle() throws Exception {
        HanLP.Config.enableDebug();
        System.out.println(MutualInformationEntropyPhraseExtractor.extract(IOUtil.readTxt("D:\\Doc\\\u8bed\u6599\u5e93\\\u4e0a\u6d77\u9759\u5b89\\\u9759\u5b89\u533a\u5168\u5e02\u9996\u63a8\u201c\u60c5\u8bd7\u8868\u767d\u201d\u7ed3\u5a5a\u9881\u8bc1.txt"), 3));
    }

    public void testSeg() throws Exception {
        System.out.println(StandardTokenizer.segment(IOUtil.readTxt("D:\\Doc\\\u8bed\u6599\u5e93\\\u4e0a\u6d77\u9759\u5b89\\\u5357\u897f\u793e\u533a\u6691\u671f\u5b66\u751f\u6d3b\u52a8\u7b80\u8baf  2010\u5e74\u7b2c1\u671f.txt")));
    }
}

