/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.test.corpus;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.dictionary.DictionaryMaker;
import com.hankcs.hanlp.corpus.dictionary.EasyDictionary;
import com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker;
import com.hankcs.hanlp.corpus.document.CorpusLoader;
import com.hankcs.hanlp.corpus.document.Document;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment;
import com.hankcs.hanlp.seg.common.Term;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;
import junit.framework.TestCase;

public class TestMakeCompanyCorpus
extends TestCase {
    public void testMake() throws Exception {
        DijkstraSegment segment = new DijkstraSegment();
        String line = null;
        BufferedReader bw = new BufferedReader(new InputStreamReader(new FileInputStream("D:\\Doc\\\u8bed\u6599\u5e93\\company.dic")));
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("data/test/nt/company.txt")));
        int limit = Integer.MAX_VALUE;
        while ((line = bw.readLine()) != null && limit-- > 0) {
            List<Term> termList;
            if (line.endsWith("\uff09") || line.length() < 4 || line.contains("\u4e2a\u4f53") || line.contains("\u4e2a\u4eba") || (termList = segment.seg(line)).size() == 0) continue;
            Term last = termList.get(termList.size() - 1);
            last.nature = Nature.nis;
            br.write("[");
            for (Term term : termList) {
                br.write(term.toString());
                if (term == last) continue;
                br.write(" ");
            }
            br.write("]/ntc");
            br.newLine();
            br.flush();
        }
        bw.close();
        br.close();
    }

    public void testParse() throws Exception {
        EasyDictionary dictionary = EasyDictionary.create("data/dictionary/2014_dictionary.txt");
        final NTDictionaryMaker nsDictionaryMaker = new NTDictionaryMaker(dictionary);
        CorpusLoader.walk("data/test/nt/part/", new CorpusLoader.Handler(){

            @Override
            public void handle(Document document) {
                nsDictionaryMaker.compute(document.getComplexSentenceList());
            }
        });
        nsDictionaryMaker.saveTxtTo("D:\\JavaProjects\\HanLP\\data\\dictionary\\organization\\outerNT");
    }

    public void testSplitLargeFile() throws Exception {
        String line = null;
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("data/test/nt/company.txt")));
        int id = 1;
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("data/test/nt/part/" + id + ".txt")));
        int count = 1;
        while ((line = br.readLine()) != null) {
            if (count == 1000) {
                bw.close();
                bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("data/test/nt/part/" + id + ".txt")));
                ++id;
                count = 0;
            }
            bw.write(line);
            bw.newLine();
            ++count;
        }
        br.close();
    }

    public void testCase() throws Exception {
        HanLP.Config.enableDebug();
        DijkstraSegment segment = new DijkstraSegment();
        segment.enableOrganizationRecognize(true);
        System.out.println(segment.seg("\u9ed1\u9f99\u6c5f\u5efa\u7b51\u804c\u4e1a\u6280\u672f\u5b66\u9662\u8fd1\u767e\u5b66\u751f\u53d1\u751f\u51b2\u7a81"));
    }

    public void testCombine() throws Exception {
        DictionaryMaker.combine("data/dictionary/organization/nt.txt", "data/dictionary/organization/outerNT.txt").saveTxtTo("data/dictionary/organization/nt.txt");
    }
}

