/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.share.mccallum.ner;

import cc.mallet.fst.CRF;
import cc.mallet.fst.CRFTrainerByLabelLikelihood;
import cc.mallet.fst.MultiSegmentationEvaluator;
import cc.mallet.fst.Transducer;
import cc.mallet.fst.ViterbiWriter;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintTokenSequenceFeatures;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureVectorSequence;
import cc.mallet.pipe.iterator.LineGroupIterator;
import cc.mallet.pipe.tsf.FeaturesInWindow;
import cc.mallet.pipe.tsf.FeaturesOfFirstMention;
import cc.mallet.pipe.tsf.OffsetConjunctions;
import cc.mallet.pipe.tsf.RegexMatches;
import cc.mallet.pipe.tsf.TokenText;
import cc.mallet.pipe.tsf.TokenTextCharNGrams;
import cc.mallet.pipe.tsf.TrieLexiconMembership;
import cc.mallet.share.mccallum.ner.ConllNer2003Sentence2TokenSequence;
import cc.mallet.share.mccallum.ner.TokenSequenceDocHeader;
import cc.mallet.types.Alphabet;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.Random;
import java.util.regex.Pattern;

public class TUI {
    static CommandOption.Double gaussianVarianceOption = new CommandOption.Double(TUI.class, "gaussian-variance", "DECIMAL", true, 10.0, "The gaussian prior variance used for training.", null);
    static CommandOption.Double hyperbolicSlopeOption = new CommandOption.Double(TUI.class, "hyperbolic-slope", "DECIMAL", true, 0.2, "The hyperbolic prior slope used for training.", null);
    static CommandOption.Double hyperbolicSharpnessOption = new CommandOption.Double(TUI.class, "hyperbolic-sharpness", "DECIMAL", true, 10.0, "The hyperbolic prior sharpness used for training.", null);
    static CommandOption.File crfInputFileOption = new CommandOption.File(TUI.class, "crf-input-file", "FILENAME", true, null, "The name of the file to write the CRF after training.", null);
    static CommandOption.Integer randomSeedOption = new CommandOption.Integer(TUI.class, "random-seed", "INTEGER", true, 0, "The random seed for randomly selecting a proportion of the instance list for training", null);
    static CommandOption.Integer labelGramOption = new CommandOption.Integer(TUI.class, "label-gram", "INTEGER", true, 1, "Markov order of labels: 1, 2, 3", null);
    static CommandOption.Integer wordWindowFeatureOption = new CommandOption.Integer(TUI.class, "word-window-size", "INTEGER", true, 0, "Size of window of words as features: 0=none, 10, 20...", null);
    static CommandOption.Boolean useTestbOption = new CommandOption.Boolean(TUI.class, "use-testb", "true|false", true, false, "Use testb, final test set", null);
    static CommandOption.Boolean useHyperbolicPriorOption = new CommandOption.Boolean(TUI.class, "use-hyperbolic-prior", "true|false", true, false, "Use hyperbolic prior", null);
    static CommandOption.Boolean useFeatureInductionOption = new CommandOption.Boolean(TUI.class, "use-feature-induction", "true|false", true, false, "Not use or use feature induction", null);
    static CommandOption.Boolean clusterFeatureInductionOption = new CommandOption.Boolean(TUI.class, "cluster-feature-induction", "true|false", true, false, "Cluster in feature induction", null);
    static CommandOption.Boolean useFirstMentionFeatureOption = new CommandOption.Boolean(TUI.class, "use-firstmention-feature", "true|false", true, false, "Don't use first-mention feature", null);
    static CommandOption.Boolean useDocHeaderFeatureOption = new CommandOption.Boolean(TUI.class, "use-docheader-feature", "true|false", true, false, "", null);
    static CommandOption.Boolean includeConllLexiconsOption = new CommandOption.Boolean(TUI.class, "include-conll-lexicons", "true|false", true, false, "", null);
    static CommandOption.Boolean charNGramsOption = new CommandOption.Boolean(TUI.class, "char-ngrams", "true|false", true, false, "", null);
    static CommandOption.String offsetsOption = new CommandOption.String(TUI.class, "offsets", "e.g. [[0,0],[1]]", true, "[[-2],[-1],[1],[2]]", "Offset conjunctions", null);
    static CommandOption.String capOffsetsOption = new CommandOption.String(TUI.class, "cap-offsets", "e.g. [[0,0],[0,1]]", true, "", "Offset conjunctions applied to features that are [A-Z]*", null);
    static CommandOption.String viterbiFilePrefixOption = new CommandOption.String(TUI.class, "viterbi-file", "FILE", true, "TUI", "Filename in which to store most recent Viterbi output", null);
    static final CommandOption.List commandOptions = new CommandOption.List("Training, testing and running a Chinese word segmenter.", new CommandOption[]{gaussianVarianceOption, hyperbolicSlopeOption, hyperbolicSharpnessOption, randomSeedOption, labelGramOption, wordWindowFeatureOption, useHyperbolicPriorOption, useFeatureInductionOption, clusterFeatureInductionOption, useFirstMentionFeatureOption, useDocHeaderFeatureOption, includeConllLexiconsOption, offsetsOption, capOffsetsOption, viterbiFilePrefixOption, useTestbOption});
    int numEvaluations = 0;
    static int iterationsBetweenEvals = 16;
    static boolean doingFeatureInduction = true;
    static boolean doingClusteredFeatureInduction = false;
    private static String CAPS = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039]";
    private static String LOW = "[a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b8]";
    private static String CAPSNUM = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u20390-9]";
    private static String ALPHA = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b8]";
    private static String ALPHANUM = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b80-9]";
    private static String PUNT = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";

    public static void main(String[] args) throws FileNotFoundException, Exception {
        commandOptions.process(args);
        String homedir = System.getProperty("HOME");
        String lexdir = homedir + "/research/data/resources/";
        String offsetsString = TUI.offsetsOption.value.replace('[', '{').replace(']', '}');
        int[][] offsets = (int[][])CommandOption.getInterpreter().eval("new int[][] " + offsetsString);
        String capOffsetsString = TUI.capOffsetsOption.value.replace('[', '{').replace(']', '}');
        int[][] capOffsets = null;
        if (capOffsetsString.length() > 0) {
            capOffsets = (int[][])CommandOption.getInterpreter().eval("new int[][] " + capOffsetsString);
        }
        SerialPipes conllLexiconsPipe = null;
        if (TUI.includeConllLexiconsOption.value) {
            conllLexiconsPipe = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(lexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership(new File(lexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership(new File(lexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership(new File(lexdir + "conll/CONLLTWOMISC"))});
        }
        SerialPipes p = new SerialPipes(new Pipe[]{new ConllNer2003Sentence2TokenSequence(), new RegexMatches("INITCAP", Pattern.compile(CAPS + ".*")), new RegexMatches("CAPITALIZED", Pattern.compile(CAPS + LOW + "*")), new RegexMatches("ALLCAPS", Pattern.compile(CAPS + "+")), new RegexMatches("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*")), new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*")), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+")), new RegexMatches("NUMERICAL", Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+")), new RegexMatches("MULTIDOTS", Pattern.compile("\\.\\.+")), new RegexMatches("ENDSINDOT", Pattern.compile("[^\\.]+.*\\.")), new RegexMatches("CONTAINSDASH", Pattern.compile(ALPHANUM + "+-" + ALPHANUM + "*")), new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches("LONELYINITIAL", Pattern.compile(CAPS + "\\.")), new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA)), new RegexMatches("CAPLETTER", Pattern.compile("[A-Z]")), new RegexMatches("PUNC", Pattern.compile(PUNT)), new RegexMatches("QUOTE", Pattern.compile(QUOTE)), TUI.includeConllLexiconsOption.value ? conllLexiconsPipe : new Noop(), new TokenText("W="), new OffsetConjunctions(offsets), capOffsets != null ? new OffsetConjunctions(capOffsets) : new Noop(), !TUI.useFirstMentionFeatureOption.value ? new Noop() : new FeaturesOfFirstMention("FIRSTMENTION=", Pattern.compile(CAPS + ".*"), Pattern.compile("W=[^@&]+"), false), !TUI.useDocHeaderFeatureOption.value ? new Noop() : new TokenSequenceDocHeader(), TUI.wordWindowFeatureOption.value > 0 ? new FeaturesInWindow("WINDOW=", -TUI.wordWindowFeatureOption.value, TUI.wordWindowFeatureOption.value, Pattern.compile("WORD=.*"), true) : new Noop(), TUI.charNGramsOption.value ? new TokenTextCharNGrams("CHARNGRAM=", new int[]{2, 3, 4}) : new Noop(), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence(true, true)});
        args = TUI.useTestbOption.value ? new String[]{homedir + "/research/data/ie/ner2003/eng.train", homedir + "/research/data/ie/ner2003/eng.testb"} : new String[]{homedir + "/research/data/ie/ner2003/eng.train", homedir + "/research/data/ie/ner2003/eng.testa"};
        InstanceList trainingData = new InstanceList(p);
        trainingData.addThruPipe(new LineGroupIterator(new FileReader(new File(args[0])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
        System.out.println("Read " + trainingData.size() + " training instances");
        InstanceList testingData = null;
        if (args.length > 1) {
            testingData = new InstanceList(p);
            testingData.addThruPipe(new LineGroupIterator(new FileReader(new File(args[1])), Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
        }
        if (testingData == null) {
            Random r = new Random(1L);
            InstanceList[] trainingLists = trainingData.split(r, new double[]{0.2, 0.1, 0.7});
            trainingData = trainingLists[0];
            if (testingData != null) {
                InstanceList[] testingLists = testingData.split(r, new double[]{0.5, 0.5});
                testingData = testingLists[0];
                testingLists = null;
            } else {
                testingData = trainingLists[1];
            }
            trainingLists = null;
            assert (testingData != null);
        }
        Alphabet targets = p.getTargetAlphabet();
        System.out.print("State labels:");
        for (int i = 0; i < targets.size(); ++i) {
            System.out.print(" " + targets.lookupObject(i));
        }
        System.out.println("");
        System.out.println("Number of features = " + p.getDataAlphabet().size());
        CRF crf = new CRF(p, null);
        if (TUI.labelGramOption.value == 1) {
            crf.addStatesForLabelsConnectedAsIn(trainingData);
        } else if (TUI.labelGramOption.value == 2) {
            crf.addStatesForBiLabelsConnectedAsIn(trainingData);
        } else {
            throw new IllegalStateException("label-gram must be 1, 2, or 3, not " + TUI.labelGramOption.value);
        }
        CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf);
        if (TUI.useHyperbolicPriorOption.value) {
            crft.setUseHyperbolicPrior(true);
            crft.setHyperbolicPriorSlope(TUI.hyperbolicSlopeOption.value);
            crft.setHyperbolicPriorSharpness(TUI.hyperbolicSharpnessOption.value);
        } else {
            crft.setGaussianPriorVariance(TUI.gaussianVarianceOption.value);
        }
        for (int i = 0; i < crf.numStates(); ++i) {
            Transducer.State s = crf.getState(i);
            if (s.getName().charAt(0) != 'I') continue;
            s.setInitialWeight(Double.POSITIVE_INFINITY);
        }
        System.out.println("Training on " + trainingData.size() + " training instances, " + testingData.size() + " testing instances...");
        MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator(new InstanceList[]{trainingData, testingData}, new String[]{"Training", "Testing"}, (Object[])new String[]{"B-PER", "B-LOC", "B-ORG", "B-MISC"}, (Object[])new String[]{"I-PER", "I-LOC", "I-ORG", "I-MISC"});
        ViterbiWriter vw = new ViterbiWriter(TUI.viterbiFilePrefixOption.value, new InstanceList[]{trainingData, testingData}, new String[]{"Training", "Testing"});
        if (TUI.useFeatureInductionOption.value) {
            if (TUI.clusterFeatureInductionOption.value) {
                crft.trainWithFeatureInduction(trainingData, null, testingData, eval, 99999, 10, 99, 200, 0.5, true, new double[]{0.1, 0.2, 0.5, 0.7});
            } else {
                crft.trainWithFeatureInduction(trainingData, null, testingData, eval, 99999, 10, 99, 1000, 0.5, false, new double[]{0.1, 0.2, 0.5, 0.7});
            }
        } else {
            double[] trainingProportions = new double[]{0.1, 0.2, 0.5, 0.7};
            for (int i = 0; i < trainingProportions.length; ++i) {
                crft.train(trainingData, 3, new double[]{trainingProportions[i]});
                eval.evaluate(crft);
                vw.evaluate(crft);
            }
            while (crft.train(trainingData, 3)) {
                eval.evaluate(crft);
                vw.evaluate(crft);
            }
            eval.evaluate(crft);
            vw.evaluate(crft);
        }
    }
}

