/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.share.weili.ner.enron;

import cc.mallet.fst.CRF;
import cc.mallet.fst.CRFTrainerByLabelLikelihood;
import cc.mallet.fst.MultiSegmentationEvaluator;
import cc.mallet.fst.ViterbiWriter;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintTokenSequenceFeatures;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureVectorSequence;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.pipe.tsf.LexiconMembership;
import cc.mallet.pipe.tsf.OffsetConjunctions;
import cc.mallet.pipe.tsf.RegexMatches;
import cc.mallet.pipe.tsf.TrieLexiconMembership;
import cc.mallet.share.upenn.ner.NEPipes;
import cc.mallet.share.weili.ner.enron.EnronMessage2TokenSequence;
import cc.mallet.types.Alphabet;
import cc.mallet.types.InstanceList;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import java.util.regex.Pattern;

public class TUI {
    private static String CAPS = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039]";
    private static String LOW = "[a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b8]";
    private static String CAPSNUM = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u20390-9]";
    private static String ALPHA = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b8]";
    private static String ALPHANUM = "[A-Z\u00a1\u2026\u00d5\u201d\u2044\u00bf\u00bb\u00c3\u201c\u0178\u00ab\u2014\u0153\u2039a-z\u2021\u00cb\u00cf\u00da\u02d8\u00b7\u00c8\u00cc\u00db\u02d9\u00c1\u00d2\u00d4\u00b80-9]";
    private static String PUNT = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";

    public static void main(String[] args) throws IOException {
        String datadir = "/usr/can/tmp3/weili/NER/Enron/data";
        String conlllexdir = "/usr/col/tmp1/weili/Resource/conllDict/";
        String idflexdir = "/usr/col/tmp1/weili/Resource/idfDict/";
        String placelexdir = "/usr/col/tmp1/weili/Resource/places";
        SerialPipes conllLexiconsPipe = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(conlllexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership(new File(conlllexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership(new File(conlllexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership(new File(conlllexdir + "conll/CONLLTWOMISC"))});
        SerialPipes googleLexiconsPipe = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGSOCCER")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGGOVT")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGNGO")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGMILITARY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGCOMPANY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGBANK")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGTRADE")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGNEWS")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGOPERATINGSYSTEM")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGPOLITICALPARTY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGTRAVEL")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGBASEBALLTEAMAUGF")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGCARMODEL")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGCARCOMPANY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGENGLISHCOUNTYAUG")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGUNIVERSITY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCNATIONALITYAUGF")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCDISEASEAUG")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCTIME")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCAWARDS")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCMOVIESAUGF")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCPOLITICALPARTY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCRELIGION")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCGOVT")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCWAR")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCCURRENCY")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/LOC")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/PERFL")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/MISCF")), new TrieLexiconMembership(new File(conlllexdir + "googlesets/ORGFRAWEDITEDSORTED"))});
        SerialPipes fixedLexiconsPipe = new SerialPipes(new Pipe[]{new LexiconMembership("FIRSTHIGHEST", new File(conlllexdir + "personname/ssdi.prfirsthighest"), true), new LexiconMembership("FIRSTHIGH", new File(conlllexdir + "personname/ssdi.prfirsthigh"), true), new LexiconMembership("FIRSTMED", new File(conlllexdir + "personname/ssdi.prfirstmed"), true), new LexiconMembership("FIRSTLOW", new File(conlllexdir + "personname/ssdi.prfirstlow"), true), new LexiconMembership("LASTHIGHEST", new File(conlllexdir + "personname/ssdi.prlasthighest"), true), new LexiconMembership("LASTHIGH", new File(conlllexdir + "personname/ssdi.prlasthigh"), true), new LexiconMembership("LASTMED", new File(conlllexdir + "personname/ssdi.prlastmed"), true), new LexiconMembership("LASTLOW", new File(conlllexdir + "personname/ssdi.prlastlow"), true), new LexiconMembership("HONORIFIC", new File(conlllexdir + "personname/honorifics"), true), new LexiconMembership("NAMESUFFIX", new File(conlllexdir + "personname/namesuffixes"), true), new LexiconMembership("NAMEPARTICLE", new File(conlllexdir + "personname/name-particles"), true), new LexiconMembership("DAY", new File(conlllexdir + "days"), true), new LexiconMembership("MONTH", new File(conlllexdir + "months"), true), new LexiconMembership("PLACESUFFIX", new File(conlllexdir + "place-suffixes"), true), new TrieLexiconMembership("COUNTRY", new File(conlllexdir + "countries"), true), new TrieLexiconMembership("COUNTRYCAPITAL", new File(conlllexdir + "country-capitals"), true), new TrieLexiconMembership("USSTATE", new File(conlllexdir + "US-states"), true), new TrieLexiconMembership("COMPANYNAME", new File(conlllexdir + "company-names"), true), new TrieLexiconMembership("COMPANYSUFFIX", new File(conlllexdir + "company-suffixes"), true), new TrieLexiconMembership("CONTINENT", new File(conlllexdir + "continents"), true), new LexiconMembership("STOPWORD", new File(conlllexdir + "stopwords"), true), new TrieLexiconMembership(new File(conlllexdir + "biz.yahoo/COMPANYNAME.ABBREV")), new TrieLexiconMembership(new File(conlllexdir + "utexas/UNIVERSITIES"))});
        SerialPipes idfLexiconsPipe = new SerialPipes(new Pipe[]{new TrieLexiconMembership("IDF_DES", new File(idflexdir + "designator.data"), true), new TrieLexiconMembership("IDF_FIR", new File(idflexdir + "firstnames.data"), true), new TrieLexiconMembership("IDF_LOC", new File(idflexdir + "locations.data"), true), new TrieLexiconMembership("IDF_NAT", new File(idflexdir + "nations.data"), true), new TrieLexiconMembership("IDF_ABB", new File(idflexdir + "non-final-abbrevs.data"), true), new TrieLexiconMembership("IDF_ORG", new File(idflexdir + "organization.data"), true), new TrieLexiconMembership("IDF_PER", new File(idflexdir + "person.data"), true)});
        SerialPipes spellingFeaturesPipe = new SerialPipes(new Pipe[]{new RegexMatches("INITCAP", Pattern.compile(CAPS + ".*")), new RegexMatches("CAPITALIZED", Pattern.compile(CAPS + LOW + "*")), new RegexMatches("ALLCAPS", Pattern.compile(CAPS + "+")), new RegexMatches("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*")), new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*")), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+")), new RegexMatches("NUMERICAL", Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+")), new RegexMatches("MULTIDOTS", Pattern.compile("\\.\\.+")), new RegexMatches("ENDSINDOT", Pattern.compile("[^\\.]+.*\\.")), new RegexMatches("CONTAINSDASH", Pattern.compile(ALPHANUM + "+-" + ALPHANUM + "*")), new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches("LONELYINITIAL", Pattern.compile(CAPS + "\\.")), new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA)), new RegexMatches("CAPLETTER", Pattern.compile("[A-Z]")), new RegexMatches("PUNC", Pattern.compile(PUNT)), new RegexMatches("QUOTE", Pattern.compile(QUOTE))});
        SerialPipes p = new SerialPipes(new Pipe[]{new EnronMessage2TokenSequence(), new NEPipes(new File(placelexdir)), conllLexiconsPipe, googleLexiconsPipe, fixedLexiconsPipe, idfLexiconsPipe, new OffsetConjunctions(new int[][]{{-1}, {1}}), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence(true, true)});
        InstanceList ilist = new InstanceList(p);
        ilist.addThruPipe(new FileIterator(datadir, FileIterator.STARTING_DIRECTORIES));
        Random r = new Random(1L);
        InstanceList[] ilists = ilist.split(r, new double[]{0.8, 0.2});
        Alphabet targets = p.getTargetAlphabet();
        System.out.print("State labels:");
        for (int i = 0; i < targets.size(); ++i) {
            System.out.print(" " + targets.lookupObject(i));
        }
        System.out.println("");
        System.out.println("Number of features = " + p.getDataAlphabet().size());
        CRF crf = new CRF(p, null);
        crf.addStatesForThreeQuarterLabelsConnectedAsIn(ilists[0]);
        CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood(crf);
        crft.setGaussianPriorVariance(100.0);
        for (int i = 0; i < crf.numStates(); ++i) {
            crf.getState(i).setInitialWeight(Double.NEGATIVE_INFINITY);
        }
        crf.getState("O").setInitialWeight(0.0);
        System.out.println("Training on " + ilists[0].size() + " training instances.");
        MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator(new InstanceList[]{ilists[0], ilists[1]}, new String[]{"train", "test"}, (Object[])new String[]{"B-DATE", "B-TIME", "B-LOCATION", "B-PERSON", "B-ORGANIZATION", "B-ACRONYM", "B-PHONE", "B-MONEY", "B-PERCENT"}, (Object[])new String[]{"I-DATE", "I-TIME", "I-LOCATION", "I-PERSON", "I-ORGANIZATION", "I-ACRONYM", "I-PHONE", "I-MONEY", "I-PERCENT"});
        if (args[0].equals("FeatureInduction")) {
            throw new IllegalStateException("Feature induction not yet supported.");
        }
        if (args[0].equals("NoFeatureInduction")) {
            crft.train(ilists[0], 5, new double[]{0.1, 0.2, 0.5, 0.7});
            while (!crft.trainIncremental(ilists[0])) {
                eval.evaluate(crft);
                if (crft.getIteration() % 5 != 0) continue;
                new ViterbiWriter(args[2], ilists[0], "train", ilists[1], "test");
            }
        } else {
            System.err.println("Feature induction or not? Give me a choice.");
            System.exit(1);
        }
        crf.write(new File(args[1]));
    }
}

