/*
 * Decompiled with CFR 0.152.
 */
package edu.umass.cs.dex.ie;

import edu.umass.cs.dex.DexRuntimeException;
import edu.umass.cs.dex.types.ContactRecord;
import edu.umass.cs.dex.types.CountedString;
import edu.umass.cs.dex.types.InformationGain;
import edu.umass.cs.dex.types.People;
import edu.umass.cs.dex.types.Person;
import edu.umass.cs.dex.web.WebPage;
import edu.umass.cs.mallet.base.fst.CRF4;
import edu.umass.cs.mallet.base.fst.MultiSegmentationEvaluator;
import edu.umass.cs.mallet.base.fst.Segment;
import edu.umass.cs.mallet.base.fst.Transducer;
import edu.umass.cs.mallet.base.fst.confidence.ConstrainedForwardBackwardConfidenceEstimator;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.iterator.SegmentIterator;
import edu.umass.cs.mallet.base.types.FeatureVectorSequence;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.types.Sequence;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.CharSequenceLexer;
import edu.umass.cs.mallet.base.util.CommandOption;
import edu.umass.cs.mallet.base.util.MalletLogger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Logger;

public class ContactRecordExtractor {
    private static Logger logger = MalletLogger.getLogger(ContactRecordExtractor.class.getName());
    private CRF4 crf;
    private SerialPipes inputPipe;
    private Object[] startTags;
    private Object[] inTags;
    private String bg = "O";
    private ConstrainedForwardBackwardConfidenceEstimator confidenceEstimator = null;
    File crfFile;
    boolean confidencePrediction;
    File vcfFile;
    File htmlFile;
    File keywordDir;
    int peopleBetweenVCFPrint = 20;
    HashSet stopList;
    public static String newline = System.getProperty("line.separator");
    static CommandOption.File crfFileOption = new CommandOption.File(ContactRecordExtractor.class, "crf-file", "FILE", true, new File("/usr/col/tmp1/culotta/mallet/exp/address/crf.obj"), "CRF object file to perform contact record extraction", null);
    static CommandOption.Boolean confidencePredictionOption = new CommandOption.Boolean(ContactRecordExtractor.class, "confidence-prediction", "true|false", false, false, "predict confidence of each extracted field?", null);
    static CommandOption.Boolean evalOption = new CommandOption.Boolean(ContactRecordExtractor.class, "eval", "true|false", false, false, "evaluate performance on labeled input file?", null);
    static CommandOption.File inputOption = new CommandOption.File(ContactRecordExtractor.class, "input", "FILE", true, null, "file to extract from", null);
    static CommandOption.File outputOption = new CommandOption.File(ContactRecordExtractor.class, "output", "FILE", true, null, "file to print extractions", null);
    static CommandOption.File keywordDirOption = new CommandOption.File(ContactRecordExtractor.class, "keyword-dir", "FILE", true, null, "dir to print extracted keywords", null);
    static final CommandOption.List commandOptions = new CommandOption.List("Extract contact information from text/html.", new CommandOption[]{crfFileOption, confidencePredictionOption, inputOption, outputOption, evalOption, keywordDirOption});

    public ContactRecordExtractor(File _crfFile, File _vcfFile, File _htmlFile, File _keywordDir, HashSet _stopList) {
        this(_crfFile, _vcfFile, _htmlFile, _keywordDir, _stopList, false);
    }

    public ContactRecordExtractor(File _crfFile, File _vcfFile, File _htmlFile, File _keywordDir, HashSet _stopList, boolean _confidencePrediction) {
        this.crfFile = _crfFile;
        this.vcfFile = _vcfFile;
        this.htmlFile = _htmlFile;
        this.keywordDir = _keywordDir;
        this.stopList = _stopList;
        this.confidencePrediction = _confidencePrediction;
        this.readCRF();
        HashSet allowed = new HashSet();
        ContactRecordExtractor.fillAllowedTags(allowed);
        this.startTags = ContactRecordExtractor.prepend("B-", allowed.toArray());
        this.inTags = ContactRecordExtractor.prepend("I-", allowed.toArray());
        if (this.confidencePrediction) {
            this.confidenceEstimator = new ConstrainedForwardBackwardConfidenceEstimator((Transducer)this.crf);
        }
    }

    private void readCRF() {
        logger.info("Loading CRF in object file " + this.crfFile + " for ContactRecordExtractor");
        try {
            ObjectInputStream ois = new ObjectInputStream(new FileInputStream(this.crfFile));
            this.crf = (CRF4)ois.readObject();
            ois.close();
            logger.info("CRF loaded successfully!");
        }
        catch (IOException e) {
            String msg = "Exception reading file: " + this.crfFile;
            logger.severe(msg + " : " + e);
            throw new DexRuntimeException(msg, e);
        }
        catch (ClassNotFoundException cnfe) {
            String msg = "Cound not find class reading in object for file: " + this.crfFile;
            logger.severe(msg + " : " + cnfe);
            throw new DexRuntimeException(msg, cnfe);
        }
        this.inputPipe = (SerialPipes)this.crf.getInputPipe();
        this.crf.getInputAlphabet().stopGrowth();
    }

    public void test(File fin, File fout) {
        Instance inst = new Instance((Object)this.fileToStringBuffer(fin).toString(), null, null, null, (Pipe)this.inputPipe);
        FeatureVectorSequence fvs = (FeatureVectorSequence)inst.getData((Pipe)this.inputPipe);
        Sequence ls = this.crf.viterbiPath((Sequence)fvs).output();
        this.printLabeledFile(ls, inst, fout.getAbsolutePath());
    }

    public void eval(File fin, File fout) {
        Instance inst = new Instance((Object)this.fileToStringBuffer(fin).toString(), null, null, null, (Pipe)this.inputPipe);
        InstanceList ilist = new InstanceList(inst.getPipe());
        ilist.add(inst);
        MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator(this.startTags, this.inTags);
        eval.test((Transducer)this.crf, ilist, "testing", null);
        FeatureVectorSequence fvs = (FeatureVectorSequence)inst.getData((Pipe)this.inputPipe);
        Sequence ls = this.crf.viterbiPath((Sequence)fvs).output();
        this.printLabeledFile(ls, inst, fout.getAbsolutePath());
    }

    private StringBuffer fileToStringBuffer(File f) {
        StringBuffer sb = new StringBuffer();
        try {
            BufferedReader rd = new BufferedReader(new FileReader(f));
            String line = "";
            boolean inTag = false;
            while ((line = rd.readLine()) != null) {
                line = line.replaceAll("<br>", newline);
                line = line.replaceAll("&nbsp;", " ");
                for (int ci = 0; ci < line.length(); ++ci) {
                    if (line.charAt(ci) == '<') {
                        inTag = true;
                        continue;
                    }
                    if (line.charAt(ci) == '>') {
                        inTag = false;
                        continue;
                    }
                    if (inTag) continue;
                    sb.append(line.charAt(ci));
                }
                sb.append("\n");
            }
            rd.close();
        }
        catch (IOException e) {
            logger.severe("Exception reaiding file: " + e);
        }
        return sb;
    }

    public People extractContactRecordsFor(People people) {
        return this.extractContactRecordsFor(people, 0);
    }

    public People extractContactRecordsFor(People people, int startingIndex) {
        CharSequenceLexer lexer = new CharSequenceLexer(CharSequenceLexer.LEX_WORDS);
        Iterator piter = people.iterator();
        int i = 0;
        while (piter.hasNext()) {
            if (i % this.peopleBetweenVCFPrint == 0 && this.vcfFile != null && this.htmlFile != null) {
                logger.info("printing intermediate VCF data to " + this.vcfFile + " and HTML data to " + this.htmlFile);
                InformationGain ig = new InformationGain(people, this.keywordDir.getAbsolutePath());
                people.writeHTML(this.htmlFile);
                people.writeVCF(this.vcfFile);
            }
            ++i;
            Person p = (Person)piter.next();
            if (p.processedForContactInformation) continue;
            Iterator iter = p.pageIterator();
            ContactRecord cr = new ContactRecord();
            int pagei = 0;
            while (iter.hasNext()) {
                WebPage webPage = (WebPage)iter.next();
                String fname = webPage.fileName;
                logger.info("Extracting contact records for " + fname);
                StringBuffer sb = this.fileToStringBuffer(new File(fname));
                Instance inst = new Instance((Object)sb.toString(), null, null, null, (Pipe)this.inputPipe);
                FeatureVectorSequence fvs = (FeatureVectorSequence)inst.getData((Pipe)this.inputPipe);
                Sequence ls = this.crf.viterbiPath((Sequence)fvs).output();
                ArrayList names = this.getNames(ls, inst, pagei);
                cr.addNames(names);
                HashSet contactRecordIndices = this.getContactRecordIndices(ls, inst);
                SegmentIterator segIter = new SegmentIterator((Sequence)fvs, ls, ls, this.startTags, this.inTags);
                this.augmentContactRecord(cr, segIter, inst, contactRecordIndices);
                this.printLabeledFile(ls, inst, ContactRecordExtractor.htmlFileToLabelFile(fname));
                this.addWords(p, ls, inst, lexer, true);
                ++pagei;
            }
            if (cr.numberFields() > 0) {
                p.setContactRecord(cr);
                logger.fine(cr.toString());
                continue;
            }
            logger.fine(">>No contact info found for person: " + p.getFirstName());
            if (p.getContactRecord().size() == 0) continue;
            throw new IllegalArgumentException("ContactRecord should be size 0, instead it's\n" + p.getContactRecord());
        }
        return people;
    }

    private void addWords(Person p, Sequence ls, Instance inst, CharSequenceLexer lexer, boolean bigram) {
        TokenSequence input = (TokenSequence)inst.getSource();
        String prevWord = null;
        for (int i = 0; i < input.size(); ++i) {
            if (!ls.get(i).equals(this.bg)) {
                prevWord = null;
                continue;
            }
            lexer.setCharSequence((CharSequence)input.getToken(i).getText());
            while (lexer.hasNext()) {
                String s = (String)lexer.next();
                if (s.matches(".*[0-9].*") || this.stopList.contains(s.toLowerCase())) {
                    prevWord = null;
                    continue;
                }
                if (s.equals("ENDLINE")) continue;
                s = s.toLowerCase();
                if (bigram && prevWord != null) {
                    p.addKeyWord(prevWord + " " + s);
                }
                prevWord = s;
            }
        }
    }

    private static String htmlFileToLabelFile(String f) {
        return f + ".tagged";
    }

    private void printLabeledFile(Sequence ls, Instance inst, String fname) {
        try {
            FileOutputStream ostream = new FileOutputStream(new File(fname));
            PrintWriter pw = new PrintWriter(ostream);
            TokenSequence input = (TokenSequence)inst.getSource();
            for (int i = 0; i < input.size(); ++i) {
                String term = input.getToken(i).getText();
                if (term.equals("ENDLINE")) {
                    pw.println("");
                    continue;
                }
                pw.print(term + " (" + ls.get(i) + ") ");
            }
            pw.flush();
            ostream.close();
        }
        catch (IOException e) {
            logger.severe("Exception reading file: " + e);
        }
    }

    private ArrayList getNames(Sequence ls, Instance inst, int pagei) {
        ArrayList names = new ArrayList();
        TokenSequence input = (TokenSequence)inst.getSource();
        for (int i = 0; i < input.size(); ++i) {
            String name;
            String s = input.getToken(i).getText();
            String l = ls.get(i).toString();
            if (l.equalsIgnoreCase("B-FirstName")) {
                if (i + 2 < input.size() && ls.get(i + 1).toString().equals("B-MiddleName") && ls.get(i + 2).toString().equals("B-LastName")) {
                    name = input.getToken(i).getText() + " " + input.getToken(i + 1).getText() + " " + input.getToken(i + 2).getText();
                    this.addName(name, names, pagei);
                    continue;
                }
                if (i + 1 >= input.size() || !ls.get(i + 1).toString().equals("B-LastName")) continue;
                name = input.getToken(i).getText() + " " + input.getToken(i + 1).getText();
                this.addName(name, names, pagei);
                continue;
            }
            if (!l.equalsIgnoreCase("B-LastName") || i + 2 >= input.size() || !input.get(i + 1).equals(",") || !ls.get(i + 2).equals("B-FirstName")) continue;
            if (i + 3 < input.size() && ls.get(i + 3).equals("B-MiddleName")) {
                name = input.getToken(i + 2).getText() + " " + input.getToken(i + 3).getText() + " " + input.getToken(i);
                this.addName(name, names, pagei);
                continue;
            }
            name = input.getToken(i + 2).getText() + " " + input.getToken(i);
            this.addName(name, names, pagei);
        }
        return names;
    }

    private boolean addName(String n, ArrayList names, int pagei) {
        if (n.matches(".*[0-9].*")) {
            return false;
        }
        names.add(new CountedString(this.cleanName(n), pagei));
        return true;
    }

    private String cleanName(String n) {
        n = n.replaceAll("\\p{Punct}", " ");
        n = n.trim();
        n = n.replaceAll("\\s{2,}", " ");
        return n.toLowerCase();
    }

    private void printViterbi(Sequence ls, TokenSequence source, Sequence target) {
        for (int i = 0; i < source.size(); ++i) {
            logger.info("(" + ls.get(i) + ") (" + target.get(i) + ") " + source.getToken(i).getText() + "\n" + source.get(i));
        }
    }

    private HashSet getContactRecordIndices(Sequence ls, Instance inst) {
        HashSet h = new HashSet();
        TokenSequence input = (TokenSequence)inst.getSource();
        int numbg = 0;
        int startRecord = -1;
        int bgThresh = 15;
        int recordThresh = 5;
        HashSet<String> seenTags = new HashSet<String>();
        for (int i = 0; i < input.size(); ++i) {
            String label = ls.get(i).toString();
            if (label.equals(this.bg)) {
                if (!input.getToken(i).getText().equals("ENDLINE")) {
                    ++numbg;
                }
                if (numbg > bgThresh && seenTags.size() < recordThresh) {
                    numbg = 0;
                    startRecord = -1;
                    seenTags = new HashSet();
                    continue;
                }
                if (numbg <= bgThresh || seenTags.size() < recordThresh && !seenTags.contains("B-PostalCode")) continue;
                if (startRecord == -1) {
                    throw new IllegalArgumentException("Adding record without setting startRecord.");
                }
                h = this.addIntegers(startRecord, i - 1, h);
                numbg = 0;
                startRecord = -1;
                seenTags = new HashSet();
                return h;
            }
            if (label.startsWith("B-")) {
                if (startRecord == -1) {
                    numbg = 0;
                    startRecord = i;
                    seenTags = new HashSet();
                    seenTags.add(label);
                    continue;
                }
                seenTags.add(label);
                continue;
            }
            if (label.startsWith("I-")) continue;
            throw new IllegalArgumentException("Invalid label: " + label);
        }
        if (seenTags.size() > recordThresh || seenTags.contains("B-PostalCode")) {
            if (startRecord == -1) {
                throw new IllegalArgumentException("Adding record without setting startRecord.");
            }
            h = this.addIntegers(startRecord, input.size() - 1, h);
        }
        return h;
    }

    private HashSet addIntegers(int from, int to, HashSet h) {
        for (int i = from; i <= to; ++i) {
            h.add(new Integer(i));
        }
        return h;
    }

    private String getStringFromIndices(int from, int to, TokenSequence ts) {
        String ret = "";
        for (int i = from; i <= to; ++i) {
            ret = ret + ts.getToken(i).getText() + " ";
        }
        return ret;
    }

    private String getStringFromSegment(Segment seg, TokenSequence ts) {
        String ret = "";
        for (int ii = seg.getStart(); ii <= seg.getEnd(); ++ii) {
            ret = ret + ts.getToken(ii).getText() + " ";
        }
        return ret;
    }

    private ContactRecord augmentContactRecord(ContactRecord cr, SegmentIterator segIter, Instance inst, HashSet contactIndices) {
        boolean si = false;
        while (segIter.hasNext()) {
            Segment seg = segIter.nextSegment();
            if (seg.getStartTag().equals(this.bg)) continue;
            String fieldValue = "";
            if (!contactIndices.contains(new Integer(seg.getStart()))) continue;
            if (this.confidencePrediction) {
                logger.info("Confidence: " + this.confidenceEstimator.estimateConfidenceFor(seg));
            }
            TokenSequence ts = (TokenSequence)inst.getSource();
            for (int ii = seg.getStart(); ii <= seg.getEnd(); ++ii) {
                fieldValue = fieldValue + ts.getToken(ii).getText() + " ";
            }
            cr.setFieldValue(ContactRecordExtractor.getFieldFromLabel((String)seg.getStartTag()), fieldValue);
        }
        return cr;
    }

    private static String getFieldFromLabel(String l) {
        return l.substring(l.indexOf("-") + 1, l.length());
    }

    private static void fillAllowedTags(HashSet h) {
        h.add("FirstName");
        h.add("MiddleName");
        h.add("Nickname");
        h.add("nickname");
        h.add("Suffix");
        h.add("LastName");
        h.add("Title");
        h.add("JobTitle");
        h.add("CompanyName");
        h.add("Department");
        h.add("AddressLine");
        h.add("City1");
        h.add("City2");
        h.add("State");
        h.add("Country");
        h.add("PostalCode");
        h.add("HomePhoneNumber");
        h.add("FaxNumber");
        h.add("CompanyPhoneNumber");
        h.add("DirectPhoneNumber");
        h.add("MobilePhoneNumber");
        h.add("PagerNumber");
        h.add("WebPageURL");
        h.add("Email");
        h.add("InstantMessagingAddress");
        h.add("VoiceMail");
    }

    private static String[] prepend(String prefix, Object[] a) {
        String[] ret = new String[a.length];
        for (int i = 0; i < a.length; ++i) {
            ret[i] = prefix + (String)a[i];
        }
        return ret;
    }

    public static void main(String[] args) {
        commandOptions.process(args);
        ContactRecordExtractor cre = new ContactRecordExtractor(ContactRecordExtractor.crfFileOption.value, null, null, null, null, ContactRecordExtractor.confidencePredictionOption.value);
        if (ContactRecordExtractor.evalOption.value) {
            cre.eval(ContactRecordExtractor.inputOption.value, ContactRecordExtractor.outputOption.value);
        } else {
            cre.test(ContactRecordExtractor.inputOption.value, ContactRecordExtractor.outputOption.value);
        }
    }
}

