cc.mallet.topics
Class PolylingualTopicModel

java.lang.Object
  extended by cc.mallet.topics.PolylingualTopicModel
All Implemented Interfaces:
java.io.Serializable

public class PolylingualTopicModel
extends java.lang.Object
implements java.io.Serializable

Latent Dirichlet Allocation for loosely parallel corpora in arbitrary languages

Author:
David Mimno, Andrew McCallum
See Also:
Serialized Form

Nested Class Summary
 class PolylingualTopicModel.TopicAssignment
           
 
Field Summary
protected  double[] alpha
           
protected  Alphabet[] alphabets
           
protected  double alphaSum
           
protected  double[] betas
           
protected  double[] betaSums
           
 int burninPeriod
           
protected  java.util.ArrayList<PolylingualTopicModel.TopicAssignment> data
           
static double DEFAULT_BETA
           
protected  int[] docLengthCounts
           
protected  java.text.NumberFormat formatter
           
protected  int iterationsSoFar
           
protected  double[][] languageCachedCoefficients
           
protected  int[] languageMaxTypeCounts
           
protected  double[] languageSmoothingOnlyMasses
           
protected  int[][] languageTokensPerTopic
           
protected  int[][][] languageTypeTopicCounts
           
protected  java.lang.String modelFilename
           
 int numIterations
           
protected  int numStopwords
           
protected  int numTopics
           
protected  int[] oneDocTopicCounts
           
 int optimizeInterval
           
protected  boolean printLogLikelihood
           
protected  Randoms random
           
protected  int saveModelInterval
           
 int saveSampleInterval
           
protected  int saveStateInterval
           
 int showTopicsInterval
           
protected  java.lang.String stateFilename
           
protected  LabelAlphabet topicAlphabet
           
protected  int topicBits
           
protected  int[][] topicDocCounts
           
protected  int topicMask
           
protected  int[] vocabularySizes
           
 int wordsPerTopic
           
 
Constructor Summary
PolylingualTopicModel(int numberOfTopics)
           
PolylingualTopicModel(int numberOfTopics, double alphaSum)
           
PolylingualTopicModel(int numberOfTopics, double alphaSum, Randoms random)
           
PolylingualTopicModel(LabelAlphabet topicAlphabet, double alphaSum, Randoms random)
           
 
Method Summary
 void addInstances(InstanceList[] training)
           
 void estimate()
           
 void estimate(int iterationsThisRound)
           
 java.util.ArrayList<PolylingualTopicModel.TopicAssignment> getData()
           
 TopicInferencer getInferencer(int language)
          Return a tool for estimating topic distributions for new documents
 int getNumTopics()
           
 LabelAlphabet getTopicAlphabet()
           
 void loadTestingIDs(java.io.File testingIDFile)
           
static void main(java.lang.String[] args)
           
 double modelLogLikelihood()
           
 void optimizeBetas()
           
 void printDocumentTopics(java.io.File f)
           
 void printDocumentTopics(java.io.PrintWriter pw)
           
 void printDocumentTopics(java.io.PrintWriter pw, double threshold, int max)
           
 void printState(java.io.File f)
           
 void printState(java.io.PrintStream out)
           
 void printTopWords(java.io.File file, int numWords, boolean useNewLines)
           
 void printTopWords(java.io.PrintStream out, int numWords, boolean usingNewLines)
           
static PolylingualTopicModel read(java.io.File f)
           
protected  void sampleTopicsForOneDoc(PolylingualTopicModel.TopicAssignment topicAssignment, boolean shouldSaveState)
           
 void setBurninPeriod(int burninPeriod)
           
 void setModelOutput(int interval, java.lang.String filename)
           
 void setNumIterations(int numIterations)
           
 void setOptimizeInterval(int interval)
           
 void setRandomSeed(int seed)
           
 void setSaveState(int interval, java.lang.String filename)
          Define how often and where to save the state
 void setTopicDisplay(int interval, int n)
           
 void write(java.io.File serializedModelFile)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

data

protected java.util.ArrayList<PolylingualTopicModel.TopicAssignment> data

topicAlphabet

protected LabelAlphabet topicAlphabet

numStopwords

protected int numStopwords

numTopics

protected int numTopics

topicMask

protected int topicMask

topicBits

protected int topicBits

alphabets

protected Alphabet[] alphabets

vocabularySizes

protected int[] vocabularySizes

alpha

protected double[] alpha

alphaSum

protected double alphaSum

betas

protected double[] betas

betaSums

protected double[] betaSums

languageMaxTypeCounts

protected int[] languageMaxTypeCounts

DEFAULT_BETA

public static final double DEFAULT_BETA
See Also:
Constant Field Values

languageSmoothingOnlyMasses

protected double[] languageSmoothingOnlyMasses

languageCachedCoefficients

protected double[][] languageCachedCoefficients

oneDocTopicCounts

protected int[] oneDocTopicCounts

languageTypeTopicCounts

protected int[][][] languageTypeTopicCounts

languageTokensPerTopic

protected int[][] languageTokensPerTopic

docLengthCounts

protected int[] docLengthCounts

topicDocCounts

protected int[][] topicDocCounts

iterationsSoFar

protected int iterationsSoFar

numIterations

public int numIterations

burninPeriod

public int burninPeriod

saveSampleInterval

public int saveSampleInterval

optimizeInterval

public int optimizeInterval

showTopicsInterval

public int showTopicsInterval

wordsPerTopic

public int wordsPerTopic

saveModelInterval

protected int saveModelInterval

modelFilename

protected java.lang.String modelFilename

saveStateInterval

protected int saveStateInterval

stateFilename

protected java.lang.String stateFilename

random

protected Randoms random

formatter

protected java.text.NumberFormat formatter

printLogLikelihood

protected boolean printLogLikelihood
Constructor Detail

PolylingualTopicModel

public PolylingualTopicModel(int numberOfTopics)

PolylingualTopicModel

public PolylingualTopicModel(int numberOfTopics,
                             double alphaSum)

PolylingualTopicModel

public PolylingualTopicModel(int numberOfTopics,
                             double alphaSum,
                             Randoms random)

PolylingualTopicModel

public PolylingualTopicModel(LabelAlphabet topicAlphabet,
                             double alphaSum,
                             Randoms random)
Method Detail

loadTestingIDs

public void loadTestingIDs(java.io.File testingIDFile)
                    throws java.io.IOException
Throws:
java.io.IOException

getTopicAlphabet

public LabelAlphabet getTopicAlphabet()

getNumTopics

public int getNumTopics()

getData

public java.util.ArrayList<PolylingualTopicModel.TopicAssignment> getData()

setNumIterations

public void setNumIterations(int numIterations)

setBurninPeriod

public void setBurninPeriod(int burninPeriod)

setTopicDisplay

public void setTopicDisplay(int interval,
                            int n)

setRandomSeed

public void setRandomSeed(int seed)

setOptimizeInterval

public void setOptimizeInterval(int interval)

setModelOutput

public void setModelOutput(int interval,
                           java.lang.String filename)

setSaveState

public void setSaveState(int interval,
                         java.lang.String filename)
Define how often and where to save the state

Parameters:
interval - Save a copy of the state every interval iterations.
filename - Save the state to this file, with the iteration number as a suffix

addInstances

public void addInstances(InstanceList[] training)

estimate

public void estimate()
              throws java.io.IOException
Throws:
java.io.IOException

estimate

public void estimate(int iterationsThisRound)
              throws java.io.IOException
Throws:
java.io.IOException

optimizeBetas

public void optimizeBetas()

sampleTopicsForOneDoc

protected void sampleTopicsForOneDoc(PolylingualTopicModel.TopicAssignment topicAssignment,
                                     boolean shouldSaveState)

printTopWords

public void printTopWords(java.io.File file,
                          int numWords,
                          boolean useNewLines)
                   throws java.io.IOException
Throws:
java.io.IOException

printTopWords

public void printTopWords(java.io.PrintStream out,
                          int numWords,
                          boolean usingNewLines)

printDocumentTopics

public void printDocumentTopics(java.io.File f)
                         throws java.io.IOException
Throws:
java.io.IOException

printDocumentTopics

public void printDocumentTopics(java.io.PrintWriter pw)

printDocumentTopics

public void printDocumentTopics(java.io.PrintWriter pw,
                                double threshold,
                                int max)
Parameters:
pw - A print writer
threshold - Only print topics with proportion greater than this number
max - Print no more than this many topics

printState

public void printState(java.io.File f)
                throws java.io.IOException
Throws:
java.io.IOException

printState

public void printState(java.io.PrintStream out)

modelLogLikelihood

public double modelLogLikelihood()

getInferencer

public TopicInferencer getInferencer(int language)
Return a tool for estimating topic distributions for new documents


write

public void write(java.io.File serializedModelFile)

read

public static PolylingualTopicModel read(java.io.File f)
                                  throws java.lang.Exception
Throws:
java.lang.Exception

main

public static void main(java.lang.String[] args)
                 throws java.io.IOException
Throws:
java.io.IOException