cc.mallet.topics
Class SimpleLDA

java.lang.Object
  extended by cc.mallet.topics.SimpleLDA
All Implemented Interfaces:
java.io.Serializable

public class SimpleLDA
extends java.lang.Object
implements java.io.Serializable

A simple implementation of Latent Dirichlet Allocation using Gibbs sampling. This code is slower than the regular Mallet LDA implementation, but provides a better starting place for understanding how sampling works and for building new topic models.

Author:
David Mimno, Andrew McCallum
See Also:
Serialized Form

Field Summary
protected  double alpha
           
protected  Alphabet alphabet
           
protected  double alphaSum
           
protected  double beta
           
protected  double betaSum
           
protected  java.util.ArrayList<TopicAssignment> data
           
static double DEFAULT_BETA
           
protected  java.text.NumberFormat formatter
           
protected  int numTopics
           
protected  int numTypes
           
protected  int[] oneDocTopicCounts
           
protected  boolean printLogLikelihood
           
protected  Randoms random
           
 int showTopicsInterval
           
protected  int[] tokensPerTopic
           
protected  LabelAlphabet topicAlphabet
           
protected  int[][] typeTopicCounts
           
 int wordsPerTopic
           
 
Constructor Summary
SimpleLDA(int numberOfTopics)
           
SimpleLDA(int numberOfTopics, double alphaSum, double beta)
           
SimpleLDA(int numberOfTopics, double alphaSum, double beta, Randoms random)
           
SimpleLDA(LabelAlphabet topicAlphabet, double alphaSum, double beta, Randoms random)
           
 
Method Summary
 void addInstances(InstanceList training)
           
 Alphabet getAlphabet()
           
 java.util.ArrayList<TopicAssignment> getData()
           
 int getNumTopics()
           
 LabelAlphabet getTopicAlphabet()
           
 int[] getTopicTotals()
           
 int[][] getTypeTopicCounts()
           
static void main(java.lang.String[] args)
           
 double modelLogLikelihood()
           
 void printDocumentTopics(java.io.File file, double threshold, int max)
           
 void printState(java.io.File f)
           
 void printState(java.io.PrintStream out)
           
 void sample(int iterations)
           
protected  void sampleTopicsForOneDoc(FeatureSequence tokenSequence, FeatureSequence topicSequence)
           
 void setRandomSeed(int seed)
           
 void setTopicDisplay(int interval, int n)
           
 java.lang.String topWords(int numWords)
           
 void write(java.io.File f)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

data

protected java.util.ArrayList<TopicAssignment> data

alphabet

protected Alphabet alphabet

topicAlphabet

protected LabelAlphabet topicAlphabet

numTopics

protected int numTopics

numTypes

protected int numTypes

alpha

protected double alpha

alphaSum

protected double alphaSum

beta

protected double beta

betaSum

protected double betaSum

DEFAULT_BETA

public static final double DEFAULT_BETA
See Also:
Constant Field Values

oneDocTopicCounts

protected int[] oneDocTopicCounts

typeTopicCounts

protected int[][] typeTopicCounts

tokensPerTopic

protected int[] tokensPerTopic

showTopicsInterval

public int showTopicsInterval

wordsPerTopic

public int wordsPerTopic

random

protected Randoms random

formatter

protected java.text.NumberFormat formatter

printLogLikelihood

protected boolean printLogLikelihood
Constructor Detail

SimpleLDA

public SimpleLDA(int numberOfTopics)

SimpleLDA

public SimpleLDA(int numberOfTopics,
                 double alphaSum,
                 double beta)

SimpleLDA

public SimpleLDA(int numberOfTopics,
                 double alphaSum,
                 double beta,
                 Randoms random)

SimpleLDA

public SimpleLDA(LabelAlphabet topicAlphabet,
                 double alphaSum,
                 double beta,
                 Randoms random)
Method Detail

getAlphabet

public Alphabet getAlphabet()

getTopicAlphabet

public LabelAlphabet getTopicAlphabet()

getNumTopics

public int getNumTopics()

getData

public java.util.ArrayList<TopicAssignment> getData()

setTopicDisplay

public void setTopicDisplay(int interval,
                            int n)

setRandomSeed

public void setRandomSeed(int seed)

getTypeTopicCounts

public int[][] getTypeTopicCounts()

getTopicTotals

public int[] getTopicTotals()

addInstances

public void addInstances(InstanceList training)

sample

public void sample(int iterations)
            throws java.io.IOException
Throws:
java.io.IOException

sampleTopicsForOneDoc

protected void sampleTopicsForOneDoc(FeatureSequence tokenSequence,
                                     FeatureSequence topicSequence)

modelLogLikelihood

public double modelLogLikelihood()

topWords

public java.lang.String topWords(int numWords)

printDocumentTopics

public void printDocumentTopics(java.io.File file,
                                double threshold,
                                int max)
                         throws java.io.IOException
Parameters:
file - The filename to print to
threshold - Only print topics with proportion greater than this number
max - Print no more than this many topics
Throws:
java.io.IOException

printState

public void printState(java.io.File f)
                throws java.io.IOException
Throws:
java.io.IOException

printState

public void printState(java.io.PrintStream out)

write

public void write(java.io.File f)

main

public static void main(java.lang.String[] args)
                 throws java.io.IOException
Throws:
java.io.IOException