|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object cc.mallet.topics.ParallelTopicModel
public class ParallelTopicModel
Simple parallel threaded implementation of LDA, following Newman, Asuncion, Smyth and Welling, Distributed Algorithms for Topic Models JMLR (2009), with SparseLDA sampling scheme and data structure from Yao, Mimno and McCallum, Efficient Methods for Topic Model Inference on Streaming Document Collections, KDD (2009).
Field Summary | |
---|---|
double[] |
alpha
|
Alphabet |
alphabet
|
double |
alphaSum
|
double |
beta
|
double |
betaSum
|
int |
burninPeriod
|
java.util.ArrayList<TopicAssignment> |
data
|
static double |
DEFAULT_BETA
|
int[] |
docLengthCounts
|
java.text.NumberFormat |
formatter
|
static java.util.logging.Logger |
logger
|
java.lang.String |
modelFilename
|
int |
numIterations
|
int |
numTopics
|
int |
numTypes
|
int |
optimizeInterval
|
boolean |
printLogLikelihood
|
int |
randomSeed
|
int |
saveModelInterval
|
int |
saveSampleInterval
|
int |
saveStateInterval
|
int |
showTopicsInterval
|
java.lang.String |
stateFilename
|
int |
temperingInterval
|
int[] |
tokensPerTopic
|
LabelAlphabet |
topicAlphabet
|
int |
topicBits
|
int[][] |
topicDocCounts
|
int |
topicMask
|
int |
totalTokens
|
int[][] |
typeTopicCounts
|
static int |
UNASSIGNED_TOPIC
|
boolean |
usingSymmetricAlpha
|
int |
wordsPerTopic
|
Constructor Summary | |
---|---|
ParallelTopicModel(int numberOfTopics)
|
|
ParallelTopicModel(int numberOfTopics,
double alphaSum,
double beta)
|
|
ParallelTopicModel(LabelAlphabet topicAlphabet,
double alphaSum,
double beta)
|
Method Summary | |
---|---|
void |
addInstances(InstanceList training)
|
void |
buildInitialTypeTopicCounts()
|
java.lang.String |
displayTopWords(int numWords,
boolean usingNewLines)
|
void |
estimate()
|
Alphabet |
getAlphabet()
|
java.util.ArrayList<TopicAssignment> |
getData()
|
TopicInferencer |
getInferencer()
Return a tool for estimating topic distributions for new documents |
int |
getNumTopics()
|
MarginalProbEstimator |
getProbEstimator()
Return a tool for evaluating the marginal probability of new documents under this model |
java.util.ArrayList<java.util.TreeSet<IDSorter>> |
getSortedWords()
Return an array of sorted sets (one set per topic). |
LabelAlphabet |
getTopicAlphabet()
|
double[] |
getTopicProbabilities(int instanceID)
Get the smoothed distribution over topics for a training instance. |
double[] |
getTopicProbabilities(LabelSequence topics)
Get the smoothed distribution over topics for a topic sequence, which may be from the training set or from a new instance with topics assigned by an inferencer. |
java.lang.Object[][] |
getTopWords(int numWords)
Return an array (one element for each topic) of arrays of words, which are the most probable words for that topic in descending order. |
void |
initializeFromState(java.io.File stateFile)
|
static void |
main(java.lang.String[] args)
|
double |
modelLogLikelihood()
|
void |
optimizeAlpha(WorkerRunnable[] runnables)
|
void |
optimizeBeta(WorkerRunnable[] runnables)
|
void |
printDocumentTopics(java.io.File file)
|
void |
printDocumentTopics(java.io.PrintWriter out)
|
void |
printDocumentTopics(java.io.PrintWriter out,
double threshold,
int max)
|
void |
printState(java.io.File f)
|
void |
printState(java.io.PrintStream out)
|
void |
printTopicWordWeights(java.io.File file)
|
void |
printTopicWordWeights(java.io.PrintWriter out)
Print an unnormalized weight for every word in every topic. |
void |
printTopWords(java.io.File file,
int numWords,
boolean useNewLines)
|
void |
printTopWords(java.io.PrintStream out,
int numWords,
boolean usingNewLines)
|
void |
printTypeTopicCounts(java.io.File file)
Write the internal representation of type-topic counts (count/topic pairs in descending order by count) to a file. |
static ParallelTopicModel |
read(java.io.File f)
|
void |
setBurninPeriod(int burninPeriod)
|
void |
setNumIterations(int numIterations)
|
void |
setNumThreads(int threads)
|
void |
setOptimizeInterval(int interval)
Interval for optimizing Dirichlet hyperparameters |
void |
setRandomSeed(int seed)
|
void |
setSaveSerializedModel(int interval,
java.lang.String filename)
Define how often and where to save a serialized model. |
void |
setSaveState(int interval,
java.lang.String filename)
Define how often and where to save a text representation of the current state. |
void |
setSymmetricAlpha(boolean b)
|
void |
setTemperingInterval(int interval)
|
void |
setTopicDisplay(int interval,
int n)
|
void |
sumTypeTopicCounts(WorkerRunnable[] runnables)
|
void |
temperAlpha(WorkerRunnable[] runnables)
|
void |
topicPhraseXMLReport(java.io.PrintWriter out,
int numWords)
|
void |
topicXMLReport(java.io.PrintWriter out,
int numWords)
|
void |
write(java.io.File serializedModelFile)
|
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
public static final int UNASSIGNED_TOPIC
public static java.util.logging.Logger logger
public java.util.ArrayList<TopicAssignment> data
public Alphabet alphabet
public LabelAlphabet topicAlphabet
public int numTopics
public int topicMask
public int topicBits
public int numTypes
public int totalTokens
public double[] alpha
public double alphaSum
public double beta
public double betaSum
public boolean usingSymmetricAlpha
public static final double DEFAULT_BETA
public int[][] typeTopicCounts
public int[] tokensPerTopic
public int[] docLengthCounts
public int[][] topicDocCounts
public int numIterations
public int burninPeriod
public int saveSampleInterval
public int optimizeInterval
public int temperingInterval
public int showTopicsInterval
public int wordsPerTopic
public int saveStateInterval
public java.lang.String stateFilename
public int saveModelInterval
public java.lang.String modelFilename
public int randomSeed
public java.text.NumberFormat formatter
public boolean printLogLikelihood
Constructor Detail |
---|
public ParallelTopicModel(int numberOfTopics)
public ParallelTopicModel(int numberOfTopics, double alphaSum, double beta)
public ParallelTopicModel(LabelAlphabet topicAlphabet, double alphaSum, double beta)
Method Detail |
---|
public Alphabet getAlphabet()
public LabelAlphabet getTopicAlphabet()
public int getNumTopics()
public java.util.ArrayList<TopicAssignment> getData()
public void setNumIterations(int numIterations)
public void setBurninPeriod(int burninPeriod)
public void setTopicDisplay(int interval, int n)
public void setRandomSeed(int seed)
public void setOptimizeInterval(int interval)
public void setSymmetricAlpha(boolean b)
public void setTemperingInterval(int interval)
public void setNumThreads(int threads)
public void setSaveState(int interval, java.lang.String filename)
interval
- Save a copy of the state every interval
iterations.filename
- Save the state to this file, with the iteration number as a suffixpublic void setSaveSerializedModel(int interval, java.lang.String filename)
interval
- Save a serialized model every interval
iterations.filename
- Save to this file, with the iteration number as a suffixpublic void addInstances(InstanceList training)
public void initializeFromState(java.io.File stateFile) throws java.io.IOException
java.io.IOException
public void buildInitialTypeTopicCounts()
public void sumTypeTopicCounts(WorkerRunnable[] runnables)
public void optimizeAlpha(WorkerRunnable[] runnables)
public void temperAlpha(WorkerRunnable[] runnables)
public void optimizeBeta(WorkerRunnable[] runnables)
public void estimate() throws java.io.IOException
java.io.IOException
public void printTopWords(java.io.File file, int numWords, boolean useNewLines) throws java.io.IOException
java.io.IOException
public java.util.ArrayList<java.util.TreeSet<IDSorter>> getSortedWords()
public java.lang.Object[][] getTopWords(int numWords)
numWords
- The maximum length of each topic's array of words (may be less).public void printTopWords(java.io.PrintStream out, int numWords, boolean usingNewLines)
public java.lang.String displayTopWords(int numWords, boolean usingNewLines)
public void topicXMLReport(java.io.PrintWriter out, int numWords)
public void topicPhraseXMLReport(java.io.PrintWriter out, int numWords)
public void printTypeTopicCounts(java.io.File file) throws java.io.IOException
java.io.IOException
public void printTopicWordWeights(java.io.File file) throws java.io.IOException
java.io.IOException
public void printTopicWordWeights(java.io.PrintWriter out) throws java.io.IOException
java.io.IOException
public double[] getTopicProbabilities(int instanceID)
public double[] getTopicProbabilities(LabelSequence topics)
public void printDocumentTopics(java.io.File file) throws java.io.IOException
java.io.IOException
public void printDocumentTopics(java.io.PrintWriter out)
public void printDocumentTopics(java.io.PrintWriter out, double threshold, int max)
out
- A print writerthreshold
- Only print topics with proportion greater than this numbermax
- Print no more than this many topicspublic void printState(java.io.File f) throws java.io.IOException
java.io.IOException
public void printState(java.io.PrintStream out)
public double modelLogLikelihood()
public TopicInferencer getInferencer()
public MarginalProbEstimator getProbEstimator()
public void write(java.io.File serializedModelFile)
public static ParallelTopicModel read(java.io.File f) throws java.lang.Exception
java.lang.Exception
public static void main(java.lang.String[] args)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |