adopted new version contributed by Dan Blanchard; cf issue 673

80a66033 · Steven Bird · 55c41438 · 80a66033
Commit 80a66033 authored Dec 01, 2011 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 36 deletions

nltk/model/ngram.py
+89 -36

No files found.
--- a/nltk/model/ngram.py
+++ b/nltk/model/ngram.py
 # Natural Language Toolkit: Language Models
 #
 # Copyright (C) 2001-2011 NLTK Project
-# Author: Steven Bird <sb@csse.unimelb.edu.au>
+# Authors: Steven Bird <sb@csse.unimelb.edu.au>
+#          Daniel Blanchard <dan.blanchard@gmail.com>
 # URL: <http://www.nltk.org/>
 # For license information, see LICENSE.TXT

-import random
 from itertools import chain
 from math import log

 from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
-                              MLEProbDist, SimpleGoodTuringProbDist)
+                              SimpleGoodTuringProbDist)
 from nltk.util import ingrams
-
 from nltk.model.api import ModelI

+
 def _estimator(fdist, bins):
    """
    Default estimator function using a SimpleGoodTuringProbDist.
    """
-    # can't be an instance method of NgramModel as they 
+    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    return SimpleGoodTuringProbDist(fdist)

+
 class NgramModel(ModelI):
    """
    A processing interface for assigning a probability to the next word.
-    
-        >>> from nltk.corpus import brown
-        >>> from nltk.probability import LidstoneProbDist, WittenBellProbDist
-        >>> import textwrap
-        >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
-        >>> # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
-        >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
-        >>> text = lm.generate(100)
-        >>> print '\n'.join(textwrap.wrap(' '.join(text)))
    """

    # add cutoff
-    def __init__(self, n, train, estimator=None):
+    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
@@ -48,16 +40,27 @@ class NgramModel(ModelI):
        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
-        :type train: list(str)
+        :type train: list of string
        :param estimator: a function for generating a probability distribution
-        :type estimator: function(ConditionalFreqDist) -> ConditionalProbDist
+        :type estimator: a function that takes a ConditionalFreqDist and
+              returns a ConditionalProbDist
+        :param estimator_args: Extra arguments for estimator.
+            These arguments are usually used to specify extra
+            properties for the probability distributions of individual
+            conditions, such as the number of bins they contain.
+            Note: For backward-compatibility, if no arguments are specified, the
+            number of bins in the underlying ConditionalFreqDist are passed to
+            the estimator as an argument.
+        :type estimator_args: (any)
+        :param estimator_kw_args: Extra keyword arguments for estimator.
+        :type estimator_kw_args: (any)
        """

        self._n = n

        if estimator is None:
            estimator = _estimator
-        
+
        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('',) * (n - 1)
@@ -68,26 +71,30 @@ class NgramModel(ModelI):
            token = ngram[-1]
            cfd[context].inc(token)

-        self._model = ConditionalProbDist(cfd, estimator, len(cfd))
+        if (not estimator_args) and (not estimator_kw_args):
+            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
+        else:
+            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
-            self._backoff = NgramModel(n-1, train, estimator)
+            self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)

-    # Katz Backoff probability
    def prob(self, word, context):
        """
-        Evaluate the probability of this word in this context.
+        Evaluate the probability of this word in this context using Katz Backoff.
+
+        :param word: the word to get the probability of
+        :type word: str
+        :param context: the context the word is in
+        :type context: list(str)
        """

        context = tuple(context)
-        if context + (word,) in self._ngrams:
+        if (context + (word,) in self._ngrams) or (self._n == 1):
            return self[context].prob(word)
-        elif self._n > 1:
-            return self._alpha(context) * self._backoff.prob(word, context[1:])
        else:
-            raise RuntimeError("No probability mass assigned to word %s in "
-                               "context %s" % (word, ' '.join(context)))
+            return self._alpha(context) * self._backoff.prob(word, context[1:])

    def _alpha(self, tokens):
        return self._beta(tokens) / self._backoff._beta(tokens[1:])
@@ -101,18 +108,37 @@ class NgramModel(ModelI):
    def logprob(self, word, context):
        """
        Evaluate the (negative) log probability of this word in this context.
+
+        :param word: the word to get the probability of
+        :type word: str
+        :param context: the context the word is in
+        :type context: list(str)
        """

        return -log(self.prob(word, context), 2)

    def choose_random_word(self, context):
-        '''Randomly select a word that is likely to appear in this context.'''
+        '''
+        Randomly select a word that is likely to appear in this context.
+
+        :param context: the context the word is in
+        :type context: list(str)
+        '''
+
        return self.generate(1, context)[-1]

    # NB, this will always start with same word since model
    # is trained on a single text
    def generate(self, num_words, context=()):
-        '''Generate random text based on the language model.'''
+        '''
+        Generate random text based on the language model.
+
+        :param num_words: number of words to generate
+        :type num_words: int
+        :param context: initial words in generated string
+        :type context: list(str)
+        '''
+
        text = list(context)
        for i in range(num_words):
            text.append(self._generate_one(text))
@@ -130,16 +156,33 @@ class NgramModel(ModelI):

    def entropy(self, text):
        """
-        Evaluate the total entropy of a text with respect to the model.
-        This is the sum of the log probability of each word in the message.
+        Calculate the approximate cross-entropy of the n-gram model for a
+        given evaluation text.
+        This is the average log probability of each word in the text.
+
+        :param text: words to use for evaluation
+        :type text: list(str)
        """

        e = 0.0
-        for i in range(self._n - 1, len(text)):
+        # Add prefix to front to correctly handle first n-1 words
+        text = list(self._prefix) + text
+        for i in range(len(text)):
            context = tuple(text[i-self._n+1:i])
            token = text[i]
            e += self.logprob(token, context)
-        return e
+        return e / float(len(text) - (self._n-1))
+
+    def perplexity(self, text):
+        """
+        Calculates the perplexity of the given text.
+        This is simply 2 ** cross-entropy for the text.
+
+        :param text: words to calculate perplexity of
+        :type text: list(str)
+        """
+
+        return pow(2.0, self.entropy(text))

    def __contains__(self, item):
        return tuple(item) in self._model
@@ -151,6 +194,16 @@ class NgramModel(ModelI):
        return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)


-if __name__ == "__main__":
-    import doctest
-    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+def demo():
+    from nltk.corpus import brown
+    from nltk.probability import LidstoneProbDist, WittenBellProbDist
+    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
+    lm = NgramModel(3, brown.words(categories='news'), estimator)
+    print lm
+    #print lm.entropy(sent)
+    text = lm.generate(100)
+    import textwrap
+    print '\n'.join(textwrap.wrap(' '.join(text)))
+
+if __name__ == '__main__':
+    demo()