resolved merge conflicts

98c2476a · Steven Bird · c83a82b5 · c54edec6 · 98c2476a · 98c2476a
Commit 98c2476a authored Apr 19, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 169 additions and 36 deletions

nltk/model/ngram.py
+47 -35

nltk/probability.py
+1 -1

nltk/test/model.doctest
+121 -0

No files found.
--- a/nltk/model/ngram.py
+++ b/nltk/model/ngram.py
@@ -8,26 +8,22 @@
 # For license information, see LICENSE.TXT
 from __future__ import unicode_literals

-from itertools import chain
 from math import log

-from nltk.probability import (FreqDist,
-    ConditionalProbDist,
-    ConditionalFreqDist,
-    LidstoneProbDist)
+from nltk.probability import ConditionalProbDist, ConditionalFreqDist, LidstoneProbDist
 from nltk.util import ngrams
 from nltk.model.api import ModelI

 from nltk import compat


-def _estimator(fdist, *estimator_args, **estimator_kwargs):
+def _estimator(fdist, **estimator_kwargs):
    """
-    Default estimator function using a SimpleGoodTuringProbDist.
+    Default estimator function using a LidstoneProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
-    return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
+    return LidstoneProbDist(fdist, 0.001, **estimator_kwargs)


 @compat.python_2_unicode_compatible
@@ -37,27 +33,22 @@ class NgramModel(ModelI):
    """

    def __init__(self, n, train, pad_left=True, pad_right=False,
-                 estimator=None, *estimator_args, **estimator_kwargs):
+                 estimator=None, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
-        training.
+        training. See model.doctest for more detailed testing

            >>> from nltk.corpus import brown
-            >>> from nltk.probability import LidstoneProbDist
-            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
-            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
+            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
-            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
-            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
-            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
-            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
+            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
-            0.5776...
+            12.0399...

        :param n: the order of the language model (ngram size)
        :type n: int
@@ -70,14 +61,6 @@ class NgramModel(ModelI):
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
-        :param estimator_args: Extra arguments for estimator.
-            These arguments are usually used to specify extra
-            properties for the probability distributions of individual
-            conditions, such as the number of bins they contain.
-            Note: For backward-compatibility, if no arguments are specified, the
-            number of bins in the underlying ConditionalFreqDist are passed to
-            the estimator as an argument.
-        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """
@@ -87,6 +70,9 @@ class NgramModel(ModelI):
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

+        self._lpad = ('',) * (n - 1) if pad_left else ()
+        self._rpad = ('',) * (n - 1) if pad_right else ()
+
        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

@@ -110,22 +96,30 @@ class NgramModel(ModelI):
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

+        # we need to keep track of the number of word types we encounter
+        vocabulary = set()
        for sent in train:
            raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
            for ngram in raw_ngrams:
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
-                cfd[(context, token)] += 1
+                cfd[context][token] += 1
+                vocabulary.add(token)

-        self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)
+        # Unless number of bins is explicitly passed, we should use the number
+        # of word types encountered during training as the bins value.
+        # If right padding is on, this includes the padding symbol.
+        if 'bins' not in estimator_kwargs:
+            estimator_kwargs['bins'] = len(vocabulary)
+
+        self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
-                                        *estimator_args,
                                        **estimator_kwargs)

            self._backoff_alphas = dict()
@@ -240,7 +234,12 @@ class NgramModel(ModelI):
        return text

    def _generate_one(self, context):
+<<<<<<< HEAD
        context = (self._lpad + tuple(context))[- self._n + 1:]
+=======
+        context = (self._lpad + tuple(context))[-self._n + 1:]
+
+>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28
        if context in self:
            return self[context].generate()
        elif self._n > 1:
@@ -258,13 +257,20 @@ class NgramModel(ModelI):
        :type text: list(str)
        """

-        e = 0.0
+        H = 0.0     # entropy is conventionally denoted by "H"
        text = list(self._lpad) + text + list(self._rpad)
        for i in range(self._n - 1, len(text)):
+<<<<<<< HEAD
            context = tuple(text[i - self._n + 1:i])
            token = text[i]
            e += self.logprob(token, context)
        return e / float(len(text) - (self._n - 1))
+=======
+            context = tuple(text[(i - self._n + 1):i])
+            token = text[i]
+            H += self.logprob(token, context)
+        return H / float(len(text) - (self._n - 1))
+>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28

    def perplexity(self, text):
        """
@@ -278,19 +284,25 @@ class NgramModel(ModelI):
        return pow(2.0, self.entropy(text))

    def __contains__(self, item):
+<<<<<<< HEAD
        return tuple(item) in self._probdist.freqdist

    def __getitem__(self, item):
        return self._probdist[tuple(item)]
+=======
+        if not isinstance(item, tuple):
+            item = (item,)
+        return item in self._model
+
+    def __getitem__(self, item):
+        if not isinstance(item, tuple):
+            item = (item,)
+        return self._model[item]
+>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28

    def __repr__(self):
        return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)

-
-def teardown_module(module=None):
-    from nltk.corpus import brown
-    brown._unload()
-
 if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
--- a/nltk/probability.py
+++ b/nltk/probability.py
@@ -659,7 +659,7 @@ class LidstoneProbDist(ProbDistI):
    likelihood estimate of the resulting frequency distribution.
    """
    SUM_TO_ONE = False
-    def __init__(self, freqdist, gamma, bins=None, override_N=None):
+    def __init__(self, freqdist, gamma, bins=None):
        """
        Use the Lidstone estimate to create a probability distribution
        for the experiment used to generate ``freqdist``.

--- a/nltk/test/model.doctest
+++ b/nltk/test/model.doctest
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+===========
+NGram Model
+===========
+
+    >>> import nltk
+    >>> from nltk.model import NgramModel
+
+
+################
+A Simple Example
+################
+
+The purpose of this example is to demonstrate the correctness of the current
+NgramModel implementation. For that reason we train on a small corpus so that
+calculating probabilities by hand is tractable. We will compare the probabilities
+we compute by hand to the ones the model yields and ideally they should match.
+
+
+Setup
+-----
+
+Below is very small corpus, borrowed from one of the comments in this thread:
+https://github.com/nltk/nltk/issues/367
+
+    >>> word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']
+
+This corpus has a property that will be important to us later. It namely has a
+different number of word tokens as opposed to word types. The latter (also
+referred to as the vocabulary) is the set of unique words in the text.
+Let's save it to a variable.
+
+    >>> word_types = set(word_seq)
+
+Next we need choose a probability estimator (aka smoothing algorithm).
+Once again, for the sake of simplicity let's use LaplaceProbDist.
+
+    >>> from nltk.probability import LaplaceProbDist as estimator
+
+We are ready to initialize our ngram language model. For this example, let's
+make it a trigram model.
+
+    >>> lm = NgramModel(3, word_seq, estimator=estimator, bins=len(word_types))
+
+Please note the last argument to the NgramModel constructor. In NLTK parlance
+this is called the ``bins`` parameter and it is passed on to the LaplaceProbDist
+estimator. Failing to provide this argument currently almost always leads to
+incorrect probability scores.
+
+Testing Probability Scores
+--------------------------
+
+Now that we have the language model set up, let's see what probability it produces
+for a trigram seen during training.
+
+    >>> lm.prob('foo', ('foo', 'foo'))
+    0.5
+
+To make sure we're on the right track, let's compute this probability by hand.
+Since the trigam was seen, P(foo | foo, foo) simply translates into:
+    (count(foo, foo, foo) + 1) / (count(foo, foo) + bins * 1)
+If we plug in numbers we have:
+    (2 + 1) / (3 + 3) = 3/6 = 0.5
+So far our model is on track!
+
+But what if we plug in a trigram that wasn't in the training corpus?
+
+    >>> lm.prob('baz', ('foo', 'foo'))
+    0.16666...
+
+Let's verify this result by hand. The current implementation of NgramModel uses
+Katz backoff, which means that P(baz | foo, foo) becomes:
+alpha(foo, foo) * P(baz | foo)
+where alpha(foo, foo)
+= (1 - sum(P(w | foo, foo) for w in W)) / (1 - sum(P(w | foo) for w in W))
+where W is all the words that followed bigram "foo foo", namely the list [foo, bar].
+
+Thus the sum in the numerator will be:
+P(foo | foo, foo) + P(bar | foo, foo)
+We already know the first member of this sum and if we plug in analogous numbers
+for P(bar | foo, foo), we arrive at:
+3/6 + 2/6 = 5/6
+We subtract this from 1 to arrive at the numerator of 1/6.
+
+Next we do the same for the denominator, with the difference that this time we're
+conditioning on the context "foo" instead of "foo foo".
+P(foo | foo) + P(bar | foo) = 4/7 + 2/7 = 6/7
+1 - 6/7 = 1/7
+
+If we combine the numerator with the denominator we get 7/6.
+This is alpha(foo, foo). Now all we need is P(baz | foo).
+
+However since our training text contains no instances of the bigram "foo baz",
+we will have to perform the same operations as we just did for "foo foo baz". I.E.
+P(baz | foo) = alpha(foo) * P(baz)
+
+The alpha this time is equal to:
+(1 - (P(foo | foo) + P(bar | foo))) / (1 - (P(foo) + P(bar)))
+
+We already have the numerator from the previous calculation, it's 1/7.
+The denominator comes from the unigram probabilities for "foo" and "bar", making it:
+1 - (5/9 + 2/9) = 2/9
+
+Thus we have alpha(foo) = 9/14
+
+Combine this with the unigram P(baz) and we get:
+P(baz | foo) = 1/7
+
+Then we combine this with alpha(foo, foo) to arrive at:
+P(baz | foo, foo) = 7/6 * 1/7 = 1/6 = 0.16666...
+
+Our model is correct again!
+
+
+Pickling and unpickling
+-----------------------
+
+We currently don't have a doctest for this because NgramModel comparison doesn't
+work. One will be added as soon as that's fixed.