Commit 98c2476a by Steven Bird

resolved merge conflicts

parents c83a82b5 c54edec6
......@@ -8,26 +8,22 @@
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
from itertools import chain
from math import log
from nltk.probability import (FreqDist,
ConditionalProbDist,
ConditionalFreqDist,
LidstoneProbDist)
from nltk.probability import ConditionalProbDist, ConditionalFreqDist, LidstoneProbDist
from nltk.util import ngrams
from nltk.model.api import ModelI
from nltk import compat
def _estimator(fdist, *estimator_args, **estimator_kwargs):
def _estimator(fdist, **estimator_kwargs):
"""
Default estimator function using a SimpleGoodTuringProbDist.
Default estimator function using a LidstoneProbDist.
"""
# can't be an instance method of NgramModel as they
# can't be pickled either.
return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
return LidstoneProbDist(fdist, 0.001, **estimator_kwargs)
@compat.python_2_unicode_compatible
......@@ -37,27 +33,22 @@ class NgramModel(ModelI):
"""
def __init__(self, n, train, pad_left=True, pad_right=False,
estimator=None, *estimator_args, **estimator_kwargs):
estimator=None, **estimator_kwargs):
"""
Create an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
from the text and may allow generation of ngrams not seen during
training.
training. See model.doctest for more detailed testing
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist
>>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
>>> lm = NgramModel(3, brown.words(categories='news'))
>>> lm
<NgramModel with 91603 3-grams>
>>> lm._backoff
<NgramModel with 62888 2-grams>
>>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
... 'primary', 'election', 'produced', '``', 'no', 'evidence',
... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
>>> lm.entropy(brown.words(categories='humor'))
... # doctest: +ELLIPSIS
0.5776...
12.0399...
:param n: the order of the language model (ngram size)
:type n: int
......@@ -70,14 +61,6 @@ class NgramModel(ModelI):
:param estimator: a function for generating a probability distribution
:type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kwargs: Extra keyword arguments for the estimator
:type estimator_kwargs: (any)
"""
......@@ -87,6 +70,9 @@ class NgramModel(ModelI):
assert(isinstance(pad_left, bool))
assert(isinstance(pad_right, bool))
self._lpad = ('',) * (n - 1) if pad_left else ()
self._rpad = ('',) * (n - 1) if pad_right else ()
# make sure n is greater than zero, otherwise print it
assert (n > 0), n
......@@ -110,22 +96,30 @@ class NgramModel(ModelI):
if (train is not None) and isinstance(train[0], compat.string_types):
train = [train]
# we need to keep track of the number of word types we encounter
vocabulary = set()
for sent in train:
raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
for ngram in raw_ngrams:
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
cfd[(context, token)] += 1
cfd[context][token] += 1
vocabulary.add(token)
self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)
# Unless number of bins is explicitly passed, we should use the number
# of word types encountered during training as the bins value.
# If right padding is on, this includes the padding symbol.
if 'bins' not in estimator_kwargs:
estimator_kwargs['bins'] = len(vocabulary)
self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)
# recursively construct the lower-order models
if not self.is_unigram_model:
self._backoff = NgramModel(n-1, train,
pad_left, pad_right,
estimator,
*estimator_args,
**estimator_kwargs)
self._backoff_alphas = dict()
......@@ -240,7 +234,12 @@ class NgramModel(ModelI):
return text
def _generate_one(self, context):
<<<<<<< HEAD
context = (self._lpad + tuple(context))[- self._n + 1:]
=======
context = (self._lpad + tuple(context))[-self._n + 1:]
>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28
if context in self:
return self[context].generate()
elif self._n > 1:
......@@ -258,13 +257,20 @@ class NgramModel(ModelI):
:type text: list(str)
"""
e = 0.0
H = 0.0 # entropy is conventionally denoted by "H"
text = list(self._lpad) + text + list(self._rpad)
for i in range(self._n - 1, len(text)):
<<<<<<< HEAD
context = tuple(text[i - self._n + 1:i])
token = text[i]
e += self.logprob(token, context)
return e / float(len(text) - (self._n - 1))
=======
context = tuple(text[(i - self._n + 1):i])
token = text[i]
H += self.logprob(token, context)
return H / float(len(text) - (self._n - 1))
>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28
def perplexity(self, text):
"""
......@@ -278,19 +284,25 @@ class NgramModel(ModelI):
return pow(2.0, self.entropy(text))
def __contains__(self, item):
<<<<<<< HEAD
return tuple(item) in self._probdist.freqdist
def __getitem__(self, item):
return self._probdist[tuple(item)]
=======
if not isinstance(item, tuple):
item = (item,)
return item in self._model
def __getitem__(self, item):
if not isinstance(item, tuple):
item = (item,)
return self._model[item]
>>>>>>> c54edec6856b877dd049cea5ef4a75b842af6c28
def __repr__(self):
return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
def teardown_module(module=None):
from nltk.corpus import brown
brown._unload()
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
......@@ -659,7 +659,7 @@ class LidstoneProbDist(ProbDistI):
likelihood estimate of the resulting frequency distribution.
"""
SUM_TO_ONE = False
def __init__(self, freqdist, gamma, bins=None, override_N=None):
def __init__(self, freqdist, gamma, bins=None):
"""
Use the Lidstone estimate to create a probability distribution
for the experiment used to generate ``freqdist``.
......
.. Copyright (C) 2001-2015 NLTK Project
.. For license information, see LICENSE.TXT
===========
NGram Model
===========
>>> import nltk
>>> from nltk.model import NgramModel
################
A Simple Example
################
The purpose of this example is to demonstrate the correctness of the current
NgramModel implementation. For that reason we train on a small corpus so that
calculating probabilities by hand is tractable. We will compare the probabilities
we compute by hand to the ones the model yields and ideally they should match.
Setup
-----
Below is very small corpus, borrowed from one of the comments in this thread:
https://github.com/nltk/nltk/issues/367
>>> word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']
This corpus has a property that will be important to us later. It namely has a
different number of word tokens as opposed to word types. The latter (also
referred to as the vocabulary) is the set of unique words in the text.
Let's save it to a variable.
>>> word_types = set(word_seq)
Next we need choose a probability estimator (aka smoothing algorithm).
Once again, for the sake of simplicity let's use LaplaceProbDist.
>>> from nltk.probability import LaplaceProbDist as estimator
We are ready to initialize our ngram language model. For this example, let's
make it a trigram model.
>>> lm = NgramModel(3, word_seq, estimator=estimator, bins=len(word_types))
Please note the last argument to the NgramModel constructor. In NLTK parlance
this is called the ``bins`` parameter and it is passed on to the LaplaceProbDist
estimator. Failing to provide this argument currently almost always leads to
incorrect probability scores.
Testing Probability Scores
--------------------------
Now that we have the language model set up, let's see what probability it produces
for a trigram seen during training.
>>> lm.prob('foo', ('foo', 'foo'))
0.5
To make sure we're on the right track, let's compute this probability by hand.
Since the trigam was seen, P(foo | foo, foo) simply translates into:
(count(foo, foo, foo) + 1) / (count(foo, foo) + bins * 1)
If we plug in numbers we have:
(2 + 1) / (3 + 3) = 3/6 = 0.5
So far our model is on track!
But what if we plug in a trigram that wasn't in the training corpus?
>>> lm.prob('baz', ('foo', 'foo'))
0.16666...
Let's verify this result by hand. The current implementation of NgramModel uses
Katz backoff, which means that P(baz | foo, foo) becomes:
alpha(foo, foo) * P(baz | foo)
where alpha(foo, foo)
= (1 - sum(P(w | foo, foo) for w in W)) / (1 - sum(P(w | foo) for w in W))
where W is all the words that followed bigram "foo foo", namely the list [foo, bar].
Thus the sum in the numerator will be:
P(foo | foo, foo) + P(bar | foo, foo)
We already know the first member of this sum and if we plug in analogous numbers
for P(bar | foo, foo), we arrive at:
3/6 + 2/6 = 5/6
We subtract this from 1 to arrive at the numerator of 1/6.
Next we do the same for the denominator, with the difference that this time we're
conditioning on the context "foo" instead of "foo foo".
P(foo | foo) + P(bar | foo) = 4/7 + 2/7 = 6/7
1 - 6/7 = 1/7
If we combine the numerator with the denominator we get 7/6.
This is alpha(foo, foo). Now all we need is P(baz | foo).
However since our training text contains no instances of the bigram "foo baz",
we will have to perform the same operations as we just did for "foo foo baz". I.E.
P(baz | foo) = alpha(foo) * P(baz)
The alpha this time is equal to:
(1 - (P(foo | foo) + P(bar | foo))) / (1 - (P(foo) + P(bar)))
We already have the numerator from the previous calculation, it's 1/7.
The denominator comes from the unigram probabilities for "foo" and "bar", making it:
1 - (5/9 + 2/9) = 2/9
Thus we have alpha(foo) = 9/14
Combine this with the unigram P(baz) and we get:
P(baz | foo) = 1/7
Then we combine this with alpha(foo, foo) to arrive at:
P(baz | foo, foo) = 7/6 * 1/7 = 1/6 = 0.16666...
Our model is correct again!
Pickling and unpickling
-----------------------
We currently don't have a doctest for this because NgramModel comparison doesn't
work. One will be added as soon as that's fixed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment