Commit 80a66033 by Steven Bird

adopted new version contributed by Dan Blanchard; cf issue 673

parent 55c41438
# Natural Language Toolkit: Language Models # Natural Language Toolkit: Language Models
# #
# Copyright (C) 2001-2011 NLTK Project # Copyright (C) 2001-2011 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au> # Authors: Steven Bird <sb@csse.unimelb.edu.au>
# Daniel Blanchard <dan.blanchard@gmail.com>
# URL: <http://www.nltk.org/> # URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
import random
from itertools import chain from itertools import chain
from math import log from math import log
from nltk.probability import (ConditionalProbDist, ConditionalFreqDist, from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
MLEProbDist, SimpleGoodTuringProbDist) SimpleGoodTuringProbDist)
from nltk.util import ingrams from nltk.util import ingrams
from nltk.model.api import ModelI from nltk.model.api import ModelI
def _estimator(fdist, bins): def _estimator(fdist, bins):
""" """
Default estimator function using a SimpleGoodTuringProbDist. Default estimator function using a SimpleGoodTuringProbDist.
""" """
# can't be an instance method of NgramModel as they # can't be an instance method of NgramModel as they
# can't be pickled either. # can't be pickled either.
return SimpleGoodTuringProbDist(fdist) return SimpleGoodTuringProbDist(fdist)
class NgramModel(ModelI): class NgramModel(ModelI):
""" """
A processing interface for assigning a probability to the next word. A processing interface for assigning a probability to the next word.
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist, WittenBellProbDist
>>> import textwrap
>>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator)
>>> text = lm.generate(100)
>>> print '\n'.join(textwrap.wrap(' '.join(text)))
""" """
# add cutoff # add cutoff
def __init__(self, n, train, estimator=None): def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
""" """
Creates an ngram language model to capture patterns in n consecutive Creates an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived words of training text. An estimator smooths the probabilities derived
...@@ -48,16 +40,27 @@ class NgramModel(ModelI): ...@@ -48,16 +40,27 @@ class NgramModel(ModelI):
:param n: the order of the language model (ngram size) :param n: the order of the language model (ngram size)
:type n: int :type n: int
:param train: the training text :param train: the training text
:type train: list(str) :type train: list of string
:param estimator: a function for generating a probability distribution :param estimator: a function for generating a probability distribution
:type estimator: function(ConditionalFreqDist) -> ConditionalProbDist :type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kw_args: Extra keyword arguments for estimator.
:type estimator_kw_args: (any)
""" """
self._n = n self._n = n
if estimator is None: if estimator is None:
estimator = _estimator estimator = _estimator
cfd = ConditionalFreqDist() cfd = ConditionalFreqDist()
self._ngrams = set() self._ngrams = set()
self._prefix = ('',) * (n - 1) self._prefix = ('',) * (n - 1)
...@@ -68,26 +71,30 @@ class NgramModel(ModelI): ...@@ -68,26 +71,30 @@ class NgramModel(ModelI):
token = ngram[-1] token = ngram[-1]
cfd[context].inc(token) cfd[context].inc(token)
self._model = ConditionalProbDist(cfd, estimator, len(cfd)) if (not estimator_args) and (not estimator_kw_args):
self._model = ConditionalProbDist(cfd, estimator, len(cfd))
else:
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)
# recursively construct the lower-order models # recursively construct the lower-order models
if n > 1: if n > 1:
self._backoff = NgramModel(n-1, train, estimator) self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
# Katz Backoff probability
def prob(self, word, context): def prob(self, word, context):
""" """
Evaluate the probability of this word in this context. Evaluate the probability of this word in this context using Katz Backoff.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
""" """
context = tuple(context) context = tuple(context)
if context + (word,) in self._ngrams: if (context + (word,) in self._ngrams) or (self._n == 1):
return self[context].prob(word) return self[context].prob(word)
elif self._n > 1:
return self._alpha(context) * self._backoff.prob(word, context[1:])
else: else:
raise RuntimeError("No probability mass assigned to word %s in " return self._alpha(context) * self._backoff.prob(word, context[1:])
"context %s" % (word, ' '.join(context)))
def _alpha(self, tokens): def _alpha(self, tokens):
return self._beta(tokens) / self._backoff._beta(tokens[1:]) return self._beta(tokens) / self._backoff._beta(tokens[1:])
...@@ -101,18 +108,37 @@ class NgramModel(ModelI): ...@@ -101,18 +108,37 @@ class NgramModel(ModelI):
def logprob(self, word, context): def logprob(self, word, context):
""" """
Evaluate the (negative) log probability of this word in this context. Evaluate the (negative) log probability of this word in this context.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
""" """
return -log(self.prob(word, context), 2) return -log(self.prob(word, context), 2)
def choose_random_word(self, context): def choose_random_word(self, context):
'''Randomly select a word that is likely to appear in this context.''' '''
Randomly select a word that is likely to appear in this context.
:param context: the context the word is in
:type context: list(str)
'''
return self.generate(1, context)[-1] return self.generate(1, context)[-1]
# NB, this will always start with same word since model # NB, this will always start with same word since model
# is trained on a single text # is trained on a single text
def generate(self, num_words, context=()): def generate(self, num_words, context=()):
'''Generate random text based on the language model.''' '''
Generate random text based on the language model.
:param num_words: number of words to generate
:type num_words: int
:param context: initial words in generated string
:type context: list(str)
'''
text = list(context) text = list(context)
for i in range(num_words): for i in range(num_words):
text.append(self._generate_one(text)) text.append(self._generate_one(text))
...@@ -130,16 +156,33 @@ class NgramModel(ModelI): ...@@ -130,16 +156,33 @@ class NgramModel(ModelI):
def entropy(self, text): def entropy(self, text):
""" """
Evaluate the total entropy of a text with respect to the model. Calculate the approximate cross-entropy of the n-gram model for a
This is the sum of the log probability of each word in the message. given evaluation text.
This is the average log probability of each word in the text.
:param text: words to use for evaluation
:type text: list(str)
""" """
e = 0.0 e = 0.0
for i in range(self._n - 1, len(text)): # Add prefix to front to correctly handle first n-1 words
text = list(self._prefix) + text
for i in range(len(text)):
context = tuple(text[i-self._n+1:i]) context = tuple(text[i-self._n+1:i])
token = text[i] token = text[i]
e += self.logprob(token, context) e += self.logprob(token, context)
return e return e / float(len(text) - (self._n-1))
def perplexity(self, text):
"""
Calculates the perplexity of the given text.
This is simply 2 ** cross-entropy for the text.
:param text: words to calculate perplexity of
:type text: list(str)
"""
return pow(2.0, self.entropy(text))
def __contains__(self, item): def __contains__(self, item):
return tuple(item) in self._model return tuple(item) in self._model
...@@ -151,6 +194,16 @@ class NgramModel(ModelI): ...@@ -151,6 +194,16 @@ class NgramModel(ModelI):
return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n) return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
if __name__ == "__main__": def demo():
import doctest from nltk.corpus import brown
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) from nltk.probability import LidstoneProbDist, WittenBellProbDist
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
lm = NgramModel(3, brown.words(categories='news'), estimator)
print lm
#print lm.entropy(sent)
text = lm.generate(100)
import textwrap
print '\n'.join(textwrap.wrap(' '.join(text)))
if __name__ == '__main__':
demo()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment