Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
80a66033
Commit
80a66033
authored
Dec 01, 2011
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
adopted new version contributed by Dan Blanchard; cf issue 673
parent
55c41438
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
89 additions
and
36 deletions
+89
-36
nltk/model/ngram.py
+89
-36
No files found.
nltk/model/ngram.py
View file @
80a66033
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au>
# Authors: Steven Bird <sb@csse.unimelb.edu.au>
# Daniel Blanchard <dan.blanchard@gmail.com>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
import
random
from
itertools
import
chain
from
math
import
log
from
nltk.probability
import
(
ConditionalProbDist
,
ConditionalFreqDist
,
MLEProbDist
,
SimpleGoodTuringProbDist
)
SimpleGoodTuringProbDist
)
from
nltk.util
import
ingrams
from
nltk.model.api
import
ModelI
def
_estimator
(
fdist
,
bins
):
"""
Default estimator function using a SimpleGoodTuringProbDist.
"""
# can't be an instance method of NgramModel as they
# can't be an instance method of NgramModel as they
# can't be pickled either.
return
SimpleGoodTuringProbDist
(
fdist
)
class
NgramModel
(
ModelI
):
"""
A processing interface for assigning a probability to the next word.
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist, WittenBellProbDist
>>> import textwrap
>>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator)
>>> text = lm.generate(100)
>>> print '
\n
'.join(textwrap.wrap(' '.join(text)))
"""
# add cutoff
def
__init__
(
self
,
n
,
train
,
estimator
=
None
):
def
__init__
(
self
,
n
,
train
,
estimator
=
None
,
*
estimator_args
,
**
estimator_kw_args
):
"""
Creates an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
...
...
@@ -48,16 +40,27 @@ class NgramModel(ModelI):
:param n: the order of the language model (ngram size)
:type n: int
:param train: the training text
:type train: list
(str)
:type train: list
of string
:param estimator: a function for generating a probability distribution
:type estimator: function(ConditionalFreqDist) -> ConditionalProbDist
:type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kw_args: Extra keyword arguments for estimator.
:type estimator_kw_args: (any)
"""
self
.
_n
=
n
if
estimator
is
None
:
estimator
=
_estimator
cfd
=
ConditionalFreqDist
()
self
.
_ngrams
=
set
()
self
.
_prefix
=
(
''
,)
*
(
n
-
1
)
...
...
@@ -68,26 +71,30 @@ class NgramModel(ModelI):
token
=
ngram
[
-
1
]
cfd
[
context
]
.
inc
(
token
)
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
len
(
cfd
))
if
(
not
estimator_args
)
and
(
not
estimator_kw_args
):
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
len
(
cfd
))
else
:
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
*
estimator_args
,
**
estimator_kw_args
)
# recursively construct the lower-order models
if
n
>
1
:
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
estimator
)
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
estimator
,
*
estimator_args
,
**
estimator_kw_args
)
# Katz Backoff probability
def
prob
(
self
,
word
,
context
):
"""
Evaluate the probability of this word in this context.
Evaluate the probability of this word in this context using Katz Backoff.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
context
=
tuple
(
context
)
if
context
+
(
word
,)
in
self
.
_ngrams
:
if
(
context
+
(
word
,)
in
self
.
_ngrams
)
or
(
self
.
_n
==
1
)
:
return
self
[
context
]
.
prob
(
word
)
elif
self
.
_n
>
1
:
return
self
.
_alpha
(
context
)
*
self
.
_backoff
.
prob
(
word
,
context
[
1
:])
else
:
raise
RuntimeError
(
"No probability mass assigned to word
%
s in "
"context
%
s"
%
(
word
,
' '
.
join
(
context
)))
return
self
.
_alpha
(
context
)
*
self
.
_backoff
.
prob
(
word
,
context
[
1
:])
def
_alpha
(
self
,
tokens
):
return
self
.
_beta
(
tokens
)
/
self
.
_backoff
.
_beta
(
tokens
[
1
:])
...
...
@@ -101,18 +108,37 @@ class NgramModel(ModelI):
def
logprob
(
self
,
word
,
context
):
"""
Evaluate the (negative) log probability of this word in this context.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
return
-
log
(
self
.
prob
(
word
,
context
),
2
)
def
choose_random_word
(
self
,
context
):
'''Randomly select a word that is likely to appear in this context.'''
'''
Randomly select a word that is likely to appear in this context.
:param context: the context the word is in
:type context: list(str)
'''
return
self
.
generate
(
1
,
context
)[
-
1
]
# NB, this will always start with same word since model
# is trained on a single text
def
generate
(
self
,
num_words
,
context
=
()):
'''Generate random text based on the language model.'''
'''
Generate random text based on the language model.
:param num_words: number of words to generate
:type num_words: int
:param context: initial words in generated string
:type context: list(str)
'''
text
=
list
(
context
)
for
i
in
range
(
num_words
):
text
.
append
(
self
.
_generate_one
(
text
))
...
...
@@ -130,16 +156,33 @@ class NgramModel(ModelI):
def
entropy
(
self
,
text
):
"""
Evaluate the total entropy of a text with respect to the model.
This is the sum of the log probability of each word in the message.
Calculate the approximate cross-entropy of the n-gram model for a
given evaluation text.
This is the average log probability of each word in the text.
:param text: words to use for evaluation
:type text: list(str)
"""
e
=
0.0
for
i
in
range
(
self
.
_n
-
1
,
len
(
text
)):
# Add prefix to front to correctly handle first n-1 words
text
=
list
(
self
.
_prefix
)
+
text
for
i
in
range
(
len
(
text
)):
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
token
=
text
[
i
]
e
+=
self
.
logprob
(
token
,
context
)
return
e
return
e
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
def
perplexity
(
self
,
text
):
"""
Calculates the perplexity of the given text.
This is simply 2 ** cross-entropy for the text.
:param text: words to calculate perplexity of
:type text: list(str)
"""
return
pow
(
2.0
,
self
.
entropy
(
text
))
def
__contains__
(
self
,
item
):
return
tuple
(
item
)
in
self
.
_model
...
...
@@ -151,6 +194,16 @@ class NgramModel(ModelI):
return
'<NgramModel with
%
d
%
d-grams>'
%
(
len
(
self
.
_ngrams
),
self
.
_n
)
if
__name__
==
"__main__"
:
import
doctest
doctest
.
testmod
(
optionflags
=
doctest
.
NORMALIZE_WHITESPACE
)
def
demo
():
from
nltk.corpus
import
brown
from
nltk.probability
import
LidstoneProbDist
,
WittenBellProbDist
estimator
=
lambda
fdist
,
bins
:
LidstoneProbDist
(
fdist
,
0.2
)
lm
=
NgramModel
(
3
,
brown
.
words
(
categories
=
'news'
),
estimator
)
print
lm
#print lm.entropy(sent)
text
=
lm
.
generate
(
100
)
import
textwrap
print
'
\n
'
.
join
(
textwrap
.
wrap
(
' '
.
join
(
text
)))
if
__name__
==
'__main__'
:
demo
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment