Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
98c2476a
Commit
98c2476a
authored
Apr 19, 2015
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
resolved merge conflicts
parents
c83a82b5
c54edec6
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
169 additions
and
36 deletions
+169
-36
nltk/model/ngram.py
+47
-35
nltk/probability.py
+1
-1
nltk/test/model.doctest
+121
-0
No files found.
nltk/model/ngram.py
View file @
98c2476a
...
...
@@ -8,26 +8,22 @@
# For license information, see LICENSE.TXT
from
__future__
import
unicode_literals
from
itertools
import
chain
from
math
import
log
from
nltk.probability
import
(
FreqDist
,
ConditionalProbDist
,
ConditionalFreqDist
,
LidstoneProbDist
)
from
nltk.probability
import
ConditionalProbDist
,
ConditionalFreqDist
,
LidstoneProbDist
from
nltk.util
import
ngrams
from
nltk.model.api
import
ModelI
from
nltk
import
compat
def
_estimator
(
fdist
,
*
estimator_args
,
*
*
estimator_kwargs
):
def
_estimator
(
fdist
,
**
estimator_kwargs
):
"""
Default estimator function using a
SimpleGoodTuring
ProbDist.
Default estimator function using a
Lidstone
ProbDist.
"""
# can't be an instance method of NgramModel as they
# can't be pickled either.
return
LidstoneProbDist
(
fdist
,
*
estimator_args
,
**
estimator_kwargs
)
return
LidstoneProbDist
(
fdist
,
0.001
,
**
estimator_kwargs
)
@compat.python_2_unicode_compatible
...
...
@@ -37,27 +33,22 @@ class NgramModel(ModelI):
"""
def
__init__
(
self
,
n
,
train
,
pad_left
=
True
,
pad_right
=
False
,
estimator
=
None
,
*
estimator_args
,
*
*
estimator_kwargs
):
estimator
=
None
,
**
estimator_kwargs
):
"""
Create an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
from the text and may allow generation of ngrams not seen during
training.
training.
See model.doctest for more detailed testing
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist
>>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
>>> lm = NgramModel(3, brown.words(categories='news'))
>>> lm
<NgramModel with 91603 3-grams>
>>> lm._backoff
<NgramModel with 62888 2-grams>
>>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
... 'primary', 'election', 'produced', '``', 'no', 'evidence',
... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
>>> lm.entropy(brown.words(categories='humor'))
... # doctest: +ELLIPSIS
0.5776
...
12.0399
...
:param n: the order of the language model (ngram size)
:type n: int
...
...
@@ -70,14 +61,6 @@ class NgramModel(ModelI):
:param estimator: a function for generating a probability distribution
:type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kwargs: Extra keyword arguments for the estimator
:type estimator_kwargs: (any)
"""
...
...
@@ -87,6 +70,9 @@ class NgramModel(ModelI):
assert
(
isinstance
(
pad_left
,
bool
))
assert
(
isinstance
(
pad_right
,
bool
))
self
.
_lpad
=
(
''
,)
*
(
n
-
1
)
if
pad_left
else
()
self
.
_rpad
=
(
''
,)
*
(
n
-
1
)
if
pad_right
else
()
# make sure n is greater than zero, otherwise print it
assert
(
n
>
0
),
n
...
...
@@ -110,22 +96,30 @@ class NgramModel(ModelI):
if
(
train
is
not
None
)
and
isinstance
(
train
[
0
],
compat
.
string_types
):
train
=
[
train
]
# we need to keep track of the number of word types we encounter
vocabulary
=
set
()
for
sent
in
train
:
raw_ngrams
=
ngrams
(
sent
,
n
,
pad_left
,
pad_right
,
pad_symbol
=
''
)
for
ngram
in
raw_ngrams
:
self
.
_ngrams
.
add
(
ngram
)
context
=
tuple
(
ngram
[:
-
1
])
token
=
ngram
[
-
1
]
cfd
[(
context
,
token
)]
+=
1
cfd
[
context
][
token
]
+=
1
vocabulary
.
add
(
token
)
self
.
_probdist
=
estimator
(
cfd
,
*
estimator_args
,
**
estimator_kwargs
)
# Unless number of bins is explicitly passed, we should use the number
# of word types encountered during training as the bins value.
# If right padding is on, this includes the padding symbol.
if
'bins'
not
in
estimator_kwargs
:
estimator_kwargs
[
'bins'
]
=
len
(
vocabulary
)
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
**
estimator_kwargs
)
# recursively construct the lower-order models
if
not
self
.
is_unigram_model
:
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
pad_left
,
pad_right
,
estimator
,
*
estimator_args
,
**
estimator_kwargs
)
self
.
_backoff_alphas
=
dict
()
...
...
@@ -240,7 +234,12 @@ class NgramModel(ModelI):
return
text
def
_generate_one
(
self
,
context
):
<<<<<<<
HEAD
context
=
(
self
.
_lpad
+
tuple
(
context
))[
-
self
.
_n
+
1
:]
=======
context
=
(
self
.
_lpad
+
tuple
(
context
))[
-
self
.
_n
+
1
:]
>>>>>>>
c54edec6856b877dd049cea5ef4a75b842af6c28
if
context
in
self
:
return
self
[
context
]
.
generate
()
elif
self
.
_n
>
1
:
...
...
@@ -258,13 +257,20 @@ class NgramModel(ModelI):
:type text: list(str)
"""
e
=
0.0
H
=
0.0
# entropy is conventionally denoted by "H"
text
=
list
(
self
.
_lpad
)
+
text
+
list
(
self
.
_rpad
)
for
i
in
range
(
self
.
_n
-
1
,
len
(
text
)):
<<<<<<<
HEAD
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
token
=
text
[
i
]
e
+=
self
.
logprob
(
token
,
context
)
return
e
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
=======
context
=
tuple
(
text
[(
i
-
self
.
_n
+
1
):
i
])
token
=
text
[
i
]
H
+=
self
.
logprob
(
token
,
context
)
return
H
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
>>>>>>>
c54edec6856b877dd049cea5ef4a75b842af6c28
def
perplexity
(
self
,
text
):
"""
...
...
@@ -278,19 +284,25 @@ class NgramModel(ModelI):
return
pow
(
2.0
,
self
.
entropy
(
text
))
def
__contains__
(
self
,
item
):
<<<<<<<
HEAD
return
tuple
(
item
)
in
self
.
_probdist
.
freqdist
def
__getitem__
(
self
,
item
):
return
self
.
_probdist
[
tuple
(
item
)]
=======
if
not
isinstance
(
item
,
tuple
):
item
=
(
item
,)
return
item
in
self
.
_model
def
__getitem__
(
self
,
item
):
if
not
isinstance
(
item
,
tuple
):
item
=
(
item
,)
return
self
.
_model
[
item
]
>>>>>>>
c54edec6856b877dd049cea5ef4a75b842af6c28
def
__repr__
(
self
):
return
'<NgramModel with
%
d
%
d-grams>'
%
(
len
(
self
.
_ngrams
),
self
.
_n
)
def
teardown_module
(
module
=
None
):
from
nltk.corpus
import
brown
brown
.
_unload
()
if
__name__
==
"__main__"
:
import
doctest
doctest
.
testmod
(
optionflags
=
doctest
.
NORMALIZE_WHITESPACE
)
nltk/probability.py
View file @
98c2476a
...
...
@@ -659,7 +659,7 @@ class LidstoneProbDist(ProbDistI):
likelihood estimate of the resulting frequency distribution.
"""
SUM_TO_ONE
=
False
def
__init__
(
self
,
freqdist
,
gamma
,
bins
=
None
,
override_N
=
None
):
def
__init__
(
self
,
freqdist
,
gamma
,
bins
=
None
):
"""
Use the Lidstone estimate to create a probability distribution
for the experiment used to generate ``freqdist``.
...
...
nltk/test/model.doctest
0 → 100644
View file @
98c2476a
.. Copyright (C) 2001-2015 NLTK Project
.. For license information, see LICENSE.TXT
===========
NGram Model
===========
>>> import nltk
>>> from nltk.model import NgramModel
################
A Simple Example
################
The purpose of this example is to demonstrate the correctness of the current
NgramModel implementation. For that reason we train on a small corpus so that
calculating probabilities by hand is tractable. We will compare the probabilities
we compute by hand to the ones the model yields and ideally they should match.
Setup
-----
Below is very small corpus, borrowed from one of the comments in this thread:
https://github.com/nltk/nltk/issues/367
>>> word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']
This corpus has a property that will be important to us later. It namely has a
different number of word tokens as opposed to word types. The latter (also
referred to as the vocabulary) is the set of unique words in the text.
Let's save it to a variable.
>>> word_types = set(word_seq)
Next we need choose a probability estimator (aka smoothing algorithm).
Once again, for the sake of simplicity let's use LaplaceProbDist.
>>> from nltk.probability import LaplaceProbDist as estimator
We are ready to initialize our ngram language model. For this example, let's
make it a trigram model.
>>> lm = NgramModel(3, word_seq, estimator=estimator, bins=len(word_types))
Please note the last argument to the NgramModel constructor. In NLTK parlance
this is called the ``bins`` parameter and it is passed on to the LaplaceProbDist
estimator. Failing to provide this argument currently almost always leads to
incorrect probability scores.
Testing Probability Scores
--------------------------
Now that we have the language model set up, let's see what probability it produces
for a trigram seen during training.
>>> lm.prob('foo', ('foo', 'foo'))
0.5
To make sure we're on the right track, let's compute this probability by hand.
Since the trigam was seen, P(foo | foo, foo) simply translates into:
(count(foo, foo, foo) + 1) / (count(foo, foo) + bins * 1)
If we plug in numbers we have:
(2 + 1) / (3 + 3) = 3/6 = 0.5
So far our model is on track!
But what if we plug in a trigram that wasn't in the training corpus?
>>> lm.prob('baz', ('foo', 'foo'))
0.16666...
Let's verify this result by hand. The current implementation of NgramModel uses
Katz backoff, which means that P(baz | foo, foo) becomes:
alpha(foo, foo) * P(baz | foo)
where alpha(foo, foo)
= (1 - sum(P(w | foo, foo) for w in W)) / (1 - sum(P(w | foo) for w in W))
where W is all the words that followed bigram "foo foo", namely the list [foo, bar].
Thus the sum in the numerator will be:
P(foo | foo, foo) + P(bar | foo, foo)
We already know the first member of this sum and if we plug in analogous numbers
for P(bar | foo, foo), we arrive at:
3/6 + 2/6 = 5/6
We subtract this from 1 to arrive at the numerator of 1/6.
Next we do the same for the denominator, with the difference that this time we're
conditioning on the context "foo" instead of "foo foo".
P(foo | foo) + P(bar | foo) = 4/7 + 2/7 = 6/7
1 - 6/7 = 1/7
If we combine the numerator with the denominator we get 7/6.
This is alpha(foo, foo). Now all we need is P(baz | foo).
However since our training text contains no instances of the bigram "foo baz",
we will have to perform the same operations as we just did for "foo foo baz". I.E.
P(baz | foo) = alpha(foo) * P(baz)
The alpha this time is equal to:
(1 - (P(foo | foo) + P(bar | foo))) / (1 - (P(foo) + P(bar)))
We already have the numerator from the previous calculation, it's 1/7.
The denominator comes from the unigram probabilities for "foo" and "bar", making it:
1 - (5/9 + 2/9) = 2/9
Thus we have alpha(foo) = 9/14
Combine this with the unigram P(baz) and we get:
P(baz | foo) = 1/7
Then we combine this with alpha(foo, foo) to arrive at:
P(baz | foo, foo) = 7/6 * 1/7 = 1/6 = 0.16666...
Our model is correct again!
Pickling and unpickling
-----------------------
We currently don't have a doctest for this because NgramModel comparison doesn't
work. One will be added as soon as that's fixed.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment