Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
80a66033
Commit
80a66033
authored
Dec 01, 2011
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
adopted new version contributed by Dan Blanchard; cf issue 673
parent
55c41438
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
89 additions
and
36 deletions
+89
-36
nltk/model/ngram.py
+89
-36
No files found.
nltk/model/ngram.py
View file @
80a66033
# Natural Language Toolkit: Language Models
# Natural Language Toolkit: Language Models
#
#
# Copyright (C) 2001-2011 NLTK Project
# Copyright (C) 2001-2011 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au>
# Authors: Steven Bird <sb@csse.unimelb.edu.au>
# Daniel Blanchard <dan.blanchard@gmail.com>
# URL: <http://www.nltk.org/>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
# For license information, see LICENSE.TXT
import
random
from
itertools
import
chain
from
itertools
import
chain
from
math
import
log
from
math
import
log
from
nltk.probability
import
(
ConditionalProbDist
,
ConditionalFreqDist
,
from
nltk.probability
import
(
ConditionalProbDist
,
ConditionalFreqDist
,
MLEProbDist
,
SimpleGoodTuringProbDist
)
SimpleGoodTuringProbDist
)
from
nltk.util
import
ingrams
from
nltk.util
import
ingrams
from
nltk.model.api
import
ModelI
from
nltk.model.api
import
ModelI
def
_estimator
(
fdist
,
bins
):
def
_estimator
(
fdist
,
bins
):
"""
"""
Default estimator function using a SimpleGoodTuringProbDist.
Default estimator function using a SimpleGoodTuringProbDist.
"""
"""
# can't be an instance method of NgramModel as they
# can't be an instance method of NgramModel as they
# can't be pickled either.
# can't be pickled either.
return
SimpleGoodTuringProbDist
(
fdist
)
return
SimpleGoodTuringProbDist
(
fdist
)
class
NgramModel
(
ModelI
):
class
NgramModel
(
ModelI
):
"""
"""
A processing interface for assigning a probability to the next word.
A processing interface for assigning a probability to the next word.
>>> from nltk.corpus import brown
>>> from nltk.probability import LidstoneProbDist, WittenBellProbDist
>>> import textwrap
>>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
>>> # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
>>> lm = NgramModel(3, brown.words(categories='news'), estimator)
>>> text = lm.generate(100)
>>> print '
\n
'.join(textwrap.wrap(' '.join(text)))
"""
"""
# add cutoff
# add cutoff
def
__init__
(
self
,
n
,
train
,
estimator
=
None
):
def
__init__
(
self
,
n
,
train
,
estimator
=
None
,
*
estimator_args
,
**
estimator_kw_args
):
"""
"""
Creates an ngram language model to capture patterns in n consecutive
Creates an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
words of training text. An estimator smooths the probabilities derived
...
@@ -48,16 +40,27 @@ class NgramModel(ModelI):
...
@@ -48,16 +40,27 @@ class NgramModel(ModelI):
:param n: the order of the language model (ngram size)
:param n: the order of the language model (ngram size)
:type n: int
:type n: int
:param train: the training text
:param train: the training text
:type train: list
(str)
:type train: list
of string
:param estimator: a function for generating a probability distribution
:param estimator: a function for generating a probability distribution
:type estimator: function(ConditionalFreqDist) -> ConditionalProbDist
:type estimator: a function that takes a ConditionalFreqDist and
returns a ConditionalProbDist
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kw_args: Extra keyword arguments for estimator.
:type estimator_kw_args: (any)
"""
"""
self
.
_n
=
n
self
.
_n
=
n
if
estimator
is
None
:
if
estimator
is
None
:
estimator
=
_estimator
estimator
=
_estimator
cfd
=
ConditionalFreqDist
()
cfd
=
ConditionalFreqDist
()
self
.
_ngrams
=
set
()
self
.
_ngrams
=
set
()
self
.
_prefix
=
(
''
,)
*
(
n
-
1
)
self
.
_prefix
=
(
''
,)
*
(
n
-
1
)
...
@@ -68,26 +71,30 @@ class NgramModel(ModelI):
...
@@ -68,26 +71,30 @@ class NgramModel(ModelI):
token
=
ngram
[
-
1
]
token
=
ngram
[
-
1
]
cfd
[
context
]
.
inc
(
token
)
cfd
[
context
]
.
inc
(
token
)
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
len
(
cfd
))
if
(
not
estimator_args
)
and
(
not
estimator_kw_args
):
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
len
(
cfd
))
else
:
self
.
_model
=
ConditionalProbDist
(
cfd
,
estimator
,
*
estimator_args
,
**
estimator_kw_args
)
# recursively construct the lower-order models
# recursively construct the lower-order models
if
n
>
1
:
if
n
>
1
:
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
estimator
)
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
estimator
,
*
estimator_args
,
**
estimator_kw_args
)
# Katz Backoff probability
def
prob
(
self
,
word
,
context
):
def
prob
(
self
,
word
,
context
):
"""
"""
Evaluate the probability of this word in this context.
Evaluate the probability of this word in this context using Katz Backoff.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
"""
context
=
tuple
(
context
)
context
=
tuple
(
context
)
if
context
+
(
word
,)
in
self
.
_ngrams
:
if
(
context
+
(
word
,)
in
self
.
_ngrams
)
or
(
self
.
_n
==
1
)
:
return
self
[
context
]
.
prob
(
word
)
return
self
[
context
]
.
prob
(
word
)
elif
self
.
_n
>
1
:
return
self
.
_alpha
(
context
)
*
self
.
_backoff
.
prob
(
word
,
context
[
1
:])
else
:
else
:
raise
RuntimeError
(
"No probability mass assigned to word
%
s in "
return
self
.
_alpha
(
context
)
*
self
.
_backoff
.
prob
(
word
,
context
[
1
:])
"context
%
s"
%
(
word
,
' '
.
join
(
context
)))
def
_alpha
(
self
,
tokens
):
def
_alpha
(
self
,
tokens
):
return
self
.
_beta
(
tokens
)
/
self
.
_backoff
.
_beta
(
tokens
[
1
:])
return
self
.
_beta
(
tokens
)
/
self
.
_backoff
.
_beta
(
tokens
[
1
:])
...
@@ -101,18 +108,37 @@ class NgramModel(ModelI):
...
@@ -101,18 +108,37 @@ class NgramModel(ModelI):
def
logprob
(
self
,
word
,
context
):
def
logprob
(
self
,
word
,
context
):
"""
"""
Evaluate the (negative) log probability of this word in this context.
Evaluate the (negative) log probability of this word in this context.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
"""
return
-
log
(
self
.
prob
(
word
,
context
),
2
)
return
-
log
(
self
.
prob
(
word
,
context
),
2
)
def
choose_random_word
(
self
,
context
):
def
choose_random_word
(
self
,
context
):
'''Randomly select a word that is likely to appear in this context.'''
'''
Randomly select a word that is likely to appear in this context.
:param context: the context the word is in
:type context: list(str)
'''
return
self
.
generate
(
1
,
context
)[
-
1
]
return
self
.
generate
(
1
,
context
)[
-
1
]
# NB, this will always start with same word since model
# NB, this will always start with same word since model
# is trained on a single text
# is trained on a single text
def
generate
(
self
,
num_words
,
context
=
()):
def
generate
(
self
,
num_words
,
context
=
()):
'''Generate random text based on the language model.'''
'''
Generate random text based on the language model.
:param num_words: number of words to generate
:type num_words: int
:param context: initial words in generated string
:type context: list(str)
'''
text
=
list
(
context
)
text
=
list
(
context
)
for
i
in
range
(
num_words
):
for
i
in
range
(
num_words
):
text
.
append
(
self
.
_generate_one
(
text
))
text
.
append
(
self
.
_generate_one
(
text
))
...
@@ -130,16 +156,33 @@ class NgramModel(ModelI):
...
@@ -130,16 +156,33 @@ class NgramModel(ModelI):
def
entropy
(
self
,
text
):
def
entropy
(
self
,
text
):
"""
"""
Evaluate the total entropy of a text with respect to the model.
Calculate the approximate cross-entropy of the n-gram model for a
This is the sum of the log probability of each word in the message.
given evaluation text.
This is the average log probability of each word in the text.
:param text: words to use for evaluation
:type text: list(str)
"""
"""
e
=
0.0
e
=
0.0
for
i
in
range
(
self
.
_n
-
1
,
len
(
text
)):
# Add prefix to front to correctly handle first n-1 words
text
=
list
(
self
.
_prefix
)
+
text
for
i
in
range
(
len
(
text
)):
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
token
=
text
[
i
]
token
=
text
[
i
]
e
+=
self
.
logprob
(
token
,
context
)
e
+=
self
.
logprob
(
token
,
context
)
return
e
return
e
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
def
perplexity
(
self
,
text
):
"""
Calculates the perplexity of the given text.
This is simply 2 ** cross-entropy for the text.
:param text: words to calculate perplexity of
:type text: list(str)
"""
return
pow
(
2.0
,
self
.
entropy
(
text
))
def
__contains__
(
self
,
item
):
def
__contains__
(
self
,
item
):
return
tuple
(
item
)
in
self
.
_model
return
tuple
(
item
)
in
self
.
_model
...
@@ -151,6 +194,16 @@ class NgramModel(ModelI):
...
@@ -151,6 +194,16 @@ class NgramModel(ModelI):
return
'<NgramModel with
%
d
%
d-grams>'
%
(
len
(
self
.
_ngrams
),
self
.
_n
)
return
'<NgramModel with
%
d
%
d-grams>'
%
(
len
(
self
.
_ngrams
),
self
.
_n
)
if
__name__
==
"__main__"
:
def
demo
():
import
doctest
from
nltk.corpus
import
brown
doctest
.
testmod
(
optionflags
=
doctest
.
NORMALIZE_WHITESPACE
)
from
nltk.probability
import
LidstoneProbDist
,
WittenBellProbDist
estimator
=
lambda
fdist
,
bins
:
LidstoneProbDist
(
fdist
,
0.2
)
lm
=
NgramModel
(
3
,
brown
.
words
(
categories
=
'news'
),
estimator
)
print
lm
#print lm.entropy(sent)
text
=
lm
.
generate
(
100
)
import
textwrap
print
'
\n
'
.
join
(
textwrap
.
wrap
(
' '
.
join
(
text
)))
if
__name__
==
'__main__'
:
demo
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment