Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
8dcae605
Commit
8dcae605
authored
Feb 04, 2015
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'model' of
https://github.com/Copper-Head/nltk
into model
parents
617c2c4a
96156590
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
140 additions
and
93 deletions
+140
-93
nltk/model/ngram.py
+55
-40
nltk/probability.py
+85
-53
No files found.
nltk/model/ngram.py
View file @
8dcae605
...
...
@@ -36,7 +36,6 @@ class NgramModel(ModelI):
A processing interface for assigning a probability to the next word.
"""
# add cutoff
def
__init__
(
self
,
n
,
train
,
pad_left
=
True
,
pad_right
=
False
,
estimator
=
None
,
*
estimator_args
,
**
estimator_kwargs
):
"""
...
...
@@ -90,42 +89,39 @@ class NgramModel(ModelI):
# make sure n is greater than zero, otherwise print it
assert
(
n
>
0
),
n
self
.
_unigram_model
=
(
n
==
1
)
# For explicitness save the check whether this is a unigram model
self
.
is_unigram_model
=
(
n
==
1
)
# save the ngram order number
self
.
_n
=
n
# save left and right padding
self
.
_lpad
=
(
''
,)
*
(
n
-
1
)
if
pad_left
else
()
self
.
_rpad
=
(
''
,)
*
(
n
-
1
)
if
pad_right
else
()
if
estimator
is
None
:
estimator
=
_estimator
cfd
=
ConditionalFreqDist
()
# set read-only ngrams set (see property declaration below to reconfigure)
self
.
_ngrams
=
set
()
# If given a list of strings instead of a list of lists, create enclosing list
if
(
train
is
not
None
)
and
isinstance
(
train
[
0
],
compat
.
string_types
):
train
=
[
train
]
# we need to keep track of the number of word types we encounter
words
=
set
()
for
sent
in
train
:
for
ngram
in
ngrams
(
sent
,
n
,
pad_left
,
pad_right
,
pad_symbol
=
''
):
raw_ngrams
=
ngrams
(
sent
,
n
,
pad_left
,
pad_right
,
pad_symbol
=
''
)
for
ngram
in
raw_ngrams
:
self
.
_ngrams
.
add
(
ngram
)
context
=
tuple
(
ngram
[:
-
1
])
token
=
ngram
[
-
1
]
cfd
[
context
][
token
]
+=
1
words
.
add
(
token
)
# unless number of bins is explicitly passed, we should use the number
# of word types encountered during training as the bins value
if
'bins'
not
in
estimator_kwargs
:
estimator_kwargs
[
'bins'
]
=
len
(
words
)
missed_words
=
(
1
-
int
(
pad_left
)
-
int
(
pad_right
))
*
(
n
-
1
)
estimator_kwargs
[
'override_N'
]
=
cfd
.
N
()
+
missed_words
cfd
[(
context
,
token
)]
+=
1
self
.
_
model
=
ConditionalProbDist
(
cfd
,
estimator
,
*
estimator_args
,
**
estimator_kwargs
)
self
.
_
probdist
=
estimator
(
cfd
,
*
estimator_args
,
**
estimator_kwargs
)
# recursively construct the lower-order models
if
not
self
.
_unigram_model
:
if
not
self
.
is
_unigram_model
:
self
.
_backoff
=
NgramModel
(
n
-
1
,
train
,
pad_left
,
pad_right
,
estimator
,
...
...
@@ -135,31 +131,38 @@ class NgramModel(ModelI):
self
.
_backoff_alphas
=
dict
()
# For each condition (or context)
for
ctxt
in
cfd
.
conditions
():
prdist
=
self
.
_model
[
ctxt
]
# prob dist for this context
backoff_ctxt
=
ctxt
[
1
:]
backoff_total_pr
=
0.0
total_observed_pr
=
0.0
for
word
in
cfd
[
ctxt
]:
# this is the subset of words that we OBSERVED
# following this context
total_observed_pr
+=
prdist
.
prob
(
word
)
# we normalize it by the total (n-1)-gram probability of
# words that were observed in this n-gram context
# this is the subset of words that we OBSERVED following
# this context.
# i.e. Count(word | context) > 0
for
word
in
self
.
_words_following
(
ctxt
,
cfd
):
total_observed_pr
+=
self
.
prob
(
word
,
ctxt
)
# we also need the total (n-1)-gram probability of
# words observed in this n-gram context
backoff_total_pr
+=
self
.
_backoff
.
prob
(
word
,
backoff_ctxt
)
assert
(
0
<
total_observed_pr
<=
1
),
total_observed_pr
assert
(
0
<
=
total_observed_pr
<=
1
),
total_observed_pr
# beta is the remaining probability weight after we factor out
# the probability of observed words
# the probability of observed words.
# As a sanity check, both total_observed_pr and backoff_total_pr
# must be GE 0, since probabilities are never negative
beta
=
1.0
-
total_observed_pr
# backoff total has to be less than one, otherwise we get
#
ZeroDivision error when we try subtracting it from 1 below
assert
(
0
<
backoff_total_pr
<
1
),
backoff_total_pr
#
an error when we try subtracting it from 1 in the denominator
assert
(
0
<
=
backoff_total_pr
<
1
),
backoff_total_pr
alpha_ctxt
=
beta
/
(
1.0
-
backoff_total_pr
)
self
.
_backoff_alphas
[
ctxt
]
=
alpha_ctxt
def
_words_following
(
self
,
context
,
cond_freq_dist
):
for
ctxt
,
word
in
cond_freq_dist
.
iterkeys
():
if
ctxt
==
context
:
yield
word
def
prob
(
self
,
word
,
context
):
"""
Evaluate the probability of this word in this context using Katz Backoff.
...
...
@@ -170,15 +173,17 @@ class NgramModel(ModelI):
:type context: list(str)
"""
context
=
tuple
(
context
)
if
(
context
+
(
word
,)
in
self
.
_ngrams
)
or
(
self
.
_unigram_model
):
return
self
[
context
]
.
prob
(
word
)
if
(
context
+
(
word
,)
in
self
.
_ngrams
)
or
(
self
.
is
_unigram_model
):
return
self
.
_probdist
.
prob
((
context
,
word
)
)
else
:
return
self
.
_alpha
(
context
)
*
self
.
_backoff
.
prob
(
word
,
context
[
1
:])
# Updated _alpha function, discarded the _beta function
def
_alpha
(
self
,
context
):
"""Get the backoff alpha value for the given context
"""
error_message
=
"Alphas and backoff are not defined for unigram models"
assert
not
self
.
is_unigram_model
,
error_message
if
context
in
self
.
_backoff_alphas
:
return
self
.
_backoff_alphas
[
context
]
else
:
...
...
@@ -193,9 +198,20 @@ class NgramModel(ModelI):
:param context: the context the word is in
:type context: list(str)
"""
return
-
log
(
self
.
prob
(
word
,
context
),
2
)
@property
def
ngrams
(
self
):
return
self
.
_ngrams
@property
def
backoff
(
self
):
return
self
.
_backoff
@property
def
probdist
(
self
):
return
self
.
_probdist
def
choose_random_word
(
self
,
context
):
'''
Randomly select a word that is likely to appear in this context.
...
...
@@ -224,8 +240,7 @@ class NgramModel(ModelI):
return
text
def
_generate_one
(
self
,
context
):
context
=
(
self
.
_lpad
+
tuple
(
context
))[
-
self
.
_n
+
1
:]
# print "Context (%d): <%s>" % (self._n, ','.join(context))
context
=
(
self
.
_lpad
+
tuple
(
context
))[
-
self
.
_n
+
1
:]
if
context
in
self
:
return
self
[
context
]
.
generate
()
elif
self
.
_n
>
1
:
...
...
@@ -245,11 +260,11 @@ class NgramModel(ModelI):
e
=
0.0
text
=
list
(
self
.
_lpad
)
+
text
+
list
(
self
.
_rpad
)
for
i
in
range
(
self
.
_n
-
1
,
len
(
text
)):
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
for
i
in
range
(
self
.
_n
-
1
,
len
(
text
)):
context
=
tuple
(
text
[
i
-
self
.
_n
+
1
:
i
])
token
=
text
[
i
]
e
+=
self
.
logprob
(
token
,
context
)
return
e
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
return
e
/
float
(
len
(
text
)
-
(
self
.
_n
-
1
))
def
perplexity
(
self
,
text
):
"""
...
...
@@ -263,10 +278,10 @@ class NgramModel(ModelI):
return
pow
(
2.0
,
self
.
entropy
(
text
))
def
__contains__
(
self
,
item
):
return
tuple
(
item
)
in
self
.
_
model
return
tuple
(
item
)
in
self
.
_
probdist
.
freqdist
def
__getitem__
(
self
,
item
):
return
self
.
_
model
[
tuple
(
item
)]
return
self
.
_
probdist
[
tuple
(
item
)]
def
__repr__
(
self
):
return
'<NgramModel with
%
d
%
d-grams>'
%
(
len
(
self
.
_ngrams
),
self
.
_n
)
...
...
nltk/probability.py
View file @
8dcae605
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment