fixed probability bug for good

f064f6e5 · Ilia Kurenkov · 4c2a1dbb · f064f6e5 · f064f6e5
Commit f064f6e5 authored Mar 21, 2015 by Ilia Kurenkov
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 15 deletions

nltk/model/ngram.py
+6 -8

nltk/probability.py
+2 -7

No files found.
--- a/nltk/model/ngram.py
+++ b/nltk/model/ngram.py
@@ -104,22 +104,20 @@ class NgramModel(ModelI):
            train = [train]
        # we need to keep track of the number of word types we encounter
-        words = set()
+        vocabulary = set()
        for sent in train:
            for ngram in ngrams(sent, n, pad_left, pad_right, pad_symbol=''):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1
-                words.add(token)
+                vocabulary.add(token)
-        # unless number of bins is explicitly passed, we should use the number
+        # Unless number of bins is explicitly passed, we should use the number
-        # of word types encountered during training as the bins value
+        # of word types encountered during training as the bins value.
+        # If right padding is on, this includes the padding symbol.
        if 'bins' not in estimator_kwargs:
-            estimator_kwargs['bins'] = len(words)
+            estimator_kwargs['bins'] = len(vocabulary)
-        missed_words = (1 - int(pad_left) - int(pad_right)) * (n - 1)
-        estimator_kwargs['override_N'] = cfd.N() + missed_words
        self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

--- a/nltk/probability.py
+++ b/nltk/probability.py
@@ -653,7 +653,7 @@ class LidstoneProbDist(ProbDistI):
    likelihood estimate of the resulting frequency distribution.
    """
    SUM_TO_ONE = False
-    def __init__(self, freqdist, gamma, bins=None, override_N=None):
+    def __init__(self, freqdist, gamma, bins=None):
        """
        Use the Lidstone estimate to create a probability distribution
        for the experiment used to generate ``freqdist``.
@@ -688,12 +688,7 @@ class LidstoneProbDist(ProbDistI):
        self._freqdist = freqdist
        self._gamma = float(gamma)
-        # if user specifies a number of tokens explicitly, use that number
+        self._N = self._freqdist.N()
-        # instead of getting it from the frequency distribution
-        if override_N:
-            self._N = override_N
-        else:
-            self._N = self._freqdist.N()
        if bins is None:
            bins = freqdist.B()