Commit f064f6e5 by Ilia Kurenkov

fixed probability bug for good

parent 4c2a1dbb
...@@ -104,22 +104,20 @@ class NgramModel(ModelI): ...@@ -104,22 +104,20 @@ class NgramModel(ModelI):
train = [train] train = [train]
# we need to keep track of the number of word types we encounter # we need to keep track of the number of word types we encounter
words = set() vocabulary = set()
for sent in train: for sent in train:
for ngram in ngrams(sent, n, pad_left, pad_right, pad_symbol=''): for ngram in ngrams(sent, n, pad_left, pad_right, pad_symbol=''):
self._ngrams.add(ngram) self._ngrams.add(ngram)
context = tuple(ngram[:-1]) context = tuple(ngram[:-1])
token = ngram[-1] token = ngram[-1]
cfd[context][token] += 1 cfd[context][token] += 1
words.add(token) vocabulary.add(token)
# unless number of bins is explicitly passed, we should use the number # Unless number of bins is explicitly passed, we should use the number
# of word types encountered during training as the bins value # of word types encountered during training as the bins value.
# If right padding is on, this includes the padding symbol.
if 'bins' not in estimator_kwargs: if 'bins' not in estimator_kwargs:
estimator_kwargs['bins'] = len(words) estimator_kwargs['bins'] = len(vocabulary)
missed_words = (1 - int(pad_left) - int(pad_right)) * (n - 1)
estimator_kwargs['override_N'] = cfd.N() + missed_words
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
......
...@@ -653,7 +653,7 @@ class LidstoneProbDist(ProbDistI): ...@@ -653,7 +653,7 @@ class LidstoneProbDist(ProbDistI):
likelihood estimate of the resulting frequency distribution. likelihood estimate of the resulting frequency distribution.
""" """
SUM_TO_ONE = False SUM_TO_ONE = False
def __init__(self, freqdist, gamma, bins=None, override_N=None): def __init__(self, freqdist, gamma, bins=None):
""" """
Use the Lidstone estimate to create a probability distribution Use the Lidstone estimate to create a probability distribution
for the experiment used to generate ``freqdist``. for the experiment used to generate ``freqdist``.
...@@ -688,12 +688,7 @@ class LidstoneProbDist(ProbDistI): ...@@ -688,12 +688,7 @@ class LidstoneProbDist(ProbDistI):
self._freqdist = freqdist self._freqdist = freqdist
self._gamma = float(gamma) self._gamma = float(gamma)
# if user specifies a number of tokens explicitly, use that number self._N = self._freqdist.N()
# instead of getting it from the frequency distribution
if override_N:
self._N = override_N
else:
self._N = self._freqdist.N()
if bins is None: if bins is None:
bins = freqdist.B() bins = freqdist.B()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment