Commit b912a222 by Peter Ljunglöf

Minor optimizations to align.py, and doctest fixes

parent 4722f1d5
......@@ -300,11 +300,16 @@ class IBMModel1(object):
- Stage 2: Generates updated word alignments for the sentence pairs, based
on the translation probabilities from Stage 1.
.. doctest::
>> from nltk.corpus import comtrans
>> from nltk.align import IBMModel1
>> ibm1 = IBMModel1(comtrans.aligned_sents())
>>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
... AlignedSent(['the', 'book'], ['das', 'Buch']),
... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
>>> ibm1 = IBMModel1(corpus)
>>> print "%.1f" % ibm1.probabilities['book', 'Buch']
1.0
>>> print "%.1f" % ibm1.probabilities['book', 'das']
0.0
>>> print "%.1f" % ibm1.probabilities['book', None]
0.5
:param aligned_sents: The parallel text ``corpus.Iterable`` containing
AlignedSent instances of aligned sentence pairs from the corpus.
......@@ -326,9 +331,9 @@ class IBMModel1(object):
def _train(self):
"""
Perform Expectation Maximization training to learn
word-to-word translation probabilities, and return
the number of iterations that were required for convergence.
word-to-word translation probabilities.
"""
logging.debug("Starting training")
# Collect up sets of all English and foreign words
english_words = set()
......@@ -338,15 +343,13 @@ class IBMModel1(object):
foreign_words.update(aligned_sent.mots)
# add the NULL token to the foreign word set.
foreign_words.add(None)
num_probs = len(english_words)*len(foreign_words)
num_probs = len(english_words) * len(foreign_words)
# Initialise t(e|f) uniformly
t = defaultdict(lambda: float(1)/len(english_words))
s_total = defaultdict(float)
for e in english_words:
for f in foreign_words:
z = t[e,f]
default_prob = 1.0 / len(english_words)
t = defaultdict(lambda: default_prob)
convergent_threshold = self.convergent_threshold
globally_converged = False
iteration_count = 0
while not globally_converged:
......@@ -356,6 +359,7 @@ class IBMModel1(object):
total = defaultdict(float)
for aligned_sent in self.aligned_sents:
s_total = {}
# Compute normalization
for e_w in aligned_sent.words:
s_total[e_w] = 0.0
......@@ -375,7 +379,7 @@ class IBMModel1(object):
for e_w in english_words:
new_prob = count[e_w, f_w] / total[f_w]
delta = abs(t[e_w, f_w] - new_prob)
if delta < self.convergent_threshold:
if delta < convergent_threshold:
num_converged += 1
t[e_w, f_w] = new_prob
......@@ -383,8 +387,8 @@ class IBMModel1(object):
iteration_count += 1
if num_converged == num_probs:
globally_converged = True
logging.debug("%d/%d (%.2f%%) converged"%(
num_converged, num_probs, 100.0*num_converged/num_probs))
logging.debug("%d/%d (%.2f%%) converged" %
(num_converged, num_probs, 100.0*num_converged/num_probs))
self.probabilities = dict(t)
......
......@@ -53,13 +53,12 @@ the corresponding sentences:
...
IndexError: Alignment is outside boundary of mots
.. in Python 2.6 version, we will support:
als.alignment = Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])
You can set alignments with any sequence of tuples, so long as the first two
indexes of the tuple are the alignment indices:
als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
>>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])
......@@ -72,36 +71,46 @@ EM for IBM Model 1
Here is an example from Kohn, 2010:
>>> from nltk.align import IBMModel1 # doctest +SKIP
>>> from nltk.align import IBMModel1
>>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
... AlignedSent(['the', 'book'], ['das', 'Buch']),
... AlignedSent(['a', 'book'], ['ein', 'Buch'])] # doctest +SKIP
>>> em_ibm1 = IBMModel1(corpus, 1e-3) # doctest +SKIP
>>> print round(em_ibm1.probabilities['the', 'das'], 1) # doctest +SKIP
... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
>>> em_ibm1 = IBMModel1(corpus, 1e-3)
>>> print round(em_ibm1.probabilities['the', 'das'], 1)
1.0
>>> print round(em_ibm1.probabilities['book', 'das'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['book', 'das'], 1)
0.0
>>> print round(em_ibm1.probabilities['house', 'das'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['house', 'das'], 1)
0.0
>>> print round(em_ibm1.probabilities['the', 'Buch'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['the', 'Buch'], 1)
0.0
>>> print round(em_ibm1.probabilities['book', 'Buch'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['book', 'Buch'], 1)
1.0
>>> print round(em_ibm1.probabilities['a', 'Buch'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['a', 'Buch'], 1)
0.0
>>> print round(em_ibm1.probabilities['book', 'ein'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['book', 'ein'], 1)
0.0
>>> print round(em_ibm1.probabilities['a', 'ein'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['a', 'ein'], 1)
1.0
>>> print round(em_ibm1.probabilities['the', 'Haus'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['the', 'Haus'], 1)
0.0
>>> print round(em_ibm1.probabilities['house', 'Haus'], 1) # doctest +SKIP
>>> print round(em_ibm1.probabilities['house', 'Haus'], 1)
1.0
>>> print round(em_ibm1.probabilities['book', None], 1)
0.5
And using an NLTK corpus. We train on only 10 sentences, since it is so incredibly slow:
>>> from nltk.corpus import comtrans
>>> com_ibm1 = IBMModel1(comtrans.aligned_sents() [:10])
>>> print round(com_ibm1.probabilities['bitte', 'Please'], 1)
0.2
>>> print round(com_ibm1.probabilities['Sitzungsperiode', 'session'], 1)
1.0
Get the alignments:
>>> em_ibm1.aligned() # doctest: +SKIP
>>> em_ibm1.aligned() # doctest: +NORMALIZE_WHITESPACE
[AlignedSent(['the', 'house'], ['das', 'Haus'],
Alignment([(0, 0), (1, 1)])),
AlignedSent(['the', 'book'], ['das', 'Buch'],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment