Minor optimizations to align.py, and doctest fixes

b912a222 · Peter Ljunglöf · 4722f1d5 · b912a222 · b912a222
Commit b912a222 authored Feb 08, 2012 by Peter Ljunglöf
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 33 deletions

nltk/align.py
+20 -16

nltk/test/align.doctest
+26 -17

No files found.
--- a/nltk/align.py
+++ b/nltk/align.py
@@ -300,11 +300,16 @@ class IBMModel1(object):
    - Stage 2: Generates updated word alignments for the sentence pairs, based
      on the translation probabilities from Stage 1.

-      .. doctest::
-
-          >> from nltk.corpus import comtrans
-          >> from nltk.align import IBMModel1
-          >> ibm1 = IBMModel1(comtrans.aligned_sents())
+        >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
+        ...           AlignedSent(['the', 'book'], ['das', 'Buch']),
+        ...           AlignedSent(['a', 'book'], ['ein', 'Buch'])]
+        >>> ibm1 = IBMModel1(corpus)
+        >>> print "%.1f" % ibm1.probabilities['book', 'Buch']
+        1.0
+        >>> print "%.1f" % ibm1.probabilities['book', 'das']
+        0.0
+        >>> print "%.1f" % ibm1.probabilities['book', None]
+        0.5

    :param aligned_sents: The parallel text ``corpus.Iterable`` containing
        AlignedSent instances of aligned sentence pairs from the corpus.
@@ -326,9 +331,9 @@ class IBMModel1(object):
    def _train(self):
        """
        Perform Expectation Maximization training to learn
-        word-to-word translation probabilities, and return
-        the number of iterations that were required for convergence.
+        word-to-word translation probabilities.
        """
+        logging.debug("Starting training")

        # Collect up sets of all English and foreign words
        english_words = set()
@@ -338,15 +343,13 @@ class IBMModel1(object):
            foreign_words.update(aligned_sent.mots)
        # add the NULL token to the foreign word set.
        foreign_words.add(None)
-        num_probs = len(english_words)*len(foreign_words)
+        num_probs = len(english_words) * len(foreign_words)

        # Initialise t(e|f) uniformly
-        t = defaultdict(lambda: float(1)/len(english_words))
-        s_total = defaultdict(float)
-        for e in english_words:
-            for f in foreign_words:
-                z = t[e,f]
+        default_prob = 1.0 / len(english_words)
+        t = defaultdict(lambda: default_prob)

+        convergent_threshold = self.convergent_threshold
        globally_converged = False
        iteration_count = 0
        while not globally_converged:
@@ -356,6 +359,7 @@ class IBMModel1(object):
            total = defaultdict(float)

            for aligned_sent in self.aligned_sents:
+                s_total = {}
                # Compute normalization
                for e_w in aligned_sent.words:
                    s_total[e_w] = 0.0
@@ -375,7 +379,7 @@ class IBMModel1(object):
                for e_w in english_words:
                    new_prob = count[e_w, f_w] / total[f_w]
                    delta = abs(t[e_w, f_w] - new_prob)
-                    if delta < self.convergent_threshold:
+                    if delta < convergent_threshold:
                        num_converged += 1
                    t[e_w, f_w] = new_prob

@@ -383,8 +387,8 @@ class IBMModel1(object):
            iteration_count += 1
            if num_converged == num_probs:
                globally_converged = True
-            logging.debug("%d/%d (%.2f%%) converged"%(
-                    num_converged, num_probs, 100.0*num_converged/num_probs))
+            logging.debug("%d/%d (%.2f%%) converged" %
+                          (num_converged, num_probs, 100.0*num_converged/num_probs))

        self.probabilities = dict(t)


--- a/nltk/test/align.doctest
+++ b/nltk/test/align.doctest
@@ -53,13 +53,12 @@ the corresponding sentences:
        ...
    IndexError: Alignment is outside boundary of mots

-.. in Python 2.6 version, we will support:
-   als.alignment = Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])
-

 You can set alignments with any sequence of tuples, so long as the first two
 indexes of the tuple are the alignment indices:

+als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
+
    >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
    Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])

@@ -72,36 +71,46 @@ EM for IBM Model 1

 Here is an example from Kohn, 2010:

-    >>> from nltk.align import IBMModel1 # doctest +SKIP
+    >>> from nltk.align import IBMModel1
    >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
    ...           AlignedSent(['the', 'book'], ['das', 'Buch']),
-    ...           AlignedSent(['a', 'book'], ['ein', 'Buch'])] # doctest +SKIP
-    >>> em_ibm1 = IBMModel1(corpus, 1e-3) # doctest +SKIP
-    >>> print round(em_ibm1.probabilities['the', 'das'], 1) # doctest +SKIP
+    ...           AlignedSent(['a', 'book'], ['ein', 'Buch'])]
+    >>> em_ibm1 = IBMModel1(corpus, 1e-3)
+    >>> print round(em_ibm1.probabilities['the', 'das'], 1)
    1.0
-    >>> print round(em_ibm1.probabilities['book', 'das'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['book', 'das'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['house', 'das'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['house', 'das'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['the', 'Buch'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['the', 'Buch'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['book', 'Buch'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['book', 'Buch'], 1)
    1.0
-    >>> print round(em_ibm1.probabilities['a', 'Buch'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['a', 'Buch'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['book', 'ein'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['book', 'ein'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['a', 'ein'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['a', 'ein'], 1)
    1.0
-    >>> print round(em_ibm1.probabilities['the', 'Haus'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['the', 'Haus'], 1)
    0.0
-    >>> print round(em_ibm1.probabilities['house', 'Haus'], 1) # doctest +SKIP
+    >>> print round(em_ibm1.probabilities['house', 'Haus'], 1)
    1.0
+    >>> print round(em_ibm1.probabilities['book', None], 1)
+    0.5

+And using an NLTK corpus. We train on only 10 sentences, since it is so incredibly slow:
+
+    >>> from nltk.corpus import comtrans
+    >>> com_ibm1 = IBMModel1(comtrans.aligned_sents() [:10])
+    >>> print round(com_ibm1.probabilities['bitte', 'Please'], 1)
+    0.2
+    >>> print round(com_ibm1.probabilities['Sitzungsperiode', 'session'], 1)
+    1.0

 Get the alignments:

-    >>> em_ibm1.aligned() # doctest: +SKIP
+    >>> em_ibm1.aligned() # doctest: +NORMALIZE_WHITESPACE
    [AlignedSent(['the', 'house'], ['das', 'Haus'],
            Alignment([(0, 0), (1, 1)])),
     AlignedSent(['the', 'book'], ['das', 'Buch'],