Commit e9e0f7dd by Steven Bird

fixed doctests in align module

parent a1e4de67
...@@ -18,22 +18,19 @@ class AlignedSent(object): ...@@ -18,22 +18,19 @@ class AlignedSent(object):
Return an aligned sentence object, which encapsulates two sentences along with Return an aligned sentence object, which encapsulates two sentences along with
an ``Alignment`` between them. an ``Alignment`` between them.
.. doctest::
:options: +SKIP
>>> from nltk.align import AlignedSent >>> from nltk.align import AlignedSent
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
... ['the', 'house', 'is', 'small'], '1-3 2-4 3-2 4-1') ... ['the', 'house', 'is', 'small'], '0-2 1-3 2-1 3-0')
>>> algnsent.words >>> algnsent.words
['klein', 'ist', 'das', 'Haus'] ['klein', 'ist', 'das', 'Haus']
>>> algnsent.mots >>> algnsent.mots
['the', 'house', 'is', 'small'] ['the', 'house', 'is', 'small']
>>> algnsent.alignment >>> algnsent.alignment
Alignment([(1, 3), (2, 4), (3, 2), (4, 1)]) Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
>>> algnsent.precision('1-3 2-4 3-2 4-4') >>> algnsent.precision('0-2 1-3 2-1 3-3')
0.75 0.75
>>> from nltk.corpus import comtrans >>> from nltk.corpus import comtrans
>>> comtrans.aligned_sents()[54] >>> print comtrans.aligned_sents()[54]
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'> <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
>>> print comtrans.aligned_sents()[54].alignment >>> print comtrans.aligned_sents()[54].alignment
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
...@@ -84,9 +81,9 @@ class AlignedSent(object): ...@@ -84,9 +81,9 @@ class AlignedSent(object):
:raise IndexError: if alignment is out of sentence boundary :raise IndexError: if alignment is out of sentence boundary
:rtype: boolean :rtype: boolean
""" """
if not all([0 <= p[0] <= len(self._words) for p in a]): if not all([0 <= p[0] < len(self._words) for p in a]):
raise IndexError("Alignment is outside boundary of words") raise IndexError("Alignment is outside boundary of words")
if not all([0 <= p[1] <= len(self._mots) for p in a]): if not all([0 <= p[1] < len(self._mots) for p in a]):
raise IndexError("Alignment is outside boundary of mots") raise IndexError("Alignment is outside boundary of mots")
return True return True
...@@ -216,19 +213,19 @@ class Alignment(frozenset): ...@@ -216,19 +213,19 @@ class Alignment(frozenset):
additional data, such as a boolean to indicate sure vs possible alignments). additional data, such as a boolean to indicate sure vs possible alignments).
>>> from nltk.align import Alignment >>> from nltk.align import Alignment
>>> a = Alignment([(1, 1), (1, 2), (2, 3), (3, 3)]) >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
>>> a.invert() >>> a.invert()
Alignment([(1, 1), (2, 1), (3, 2), (3, 3)]) Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
>>> print a.invert() >>> print a.invert()
1-1 2-1 3-2 3-3 0-0 1-0 2-1 2-2
>>> a[1] >>> a[0]
[(1, 2), (1, 1)] [(0, 1), (0, 0)]
>>> a.invert()[3] >>> a.invert()[3]
[(3, 2), (3, 3)] [(2, 1), (2, 2)]
>>> b = Alignment([(1, 1), (1, 2)]) >>> b = Alignment([(0, 0), (0, 1)])
>>> b.issubset(a) >>> b.issubset(a)
True True
>>> c = Alignment('1-1 1-2') >>> c = Alignment('0-0 0-1')
>>> b == c >>> b == c
True True
""" """
...@@ -395,7 +392,6 @@ class IBMModel1(object): ...@@ -395,7 +392,6 @@ class IBMModel1(object):
num_converged, num_probs, 100.0*num_converged/num_probs)) num_converged, num_probs, 100.0*num_converged/num_probs))
self.probabilities = dict(t) self.probabilities = dict(t)
return iteration_count
def aligned(self): def aligned(self):
""" """
...@@ -403,7 +399,7 @@ class IBMModel1(object): ...@@ -403,7 +399,7 @@ class IBMModel1(object):
IBM-Model 1. IBM-Model 1.
""" """
if self.probablities is None: if self.probabilities is None:
raise ValueError("No probabilities calculated") raise ValueError("No probabilities calculated")
aligned = [] aligned = []
......
...@@ -42,34 +42,26 @@ but they are easilly inverted: ...@@ -42,34 +42,26 @@ but they are easilly inverted:
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]))
We can also set new alignments, but these need to be in the correct range of We can create new alignments, but these need to be in the correct range of
the corresponding sentences: the corresponding sentences:
>>> from nltk import align >>> from nltk.align import Alignment, AlignedSent
>>> als.alignment = align.Alignment([(0, 0), (1, 4), (2, 1), (3, 3)]) >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'],
... ['Resumption', 'of', 'the', 'session'],
... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)]))
Traceback (most recent call last): Traceback (most recent call last):
... ...
IndexError: Alignment is outside boundary of mots IndexError: Alignment is outside boundary of mots
>>> als.alignment = align.Alignment([(-1, 0), (1, 2), (2, 1), (3, 3)])
Traceback (most recent call last): .. in Python 2.6 version, we will support:
... als.alignment = Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])
IndexError: Alignment is outside boundary of words
>>> als.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])
>>> als.alignment = align.Alignment([(1, 3), (3, 2), (0, 1), (2, 0)])
>>> als.alignment
Alignment([(0, 1), (1, 3), (2, 0), (3, 2)])
You can set alignments with any sequence of tuples, so long as the first two You can set alignments with any sequence of tuples, so long as the first two
indexes of the tuple are the alignment indices: indexes of the tuple are the alignment indices:
>>> als.alignment = [(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))] >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
>>> als.alignment
Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))]) Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])
>>> als.alignment = ((0, 0), (1, 1), (2, 2), (3, 3))
>>> als.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])
Alignment Algorithms Alignment Algorithms
...@@ -80,11 +72,11 @@ EM for IBM Model 1 ...@@ -80,11 +72,11 @@ EM for IBM Model 1
Here is an example from Kohn, 2010: Here is an example from Kohn, 2010:
>>> corpus = [align.AlignedSent(['the', 'house'], ['das', 'Haus']), >>> from nltk.align import IBMModel1
... align.AlignedSent(['the', 'book'], ['das', 'Buch']), >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
... align.AlignedSent(['a', 'book'], ['ein', 'Buch'])] ... AlignedSent(['the', 'book'], ['das', 'Buch']),
>>> em_ibm1 = align.EMIBMModel1(corpus, 1e-3) ... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
>>> iterations = em_ibm1.train() >>> em_ibm1 = IBMModel1(corpus, 1e-3)
>>> print round(em_ibm1.probabilities['the', 'das'], 1) >>> print round(em_ibm1.probabilities['the', 'das'], 1)
1.0 1.0
>>> print round(em_ibm1.probabilities['book', 'das'], 1) >>> print round(em_ibm1.probabilities['book', 'das'], 1)
...@@ -132,7 +124,7 @@ This then gives us a very clean form for defining our evaluation metrics. ...@@ -132,7 +124,7 @@ This then gives us a very clean form for defining our evaluation metrics.
Consider the following aligned sentence for evaluation: Consider the following aligned sentence for evaluation:
>>> my_als = align.AlignedSent(['Resumption', 'of', 'the', 'session'], >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'],
... ['Reprise', 'de', 'la', 'session'], ... ['Reprise', 'de', 'la', 'session'],
... [(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)]) ... [(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment