Commit f157380d by Steven Bird

Bugfix plus doctests

parent cd7b1022
# Natural Language Toolkit: Aligned Sentences # Natural Language Toolkit: Aligned Sentences
# #
# Copyright (C) 2001-2012 NLTK Project # Copyright (C) 2001-2012 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://www.nltk.org/> # URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
...@@ -8,12 +11,28 @@ import sys ...@@ -8,12 +11,28 @@ import sys
import logging import logging
from collections import defaultdict from collections import defaultdict
import nltk.metrics from nltk.metrics import precision, recall
class AlignedSent(object): class AlignedSent(object):
""" """
Aligned sentence object. Encapsulates two sentences along with Return an aligned sentence object, which encapsulates two sentences along with
an ``Alignment`` between them. an ``Alignment`` between them.
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
... ['the', 'house', 'is', 'small'], '1-3 2-4 3-2 4-1')
>>> algnsent.words
['klein', 'ist', 'das', 'Haus']
>>> algnsent.mots
['the', 'house', 'is', 'small']
>>> algnsent.alignment
Alignment([(1, 3), (2, 4), (3, 2), (4, 1)])
>>> algnsent.precision('1-3 2-4 3-2 4-4')
0.75
>>> from nltk.corpus import comtrans
>>> comtrans.aligned_sents()[54]
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
>>> print comtrans.aligned_sents()[54].alignment
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
:param words: source language words :param words: source language words
:type words: list(str) :type words: list(str)
...@@ -55,27 +74,30 @@ class AlignedSent(object): ...@@ -55,27 +74,30 @@ class AlignedSent(object):
def _check_align(self, a): def _check_align(self, a):
""" """
Check whether the alignments are legal.
:param a: alignment to be checked :param a: alignment to be checked
:raise IndexError: if alignment is out of sentence boundary :raise IndexError: if alignment is out of sentence boundary
:return: True if passed alignment check
:rtype: boolean :rtype: boolean
""" """
if not all([0 <= p[0] < len(self._words) for p in a]): if not all([0 <= p[0] <= len(self._words) for p in a]):
raise IndexError("Alignment is outside boundary of words") raise IndexError("Alignment is outside boundary of words")
if not all([0 <= p[1] < len(self._mots) for p in a]): if not all([0 <= p[1] <= len(self._mots) for p in a]):
raise IndexError("Alignment is outside boundary of mots") raise IndexError("Alignment is outside boundary of mots")
return True return True
def __repr__(self): def __repr__(self):
""" """
:return: A string representation for this ``AlignedSent``. Return a string representation for this ``AlignedSent``.
:rtype: str :rtype: str
""" """
return "AlignedSent(%r, %r, %r)" % (self._words, self._mots, self._alignment) return "AlignedSent(%r, %r, %r)" % (self._words, self._mots, self._alignment)
def __str__(self): def __str__(self):
""" """
:return: A string representation for this ``AlignedSent``. Return a human-readable string representation for this ``AlignedSent``.
:rtype: str :rtype: str
""" """
source = " ".join(self._words)[:20] + "..." source = " ".join(self._words)[:20] + "..."
...@@ -84,46 +106,48 @@ class AlignedSent(object): ...@@ -84,46 +106,48 @@ class AlignedSent(object):
def invert(self): def invert(self):
""" """
:return: the invert object Return the aligned sentence pair, reversing the directionality
:rtype: AlignedSent :rtype: AlignedSent
""" """
return AlignedSent(self._mots, self._words, return AlignedSent(self._mots, self._words,
self._alignment.invert()) self._alignment.invert())
def precision(self, reference): def precision(self, reference):
"""Calculates the precision of an aligned sentence with respect to a """
Return the precision of an aligned sentence with respect to a
"gold standard" reference ``AlignedSent``. "gold standard" reference ``AlignedSent``.
The "possible" precision is used since it doesn't penalise for finding
an alignment that was marked as "possible".
:type reference: AlignedSent or Alignment :type reference: AlignedSent or Alignment
:param reference: A "gold standard" reference aligned sentence. :param reference: A "gold standard" reference aligned sentence.
:rtype: float or None :rtype: float or None
""" """
# Get alignments in set of 2-tuples form # Get alignments in set of 2-tuples form
# The "possible" precision is used since it doesn't penalize for finding
# an alignment that was marked as "possible" (NAACL corpus)
align = self.alignment align = self.alignment
if isinstance(reference, AlignedSent): if isinstance(reference, AlignedSent):
possible = reference.alignment possible = reference.alignment
else: else:
possible = Alignment(reference) possible = Alignment(reference)
# Call NLTKs existing functions for precision return precision(possible, align)
return nltk.metrics.scores.precision(possible, align)
def recall(self, reference): def recall(self, reference):
"""Calculates the recall of an aligned sentence with respect to a """
Return the recall of an aligned sentence with respect to a
"gold standard" reference ``AlignedSent``. "gold standard" reference ``AlignedSent``.
The "sure" recall is used so we don't penalise for missing an
alignment that was only marked as "possible".
:type reference: AlignedSent or Alignment :type reference: AlignedSent or Alignment
:param reference: A "gold standard" reference aligned sentence. :param reference: A "gold standard" reference aligned sentence.
:rtype: float or None :rtype: float or None
""" """
# Get alignments in set of 2-tuples form # Get alignments in set of 2-tuples form
# The "sure" recall is used so we don't penalize for missing an
# alignment that was only marked as "possible".
align = self.alignment align = self.alignment
if isinstance(reference, AlignedSent): if isinstance(reference, AlignedSent):
sure = reference.alignment sure = reference.alignment
...@@ -131,11 +155,12 @@ class AlignedSent(object): ...@@ -131,11 +155,12 @@ class AlignedSent(object):
sure = Alignment(reference) sure = Alignment(reference)
# Call NLTKs existing functions for recall # Call NLTKs existing functions for recall
return nltk.metrics.scores.recall(sure, align) return recall(sure, align)
def alignment_error_rate(self, reference, possible=None): def alignment_error_rate(self, reference, possible=None):
"""Calculates the Alignment Error Rate (AER) of an aligned sentence """
Return the Alignment Error Rate (AER) of an aligned sentence
with respect to a "gold standard" reference ``AlignedSent``. with respect to a "gold standard" reference ``AlignedSent``.
Return an error rate between 0.0 (perfect alignment) and 1.0 (no Return an error rate between 0.0 (perfect alignment) and 1.0 (no
...@@ -188,6 +213,8 @@ class Alignment(frozenset): ...@@ -188,6 +213,8 @@ class Alignment(frozenset):
>>> a = Alignment([(1, 1), (1, 2), (2, 3), (3, 3)]) >>> a = Alignment([(1, 1), (1, 2), (2, 3), (3, 3)])
>>> a.invert() >>> a.invert()
Alignment([(1, 1), (2, 1), (3, 2), (3, 3)]) Alignment([(1, 1), (2, 1), (3, 2), (3, 3)])
>>> print a.invert()
1-1 2-1 3-2 3-3
>>> a[1] >>> a[1]
[(1, 2), (1, 1)] [(1, 2), (1, 1)]
>>> a.invert()[3] >>> a.invert()[3]
...@@ -195,6 +222,9 @@ class Alignment(frozenset): ...@@ -195,6 +222,9 @@ class Alignment(frozenset):
>>> b = Alignment([(1, 1), (1, 2)]) >>> b = Alignment([(1, 1), (1, 2)])
>>> b.issubset(a) >>> b.issubset(a)
True True
>>> c = Alignment('1-1 1-2')
>>> b == c
True
""" """
def __new__(cls, string_or_pairs): def __new__(cls, string_or_pairs):
...@@ -258,45 +288,45 @@ class Alignment(frozenset): ...@@ -258,45 +288,45 @@ class Alignment(frozenset):
self._index[p[0]].append(p) self._index[p[0]].append(p)
class EMIBMModel1(object): class IBMModel1(object):
''' """
This class contains implementations of the Expectation Maximization This class implements the Expectation Maximization algorithm for
algorithm for IBM Model 1. The algorithm runs upon a sentence-aligned IBM Model 1. The algorithm runs upon a sentence-aligned parallel
parallel corpus and generates word alignments in aligned sentence pairs. corpus and generates word alignments in aligned sentence pairs.
The process is divided into 2 stages:
The process is divided into 2 main stages.
Stage 1: Studies word-to-word translation probabilities by collecting - Stage 1: Calculates word-to-word translation probabilities by collecting
evidence of a English word been the translation of a foreign word from evidence of a English word being the translation of a foreign word from
the parallel corpus. the parallel corpus.
- Stage 2: Generates updated word alignments for the sentence pairs, based
Stage 2: Based on the translation probabilities from Stage 1, generates on the translation probabilities from Stage 1.
word alignments for aligned sentence pairs.
''' >>> from nltk.corpus import comtrans
>>> ibm1 = IBMModel1(comtrans.aligned_sents())
:param aligned_sents: The parallel text ``corpus.Iterable`` containing
AlignedSent instances of aligned sentence pairs from the corpus.
:type aligned_sents: list(AlignedSent)
:param convergent_threshold: The threshold value of convergence. An
entry is considered converged if the delta from ``old_t`` to ``new_t``
is less than this value. The algorithm terminates when all entries
are converged. This parameter is optional, default is 0.01
:type convergent_threshold: float
"""
def __init__(self, aligned_sents, convergent_threshold=1e-2, debug=False): def __init__(self, aligned_sents, convergent_threshold=1e-2, debug=False):
'''
Initialize a new ``EMIBMModel1``.
:param aligned_sents: The parallel text ``corpus.Iterable`` containing
AlignedSent instances of aligned sentence pairs from the corpus.
:type aligned_sents: list(AlignedSent)
:param convergent_threshold: The threshold value of convergence. An
entry is considered converged if the delta from ``old_t`` to ``new_t``
is less than this value. The algorithm terminates when all entries
are converged. This parameter is optional, default is 0.01
:type convergent_threshold: float
'''
self.aligned_sents = aligned_sents self.aligned_sents = aligned_sents
self.convergent_threshold = convergent_threshold self.convergent_threshold = convergent_threshold
# Dictionary of translation probabilities t(e,f). # Dictionary of translation probabilities t(e,f).
self.probabilities = None self.probabilities = None
self._train()
def train(self): def _train(self):
''' """
Perform Expectation Maximization training to learn Perform Expectation Maximization training to learn
word-to-word translation probabilities, and return word-to-word translation probabilities, and return
the number of iterations that were required for convergence. the number of iterations that were required for convergence.
''' """
# Collect up sets of all English and foreign words # Collect up sets of all English and foreign words
english_words = set() english_words = set()
...@@ -358,10 +388,11 @@ class EMIBMModel1(object): ...@@ -358,10 +388,11 @@ class EMIBMModel1(object):
return iteration_count return iteration_count
def aligned(self): def aligned(self):
''' """
Returns a list of AlignedSents with Alignments calculated using Return a list of AlignedSents with Alignments calculated using
IBM-Model 1. IBM-Model 1.
''' """
if self.probablities is None: if self.probablities is None:
raise ValueError("No probabilities calculated") raise ValueError("No probabilities calculated")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment