Merge pull request #765 from dimazest/wsd-tests

Make nltk.wsd.lesk() deterministic.

Merge pull request #765 from dimazest/wsd-tests
Make nltk.wsd.lesk() deterministic.
481a8cec · Steven Bird · 1ec1680f · 91af7526 · 481a8cec · 481a8cec
Commit 481a8cec authored Nov 03, 2014 by Steven Bird
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 50 deletions

nltk/test/wsd.doctest
+7 -6

nltk/wsd.py
+26 -44

No files found.
--- a/nltk/test/wsd.doctest
+++ b/nltk/test/wsd.doctest
@@ -20,12 +20,13 @@ a Synset with the highest number of overlapping words between the context
 sentence and different definitions from each Synset.
    >>> from nltk.wsd import lesk
-    >>> from nltk import word_tokenize
+    >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
-    >>> sent = word_tokenize("I went to the bank to deposit money.")
-    >>> word = "bank"
+    >>> print(lesk(sent, 'bank', 'n'))
-    >>> pos = "n"
+    Synset('savings_bank.n.02')
-    >>> print(lesk(sent, word, pos))
-    Synset('depository_financial_institution.n.01')
+    >>> print(lesk(sent, 'bank'))
+    Synset('savings_bank.n.02')
 The definitions for "bank" are:

--- a/nltk/wsd.py
+++ b/nltk/wsd.py
@@ -6,59 +6,41 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-from nltk.corpus import wordnet as wn
+from nltk.corpus import wordnet
-############################################################
-# Lesk Algorithm
-############################################################
-def _compare_overlaps_greedy(context, synsets_signatures, pos=None):
+def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
-    """
+    """Return a synset for an ambigous word.
-    Calculate overlaps between the context sentence and the synset_signature
-    and returns the synset with the highest overlap.
-    :param context: ``context_sentence`` The context sentence where the ambiguous word occurs.
+    :param context_sentence: The context sentence where the ambiguous word occurs.
-    :param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word.
+    :param ambiguous_word: The ambiguous word that requires WSD.
-    :param pos: ``pos`` A specified Part-of-Speech (POS).
+    :param pos: A specified Part-of-Speech (POS).
+    :param iter sysnsets: Possible synsets of the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
-    """
-    max_overlaps = 0
-    lesk_sense = None
-    for ss in synsets_signatures:
-        if pos and str(ss.pos()) != pos: # Skips different POS.
-            continue
-        overlaps = set(synsets_signatures[ss]).intersection(context)
-        if len(overlaps) > max_overlaps:
-            lesk_sense = ss
-            max_overlaps = len(overlaps)  
-    return lesk_sense
-def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
+    This function is an implementation of the original Lesk algorithm (1986) [1].
-    """
-    This function is the implementation of the original Lesk algorithm (1986).
+    Usage example::
-    It requires a dictionary which contains the definition of the different
-    sense of each word. See http://goo.gl/8TB15w
-        >>> from nltk import word_tokenize
+        >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
-        >>> sent = word_tokenize("I went to the bank to deposit money.")
-        >>> word = "bank"
-        >>> pos = "n"
-        >>> lesk(sent, word, pos)
        Synset('savings_bank.n.02')
-    :param context_sentence: The context sentence where the ambiguous word occurs.
+    [1] Lesk, Michael. "Automatic sense disambiguation using machine readable
-    :param ambiguous_word: The ambiguous word that requires WSD.
+    dictionaries: how to tell a pine cone from an ice cream cone." Proceedings
-    :param pos: A specified Part-of-Speech (POS).
+    of the 5th annual international conference on Systems documentation. ACM,
-    :param dictionary: A list of words that 'signifies' the ambiguous word.
+    1986. http://dl.acm.org/citation.cfm?id=318728
-    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
-    if not dictionary:
+    context = set(context_sentence)
-        dictionary = {}
+    if not synsets:
-        for ss in wn.synsets(ambiguous_word):
+        synsets = wordnet.synsets(ambiguous_word)
-            dictionary[ss] = ss.definition().split()
-    best_sense = _compare_overlaps_greedy(context_sentence,
+    _, sense = max(
-                                       dictionary, pos)
+        (len(context.intersection(ss.definition().split())), ss)
-    return best_sense
+        for ss in synsets if pos is None or str(ss.pos()) == pos
+    )
+    return sense
 if __name__ == "__main__":