Commit 481a8cec by Steven Bird

Merge pull request #765 from dimazest/wsd-tests

Make nltk.wsd.lesk() deterministic.
parents 1ec1680f 91af7526
...@@ -13,19 +13,20 @@ Lesk Algorithm ...@@ -13,19 +13,20 @@ Lesk Algorithm
Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using
a the definitions of the ambiguous word. a the definitions of the ambiguous word.
Given an ambiguous word and the context in which the word occurs, Lesk returns Given an ambiguous word and the context in which the word occurs, Lesk returns
a Synset with the highest number of overlapping words between the context a Synset with the highest number of overlapping words between the context
sentence and different definitions from each Synset. sentence and different definitions from each Synset.
>>> from nltk.wsd import lesk >>> from nltk.wsd import lesk
>>> from nltk import word_tokenize >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
>>> sent = word_tokenize("I went to the bank to deposit money.")
>>> word = "bank" >>> print(lesk(sent, 'bank', 'n'))
>>> pos = "n" Synset('savings_bank.n.02')
>>> print(lesk(sent, word, pos))
Synset('depository_financial_institution.n.01') >>> print(lesk(sent, 'bank'))
Synset('savings_bank.n.02')
The definitions for "bank" are: The definitions for "bank" are:
......
...@@ -6,59 +6,41 @@ ...@@ -6,59 +6,41 @@
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from nltk.corpus import wordnet as wn from nltk.corpus import wordnet
############################################################
# Lesk Algorithm
############################################################
def _compare_overlaps_greedy(context, synsets_signatures, pos=None): def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
""" """Return a synset for an ambigous word.
Calculate overlaps between the context sentence and the synset_signature
and returns the synset with the highest overlap.
:param context: ``context_sentence`` The context sentence where the ambiguous word occurs.
:param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word.
:param pos: ``pos`` A specified Part-of-Speech (POS).
:return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
"""
max_overlaps = 0
lesk_sense = None
for ss in synsets_signatures:
if pos and str(ss.pos()) != pos: # Skips different POS.
continue
overlaps = set(synsets_signatures[ss]).intersection(context)
if len(overlaps) > max_overlaps:
lesk_sense = ss
max_overlaps = len(overlaps)
return lesk_sense
def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
"""
This function is the implementation of the original Lesk algorithm (1986).
It requires a dictionary which contains the definition of the different
sense of each word. See http://goo.gl/8TB15w
>>> from nltk import word_tokenize
>>> sent = word_tokenize("I went to the bank to deposit money.")
>>> word = "bank"
>>> pos = "n"
>>> lesk(sent, word, pos)
Synset('savings_bank.n.02')
:param context_sentence: The context sentence where the ambiguous word occurs. :param context_sentence: The context sentence where the ambiguous word occurs.
:param ambiguous_word: The ambiguous word that requires WSD. :param ambiguous_word: The ambiguous word that requires WSD.
:param pos: A specified Part-of-Speech (POS). :param pos: A specified Part-of-Speech (POS).
:param dictionary: A list of words that 'signifies' the ambiguous word. :param iter sysnsets: Possible synsets of the ambiguous word.
:return: ``lesk_sense`` The Synset() object with the highest signature overlaps. :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
This function is an implementation of the original Lesk algorithm (1986) [1].
Usage example::
>>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
Synset('savings_bank.n.02')
[1] Lesk, Michael. "Automatic sense disambiguation using machine readable
dictionaries: how to tell a pine cone from an ice cream cone." Proceedings
of the 5th annual international conference on Systems documentation. ACM,
1986. http://dl.acm.org/citation.cfm?id=318728
""" """
if not dictionary: context = set(context_sentence)
dictionary = {} if not synsets:
for ss in wn.synsets(ambiguous_word): synsets = wordnet.synsets(ambiguous_word)
dictionary[ss] = ss.definition().split()
best_sense = _compare_overlaps_greedy(context_sentence, _, sense = max(
dictionary, pos) (len(context.intersection(ss.definition().split())), ss)
return best_sense for ss in synsets if pos is None or str(ss.pos()) == pos
)
return sense
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment