Commit 481a8cec by Steven Bird

Merge pull request #765 from dimazest/wsd-tests

Make nltk.wsd.lesk() deterministic.
parents 1ec1680f 91af7526
...@@ -20,12 +20,13 @@ a Synset with the highest number of overlapping words between the context ...@@ -20,12 +20,13 @@ a Synset with the highest number of overlapping words between the context
sentence and different definitions from each Synset. sentence and different definitions from each Synset.
>>> from nltk.wsd import lesk >>> from nltk.wsd import lesk
>>> from nltk import word_tokenize >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
>>> sent = word_tokenize("I went to the bank to deposit money.")
>>> word = "bank" >>> print(lesk(sent, 'bank', 'n'))
>>> pos = "n" Synset('savings_bank.n.02')
>>> print(lesk(sent, word, pos))
Synset('depository_financial_institution.n.01') >>> print(lesk(sent, 'bank'))
Synset('savings_bank.n.02')
The definitions for "bank" are: The definitions for "bank" are:
......
...@@ -6,59 +6,41 @@ ...@@ -6,59 +6,41 @@
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from nltk.corpus import wordnet as wn from nltk.corpus import wordnet
############################################################
# Lesk Algorithm
############################################################
def _compare_overlaps_greedy(context, synsets_signatures, pos=None): def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
""" """Return a synset for an ambigous word.
Calculate overlaps between the context sentence and the synset_signature
and returns the synset with the highest overlap.
:param context: ``context_sentence`` The context sentence where the ambiguous word occurs. :param context_sentence: The context sentence where the ambiguous word occurs.
:param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word. :param ambiguous_word: The ambiguous word that requires WSD.
:param pos: ``pos`` A specified Part-of-Speech (POS). :param pos: A specified Part-of-Speech (POS).
:param iter sysnsets: Possible synsets of the ambiguous word.
:return: ``lesk_sense`` The Synset() object with the highest signature overlaps. :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
"""
max_overlaps = 0
lesk_sense = None
for ss in synsets_signatures:
if pos and str(ss.pos()) != pos: # Skips different POS.
continue
overlaps = set(synsets_signatures[ss]).intersection(context)
if len(overlaps) > max_overlaps:
lesk_sense = ss
max_overlaps = len(overlaps)
return lesk_sense
def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None): This function is an implementation of the original Lesk algorithm (1986) [1].
"""
This function is the implementation of the original Lesk algorithm (1986). Usage example::
It requires a dictionary which contains the definition of the different
sense of each word. See http://goo.gl/8TB15w
>>> from nltk import word_tokenize >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
>>> sent = word_tokenize("I went to the bank to deposit money.")
>>> word = "bank"
>>> pos = "n"
>>> lesk(sent, word, pos)
Synset('savings_bank.n.02') Synset('savings_bank.n.02')
:param context_sentence: The context sentence where the ambiguous word occurs. [1] Lesk, Michael. "Automatic sense disambiguation using machine readable
:param ambiguous_word: The ambiguous word that requires WSD. dictionaries: how to tell a pine cone from an ice cream cone." Proceedings
:param pos: A specified Part-of-Speech (POS). of the 5th annual international conference on Systems documentation. ACM,
:param dictionary: A list of words that 'signifies' the ambiguous word. 1986. http://dl.acm.org/citation.cfm?id=318728
:return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
""" """
if not dictionary: context = set(context_sentence)
dictionary = {} if not synsets:
for ss in wn.synsets(ambiguous_word): synsets = wordnet.synsets(ambiguous_word)
dictionary[ss] = ss.definition().split()
best_sense = _compare_overlaps_greedy(context_sentence, _, sense = max(
dictionary, pos) (len(context.intersection(ss.definition().split())), ss)
return best_sense for ss in synsets if pos is None or str(ss.pos()) == pos
)
return sense
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment