Commit aca50bcc by Steven Bird

WSD doesn't belong inside the sem package; may be a whole package on its own someday

parent 0f896856
...@@ -165,7 +165,7 @@ else: ...@@ -165,7 +165,7 @@ else:
from nltk import align, ccg, chunk, classify, collocations from nltk import align, ccg, chunk, classify, collocations
from nltk import data, featstruct, grammar, help, inference, metrics from nltk import data, featstruct, grammar, help, inference, metrics
from nltk import misc, parse, probability, sem, stem from nltk import misc, parse, probability, sem, stem, wsd
from nltk import tag, tbl, text, tokenize, tree, treetransforms, util from nltk import tag, tbl, text, tokenize, tree, treetransforms, util
# override any accidentally imported demo # override any accidentally imported demo
......
# Natural Language Toolkit: Lesk Algorithm # Natural Language Toolkit: Word Sense Disambiguation Algorithms
# #
# Author: Liling Tan <alvations@gmail.com> # Author: Liling Tan <alvations@gmail.com>
# #
...@@ -6,10 +6,13 @@ ...@@ -6,10 +6,13 @@
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from nltk import word_tokenize
from nltk.corpus import wordnet as wn from nltk.corpus import wordnet as wn
def compare_overlaps_greedy(context, synsets_signatures, pos=None): ############################################################
# Lesk Algorithm
############################################################
def _compare_overlaps_greedy(context, synsets_signatures, pos=None):
""" """
Calculate overlaps between the context sentence and the synset_signature Calculate overlaps between the context sentence and the synset_signature
and returns the synset with the highest overlap. and returns the synset with the highest overlap.
...@@ -30,11 +33,18 @@ def compare_overlaps_greedy(context, synsets_signatures, pos=None): ...@@ -30,11 +33,18 @@ def compare_overlaps_greedy(context, synsets_signatures, pos=None):
max_overlaps = len(overlaps) max_overlaps = len(overlaps)
return lesk_sense return lesk_sense
def wsd(context_sentence, ambiguous_word, pos=None, dictionary=None): def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
""" """
This function is the implementation of the original Lesk algorithm (1986). This function is the implementation of the original Lesk algorithm (1986).
It requires a dictionary which contains the definition of the different It requires a dictionary which contains the definition of the different
sense of each word. See http://goo.gl/8TB15w sense of each word. See http://goo.gl/8TB15w
>>> from nltk import word_tokenize
>>> sent = word_tokenize("I went to the bank to deposit money.")
>>> word = "bank"
>>> pos = "n"
>>> wsd(sent, word, pos)
Synset('depository_financial_institution.n.01')
:param context_sentence: The context sentence where the ambiguous word occurs. :param context_sentence: The context sentence where the ambiguous word occurs.
:param ambiguous: The ambiguous word that requires WSD. :param ambiguous: The ambiguous word that requires WSD.
...@@ -46,16 +56,11 @@ def wsd(context_sentence, ambiguous_word, pos=None, dictionary=None): ...@@ -46,16 +56,11 @@ def wsd(context_sentence, ambiguous_word, pos=None, dictionary=None):
dictionary = {} dictionary = {}
for ss in wn.synsets(ambiguous_word): for ss in wn.synsets(ambiguous_word):
dictionary[ss] = ss.definition().split() dictionary[ss] = ss.definition().split()
best_sense = compare_overlaps_greedy(word_tokenize(context_sentence), \ best_sense = _compare_overlaps_greedy(context_sentence, \
dictionary, pos) dictionary, pos)
return best_sense return best_sense
def demo(): if __name__ == "__main__":
sent = "I went to the bank to deposit money." import doctest
word = "bank" doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
pos = "n"
print wsd(sent, word, pos)
if __name__ == '__main__':
demo()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment