Commit e2090e42 by Steven Bird

Merge pull request #669 from jskda/pull_request.505

Fixes to omw wordnet
parents 24e257a6 916b0be3
...@@ -206,8 +206,7 @@ webtext = LazyCorpusLoader( ...@@ -206,8 +206,7 @@ webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2') 'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader( wordnet = LazyCorpusLoader(
'wordnet', WordNetCorpusReader, 'wordnet', WordNetCorpusReader,
LazyCorpusLoader('omw', CorpusReader, r'(?!\.).*\.tab') LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'))
)
wordnet_ic = LazyCorpusLoader( wordnet_ic = LazyCorpusLoader(
'wordnet_ic', WordNetICCorpusReader, '.*\.dat') 'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader( words = LazyCorpusLoader(
......
# -*- coding: utf-8 -*-
# Natural Language Toolkit: WordNet # Natural Language Toolkit: WordNet
# #
# Copyright (C) 2001-2014 NLTK Project # Copyright (C) 2001-2014 NLTK Project
...@@ -1815,65 +1816,6 @@ def lin_similarity(synset1, synset2, ic, verbose=False): ...@@ -1815,65 +1816,6 @@ def lin_similarity(synset1, synset2, ic, verbose=False):
lin_similarity.__doc__ = Synset.lin_similarity.__doc__ lin_similarity.__doc__ = Synset.lin_similarity.__doc__
def _lcs_by_depth(synset1, synset2, verbose=False):
"""
Finds the least common subsumer of two synsets in a WordNet taxonomy,
where the least common subsumer is defined as the ancestor node common
to both input synsets whose shortest path to the root node is the longest.
:type synset1: Synset
:param synset1: First input synset.
:type synset2: Synset
:param synset2: Second input synset.
:return: The ancestor synset common to both input synsets which is also the
LCS.
"""
subsumer = None
max_min_path_length = -1
subsumers = synset1.common_hypernyms(synset2)
if verbose:
print("> Subsumers1:", subsumers)
# Eliminate those synsets which are ancestors of other synsets in the
# set of subsumers.
eliminated = set()
hypernym_relation = lambda s: s.hypernyms() + s.instance_hypernyms()
for s1 in subsumers:
for s2 in subsumers:
if s2 in s1.closure(hypernym_relation):
eliminated.add(s2)
if verbose:
print("> Eliminated:", eliminated)
subsumers = [s for s in subsumers if s not in eliminated]
if verbose:
print("> Subsumers2:", subsumers)
# Calculate the length of the shortest path to the root for each
# subsumer. Select the subsumer with the longest of these.
for candidate in subsumers:
paths_to_root = candidate.hypernym_paths()
min_path_length = -1
for path in paths_to_root:
if min_path_length < 0 or len(path) < min_path_length:
min_path_length = len(path)
if min_path_length > max_min_path_length:
max_min_path_length = min_path_length
subsumer = candidate
if verbose:
print("> LCS Subsumer by depth:", subsumer)
return subsumer
def _lcs_ic(synset1, synset2, ic, verbose=False): def _lcs_ic(synset1, synset2, ic, verbose=False):
""" """
Get the information content of the least common subsumer that has Get the information content of the least common subsumer that has
......
...@@ -47,29 +47,30 @@ A synset is identified with a 3-part name of the form: word.pos.nn: ...@@ -47,29 +47,30 @@ A synset is identified with a 3-part name of the form: word.pos.nn:
The WordNet corpus reader gives access to the Open Multilingual The WordNet corpus reader gives access to the Open Multilingual
WordNet, using ISO-639 language codes. WordNet, using ISO-639 language codes.
>>> wn.langs() >>> sorted(wn.langs())
[u'als', u'arb', u'cat', u'cmn', u'dan', u'eng', u'eus', u'fas', ['als', 'arb', 'cat', 'cmn', 'dan', 'eng', 'eus', 'fas',
u'fin', u'fre', u'glg', u'heb', u'ind', u'ita', u'jpn', u'nno', 'fin', 'fra', 'fre', 'glg', 'heb', 'ind', 'ita', 'jpn', 'nno',
u'nob', u'pol', u'por', u'spa', u'tha', u'zsm'] 'nob', 'pol', 'por', 'spa', 'tha', 'zsm']
>>> wn.synsets('ç', lang='jpn') >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
[Synset('dog.n.01'), Synset('spy.n.01')] [Synset('dog.n.01'), Synset('spy.n.01')]
>>> wn.synset('spy.n.01').lemma_names('jpn') >>> wn.synset('spy.n.01').lemma_names('jpn')
['いぬ', 'まわしè', 'スパイ', 'åしè', 'åè', 'åå', 'åäå', 'åしè', 'åè', 'æ', 'æり ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
', 'ç', 'çåææå', 'èåå', 'èè', 'éè', 'éè', 'éå'] '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
'\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
'\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
>>> wn.synset('dog.n.01').lemma_names('ita') >>> wn.synset('dog.n.01').lemma_names('ita')
['cane', 'Canis_familiaris'] ['cane', 'Canis_familiaris']
>>> wn.lemmas('cane', lang='ita') >>> wn.lemmas('cane', lang='ita')
[Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'), [Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'),
Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')] Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')]
>>> wn.synset('dog.n.01').lemmas(['dan', 'por']) >>> sorted(wn.synset('dog.n.01').lemmas(['dan', 'por']).items())
defaultdict(<class 'list'>, {'dan': [Lemma('dog.n.01.hund'), [('dan', [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
Lemma('dog.n.01.køter'), Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]),
Lemma('dog.n.01.vovse')], 'por': [Lemma('dog.n.01.cachorro'), ('por', [Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.c\xe3o'),
Lemma('dog.n.01.cão'), Lemma('dog.n.01.cão'), Lemma('dog.n.01.c\xe3o'), Lemma('dog.n.01.c\xe3es')])]
Lemma('dog.n.01.cães')]} >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
>>> dog_lemma = wn.lemma('dog.n.01.cão', lang='por')
>>> dog_lemma >>> dog_lemma
Lemma('dog.n.01.cão') Lemma('dog.n.01.c\xe3o')
>>> dog_lemma.lang() >>> dog_lemma.lang()
'por' 'por'
>>> len(wordnet.all_lemma_names(pos='n', lang='jpn')) >>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment