Merge pull request #669 from jskda/pull_request.505

Fixes to omw wordnet

Merge pull request #669 from jskda/pull_request.505
Fixes to omw wordnet
e2090e42 · Steven Bird · 24e257a6 · 916b0be3 · e2090e42 · e2090e42
Commit e2090e42 authored May 17, 2014 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 76 deletions

nltk/corpus/__init__.py
+1 -2

nltk/corpus/reader/wordnet.py
+1 -59

nltk/test/wordnet.doctest
+16 -15

No files found.
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -206,8 +206,7 @@ webtext = LazyCorpusLoader(
    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
 wordnet = LazyCorpusLoader(
    'wordnet', WordNetCorpusReader,
-    LazyCorpusLoader('omw', CorpusReader, r'(?!\.).*\.tab')
+    LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'))
-)
 wordnet_ic = LazyCorpusLoader(
    'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
 words = LazyCorpusLoader(

--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: WordNet
 #
 # Copyright (C) 2001-2014 NLTK Project
@@ -1815,65 +1816,6 @@ def lin_similarity(synset1, synset2, ic, verbose=False):
 lin_similarity.__doc__ = Synset.lin_similarity.__doc__
-def _lcs_by_depth(synset1, synset2, verbose=False):
-    """
-    Finds the least common subsumer of two synsets in a WordNet taxonomy,
-    where the least common subsumer is defined as the ancestor node common
-    to both input synsets whose shortest path to the root node is the longest.
-    :type synset1: Synset
-    :param synset1: First input synset.
-    :type synset2: Synset
-    :param synset2: Second input synset.
-    :return: The ancestor synset common to both input synsets which is also the
-    LCS.
-    """
-    subsumer = None
-    max_min_path_length = -1
-    subsumers = synset1.common_hypernyms(synset2)
-    if verbose:
-        print("> Subsumers1:", subsumers)
-    # Eliminate those synsets which are ancestors of other synsets in the
-    # set of subsumers.
-    eliminated = set()
-    hypernym_relation = lambda s: s.hypernyms() + s.instance_hypernyms()
-    for s1 in subsumers:
-        for s2 in subsumers:
-            if s2 in s1.closure(hypernym_relation):
-                eliminated.add(s2)
-    if verbose:
-        print("> Eliminated:", eliminated)
-    subsumers = [s for s in subsumers if s not in eliminated]
-    if verbose:
-        print("> Subsumers2:", subsumers)
-    # Calculate the length of the shortest path to the root for each
-    # subsumer. Select the subsumer with the longest of these.
-    for candidate in subsumers:
-        paths_to_root = candidate.hypernym_paths()
-        min_path_length = -1
-        for path in paths_to_root:
-            if min_path_length < 0 or len(path) < min_path_length:
-                min_path_length = len(path)
-        if min_path_length > max_min_path_length:
-            max_min_path_length = min_path_length
-            subsumer = candidate
-    if verbose:
-        print("> LCS Subsumer by depth:", subsumer)
-    return subsumer
 def _lcs_ic(synset1, synset2, ic, verbose=False):
    """
    Get the information content of the least common subsumer that has

--- a/nltk/test/wordnet.doctest
+++ b/nltk/test/wordnet.doctest
@@ -47,29 +47,30 @@ A synset is identified with a 3-part name of the form: word.pos.nn:
 The WordNet corpus reader gives access to the Open Multilingual
 WordNet, using ISO-639 language codes.
-    >>> wn.langs()
+    >>> sorted(wn.langs())
-    [u'als', u'arb', u'cat', u'cmn', u'dan', u'eng', u'eus', u'fas',
+    ['als', 'arb', 'cat', 'cmn', 'dan', 'eng', 'eus', 'fas',
-    u'fin', u'fre', u'glg', u'heb', u'ind', u'ita', u'jpn', u'nno',
+    'fin', 'fra', 'fre', 'glg', 'heb', 'ind', 'ita', 'jpn', 'nno',
-    u'nob', u'pol', u'por', u'spa', u'tha', u'zsm']
+    'nob', 'pol', 'por', 'spa', 'tha', 'zsm']
-    >>> wn.synsets('ç', lang='jpn')
+    >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
    [Synset('dog.n.01'), Synset('spy.n.01')]
    >>> wn.synset('spy.n.01').lemma_names('jpn')
-    ['いぬ', 'まわしè', 'スパイ', 'åしè', 'åè', 'åå', 'åäå', 'åしè', 'åè', 'æ', 'æり
+    ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
-    ', 'ç', 'çåææå', 'èåå', 'èè', 'éè', 'éè', 'éå']
+    '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
+    '\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
+    '\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
    >>> wn.synset('dog.n.01').lemma_names('ita')
    ['cane', 'Canis_familiaris']
    >>> wn.lemmas('cane', lang='ita')
    [Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'),
    Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')]
-    >>> wn.synset('dog.n.01').lemmas(['dan', 'por'])
+    >>> sorted(wn.synset('dog.n.01').lemmas(['dan', 'por']).items())
-    defaultdict(<class 'list'>, {'dan': [Lemma('dog.n.01.hund'),
+    [('dan', [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
-    Lemma('dog.n.01.køter'), Lemma('dog.n.01.vovhund'),
+    Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]),
-    Lemma('dog.n.01.vovse')], 'por': [Lemma('dog.n.01.cachorro'),
+    ('por', [Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.c\xe3o'),
-    Lemma('dog.n.01.cão'), Lemma('dog.n.01.cão'),
+    Lemma('dog.n.01.c\xe3o'), Lemma('dog.n.01.c\xe3es')])]
-    Lemma('dog.n.01.cães')]}
+    >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
-    >>> dog_lemma = wn.lemma('dog.n.01.cão', lang='por')
    >>> dog_lemma
-    Lemma('dog.n.01.cão')
+    Lemma('dog.n.01.c\xe3o')
    >>> dog_lemma.lang()
    'por'
    >>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))