Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
e2090e42
Commit
e2090e42
authored
May 17, 2014
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #669 from jskda/pull_request.505
Fixes to omw wordnet
parents
24e257a6
916b0be3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
76 deletions
+18
-76
nltk/corpus/__init__.py
+1
-2
nltk/corpus/reader/wordnet.py
+1
-59
nltk/test/wordnet.doctest
+16
-15
No files found.
nltk/corpus/__init__.py
View file @
e2090e42
...
@@ -206,8 +206,7 @@ webtext = LazyCorpusLoader(
...
@@ -206,8 +206,7 @@ webtext = LazyCorpusLoader(
'webtext'
,
PlaintextCorpusReader
,
r'(?!README|\.).*\.txt'
,
encoding
=
'ISO-8859-2'
)
'webtext'
,
PlaintextCorpusReader
,
r'(?!README|\.).*\.txt'
,
encoding
=
'ISO-8859-2'
)
wordnet
=
LazyCorpusLoader
(
wordnet
=
LazyCorpusLoader
(
'wordnet'
,
WordNetCorpusReader
,
'wordnet'
,
WordNetCorpusReader
,
LazyCorpusLoader
(
'omw'
,
CorpusReader
,
r'(?!\.).*\.tab'
)
LazyCorpusLoader
(
'omw'
,
CorpusReader
,
r'.*/wn-data-.*\.tab'
,
encoding
=
'utf8'
))
)
wordnet_ic
=
LazyCorpusLoader
(
wordnet_ic
=
LazyCorpusLoader
(
'wordnet_ic'
,
WordNetICCorpusReader
,
'.*
\
.dat'
)
'wordnet_ic'
,
WordNetICCorpusReader
,
'.*
\
.dat'
)
words
=
LazyCorpusLoader
(
words
=
LazyCorpusLoader
(
...
...
nltk/corpus/reader/wordnet.py
View file @
e2090e42
# -*- coding: utf-8 -*-
# Natural Language Toolkit: WordNet
# Natural Language Toolkit: WordNet
#
#
# Copyright (C) 2001-2014 NLTK Project
# Copyright (C) 2001-2014 NLTK Project
...
@@ -1815,65 +1816,6 @@ def lin_similarity(synset1, synset2, ic, verbose=False):
...
@@ -1815,65 +1816,6 @@ def lin_similarity(synset1, synset2, ic, verbose=False):
lin_similarity
.
__doc__
=
Synset
.
lin_similarity
.
__doc__
lin_similarity
.
__doc__
=
Synset
.
lin_similarity
.
__doc__
def
_lcs_by_depth
(
synset1
,
synset2
,
verbose
=
False
):
"""
Finds the least common subsumer of two synsets in a WordNet taxonomy,
where the least common subsumer is defined as the ancestor node common
to both input synsets whose shortest path to the root node is the longest.
:type synset1: Synset
:param synset1: First input synset.
:type synset2: Synset
:param synset2: Second input synset.
:return: The ancestor synset common to both input synsets which is also the
LCS.
"""
subsumer
=
None
max_min_path_length
=
-
1
subsumers
=
synset1
.
common_hypernyms
(
synset2
)
if
verbose
:
print
(
"> Subsumers1:"
,
subsumers
)
# Eliminate those synsets which are ancestors of other synsets in the
# set of subsumers.
eliminated
=
set
()
hypernym_relation
=
lambda
s
:
s
.
hypernyms
()
+
s
.
instance_hypernyms
()
for
s1
in
subsumers
:
for
s2
in
subsumers
:
if
s2
in
s1
.
closure
(
hypernym_relation
):
eliminated
.
add
(
s2
)
if
verbose
:
print
(
"> Eliminated:"
,
eliminated
)
subsumers
=
[
s
for
s
in
subsumers
if
s
not
in
eliminated
]
if
verbose
:
print
(
"> Subsumers2:"
,
subsumers
)
# Calculate the length of the shortest path to the root for each
# subsumer. Select the subsumer with the longest of these.
for
candidate
in
subsumers
:
paths_to_root
=
candidate
.
hypernym_paths
()
min_path_length
=
-
1
for
path
in
paths_to_root
:
if
min_path_length
<
0
or
len
(
path
)
<
min_path_length
:
min_path_length
=
len
(
path
)
if
min_path_length
>
max_min_path_length
:
max_min_path_length
=
min_path_length
subsumer
=
candidate
if
verbose
:
print
(
"> LCS Subsumer by depth:"
,
subsumer
)
return
subsumer
def
_lcs_ic
(
synset1
,
synset2
,
ic
,
verbose
=
False
):
def
_lcs_ic
(
synset1
,
synset2
,
ic
,
verbose
=
False
):
"""
"""
Get the information content of the least common subsumer that has
Get the information content of the least common subsumer that has
...
...
nltk/test/wordnet.doctest
View file @
e2090e42
...
@@ -47,29 +47,30 @@ A synset is identified with a 3-part name of the form: word.pos.nn:
...
@@ -47,29 +47,30 @@ A synset is identified with a 3-part name of the form: word.pos.nn:
The WordNet corpus reader gives access to the Open Multilingual
The WordNet corpus reader gives access to the Open Multilingual
WordNet, using ISO-639 language codes.
WordNet, using ISO-639 language codes.
>>>
wn.langs(
)
>>>
sorted(wn.langs()
)
[
u'als', u'arb', u'cat', u'cmn', u'dan', u'eng', u'eus', u
'fas',
[
'als', 'arb', 'cat', 'cmn', 'dan', 'eng', 'eus',
'fas',
u'fin', u'fre', u'glg', u'heb', u'ind', u'ita', u'jpn', u
'nno',
'fin', 'fra', 'fre', 'glg', 'heb', 'ind', 'ita', 'jpn',
'nno',
u'nob', u'pol', u'por', u'spa', u'tha', u
'zsm']
'nob', 'pol', 'por', 'spa', 'tha',
'zsm']
>>> wn.synsets(
'ç'
, lang='jpn')
>>> wn.synsets(
b'\xe7\x8a\xac'.decode('utf-8')
, lang='jpn')
[Synset('dog.n.01'), Synset('spy.n.01')]
[Synset('dog.n.01'), Synset('spy.n.01')]
>>> wn.synset('spy.n.01').lemma_names('jpn')
>>> wn.synset('spy.n.01').lemma_names('jpn')
['いぬ', 'まわしè', 'スパイ', 'åしè', 'åè', 'åå', 'åäå', 'åしè', 'åè', 'æ', 'æり
['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
', 'ç', 'çåææå', 'èåå', 'èè', 'éè', 'éè', 'éå']
'\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
'\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
'\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
>>> wn.synset('dog.n.01').lemma_names('ita')
>>> wn.synset('dog.n.01').lemma_names('ita')
['cane', 'Canis_familiaris']
['cane', 'Canis_familiaris']
>>> wn.lemmas('cane', lang='ita')
>>> wn.lemmas('cane', lang='ita')
[Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'),
[Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'),
Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')]
Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')]
>>> wn.synset('dog.n.01').lemmas(['dan', 'por'])
>>> sorted(wn.synset('dog.n.01').lemmas(['dan', 'por']).items())
defaultdict(<class 'list'>, {'dan': [Lemma('dog.n.01.hund'),
[('dan', [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
Lemma('dog.n.01.køter'), Lemma('dog.n.01.vovhund'),
Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]),
Lemma('dog.n.01.vovse')], 'por': [Lemma('dog.n.01.cachorro'),
('por', [Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.c\xe3o'),
Lemma('dog.n.01.cão'), Lemma('dog.n.01.cão'),
Lemma('dog.n.01.c\xe3o'), Lemma('dog.n.01.c\xe3es')])]
Lemma('dog.n.01.cães')]}
>>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
>>> dog_lemma = wn.lemma('dog.n.01.cão', lang='por')
>>> dog_lemma
>>> dog_lemma
Lemma('dog.n.01.c
ã
o')
Lemma('dog.n.01.c
\xe3
o')
>>> dog_lemma.lang()
>>> dog_lemma.lang()
'por'
'por'
>>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
>>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment