Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
e51b026b
Commit
e51b026b
authored
Aug 16, 2014
by
Steven Bird
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #733 from lade/paice-doc
Updated Paice documentation (Sphinx-style) & improved demo.
parents
b3c1c74f
a3edcf93
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
136 additions
and
30 deletions
+136
-30
nltk/metrics/paice.py
+123
-21
nltk/test/paice.doctest
+13
-9
No files found.
nltk/metrics/paice.py
View file @
e51b026b
...
@@ -24,7 +24,15 @@ from math import sqrt
...
@@ -24,7 +24,15 @@ from math import sqrt
def
get_words_from_dictionary
(
lemmas
):
def
get_words_from_dictionary
(
lemmas
):
'''Get original set of words used for analysis.'''
'''
Get original set of words used for analysis.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:type lemmas: dict
:return: Set of words that exist as values in the dictionary
:rtype: set
'''
words
=
set
()
words
=
set
()
for
lemma
in
lemmas
:
for
lemma
in
lemmas
:
words
.
update
(
set
(
lemmas
[
lemma
]))
words
.
update
(
set
(
lemmas
[
lemma
]))
...
@@ -32,7 +40,16 @@ def get_words_from_dictionary(lemmas):
...
@@ -32,7 +40,16 @@ def get_words_from_dictionary(lemmas):
def
_truncate
(
words
,
cutlength
):
def
_truncate
(
words
,
cutlength
):
'''Group words by length truncating them at
\'
cutlength
\'
.'''
'''Group words by stems defined by truncating them at given length.
:param words: Set of words used for analysis
:param cutlength: Words are stemmed by cutting at this length.
:type words: set or list
:type cutlength: int
:return: Dictionary where keys are stems and values are sets of words
corresponding to that stem.
:rtype: dict
'''
stems
=
{}
stems
=
{}
for
word
in
words
:
for
word
in
words
:
stem
=
word
[:
cutlength
]
stem
=
word
[:
cutlength
]
...
@@ -42,9 +59,18 @@ def _truncate(words, cutlength):
...
@@ -42,9 +59,18 @@ def _truncate(words, cutlength):
stems
[
stem
]
=
set
([
word
])
stems
[
stem
]
=
set
([
word
])
return
stems
return
stems
# Reference: http://en.wikipedia.org/wiki/Line-line_intersection
# Reference: http://en.wikipedia.org/wiki/Line-line_intersection
def
_count_intersection
(
l1
,
l2
):
def
_count_intersection
(
l1
,
l2
):
'''Count intersections between two line segments defined by coordinate pairs.'''
'''Count intersection between two line segments defined by coordinate pairs.
:param l1: Tuple of two coordinate pairs defining the first line segment
:param l2: Tuple of two coordinate pairs defining the second line segment
:type l1: tuple
:type l2: tuple
:return: Coordinates of the intersection
:rtype: tuple
'''
x1
,
y1
=
l1
[
0
]
x1
,
y1
=
l1
[
0
]
x2
,
y2
=
l1
[
1
]
x2
,
y2
=
l1
[
1
]
x3
,
y3
=
l2
[
0
]
x3
,
y3
=
l2
[
0
]
...
@@ -57,8 +83,8 @@ def _count_intersection(l1, l2):
...
@@ -57,8 +83,8 @@ def _count_intersection(l1, l2):
# When lines are parallel, they must be on the y-axis.
# When lines are parallel, they must be on the y-axis.
# We can ignore x-axis because we stop counting the
# We can ignore x-axis because we stop counting the
# truncation line when we get there.
# truncation line when we get there.
# There are no other options as UI
grows and OI diminishes
# There are no other options as UI
(x-axis) grows and
# when we go along the truncation line.
#
OI (y-axis) diminishes
when we go along the truncation line.
return
(
0.0
,
y4
)
return
(
0.0
,
y4
)
x
=
((
x1
*
y2
-
y1
*
x2
)
*
(
x3
-
x4
)
-
(
x1
-
x2
)
*
(
x3
*
y4
-
y3
*
x4
))
/
denominator
x
=
((
x1
*
y2
-
y1
*
x2
)
*
(
x3
-
x4
)
-
(
x1
-
x2
)
*
(
x3
*
y4
-
y3
*
x4
))
/
denominator
...
@@ -67,8 +93,13 @@ def _count_intersection(l1, l2):
...
@@ -67,8 +93,13 @@ def _count_intersection(l1, l2):
def
_get_derivative
(
coordinates
):
def
_get_derivative
(
coordinates
):
'''Get derivative of the line from (0,0) to the point defined by
'''Get derivative of the line from (0,0) to given coordinates.
the coordinates.'''
:param coordinates: A coordinate pair
:type coordinates: tuple
:return: Derivative; inf if x is zero
:rtype: float
'''
try
:
try
:
return
coordinates
[
1
]
/
coordinates
[
0
]
return
coordinates
[
1
]
/
coordinates
[
0
]
except
ZeroDivisionError
:
except
ZeroDivisionError
:
...
@@ -76,7 +107,17 @@ def _get_derivative(coordinates):
...
@@ -76,7 +107,17 @@ def _get_derivative(coordinates):
def
_calculate_cut
(
lemmawords
,
stems
):
def
_calculate_cut
(
lemmawords
,
stems
):
'''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.'''
'''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
:param lemmawords: Set or list of words corresponding to certain lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmawords: set or list
:type stems: dict
:return: Amount of understemmed and overstemmed pairs contributed by words
existing in both lemmawords and stems.
:rtype: tuple
'''
umt
,
wmt
=
0.0
,
0.0
umt
,
wmt
=
0.0
,
0.0
for
stem
in
stems
:
for
stem
in
stems
:
cut
=
set
(
lemmawords
)
&
set
(
stems
[
stem
])
cut
=
set
(
lemmawords
)
&
set
(
stems
[
stem
])
...
@@ -91,7 +132,20 @@ def _calculate_cut(lemmawords, stems):
...
@@ -91,7 +132,20 @@ def _calculate_cut(lemmawords, stems):
def
_calculate
(
lemmas
,
stems
):
def
_calculate
(
lemmas
,
stems
):
'''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.'''
'''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict
:type stems: dict
:return: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:rtype: tuple
'''
n
=
sum
(
len
(
lemmas
[
word
])
for
word
in
lemmas
)
n
=
sum
(
len
(
lemmas
[
word
])
for
word
in
lemmas
)
...
@@ -119,7 +173,18 @@ def _calculate(lemmas, stems):
...
@@ -119,7 +173,18 @@ def _calculate(lemmas, stems):
def
_indexes
(
gumt
,
gdmt
,
gwmt
,
gdnt
):
def
_indexes
(
gumt
,
gdmt
,
gwmt
,
gdnt
):
'''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).'''
'''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:type gumt, gdmt, gwmt, gdnt: float
:return: Understemming Index (UI),
Overstemming Index (OI) and
Stemming Weight (SW).
:rtype: tuple
'''
# Calculate Understemming Index (UI),
# Calculate Understemming Index (UI),
# Overstemming Index (OI) and Stemming Weight (SW)
# Overstemming Index (OI) and Stemming Weight (SW)
try
:
try
:
...
@@ -147,6 +212,14 @@ def _indexes(gumt, gdmt, gwmt, gdnt):
...
@@ -147,6 +212,14 @@ def _indexes(gumt, gdmt, gwmt, gdnt):
class
Paice
(
object
):
class
Paice
(
object
):
'''Class for storing lemmas, stems and evaluation metrics.'''
'''Class for storing lemmas, stems and evaluation metrics.'''
def
__init__
(
self
,
lemmas
,
stems
):
def
__init__
(
self
,
lemmas
,
stems
):
'''
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict
:type stems: dict
'''
self
.
lemmas
=
lemmas
self
.
lemmas
=
lemmas
self
.
stems
=
stems
self
.
stems
=
stems
self
.
coords
=
[]
self
.
coords
=
[]
...
@@ -169,7 +242,15 @@ class Paice(object):
...
@@ -169,7 +242,15 @@ class Paice(object):
return
''
.
join
(
text
)
return
''
.
join
(
text
)
def
_get_truncation_indexes
(
self
,
words
,
cutlength
):
def
_get_truncation_indexes
(
self
,
words
,
cutlength
):
'''Count (UI, OI) when stemming is done by truncating words at
\'
cutlength
\'
.'''
'''Count (UI, OI) when stemming is done by truncating words at
\'
cutlength
\'
.
:param words: Words used for the analysis
:param cutlength: Words are stemmed by cutting them at this length
:type words: set or list
:type cutlength: int
:return: Understemming and overstemming indexes
:rtype: tuple
'''
truncated
=
_truncate
(
words
,
cutlength
)
truncated
=
_truncate
(
words
,
cutlength
)
gumt
,
gdmt
,
gwmt
,
gdnt
=
_calculate
(
self
.
lemmas
,
truncated
)
gumt
,
gdmt
,
gwmt
,
gdnt
=
_calculate
(
self
.
lemmas
,
truncated
)
...
@@ -177,7 +258,16 @@ class Paice(object):
...
@@ -177,7 +258,16 @@ class Paice(object):
return
(
ui
,
oi
)
return
(
ui
,
oi
)
def
_get_truncation_coordinates
(
self
,
cutlength
=
0
):
def
_get_truncation_coordinates
(
self
,
cutlength
=
0
):
'''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.'''
'''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
:param cutlength: Optional parameter to start counting from (ui, oi)
coordinates gotten by stemming at this length. Useful for speeding up
the calculations when you know the approximate location of the
intersection.
:type cutlength: int
:return: List of coordinate pairs that define the truncation line
:rtype: list
'''
words
=
get_words_from_dictionary
(
self
.
lemmas
)
words
=
get_words_from_dictionary
(
self
.
lemmas
)
maxlength
=
max
(
len
(
word
)
for
word
in
words
)
maxlength
=
max
(
len
(
word
)
for
word
in
words
)
...
@@ -207,7 +297,13 @@ class Paice(object):
...
@@ -207,7 +297,13 @@ class Paice(object):
return
coords
return
coords
def
_errt
(
self
):
def
_errt
(
self
):
'''Count Error-Rate Relative to Truncation (ERRT).'''
'''Count Error-Rate Relative to Truncation (ERRT).
:return: ERRT, length of the line from origo to (UI, OI) divided by
the length of the line from origo to the point defined by the same
line when extended until the truncation line.
:rtype: float
'''
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
self
.
coords
=
self
.
_get_truncation_coordinates
()
self
.
coords
=
self
.
_get_truncation_coordinates
()
if
(
0.0
,
0.0
)
in
self
.
coords
:
if
(
0.0
,
0.0
)
in
self
.
coords
:
...
@@ -242,14 +338,16 @@ class Paice(object):
...
@@ -242,14 +338,16 @@ class Paice(object):
def
demo
():
def
demo
():
'''Demonstration of the module.'''
'''Demonstration of the module.'''
# Some words with their real lemmas
# Some words with their real lemmas
lemmas
=
{
'
consol'
:
[
'consol'
,
'consols
'
],
lemmas
=
{
'
kneel'
:
[
'kneel'
,
'knelt
'
],
'
console'
:
[
'consoled'
,
'consoles'
,
'consoling
'
],
'
range'
:
[
'range'
,
'ranged
'
],
'
kneel'
:
[
'kneel'
,
'knelt
'
]
'
ring'
:
[
'ring'
,
'rang'
,
'rung
'
]
}
}
# Same words with stems from a stemming algorithm
# Same words with stems from a stemming algorithm
stems
=
{
'consol'
:
[
'consol'
,
'consoled'
,
'consoles'
,
'consoling'
,
'consols'
],
stems
=
{
'kneel'
:
[
'kneel'
],
'kneel'
:
[
'kneel'
],
'knelt'
:
[
'knelt'
],
'knelt'
:
[
'knelt'
]
'rang'
:
[
'rang'
,
'range'
,
'ranged'
],
'ring'
:
[
'ring'
],
'rung'
:
[
'rung'
]
}
}
print
(
'Words grouped by their lemmas:'
)
print
(
'Words grouped by their lemmas:'
)
for
lemma
in
sorted
(
lemmas
):
for
lemma
in
sorted
(
lemmas
):
...
@@ -263,8 +361,12 @@ def demo():
...
@@ -263,8 +361,12 @@ def demo():
print
(
p
)
print
(
p
)
print
()
print
()
# Let's "change" results from a stemming algorithm
# Let's "change" results from a stemming algorithm
stems
=
{
'consol'
:
[
'consol'
,
'consoled'
,
'consoles'
,
'consoling'
,
'consols'
],
stems
=
{
'kneel'
:
[
'kneel'
],
'kne'
:
[
'kneel'
,
'knelt'
],
'knelt'
:
[
'knelt'
],
'rang'
:
[
'rang'
],
'range'
:
[
'range'
,
'ranged'
],
'ring'
:
[
'ring'
],
'rung'
:
[
'rung'
]
}
}
print
(
'Counting stats after changing stemming results:'
)
print
(
'Counting stats after changing stemming results:'
)
for
stem
in
sorted
(
stems
):
for
stem
in
sorted
(
stems
):
...
...
nltk/test/paice.doctest
View file @
e51b026b
...
@@ -13,21 +13,24 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a
...
@@ -13,21 +13,24 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a
Understemming and Overstemming values
Understemming and Overstemming values
-------------------------------------
-------------------------------------
>>> lemmas = {'consol': ['consol', 'consols'],
>>> lemmas = {'kneel': ['kneel', 'knelt'],
... 'console': ['consoled', 'consoles', 'consoling'],
... 'range': ['range', 'ranged'],
... 'kneel': ['kneel', 'knelt']}
... 'ring': ['ring', 'rang', 'rung']}
>>> stems = {'consol': ['consol', 'consoled', 'consoles', 'consoling', 'consols'],
>>> stems = {'kneel': ['kneel'],
... 'kneel': ['kneel'],
... 'knelt': ['knelt'],
... 'knelt': ['knelt']}
... 'rang': ['rang', 'range', 'ranged'],
... 'ring': ['ring'],
... 'rung': ['rung']}
>>> p = Paice(lemmas, stems)
>>> p = Paice(lemmas, stems)
>>> p.gumt, p.gdmt, p.gwmt, p.gdnt
>>> p.gumt, p.gdmt, p.gwmt, p.gdnt
(
1.0, 5.0, 6
.0, 16.0)
(
4.0, 5.0, 2
.0, 16.0)
>>> p.ui, p.oi, p.sw
>>> p.ui, p.oi, p.sw
(0.
2, 0.375, 1.87
5)
(0.
8, 0.125, 0.1562
5)
>>> p.errt
>>> p.errt
1.0
1.0
>>> p.coords
>>> p.coords
[(0.0, 1.0), (0.0, 0.375), (0.2, 0.375)]
[(0.0, 1.0), (0.0, 0.375), (0.6, 0.125), (0.8, 0.125)]
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment