Commit e51b026b by Steven Bird

Merge pull request #733 from lade/paice-doc

Updated Paice documentation (Sphinx-style) & improved demo.
parents b3c1c74f a3edcf93
...@@ -24,7 +24,15 @@ from math import sqrt ...@@ -24,7 +24,15 @@ from math import sqrt
def get_words_from_dictionary(lemmas): def get_words_from_dictionary(lemmas):
'''Get original set of words used for analysis.''' '''
Get original set of words used for analysis.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:type lemmas: dict
:return: Set of words that exist as values in the dictionary
:rtype: set
'''
words = set() words = set()
for lemma in lemmas: for lemma in lemmas:
words.update(set(lemmas[lemma])) words.update(set(lemmas[lemma]))
...@@ -32,7 +40,16 @@ def get_words_from_dictionary(lemmas): ...@@ -32,7 +40,16 @@ def get_words_from_dictionary(lemmas):
def _truncate(words, cutlength): def _truncate(words, cutlength):
'''Group words by length truncating them at \'cutlength\'.''' '''Group words by stems defined by truncating them at given length.
:param words: Set of words used for analysis
:param cutlength: Words are stemmed by cutting at this length.
:type words: set or list
:type cutlength: int
:return: Dictionary where keys are stems and values are sets of words
corresponding to that stem.
:rtype: dict
'''
stems = {} stems = {}
for word in words: for word in words:
stem = word[:cutlength] stem = word[:cutlength]
...@@ -42,9 +59,18 @@ def _truncate(words, cutlength): ...@@ -42,9 +59,18 @@ def _truncate(words, cutlength):
stems[stem] = set([word]) stems[stem] = set([word])
return stems return stems
# Reference: http://en.wikipedia.org/wiki/Line-line_intersection # Reference: http://en.wikipedia.org/wiki/Line-line_intersection
def _count_intersection(l1, l2): def _count_intersection(l1, l2):
'''Count intersections between two line segments defined by coordinate pairs.''' '''Count intersection between two line segments defined by coordinate pairs.
:param l1: Tuple of two coordinate pairs defining the first line segment
:param l2: Tuple of two coordinate pairs defining the second line segment
:type l1: tuple
:type l2: tuple
:return: Coordinates of the intersection
:rtype: tuple
'''
x1, y1 = l1[0] x1, y1 = l1[0]
x2, y2 = l1[1] x2, y2 = l1[1]
x3, y3 = l2[0] x3, y3 = l2[0]
...@@ -57,8 +83,8 @@ def _count_intersection(l1, l2): ...@@ -57,8 +83,8 @@ def _count_intersection(l1, l2):
# When lines are parallel, they must be on the y-axis. # When lines are parallel, they must be on the y-axis.
# We can ignore x-axis because we stop counting the # We can ignore x-axis because we stop counting the
# truncation line when we get there. # truncation line when we get there.
# There are no other options as UI grows and OI diminishes # There are no other options as UI (x-axis) grows and
# when we go along the truncation line. # OI (y-axis) diminishes when we go along the truncation line.
return (0.0, y4) return (0.0, y4)
x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator
...@@ -67,8 +93,13 @@ def _count_intersection(l1, l2): ...@@ -67,8 +93,13 @@ def _count_intersection(l1, l2):
def _get_derivative(coordinates): def _get_derivative(coordinates):
'''Get derivative of the line from (0,0) to the point defined by '''Get derivative of the line from (0,0) to given coordinates.
the coordinates.'''
:param coordinates: A coordinate pair
:type coordinates: tuple
:return: Derivative; inf if x is zero
:rtype: float
'''
try: try:
return coordinates[1] / coordinates[0] return coordinates[1] / coordinates[0]
except ZeroDivisionError: except ZeroDivisionError:
...@@ -76,7 +107,17 @@ def _get_derivative(coordinates): ...@@ -76,7 +107,17 @@ def _get_derivative(coordinates):
def _calculate_cut(lemmawords, stems): def _calculate_cut(lemmawords, stems):
'''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.''' '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
:param lemmawords: Set or list of words corresponding to certain lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmawords: set or list
:type stems: dict
:return: Amount of understemmed and overstemmed pairs contributed by words
existing in both lemmawords and stems.
:rtype: tuple
'''
umt, wmt = 0.0, 0.0 umt, wmt = 0.0, 0.0
for stem in stems: for stem in stems:
cut = set(lemmawords) & set(stems[stem]) cut = set(lemmawords) & set(stems[stem])
...@@ -91,7 +132,20 @@ def _calculate_cut(lemmawords, stems): ...@@ -91,7 +132,20 @@ def _calculate_cut(lemmawords, stems):
def _calculate(lemmas, stems): def _calculate(lemmas, stems):
'''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.''' '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict
:type stems: dict
:return: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:rtype: tuple
'''
n = sum(len(lemmas[word]) for word in lemmas) n = sum(len(lemmas[word]) for word in lemmas)
...@@ -119,7 +173,18 @@ def _calculate(lemmas, stems): ...@@ -119,7 +173,18 @@ def _calculate(lemmas, stems):
def _indexes(gumt, gdmt, gwmt, gdnt): def _indexes(gumt, gdmt, gwmt, gdnt):
'''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).''' '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:type gumt, gdmt, gwmt, gdnt: float
:return: Understemming Index (UI),
Overstemming Index (OI) and
Stemming Weight (SW).
:rtype: tuple
'''
# Calculate Understemming Index (UI), # Calculate Understemming Index (UI),
# Overstemming Index (OI) and Stemming Weight (SW) # Overstemming Index (OI) and Stemming Weight (SW)
try: try:
...@@ -147,6 +212,14 @@ def _indexes(gumt, gdmt, gwmt, gdnt): ...@@ -147,6 +212,14 @@ def _indexes(gumt, gdmt, gwmt, gdnt):
class Paice(object): class Paice(object):
'''Class for storing lemmas, stems and evaluation metrics.''' '''Class for storing lemmas, stems and evaluation metrics.'''
def __init__(self, lemmas, stems): def __init__(self, lemmas, stems):
'''
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict
:type stems: dict
'''
self.lemmas = lemmas self.lemmas = lemmas
self.stems = stems self.stems = stems
self.coords = [] self.coords = []
...@@ -169,7 +242,15 @@ class Paice(object): ...@@ -169,7 +242,15 @@ class Paice(object):
return ''.join(text) return ''.join(text)
def _get_truncation_indexes(self, words, cutlength): def _get_truncation_indexes(self, words, cutlength):
'''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.''' '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
:param words: Words used for the analysis
:param cutlength: Words are stemmed by cutting them at this length
:type words: set or list
:type cutlength: int
:return: Understemming and overstemming indexes
:rtype: tuple
'''
truncated = _truncate(words, cutlength) truncated = _truncate(words, cutlength)
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated) gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
...@@ -177,7 +258,16 @@ class Paice(object): ...@@ -177,7 +258,16 @@ class Paice(object):
return (ui, oi) return (ui, oi)
def _get_truncation_coordinates(self, cutlength=0): def _get_truncation_coordinates(self, cutlength=0):
'''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.''' '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
:param cutlength: Optional parameter to start counting from (ui, oi)
coordinates gotten by stemming at this length. Useful for speeding up
the calculations when you know the approximate location of the
intersection.
:type cutlength: int
:return: List of coordinate pairs that define the truncation line
:rtype: list
'''
words = get_words_from_dictionary(self.lemmas) words = get_words_from_dictionary(self.lemmas)
maxlength = max(len(word) for word in words) maxlength = max(len(word) for word in words)
...@@ -207,7 +297,13 @@ class Paice(object): ...@@ -207,7 +297,13 @@ class Paice(object):
return coords return coords
def _errt(self): def _errt(self):
'''Count Error-Rate Relative to Truncation (ERRT).''' '''Count Error-Rate Relative to Truncation (ERRT).
:return: ERRT, length of the line from origo to (UI, OI) divided by
the length of the line from origo to the point defined by the same
line when extended until the truncation line.
:rtype: float
'''
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
self.coords = self._get_truncation_coordinates() self.coords = self._get_truncation_coordinates()
if (0.0, 0.0) in self.coords: if (0.0, 0.0) in self.coords:
...@@ -242,14 +338,16 @@ class Paice(object): ...@@ -242,14 +338,16 @@ class Paice(object):
def demo(): def demo():
'''Demonstration of the module.''' '''Demonstration of the module.'''
# Some words with their real lemmas # Some words with their real lemmas
lemmas = {'consol': ['consol', 'consols'], lemmas = {'kneel': ['kneel', 'knelt'],
'console': ['consoled', 'consoles', 'consoling'], 'range': ['range', 'ranged'],
'kneel': ['kneel', 'knelt'] 'ring': ['ring', 'rang', 'rung']
} }
# Same words with stems from a stemming algorithm # Same words with stems from a stemming algorithm
stems = {'consol': ['consol', 'consoled', 'consoles', 'consoling', 'consols'], stems = {'kneel': ['kneel'],
'kneel': ['kneel'], 'knelt': ['knelt'],
'knelt': ['knelt'] 'rang': ['rang', 'range', 'ranged'],
'ring': ['ring'],
'rung': ['rung']
} }
print('Words grouped by their lemmas:') print('Words grouped by their lemmas:')
for lemma in sorted(lemmas): for lemma in sorted(lemmas):
...@@ -263,8 +361,12 @@ def demo(): ...@@ -263,8 +361,12 @@ def demo():
print(p) print(p)
print() print()
# Let's "change" results from a stemming algorithm # Let's "change" results from a stemming algorithm
stems = {'consol': ['consol', 'consoled', 'consoles', 'consoling', 'consols'], stems = {'kneel': ['kneel'],
'kne': ['kneel', 'knelt'], 'knelt': ['knelt'],
'rang': ['rang'],
'range': ['range', 'ranged'],
'ring': ['ring'],
'rung': ['rung']
} }
print('Counting stats after changing stemming results:') print('Counting stats after changing stemming results:')
for stem in sorted(stems): for stem in sorted(stems):
......
...@@ -13,21 +13,24 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a ...@@ -13,21 +13,24 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a
Understemming and Overstemming values Understemming and Overstemming values
------------------------------------- -------------------------------------
>>> lemmas = {'consol': ['consol', 'consols'], >>> lemmas = {'kneel': ['kneel', 'knelt'],
... 'console': ['consoled', 'consoles', 'consoling'], ... 'range': ['range', 'ranged'],
... 'kneel': ['kneel', 'knelt']} ... 'ring': ['ring', 'rang', 'rung']}
>>> stems = {'consol': ['consol', 'consoled', 'consoles', 'consoling', 'consols'], >>> stems = {'kneel': ['kneel'],
... 'kneel': ['kneel'], ... 'knelt': ['knelt'],
... 'knelt': ['knelt']} ... 'rang': ['rang', 'range', 'ranged'],
... 'ring': ['ring'],
... 'rung': ['rung']}
>>> p = Paice(lemmas, stems) >>> p = Paice(lemmas, stems)
>>> p.gumt, p.gdmt, p.gwmt, p.gdnt >>> p.gumt, p.gdmt, p.gwmt, p.gdnt
(1.0, 5.0, 6.0, 16.0) (4.0, 5.0, 2.0, 16.0)
>>> p.ui, p.oi, p.sw >>> p.ui, p.oi, p.sw
(0.2, 0.375, 1.875) (0.8, 0.125, 0.15625)
>>> p.errt >>> p.errt
1.0 1.0
>>> p.coords >>> p.coords
[(0.0, 1.0), (0.0, 0.375), (0.2, 0.375)] [(0.0, 1.0), (0.0, 0.375), (0.6, 0.125), (0.8, 0.125)]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment