Distinction between weigthed and unweighted WindowDiff

Correction of bugs in the proposed version

Distinction between weigthed and unweighted WindowDiff
Correction of bugs in the proposed version
92200f1d · David Doukhan · c3e40de3 · 92200f1d
Commit 92200f1d authored Jan 17, 2013 by David Doukhan
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 14 deletions

nltk/metrics/segmentation.py
+12 -14

No files found.
--- a/nltk/metrics/segmentation.py
+++ b/nltk/metrics/segmentation.py
@@ -48,23 +48,13 @@ except ImportError:

 from nltk.compat import xrange

-def windowdiff(seg1, seg2, k, boundary="1"):
+def windowdiff(seg1, seg2, k, boundary="1", weighted=True):
    """
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

-    >>> s1 = "00000010000000001000000"
-    >>> s2 = "00000001000000010000000"
-    >>> s3 = "00010000000000000001000"
-    >>> windowdiff(s1, s1, 3)
-    0
-    >>> windowdiff(s1, s2, 3)
-    4
-    >>> windowdiff(s2, s3, 3)
-    16
-
    :param seg1: a segmentation
    :type seg1: str or list
    :param seg2: a segmentation
@@ -74,14 +64,22 @@ def windowdiff(seg1, seg2, k, boundary="1"):
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: int
+    :param weighted: use the weighted variant of windowdiff
+    :rtype weighted: boolean
    """

    if len(seg1) != len(seg2):
        raise ValueError("Segmentations have unequal length")
+    if k > len(seg1):
+        raise ValueError("Window width k should be smaller or equal than segmentation lengths")
    wd = 0
-    for i in range(len(seg1) - k):
-        wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary))
-    return wd
+    for i in range(len(seg1) - k + 1):
+        ndiff = abs(seg1[i:i+k].count(boundary) - seg2[i:i+k].count(boundary))
+        if weighted:
+            wd += ndiff
+        else:
+            wd += min(1, ndiff)
+    return wd / (len(seg1) - k + 1.)