reworked tokenizer howto, as docstrings in tokenizer package

37aced7e · Steven Bird · 637d1901 · 37aced7e · 37aced7e · 37aced7e
Commit 37aced7e authored Nov 06, 2011 by Steven Bird
12 changed files
--- a/nltk/test/tag.errs
+++ b/nltk/test/tag.errs
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 167, in tag.doctest
+Failed example:
+    print 'Accuracy: %4.1f%%' % (
+        100.0 * unigram_tagger.evaluate(brown_test))
+Expected:
+    Accuracy: 85.4%
+Got:
+    Accuracy: 85.8%
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 178, in tag.doctest
+Failed example:
+    print 'Accuracy: %4.1f%%' % (
+        100.0 * unigram_tagger_2.evaluate(brown_test))
+Expected:
+    Accuracy: 88.0%
+Got:
+    Accuracy: 88.4%
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 205, in tag.doctest
+Failed example:
+    print bigram_tagger.size()
+Expected:
+    3394
+Got:
+    3386
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 207, in tag.doctest
+Failed example:
+    print 'Accuracy: %4.1f%%' % (
+        100.0 * bigram_tagger.evaluate(brown_test))
+Expected:
+    Accuracy: 89.4%
+Got:
+    Accuracy: 89.6%
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 222, in tag.doctest
+Failed example:
+    print trigram_tagger.size()
+Expected:
+    1493
+Got:
+    1502
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 224, in tag.doctest
+Failed example:
+    print 'Accuracy: %4.1f%%' % (
+        100.0 * trigram_tagger.evaluate(brown_test))
+Expected:
+    Accuracy: 88.8%
+Got:
+    Accuracy: 89.0%
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 251, in tag.doctest
+Failed example:
+    brill_tagger = trainer.train(brown_train, max_rules=10)  # doctest: +NORMALIZE_WHITESPACE
+Expected:
+    Training Brill tagger on 4523 sentences...
+    Finding initial useful rules...
+        Found 75359 useful rules.
+    <BLANKLINE>
+               B      |     
+       S   F   r   O  |        Score = Fixed - Broken
+       c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+       o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+       r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+       e   d   n   r  |  e
+    ------------------+-------------------------------------------------------
+     354 354   0   3  | TO -> IN if the tag of the following word is 'AT'
+     111 173  62   3  | NN -> VB if the tag of the preceding word is 'TO'
+     110 110   0   4  | TO -> IN if the tag of the following word is 'NP'
+      83 157  74   4  | NP -> NP-TL if the tag of the following word is
+                      |   'NN-TL'
+      73  77   4   0  | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
+      71 116  45   3  | TO -> IN if the tag of words i+1...i+2 is 'NNS'
+      65  65   0   3  | NN -> VB if the tag of the preceding word is 'MD'
+      63  63   0   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVZ'
+      59  62   3   2  | CS -> QL if the text of words i+1...i+3 is 'as'
+      55  57   2   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVD'
+Got:
+    Training Brill tagger on 4523 sentences...
+    Finding initial useful rules...
+        Found 75299 useful rules.
+    <BLANKLINE>
+               B      |     
+       S   F   r   O  |        Score = Fixed - Broken
+       c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+       o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+       r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+       e   d   n   r  |  e
+    ------------------+-------------------------------------------------------
+     354 354   0   3  | TO -> IN if the tag of the following word is 'AT'
+     110 110   0   3  | TO -> IN if the tag of the following word is 'NP'
+      91 127  36   6  | VB -> NN if the tag of words i-2...i-1 is 'AT'
+      82 143  61   3  | NN -> VB if the tag of the preceding word is 'TO'
+      71 116  45   2  | TO -> IN if the tag of words i+1...i+2 is 'NNS'
+      66  69   3   0  | VBN -> VBD if the tag of the preceding word is
+                      |   'NP'
+      64 131  67   6  | NP -> NP-TL if the tag of the following word is
+                      |   'NN-TL'
+      59  62   3   2  | CS -> QL if the text of words i+1...i+3 is 'as'
+      55  55   0   1  | NN -> VB if the tag of the preceding word is 'MD'
+      55  59   4   0  | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
+
+***************************************************************************
+File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 274, in tag.doctest
+Failed example:
+    print 'Accuracy: %4.1f%%' % (
+        100.0 * brill_tagger.evaluate(brown_test))
+Expected:
+    Accuracy: 89.1%
+Got:
+    Accuracy: 89.5%
+.
\ No newline at end of file
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
--- a/nltk/test/tokenize.errs
+++ b/nltk/test/tokenize.errs
+.
\ No newline at end of file
--- a/nltk/test/toolbox.errs
+++ b/nltk/test/toolbox.errs
+.
\ No newline at end of file
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Tokenizers
 #
 # Copyright (C) 2001-2011 NLTK Project
@@ -7,19 +8,41 @@
 # For license information, see LICENSE.TXT

 """
-This package contains several *tokenizers*, which break continuous text
-into a sequence of units, such as words and punctuation.  Tokenizers operate on a string,
-and return a sequence of strings, one per token.  The decision about which
-tokenizer to use often depends on the particular application.
+NLTK Tokenizer Package

-The most frequently used tokenizer is ``word_tokenize()``, e.g.
+Tokenizers divide strings into lists of substrings.  For example,
+tokenizers can be used to find the list of sentences or words in a
+string.

-    >>> from nltk.tokenize import word_tokenize
-    >>> word_tokenize("Good muffins cost $3.88 in New York.")
-    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.']
+    >>> from nltk import word_tokenize, wordpunct_tokenize
+    >>> s = "Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\n\\nThanks."
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> sent_tokenize(s)
+    ['Good muffins cost $3.88\\nin New York.', 'Please buy me\\ntwo of them.', 'Thanks.']
+    >>> [word_tokenize(t) for t in sent_tokenize(s)]
+    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
+    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

-For more information about tokenization, please see the tokenizer HOWTO,
-or chapter 3 of the NLTK book.
+Caution: only use ``word_tokenize()`` on individual sentences.
+
+Caution: when tokenizing a Unicode string, make sure you are not
+using an encoded version of the string (it may be necessary to
+decode it first, e.g. with ``s.decode("utf8")``.
+
+NLTK tokenizers can produce token-spans, represented as tuples of integers
+having the same semantics as string slices, to support efficient comparison
+of tokenizers.  (These methods are implemented as generators.)
+
+    >>> list(WhitespaceTokenizer().span_tokenize(s))
+    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+There are numerous ways to tokenize text.  If you need more control over
+tokenization, see the other methods provided in this package.
+
+For further information, please see Chapter 3 of the NLTK book.
 """

 from ..data    import load 

--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -11,9 +11,62 @@
 #
 # $Id: probability.py 4865 2007-07-11 22:6:07Z edloper $

-"""
-The Punkt sentence tokenizer.  The algorithm for this tokenizer is
-described in Kiss & Strunk (2006)::
+r"""
+Punkt Sentence Tokenizer
+
+This tokenizer divides a text into a list of sentences,
+by using an unsupervised algorithm to build a model for abbreviation
+words, collocations, and words that start sentences.  It must be
+trained on a large collection of plaintext in the taret language
+before it can be used.
+
+The NLTK data package includes a pre-trained Punkt tokenizer for
+English.
+
+    >>> import nltk.data
+    >>> text = '''
+    ... Punkt knows that the periods in Mr. Smith and Johann S. Bach
+    ... do not mark sentence boundaries.  And sometimes sentences 
+    ... can start with non-capitalized words.  i is a good variable
+    ... name.
+    ... '''
+    >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
+    >>> print '\n-----\n'.join(sent_detector.tokenize(text.strip()))
+    Punkt knows that the periods in Mr. Smith and Johann S. Bach
+    do not mark sentence boundaries.
+    -----
+    And sometimes sentences 
+    can start with non-capitalized words.
+    -----
+    i is a good variable
+    name.
+
+(Note that whitespace from the original text, including newlines, is
+retained in the output.)
+
+Punctuation following sentences can be included with the realign_boundaries
+flag:
+   
+    >>> text = '''
+    ... (How does it deal with this parenthesis?)  "It should be part of the
+    ... previous sentence."
+    ... '''
+    >>> print '\n-----\n'.join(
+    ...     sent_detector.tokenize(text.strip(), realign_boundaries=True))
+    (How does it deal with this parenthesis?)
+    -----
+    "It should be part of the
+    previous sentence."
+
+:class:`.PunktWordTokenizer` uses a regular expression to divide a text into tokens,
+leaving all periods attached to words, but separating off other punctuation:
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> PunktWordTokenizer().tokenize(s)
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please',
+    'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+
+The algorithm for this tokenizer is described in::

  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
    Boundary Detection.  Computational Linguistics 32: 485-525.

--- a/nltk/tokenize/regexp.py
+++ b/nltk/tokenize/regexp.py
@@ -7,9 +7,60 @@
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT

-"""
-Tokenizers that divide strings into substrings using regular
-expressions that can match either tokens or separators between tokens.
+r"""
+Regular-Expression Tokenizers
+
+A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
+For example, the following tokenizer forms tokens out of alphabetic sequences,
+money expressions, and any other non-whitespace sequences:
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
+    >>> tokenizer.tokenize(s)
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
+
+    >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
+    >>> tokenizer.tokenize(s)
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+
+Note that empty tokens are not returned when the delimiter appears at
+the start or end of the string.
+
+The material between the tokens is discarded.  For example,
+the following tokenizer selects just the capitalized words:
+
+    >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
+    >>> capword_tokenizer.tokenize(s)
+    ['Good', 'New', 'York', 'Please', 'Thanks']
+
+This module contains several subclasses of ``RegexpTokenizer``
+that use pre-defined regular expressions.
+
+    >>> # Uses '\s*\n\s*\n\s*':
+    >>> BlanklineTokenizer().tokenize(s)
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
+    'Thanks.']
+
+All of the regular expression tokenizers are also available as functions:
+
+    >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+     '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> blankline_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 'Thanks.']
+
+Caution: The function ``regexp_tokenize()`` takes the text as its
+first argument, and the regular expression pattern as its second
+argument.  This differs from the conventions used by Python's
+``re`` functions, where the pattern is always the first argument.
+(This is for consistency with the other NLTK tokenizers.)
 """

 import re
@@ -95,10 +146,12 @@ class RegexpTokenizer(TokenizerI):
 class WhitespaceTokenizer(RegexpTokenizer):
    r"""
    Tokenize a string on whitespace (space, tab, newline).
-    In general, users should use ``str.split()`` instead, e.g.:
-
-        >>> words = "lorem ipsum".split()
+    In general, users should use the string ``split()`` method instead.

+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> WhitespaceTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+        'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    """

    def __init__(self):
@@ -114,14 +167,14 @@ class BlanklineTokenizer(RegexpTokenizer):
        RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)

 class WordPunctTokenizer(RegexpTokenizer):
-    r"""
+    """
    Tokenize a text into a sequence of alphabetic and
-    non-alphabetic characters.  E.g.:
-
-        >>> from nltk.tokenize.regexp import WordPunctTokenizer
-        >>> WordPunctTokenizer().tokenize("She said 'hello'.")
-        ['She', 'said', "'", 'hello', "'."]
+    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. 

+        >>> s = "Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\n\\nThanks."
+        >>> WordPunctTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    """
    def __init__(self):
        RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')

--- a/nltk/tokenize/sexpr.py
+++ b/nltk/tokenize/sexpr.py
@@ -7,10 +7,44 @@
 # For license information, see LICENSE.TXT

 """
-A tokenizer that divides strings into s-expressions.  E.g.:
+S-Expression Tokenizer
+
+``SExprTokenizer`` is used to find parenthesized expressions in a
+string.  In particular, it divides a string into a sequence of
+substrings that are either parenthesized expressions (including any
+nested parenthesized expressions), or other whitespace-separated
+tokens.
+
+    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
+    ['(a b (c d))', 'e', 'f', '(g)']
+
+By default, `SExprTokenizer` will raise a ``ValueError`` exception if
+used to tokenize an expression with non-matching parentheses:
+
+    >>> SExprTokenizer().tokenize('c) d) e (f (g')
+    Traceback (most recent call last):
+      ...
+    ValueError: Un-matched close paren at char 1
+
+The ``strict`` argument can be set to False to allow for
+non-matching parentheses.  Any unmatched close parentheses will be
+listed as their own s-expression; and the last partial sexpr with
+unmatched open parentheses will be listed as its own sexpr:
+
+    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
+    ['c', ')', 'd', ')', 'e', '(f (g']
+
+The characters used for open and close parentheses may be customized
+using the ``parens`` argument to the `SExprTokenizer` constructor:
+
+    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
+    ['{a b {c d}}', 'e', 'f', '{g}']
+
+The s-expression tokenizer is also available as a function:

    >>> sexpr_tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']
+
 """

 import re
@@ -28,21 +62,18 @@ class SExprTokenizer(TokenizerI):
    
    For example, the string ``(a (b c)) d e (f)`` consists of four
    s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.  
+
+    By default, the characters ``(`` and ``)`` are treated as open and
+    close parentheses, but alternative strings may be specified.
+
+    :param parens: A two-element sequence specifying the open and close parentheses
+        that should be used to find sexprs.  This will typically be either a
+        two-character string, or a list of two strings.
+    :type parens: C{str} or C{list}
+    :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
    """
+
    def __init__(self, parens='()', strict=True):
-        """
-        Construct a new SExpr tokenizer.  By default, the characters
-        ``(`` and ``)`` are treated as open and close parentheses;
-        but alternative strings may be specified.
-
-        :param parens: A two-element sequence specifying the open and
-            close parentheses that should be used to find sexprs.  This
-            will typically be either a two-character string, or a list
-            of two strings.
-        :type parens: C{str} or C{list}
-        :param strict: If true, then raise an exception when tokenizing
-            an ill-formed sexpr.
-        """
        if len(parens) != 2:
            raise ValueError('parens must contain exactly two strings')
        self._strict = strict

--- a/nltk/tokenize/simple.py
+++ b/nltk/tokenize/simple.py
@@ -6,29 +6,56 @@
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT

-"""
-Tokenizers that divide strings into substrings using the string
+r"""
+Simple Tokenizers
+
+These tokenizers divide strings into substrings using the string
 ``split()`` method.
+When tokenizing using a particular delimiter string, use
+the string ``split()`` method directly, as this is more efficient.

-These tokenizers implement the ``TokenizerI`` interface, and so
-can be used with any code that expects a tokenizer, e.g.
-:class:`~nltk.corpus.reader.CorpusReader`.
+The simple tokenizers are *not* available as separate functions;
+instead, you should just use the string ``split()`` method directly:
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> s.split()
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+    >>> s.split(' ')
+    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
+    >>> s.split('\n')
+    ['Good muffins cost $3.88', 'in New York.  Please buy me',
+    'two of them.', '', 'Thanks.']
+
+The simple tokenizers are mainly useful because they follow the
+standard ``TokenizerI`` interface, and so can be used with any code
+that expects a tokenizer.  For example, these tokenizers can be used
+to specify the tokenization conventions when building a `CorpusReader`.

-When tokenizing using a particular delimiter string, consider using
-the string ``split()`` method directly, as this is more efficient.
 """

 from .api import TokenizerI, StringTokenizer 
 from .util import string_span_tokenize, regexp_span_tokenize
    
 class SpaceTokenizer(StringTokenizer):
-    """Tokenize a string using the space character as a delimiter.
+    r"""Tokenize a string using the space character as a delimiter,
+    which is the same as ``s.split(' ')``.
+    
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> SpaceTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    """

    _string = ' '
    
 class TabTokenizer(StringTokenizer):
-    """Tokenize a string use the tab character as a delimiter.
+    r"""Tokenize a string use the tab character as a delimiter,
+    the same as ``s.split('\t')``.
+    
+        >>> TabTokenizer().tokenize('a\tb c\n\t d')
+        ['a', 'b c\n', ' d']
    """
    
    _string = '\t'
@@ -46,21 +73,28 @@ class CharTokenizer(StringTokenizer):
            yield i, j
                              
 class LineTokenizer(TokenizerI):
-    """Tokenize a string into its lines, optionally discarding blank lines.
+    r"""Tokenize a string into its lines, optionally discarding blank lines.
+    This is similar to ``s.split('\n')``.
+
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> LineTokenizer(blanklines='keep').tokenize(s)
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', '', 'Thanks.']
+        >>> # same as [l for l in s.split('\n') if l.strip()]:
+        >>> LineTokenizer(blanklines='discard').tokenize(s)
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', 'Thanks.']
+
+    :param blanklines: Indicates how blank lines should be handled.  Valid values are:
+        
+        - ``discard``: strip blank lines out of the token list before returning it.
+           A line is considered blank if it contains only whitespace characters.
+        - ``keep``: leave all blank lines in the token list.
+        - ``discard-eof``: if the string ends with a newline, then do not generate
+           a corresponding token ``''`` after that newline.
    """
+
    def __init__(self, blanklines='discard'):
-        """
-        :param blanklines: Indicates how blank lines should be
-        handled.  Valid values are:
-        
-          - ``discard``: strip blank lines out of the token list
-            before returning it.  A line is considered blank if
-            it contains only whitespace characters.
-          - ``keep``: leave all blank lines in the token list.
-          - ``discard-eof``: if the string ends with a newline,
-            then do not generate a corresponding token ``''`` after
-            that newline.
-        """
        valid_blanklines = ('discard', 'keep', 'discard-eof')
        if blanklines not in valid_blanklines:
            raise ValueError('Blank lines must be one of: %s' %

--- a/nltk/tokenize/texttiling.py
+++ b/nltk/tokenize/texttiling.py
@@ -380,8 +380,6 @@ class TokenSequence(object):
        self.__dict__.update(locals())
        del self.__dict__['self']

-
-
 #Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
 def smooth(x,window_len=11,window='flat'):
    """smooth the data using a window with requested size.
@@ -420,15 +418,12 @@ def smooth(x,window_len=11,window='flat'):
    if x.size < window_len:
        raise ValueError, "Input vector needs to be bigger than window size."

-
    if window_len<3:
        return x

-
    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
        raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"

-
    s=numpy.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]

    #print(len(s))

--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -5,12 +5,25 @@
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT

-"""
-A tokenizer that uses the Penn Treebank conventions:
+r"""
+Penn Treebank Tokenizer
+
+The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
+This is the method that is invoked by ``word_tokenize()``.  It assumes that the
+text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
+
+This tokenizer performs the following steps:
+
  - split standard contractions, e.g. ``don't -> ``do n't``
  - treat most punctuation characters as separate tokens
  - split off commas and single quotes, when followed by whitespace
  - separate periods that appear at the end of line
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> TreebankWordTokenizer().tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
 """

 import re
@@ -70,4 +83,3 @@ class TreebankWordTokenizer(TokenizerI):
        text = re.sub('\. *(\n|$)', ' . ', text)

        return text.split()
-    
--- a/nltk/tokenize/util.py
+++ b/nltk/tokenize/util.py
@@ -34,10 +34,16 @@ def string_span_tokenize(s, sep):
        left = right + len(sep)

 def regexp_span_tokenize(s, regexp):
-    """
+    r"""
    Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples,
    by splitting the string at each successive match of *regexp*.
    
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> list(WhitespaceTokenizer().span_tokenize(s))
+        [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+        (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+    
    :param s: the string to be tokenized
    :type s: str
    :param regexp: regular expression that matches token separators
@@ -53,9 +59,15 @@ def regexp_span_tokenize(s, regexp):
    yield left, len(s)

 def spans_to_relative(spans):
-    """
+    r"""
    Return a sequence of relative spans, given a sequence of spans.
    
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
+        [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
+        (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
+    
    :param spans: a sequence of (start, end) offsets of the tokens
    :type spans: iter(tuple(int, int))
    :rtype: iter(tuple(int, int))