Commit 37aced7e by Steven Bird

reworked tokenizer howto, as docstrings in tokenizer package

parent 637d1901
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 167, in tag.doctest
Failed example:
print 'Accuracy: %4.1f%%' % (
100.0 * unigram_tagger.evaluate(brown_test))
Expected:
Accuracy: 85.4%
Got:
Accuracy: 85.8%
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 178, in tag.doctest
Failed example:
print 'Accuracy: %4.1f%%' % (
100.0 * unigram_tagger_2.evaluate(brown_test))
Expected:
Accuracy: 88.0%
Got:
Accuracy: 88.4%
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 205, in tag.doctest
Failed example:
print bigram_tagger.size()
Expected:
3394
Got:
3386
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 207, in tag.doctest
Failed example:
print 'Accuracy: %4.1f%%' % (
100.0 * bigram_tagger.evaluate(brown_test))
Expected:
Accuracy: 89.4%
Got:
Accuracy: 89.6%
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 222, in tag.doctest
Failed example:
print trigram_tagger.size()
Expected:
1493
Got:
1502
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 224, in tag.doctest
Failed example:
print 'Accuracy: %4.1f%%' % (
100.0 * trigram_tagger.evaluate(brown_test))
Expected:
Accuracy: 88.8%
Got:
Accuracy: 89.0%
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 251, in tag.doctest
Failed example:
brill_tagger = trainer.train(brown_train, max_rules=10) # doctest: +NORMALIZE_WHITESPACE
Expected:
Training Brill tagger on 4523 sentences...
Finding initial useful rules...
Found 75359 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
c i o t | R Fixed = num tags changed incorrect -> correct
o x k h | u Broken = num tags changed correct -> incorrect
r e e e | l Other = num tags changed incorrect -> incorrect
e d n r | e
------------------+-------------------------------------------------------
354 354 0 3 | TO -> IN if the tag of the following word is 'AT'
111 173 62 3 | NN -> VB if the tag of the preceding word is 'TO'
110 110 0 4 | TO -> IN if the tag of the following word is 'NP'
83 157 74 4 | NP -> NP-TL if the tag of the following word is
| 'NN-TL'
73 77 4 0 | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
71 116 45 3 | TO -> IN if the tag of words i+1...i+2 is 'NNS'
65 65 0 3 | NN -> VB if the tag of the preceding word is 'MD'
63 63 0 0 | VBD -> VBN if the tag of words i-3...i-1 is 'HVZ'
59 62 3 2 | CS -> QL if the text of words i+1...i+3 is 'as'
55 57 2 0 | VBD -> VBN if the tag of words i-3...i-1 is 'HVD'
Got:
Training Brill tagger on 4523 sentences...
Finding initial useful rules...
Found 75299 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
c i o t | R Fixed = num tags changed incorrect -> correct
o x k h | u Broken = num tags changed correct -> incorrect
r e e e | l Other = num tags changed incorrect -> incorrect
e d n r | e
------------------+-------------------------------------------------------
354 354 0 3 | TO -> IN if the tag of the following word is 'AT'
110 110 0 3 | TO -> IN if the tag of the following word is 'NP'
91 127 36 6 | VB -> NN if the tag of words i-2...i-1 is 'AT'
82 143 61 3 | NN -> VB if the tag of the preceding word is 'TO'
71 116 45 2 | TO -> IN if the tag of words i+1...i+2 is 'NNS'
66 69 3 0 | VBN -> VBD if the tag of the preceding word is
| 'NP'
64 131 67 6 | NP -> NP-TL if the tag of the following word is
| 'NN-TL'
59 62 3 2 | CS -> QL if the text of words i+1...i+3 is 'as'
55 55 0 1 | NN -> VB if the tag of the preceding word is 'MD'
55 59 4 0 | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
***************************************************************************
File "/Users/sb/git/nltk/nltk/test/tag.doctest", line 274, in tag.doctest
Failed example:
print 'Accuracy: %4.1f%%' % (
100.0 * brill_tagger.evaluate(brown_test))
Expected:
Accuracy: 89.1%
Got:
Accuracy: 89.5%
.
\ No newline at end of file
.
\ No newline at end of file
.
\ No newline at end of file
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2011 NLTK Project
......@@ -7,19 +8,41 @@
# For license information, see LICENSE.TXT
"""
This package contains several *tokenizers*, which break continuous text
into a sequence of units, such as words and punctuation. Tokenizers operate on a string,
and return a sequence of strings, one per token. The decision about which
tokenizer to use often depends on the particular application.
NLTK Tokenizer Package
The most frequently used tokenizer is ``word_tokenize()``, e.g.
Tokenizers divide strings into lists of substrings. For example,
tokenizers can be used to find the list of sentences or words in a
string.
>>> from nltk.tokenize import word_tokenize
>>> word_tokenize("Good muffins cost $3.88 in New York.")
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.']
>>> from nltk import word_tokenize, wordpunct_tokenize
>>> s = "Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks."
>>> wordpunct_tokenize(s)
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> sent_tokenize(s)
['Good muffins cost $3.88\\nin New York.', 'Please buy me\\ntwo of them.', 'Thanks.']
>>> [word_tokenize(t) for t in sent_tokenize(s)]
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
For more information about tokenization, please see the tokenizer HOWTO,
or chapter 3 of the NLTK book.
Caution: only use ``word_tokenize()`` on individual sentences.
Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.
NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers. (These methods are implemented as generators.)
>>> list(WhitespaceTokenizer().span_tokenize(s))
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
There are numerous ways to tokenize text. If you need more control over
tokenization, see the other methods provided in this package.
For further information, please see Chapter 3 of the NLTK book.
"""
from ..data import load
......
......@@ -11,9 +11,62 @@
#
# $Id: probability.py 4865 2007-07-11 22:6:07Z edloper $
"""
The Punkt sentence tokenizer. The algorithm for this tokenizer is
described in Kiss & Strunk (2006)::
r"""
Punkt Sentence Tokenizer
This tokenizer divides a text into a list of sentences,
by using an unsupervised algorithm to build a model for abbreviation
words, collocations, and words that start sentences. It must be
trained on a large collection of plaintext in the taret language
before it can be used.
The NLTK data package includes a pre-trained Punkt tokenizer for
English.
>>> import nltk.data
>>> text = '''
... Punkt knows that the periods in Mr. Smith and Johann S. Bach
... do not mark sentence boundaries. And sometimes sentences
... can start with non-capitalized words. i is a good variable
... name.
... '''
>>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
>>> print '\n-----\n'.join(sent_detector.tokenize(text.strip()))
Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.
-----
And sometimes sentences
can start with non-capitalized words.
-----
i is a good variable
name.
(Note that whitespace from the original text, including newlines, is
retained in the output.)
Punctuation following sentences can be included with the realign_boundaries
flag:
>>> text = '''
... (How does it deal with this parenthesis?) "It should be part of the
... previous sentence."
... '''
>>> print '\n-----\n'.join(
... sent_detector.tokenize(text.strip(), realign_boundaries=True))
(How does it deal with this parenthesis?)
-----
"It should be part of the
previous sentence."
:class:`.PunktWordTokenizer` uses a regular expression to divide a text into tokens,
leaving all periods attached to words, but separating off other punctuation:
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> PunktWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please',
'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
The algorithm for this tokenizer is described in::
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
Boundary Detection. Computational Linguistics 32: 485-525.
......
......@@ -7,9 +7,60 @@
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
"""
Tokenizers that divide strings into substrings using regular
expressions that can match either tokens or separators between tokens.
r"""
Regular-Expression Tokenizers
A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
For example, the following tokenizer forms tokens out of alphabetic sequences,
money expressions, and any other non-whitespace sequences:
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
>>> tokenizer.tokenize(s)
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
>>> tokenizer = RegexpTokenizer('\s+', gaps=True)
>>> tokenizer.tokenize(s)
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
Note that empty tokens are not returned when the delimiter appears at
the start or end of the string.
The material between the tokens is discarded. For example,
the following tokenizer selects just the capitalized words:
>>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
>>> capword_tokenizer.tokenize(s)
['Good', 'New', 'York', 'Please', 'Thanks']
This module contains several subclasses of ``RegexpTokenizer``
that use pre-defined regular expressions.
>>> # Uses '\s*\n\s*\n\s*':
>>> BlanklineTokenizer().tokenize(s)
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
'Thanks.']
All of the regular expression tokenizers are also available as functions:
>>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> wordpunct_tokenize(s)
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> blankline_tokenize(s)
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
Caution: The function ``regexp_tokenize()`` takes the text as its
first argument, and the regular expression pattern as its second
argument. This differs from the conventions used by Python's
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
"""
import re
......@@ -95,10 +146,12 @@ class RegexpTokenizer(TokenizerI):
class WhitespaceTokenizer(RegexpTokenizer):
r"""
Tokenize a string on whitespace (space, tab, newline).
In general, users should use ``str.split()`` instead, e.g.:
>>> words = "lorem ipsum".split()
In general, users should use the string ``split()`` method instead.
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> WhitespaceTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
"""
def __init__(self):
......@@ -114,14 +167,14 @@ class BlanklineTokenizer(RegexpTokenizer):
RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
r"""
"""
Tokenize a text into a sequence of alphabetic and
non-alphabetic characters. E.g.:
>>> from nltk.tokenize.regexp import WordPunctTokenizer
>>> WordPunctTokenizer().tokenize("She said 'hello'.")
['She', 'said', "'", 'hello', "'."]
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
>>> s = "Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks."
>>> WordPunctTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
......
......@@ -7,10 +7,44 @@
# For license information, see LICENSE.TXT
"""
A tokenizer that divides strings into s-expressions. E.g.:
S-Expression Tokenizer
``SExprTokenizer`` is used to find parenthesized expressions in a
string. In particular, it divides a string into a sequence of
substrings that are either parenthesized expressions (including any
nested parenthesized expressions), or other whitespace-separated
tokens.
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
used to tokenize an expression with non-matching parentheses:
>>> SExprTokenizer().tokenize('c) d) e (f (g')
Traceback (most recent call last):
...
ValueError: Un-matched close paren at char 1
The ``strict`` argument can be set to False to allow for
non-matching parentheses. Any unmatched close parentheses will be
listed as their own s-expression; and the last partial sexpr with
unmatched open parentheses will be listed as its own sexpr:
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
['c', ')', 'd', ')', 'e', '(f (g']
The characters used for open and close parentheses may be customized
using the ``parens`` argument to the `SExprTokenizer` constructor:
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
['{a b {c d}}', 'e', 'f', '{g}']
The s-expression tokenizer is also available as a function:
>>> sexpr_tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
"""
import re
......@@ -28,21 +62,18 @@ class SExprTokenizer(TokenizerI):
For example, the string ``(a (b c)) d e (f)`` consists of four
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
By default, the characters ``(`` and ``)`` are treated as open and
close parentheses, but alternative strings may be specified.
:param parens: A two-element sequence specifying the open and close parentheses
that should be used to find sexprs. This will typically be either a
two-character string, or a list of two strings.
:type parens: C{str} or C{list}
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
"""
def __init__(self, parens='()', strict=True):
"""
Construct a new SExpr tokenizer. By default, the characters
``(`` and ``)`` are treated as open and close parentheses;
but alternative strings may be specified.
:param parens: A two-element sequence specifying the open and
close parentheses that should be used to find sexprs. This
will typically be either a two-character string, or a list
of two strings.
:type parens: C{str} or C{list}
:param strict: If true, then raise an exception when tokenizing
an ill-formed sexpr.
"""
if len(parens) != 2:
raise ValueError('parens must contain exactly two strings')
self._strict = strict
......
......@@ -6,29 +6,56 @@
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
"""
Tokenizers that divide strings into substrings using the string
r"""
Simple Tokenizers
These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.
These tokenizers implement the ``TokenizerI`` interface, and so
can be used with any code that expects a tokenizer, e.g.
:class:`~nltk.corpus.reader.CorpusReader`.
The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> s.split()
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
>>> s.split(' ')
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
>>> s.split('\n')
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer. For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.
When tokenizing using a particular delimiter string, consider using
the string ``split()`` method directly, as this is more efficient.
"""
from .api import TokenizerI, StringTokenizer
from .util import string_span_tokenize, regexp_span_tokenize
class SpaceTokenizer(StringTokenizer):
"""Tokenize a string using the space character as a delimiter.
r"""Tokenize a string using the space character as a delimiter,
which is the same as ``s.split(' ')``.
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> SpaceTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
"""
_string = ' '
class TabTokenizer(StringTokenizer):
"""Tokenize a string use the tab character as a delimiter.
r"""Tokenize a string use the tab character as a delimiter,
the same as ``s.split('\t')``.
>>> TabTokenizer().tokenize('a\tb c\n\t d')
['a', 'b c\n', ' d']
"""
_string = '\t'
......@@ -46,21 +73,28 @@ class CharTokenizer(StringTokenizer):
yield i, j
class LineTokenizer(TokenizerI):
"""Tokenize a string into its lines, optionally discarding blank lines.
r"""Tokenize a string into its lines, optionally discarding blank lines.
This is similar to ``s.split('\n')``.
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> LineTokenizer(blanklines='keep').tokenize(s)
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
>>> # same as [l for l in s.split('\n') if l.strip()]:
>>> LineTokenizer(blanklines='discard').tokenize(s)
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', 'Thanks.']
:param blanklines: Indicates how blank lines should be handled. Valid values are:
- ``discard``: strip blank lines out of the token list before returning it.
A line is considered blank if it contains only whitespace characters.
- ``keep``: leave all blank lines in the token list.
- ``discard-eof``: if the string ends with a newline, then do not generate
a corresponding token ``''`` after that newline.
"""
def __init__(self, blanklines='discard'):
"""
:param blanklines: Indicates how blank lines should be
handled. Valid values are:
- ``discard``: strip blank lines out of the token list
before returning it. A line is considered blank if
it contains only whitespace characters.
- ``keep``: leave all blank lines in the token list.
- ``discard-eof``: if the string ends with a newline,
then do not generate a corresponding token ``''`` after
that newline.
"""
valid_blanklines = ('discard', 'keep', 'discard-eof')
if blanklines not in valid_blanklines:
raise ValueError('Blank lines must be one of: %s' %
......
......@@ -380,8 +380,6 @@ class TokenSequence(object):
self.__dict__.update(locals())
del self.__dict__['self']
#Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
def smooth(x,window_len=11,window='flat'):
"""smooth the data using a window with requested size.
......@@ -420,15 +418,12 @@ def smooth(x,window_len=11,window='flat'):
if x.size < window_len:
raise ValueError, "Input vector needs to be bigger than window size."
if window_len<3:
return x
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
s=numpy.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]
#print(len(s))
......
......@@ -5,12 +5,25 @@
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
"""
A tokenizer that uses the Penn Treebank conventions:
r"""
Penn Treebank Tokenizer
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
This tokenizer performs the following steps:
- split standard contractions, e.g. ``don't -> ``do n't``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
"""
import re
......@@ -70,4 +83,3 @@ class TreebankWordTokenizer(TokenizerI):
text = re.sub('\. *(\n|$)', ' . ', text)
return text.split()
......@@ -34,10 +34,16 @@ def string_span_tokenize(s, sep):
left = right + len(sep)
def regexp_span_tokenize(s, regexp):
"""
r"""
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples,
by splitting the string at each successive match of *regexp*.
>>> from nltk.tokenize import WhitespaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> list(WhitespaceTokenizer().span_tokenize(s))
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
:param s: the string to be tokenized
:type s: str
:param regexp: regular expression that matches token separators
......@@ -53,9 +59,15 @@ def regexp_span_tokenize(s, regexp):
yield left, len(s)
def spans_to_relative(spans):
"""
r"""
Return a sequence of relative spans, given a sequence of spans.
>>> from nltk.tokenize import WhitespaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
:param spans: a sequence of (start, end) offsets of the tokens
:type spans: iter(tuple(int, int))
:rtype: iter(tuple(int, int))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment