Merge branch 'develop' into brill

daf4d0f7 · Steven Bird · 4cc33e5e · d78254b7 · daf4d0f7 · daf4d0f7
Commit daf4d0f7 authored Apr 21, 2014 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 138 additions and 70 deletions

nltk/corpus/__init__.py
+3 -0

nltk/corpus/reader/bnc.py
+44 -52

nltk/test/FX8.xml
+17 -0

nltk/test/bnc.doctest
+55 -0

nltk/tokenize/punkt.py
+19 -18

No files found.
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -138,6 +138,9 @@ mac_morpho = LazyCorpusLoader(
 machado = LazyCorpusLoader(
    'machado', PortugueseCategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
+masc_tagged = LazyCorpusLoader(
+    'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
+    cat_file='categories.txt', tagset='wsj', encoding="ascii", sep="_")
 movie_reviews = LazyCorpusLoader(
    'movie_reviews', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',

--- a/nltk/corpus/reader/bnc.py
+++ b/nltk/corpus/reader/bnc.py
@@ -5,23 +5,29 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT

-"""
-Corpus reader for the XML version of the British National Corpus.
-"""
-__docformat__ = 'epytext en'
+"""Corpus reader for the XML version of the British National Corpus."""

-from nltk.corpus.reader.api import *
-from nltk.corpus.reader.util import *
-from nltk.corpus.reader.xmldocs import *
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree


 class BNCCorpusReader(XMLCorpusReader):
-    """
-    Corpus reader for the XML version of the British National Corpus.
+    """Corpus reader for the XML version of the British National Corpus.
+
    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+
+    You can obtain the full version of the BNC corpus at
+    http://www.ota.ox.ac.uk/desc/2554
+
+    If you extracted the archive to a directory called `BNC`, then you can
+    instantiate the reder as::
+
+        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
+
    """
+
    def __init__(self, root, fileids, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy
@@ -36,14 +42,7 @@ class BNCCorpusReader(XMLCorpusReader):
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
-        if self._lazy:
-            return concat([BNCWordView(fileid, False, None,
-                                       strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
-        else:
-            return concat([self._words(fileid, False, None,
-                                       strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
+        return self._views(fileids, False, None, strip_space, stem)

    def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
        """
@@ -58,16 +57,8 @@ class BNCCorpusReader(XMLCorpusReader):
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
-        if c5:
-            tag = 'c5'
-        else:
-            tag = 'pos'
-        if self._lazy:
-            return concat([BNCWordView(fileid, False, tag, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
-        else:
-            return concat([self._words(fileid, False, tag, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
+        tag = 'c5' if c5 else 'pos'
+        return self._views(fileids, False, tag, strip_space, stem)

    def sents(self, fileids=None, strip_space=True, stem=False):
        """
@@ -80,15 +71,9 @@ class BNCCorpusReader(XMLCorpusReader):
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
-        if self._lazy:
-            return concat([BNCWordView(fileid, True, None, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
-        else:
-            return concat([self._words(fileid, True, None, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
+        return self._views(fileids, True, None, strip_space, stem)

-    def tagged_sents(self, fileids=None, c5=False, strip_space=True,
-                     stem=False):
+    def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
@@ -100,16 +85,13 @@ class BNCCorpusReader(XMLCorpusReader):
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
-        if c5:
-            tag = 'c5'
-        else:
-            tag = 'pos'
-        if self._lazy:
-            return concat([BNCWordView(fileid, True, tag, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
-        else:
-            return concat([self._words(fileid, True, tag, strip_space, stem)
-                           for fileid in self.abspaths(fileids)])
+        tag = 'c5' if c5 else 'pos'
+        return self._views(fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem)
+
+    def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
+        """A helper function that instantiates BNCWordViews or the list of words/sentences."""
+        f = BNCWordView if self._lazy else self._words
+        return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])

    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
        """
@@ -174,6 +156,16 @@ class BNCWordView(XMLCorpusView):
    """
    A stream backed corpus view specialized for use with the BNC corpus.
    """
+
+    tags_to_ignore = set(
+        ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
+    )
+    """These tags are ignored. For their description refer to the
+    technical documentation, for example,
+    http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
+
+    """
+
    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
@@ -191,6 +183,11 @@ class BNCWordView(XMLCorpusView):
        self._strip_space = strip_space
        self._stem = stem

+        self.title = None  #: Title of the document.
+        self.author = None  #: Author of the document.
+        self.editor = None  #: Editor
+        self.resps = None  #: Statement of responsibility
+
        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
@@ -201,11 +198,6 @@ class BNCWordView(XMLCorpusView):
        # Reset tag context.
        self._tag_context = {0: ()}

-    title = None  #: Title of the document.
-    author = None  #: Author of the document.
-    editor = None  #: Editor
-    resps = None  #: Statement of responsibility
-
    def handle_header(self, elt, context):
        # Set up some metadata!
        titles = elt.findall('titleStmt/title')
@@ -251,10 +243,10 @@ class BNCWordView(XMLCorpusView):
    def handle_sent(self, elt):
        sent = []
        for child in elt:
-            if child.tag == 'mw':
+            if child.tag in ('mw', 'hi', 'corr', 'trunc'):
                sent += [self.handle_word(w) for w in child]
            elif child.tag in ('w', 'c'):
                sent.append(self.handle_word(child))
-            else:
+            elif child.tag not in self.tags_to_ignore:
                raise ValueError('Unexpected element %s' % child.tag)
        return BNCSentence(elt.attrib['n'], sent)
--- a/nltk/test/FX8.xml
+++ b/nltk/test/FX8.xml
+<bncDoc xml:id="FX8"><teiHeader><fileDesc><titleStmt><title>  General practitioner's surgery: medical consultation. Sample containing about 125 words speech recorded in public context </title><respStmt><resp> Data capture and transcription </resp><name> Longman ELT </name> </respStmt></titleStmt><editionStmt><edition>BNC XML Edition, December 2006</edition></editionStmt><extent> 125 tokens; 130 w-units; 15 s-units </extent><publicationStmt><distributor>Distributed under licence by Oxford University Computing Services on behalf of the BNC Consortium.</distributor><availability> This material is protected by international copyright laws and may not be copied or redistributed in any way. Consult the BNC Web Site at http://www.natcorp.ox.ac.uk for full licencing and distribution conditions.</availability><idno type="bnc">FX8</idno><idno type="old"> 093802 </idno></publicationStmt><sourceDesc><recordingStmt><recording n="093802" type="DAT"/></recordingStmt></sourceDesc></fileDesc><encodingDesc><tagsDecl><namespace name=""><tagUsage gi="align" occurs="4"/><tagUsage gi="c" occurs="21"/><tagUsage gi="event" occurs="1"/><tagUsage gi="gap" occurs="1"/><tagUsage gi="mw" occurs="2"/><tagUsage gi="pause" occurs="2"/><tagUsage gi="s" occurs="15"/><tagUsage gi="u" occurs="9"/><tagUsage gi="unclear" occurs="16"/><tagUsage gi="w" occurs="130"/></namespace></tagsDecl></encodingDesc><profileDesc><creation date="0000">0000-00-00 Origination/creation date not known </creation><particDesc n="C126"><person ageGroup="X" xml:id="PS22T" role="unspecified" sex="m" soc="AB" dialect="NONE" firstLang="EN-GBR" educ="Ed0"><persName>Doctor</persName> <occupation>doctor</occupation> <persNote>other participants are doctors patients</persNote></person><person ageGroup="X" xml:id="FX8PS000" role="unspecified" sex="u" soc="UU" dialect="NONE"/><person ageGroup="X" dialect="NONE" n="W0000" role="other" sex="u" soc="UU" xml:id="FX8PSUNK"><persName>Unknown speaker</persName></person><person ageGroup="X" dialect="NONE" n="W000M" role="other" sex="u" soc="UU" xml:id="FX8PSUGP"><persName>Group of unknown speakers</persName></person></particDesc><settingDesc><setting n="093802" who="PS22T FX8PS000"><placeName>Strathclyde:  Lanarkshire </placeName> <locale> G.P.'s surgery </locale><activity> Medical consultation </activity></setting></settingDesc><textClass><catRef targets="SPO ALLTIM3 ALLAVA0 ALLTYP2 SCGDOM3 SPOLOG2 SPOREG3"/><classCode scheme="DLEE">S consult</classCode><keywords><term> medicine </term><term> medical consultation </term></keywords></textClass></profileDesc><revisionDesc><change date="2006-10-21" who="#OUCS">Tag usage updated for BNC-XML</change><change date="2000-12-13" who="#OUCS">Last check for BNC World first release</change><change date="2000-09-01" who="#OUCS">Check all tagcounts</change><change date="2000-06-23" who="#OUCS">Resequenced s-units and added headers</change><change date="2000-01-29" who="#OUCS">Revised participant details</change><change date="2000-01-21" who="#OUCS">Added date info</change><change date="2000-01-09" who="#OUCS">Updated all catrefs</change><change date="2000-01-09" who="#OUCS">Updated REC elements to include tape number</change><change date="2000-01-08" who="#OUCS">Updated titles</change><change date="1999-12-25" who="#OUCS">corrected tagUsage</change><change date="1999-09-20" who="#UCREL">POS codes revised for BNC-2; header updated</change><change date="1994-11-27" who="#dominic">Initial accession to corpus</change></revisionDesc></teiHeader>
+<stext type="OTHERSP"><u who="FX8PSUNK">
+<s n="1"><w c5="ITJ" hw="ah" pos="INTERJ">Ah </w><w c5="AV0" hw="there" pos="ADV">there </w><w c5="PNP" hw="we" pos="PRON">we </w><w c5="VBB" hw="be" pos="VERB">are</w><c c5="PUN">,</c><unclear/><c c5="PUN">.</c></s>
+<s n="2"><w c5="AV0" hw="right" pos="ADV">Right  </w><unclear/><w c5="AJ0" hw="abdominal" pos="ADJ">abdominal </w><w c5="NN1" hw="wound" pos="SUBST">wound</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she</w><w c5="VBZ" hw="be" pos="VERB">'s </w><w c5="AT0" hw="a" pos="ART">a </w><w c5="AJ0-NN1" hw="wee" pos="ADJ">wee </w><w c5="NN1" hw="bit" pos="SUBST">bit  </w><pause/><w c5="VVN-AJ0" hw="confuse" pos="VERB">confused</w><c c5="PUN">.</c></s>
+<s n="3"><w c5="PNP" hw="she" pos="PRON">She </w><w c5="VDD" hw="do" pos="VERB">did</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="VVI" hw="bother" pos="VERB">bother </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="tell" pos="VERB">tell </w><w c5="PNP" hw="i" pos="PRON">me </w><w c5="CJT" hw="that" pos="CONJ">that </w><w c5="PNP" hw="she" pos="PRON">she</w><w c5="VHD" hw="have" pos="VERB">'d </w><w c5="AV0" hw="only" pos="ADV">only </w><w c5="VVN" hw="get" pos="VERB">got  </w><unclear/><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="call" pos="VERB">call </w><w c5="PNP" hw="you" pos="PRON">you</w><c c5="PUN">, </c><w c5="AV0" hw="right" pos="ADV">right</w><c c5="PUN">?</c></s>
+<s n="4"><w c5="UNC" hw="erm" pos="UNC">Erm </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VBD" hw="be" pos="VERB">was</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="PRP" hw="in" pos="PREP">in </w><w c5="DPS" hw="she" pos="PRON">her </w><w c5="NN1" hw="nightdress" pos="SUBST">nightdress </w><w c5="CJC" hw="but" pos="CONJ">but </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="AV0" hw="only" pos="ADV">only </w><w c5="VVD" hw="dress" pos="VERB">dressed </w><w c5="PNX" hw="herself" pos="PRON">herself</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="say" pos="VERB">said  </w><align with="FX8LC001"/><unclear/><align with="FX8LC002"/></s></u><u who="PS22T">
+<s n="5"><align with="FX8LC001"/><w c5="CJC" hw="and" pos="CONJ">And </w><w c5="PNP" hw="you" pos="PRON">you  </w><unclear/><align with="FX8LC002"/></s></u><u who="FX8PS000">
+<s n="6"><w c5="PNP" hw="she" pos="PRON">She </w><w c5="VVD" hw="say" pos="VERB">said </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="go" pos="VERB">went </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="buy" pos="VERB">buy </w><w c5="PNI" hw="something" pos="PRON">something  </w><unclear/><w c5="PNX" hw="herself" pos="PRON">herself</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="phone" pos="VERB">phoned </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1" hw="clinic" pos="SUBST">clinic </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1" hw="clinic" pos="SUBST">clinic </w><unclear/><c c5="PUN">.</c></s>
+<s n="7"><w c5="PNP" hw="she" pos="PRON">She</w><w c5="VBZ" hw="be" pos="VERB">'s  </w><unclear/><w c5="AV0" hw="here" pos="ADV">here </w><w c5="CJC" hw="and" pos="CONJ">and  </w><gap desc="name" reason="anonymization"/><w c5="VVZ" hw="say" pos="VERB">says </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VM0" hw="should" pos="VERB">should </w><w c5="VBI" hw="be" pos="VERB">be  </w><unclear/><w c5="AV0" hw="fortnightly" pos="ADV">fortnightly </w><unclear/><c c5="PUN">.</c></s>
+<s n="8"><pause/><w c5="AV0" hw="so" pos="ADV">So </w><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VDB" hw="do" pos="VERB">do</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="VVI" hw="know" pos="VERB">know </w><w c5="CJS" hw="whether" pos="CONJ">whether </w><w c5="PNP" hw="you" pos="PRON">you </w><w c5="VVB" hw="want" pos="VERB">want </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="go" pos="VERB">go </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="VVI" hw="see" pos="VERB">see </w><w c5="PNP" hw="she" pos="PRON">her </w><mw c5="PRP"><w c5="AV0" hw="rather" pos="ADV">rather </w><w c5="CJS" hw="than" pos="CONJ">than</w></mw><c c5="PUN">, </c><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VM0" hw="could" pos="VERB">could </w><w c5="VVI" hw="get" pos="VERB">get </w><w c5="AT0" hw="a" pos="ART">a </w><w c5="NN1" hw="doctor" pos="SUBST">doctor </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="go" pos="VERB">go </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="VVI" hw="see" pos="VERB">see </w><w c5="PNP" hw="she" pos="PRON">her </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="NN1-VVB" hw="phone" pos="SUBST">phone</w><c c5="PUN">,</c><unclear/><c c5="PUN">.</c></s></u><u who="PS22T">
+<s n="9"><unclear/><w c5="PNP" hw="it" pos="PRON">it</w><w c5="VBZ" hw="be" pos="VERB">'s </w><w c5="AV0" hw="just" pos="ADV">just </w><w c5="CJT" hw="that" pos="CONJ">that </w><w c5="PNP" hw="i" pos="PRON">I</w><w c5="VBB" hw="be" pos="VERB">'m </w><w c5="AV0" hw="never" pos="ADV">never </w><w c5="VVG" hw="gon" pos="VERB">gon</w><w c5="TO0" hw="na" pos="PREP">na </w><w c5="VVI" hw="get" pos="VERB">get </w><w c5="PRP" hw="to" pos="PREP">to </w><mw c5="PRP"><w c5="AVP" hw="up" pos="ADV">up </w><w c5="PRP" hw="to" pos="PREP">to</w></mw><c c5="PUN">.</c></s></u><u who="FX8PS000">
+<s n="10"><unclear/><c c5="PUN">?</c></s></u><u who="PS22T">
+<s n="11"><w c5="ITJ" hw="yeah" pos="INTERJ">Yeah</w><c c5="PUN">.</c></s></u><u who="FX8PS000">
+<s n="12"><w c5="AV0" hw="okay" pos="ADV">Okay</w><c c5="PUN">.</c></s></u><u who="PS22T">
+<s n="13"><w c5="ITJ" hw="yeah" pos="INTERJ">Yeah</w><c c5="PUN">.</c></s></u><u who="FX8PS000">
+<s n="14"><w c5="UNC" hw="erm" pos="UNC">erm</w><c c5="PUN">, </c><w c5="ORD" hw="first" pos="ADJ">first  </w><unclear/><w c5="CRD" hw="twelve" pos="ADJ">twelve </w><w c5="NN2" hw="week" pos="SUBST">weeks </w><w c5="AJ0" hw="pregnant" pos="ADJ">pregnant </w><w c5="AV0" hw="so" pos="ADV">so </w><w c5="VM0" hw="should" pos="VERB">should </w><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VVI" hw="mark" pos="VERB">mark </w><w c5="PRP" hw="at" pos="PREP">at </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1-AJ0" hw="bottom" pos="SUBST">bottom </w><w c5="CJS" hw="when" pos="CONJ">when </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVZ" hw="type" pos="VERB">types </w><unclear/><c c5="PUN">.</c></s>
+<s n="15"><w c5="UNC" hw="erm" pos="UNC">Erm  </w><unclear/><w c5="DT0" hw="this" pos="ADJ">this </w><w c5="PNI" hw="one" pos="PRON">one</w><c c5="PUN">.</c><event desc="recording ends"/></s></u></stext></bncDoc>
--- a/nltk/test/bnc.doctest
+++ b/nltk/test/bnc.doctest
+.. Copyright (C) 2001-2014 NLTK Project
+.. For license information, see LICENSE.TXT
+
+>>> from nltk.corpus.reader.bnc import BNCCorpusReader
+>>> bnc = BNCCorpusReader(root='.', fileids=r'FX8.xml')
+
+Checking the word access.
+-------------------------
+
+>>> len(bnc.words())
+151
+
+>>> bnc.words()[:6]
+['Ah', 'there', 'we', 'are', ',', '.']
+>>> bnc.words(stem=True)[:6]
+['ah', 'there', 'we', 'be', ',', '.']
+
+>>> bnc.tagged_words()[:6]
+[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
+
+>>> bnc.tagged_words(c5=True)[:6]
+[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
+
+Testing access to the sentences.
+--------------------------------
+
+>>> len(bnc.sents())
+15
+
+>>> bnc.sents()[0]
+['Ah', 'there', 'we', 'are', ',', '.']
+>>> bnc.sents(stem=True)[0]
+['ah', 'there', 'we', 'be', ',', '.']
+
+>>> bnc.tagged_sents()[0]
+[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
+>>> bnc.tagged_sents(c5=True)[0]
+[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
+
+A not lazy loader.
+-----------------
+
+>>> eager = BNCCorpusReader(root='.', fileids=r'FX8.xml', lazy=False)
+
+>>> len(eager.words())
+151
+>>> eager.words(stem=True)[6:17]
+['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
+
+>>> eager.tagged_words()[6:11]
+[('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
+>>> eager.tagged_words(c5=True)[6:17]
+[('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
+>>> len(eager.sents())
+15
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -6,6 +6,7 @@
 #         Steven Bird <stevenbird1@gmail.com> (additions)
 #         Edward Loper <edloper@gmail.com> (rewrite)
 #         Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite)
+#         Arthur Darcet <arthur@darcet.fr> (fixes)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT

@@ -49,7 +50,7 @@ flag.
    >>> text = '''
    ... (How does it deal with this parenthesis?)  "It should be part of the
    ... previous sentence." "(And the same with this one.)" ('And this one!')
-    ... "('(And (this)) '?)" [(and this.)]
+    ... "('(And (this)) '?)" [(and this. )]
    ... '''
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip())))
@@ -64,7 +65,7 @@ flag.
    -----
    "('(And (this)) '?)"
    -----
-    [(and this.)]
+    [(and this. )]
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip(), realign_boundaries=False)))
    (How does it deal with this parenthesis?
@@ -1297,12 +1298,15 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
                break_decision=tokens[0].sentbreak,
            )

-    def span_tokenize(self, text):
+    def span_tokenize(self, text, realign_boundaries=True):
        """
        Given a text, returns a list of the (start, end) spans of sentences
        in the text.
        """
-        return [(sl.start, sl.stop) for sl in self._slices_from_text(text)]
+        slices = self._slices_from_text(text)
+        if realign_boundaries:
+            slices = self._realign_boundaries(text, slices)
+        return [(sl.start, sl.stop) for sl in slices]

    def sentences_from_text(self, text, realign_boundaries=True):
        """
@@ -1311,10 +1315,7 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
        True, includes in the sentence closing punctuation that
        follows the period.
        """
-        sents = [text[sl] for sl in self._slices_from_text(text)]
-        if realign_boundaries:
-            sents = self._realign_boundaries(sents)
-        return sents
+        return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

    def _slices_from_text(self, text):
        last_break = 0
@@ -1330,7 +1331,7 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
                    last_break = match.end()
        yield slice(last_break, len(text))

-    def _realign_boundaries(self, sents):
+    def _realign_boundaries(self, text, slices):
        """
        Attempts to realign punctuation that falls after the period but
        should otherwise be included in the same sentence.
@@ -1344,21 +1345,21 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
            ["(Sent1.)", "Sent2."].
        """
        realign = 0
-        for s1, s2 in _pair_iter(sents):
-            s1 = s1[realign:]
-            if not s2:
-                if s1:
-                    yield s1
+        for sl1, sl2 in _pair_iter(slices):
+            sl1 = slice(sl1.start + realign, sl1.stop)
+            if not sl2:
+                if text[sl1]:
+                    yield sl1
                continue

-            m = self._lang_vars.re_boundary_realignment.match(s2)
+            m = self._lang_vars.re_boundary_realignment.match(text[sl2])
            if m:
-                yield s1 + m.group(0).strip()
+                yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
                realign = m.end()
            else:
                realign = 0
-                if s1:
-                    yield s1
+                if text[sl1]:
+                    yield sl1

    def text_contains_sentbreak(self, text):
        """