Commit daf4d0f7 by Steven Bird

Merge branch 'develop' into brill

parents 4cc33e5e d78254b7
......@@ -138,6 +138,9 @@ mac_morpho = LazyCorpusLoader(
machado = LazyCorpusLoader(
'machado', PortugueseCategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
masc_tagged = LazyCorpusLoader(
'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
cat_file='categories.txt', tagset='wsj', encoding="ascii", sep="_")
movie_reviews = LazyCorpusLoader(
'movie_reviews', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
......
......@@ -5,23 +5,29 @@
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the XML version of the British National Corpus.
"""
__docformat__ = 'epytext en'
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
class BNCCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the British National Corpus.
"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reder as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
......@@ -36,14 +42,7 @@ class BNCCorpusReader(XMLCorpusReader):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if self._lazy:
return concat([BNCWordView(fileid, False, None,
strip_space, stem)
for fileid in self.abspaths(fileids)])
else:
return concat([self._words(fileid, False, None,
strip_space, stem)
for fileid in self.abspaths(fileids)])
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
......@@ -58,16 +57,8 @@ class BNCCorpusReader(XMLCorpusReader):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if c5:
tag = 'c5'
else:
tag = 'pos'
if self._lazy:
return concat([BNCWordView(fileid, False, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
else:
return concat([self._words(fileid, False, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
tag = 'c5' if c5 else 'pos'
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
......@@ -80,15 +71,9 @@ class BNCCorpusReader(XMLCorpusReader):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if self._lazy:
return concat([BNCWordView(fileid, True, None, strip_space, stem)
for fileid in self.abspaths(fileids)])
else:
return concat([self._words(fileid, True, None, strip_space, stem)
for fileid in self.abspaths(fileids)])
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True,
stem=False):
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
......@@ -100,16 +85,13 @@ class BNCCorpusReader(XMLCorpusReader):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if c5:
tag = 'c5'
else:
tag = 'pos'
if self._lazy:
return concat([BNCWordView(fileid, True, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
else:
return concat([self._words(fileid, True, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
tag = 'c5' if c5 else 'pos'
return self._views(fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
......@@ -174,6 +156,16 @@ class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = set(
['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
)
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
......@@ -191,6 +183,11 @@ class BNCWordView(XMLCorpusView):
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
......@@ -201,11 +198,6 @@ class BNCWordView(XMLCorpusView):
# Reset tag context.
self._tag_context = {0: ()}
title = None #: Title of the document.
author = None #: Author of the document.
editor = None #: Editor
resps = None #: Statement of responsibility
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall('titleStmt/title')
......@@ -251,10 +243,10 @@ class BNCWordView(XMLCorpusView):
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag == 'mw':
if child.tag in ('mw', 'hi', 'corr', 'trunc'):
sent += [self.handle_word(w) for w in child]
elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
else:
elif child.tag not in self.tags_to_ignore:
raise ValueError('Unexpected element %s' % child.tag)
return BNCSentence(elt.attrib['n'], sent)
<bncDoc xml:id="FX8"><teiHeader><fileDesc><titleStmt><title> General practitioner's surgery: medical consultation. Sample containing about 125 words speech recorded in public context </title><respStmt><resp> Data capture and transcription </resp><name> Longman ELT </name> </respStmt></titleStmt><editionStmt><edition>BNC XML Edition, December 2006</edition></editionStmt><extent> 125 tokens; 130 w-units; 15 s-units </extent><publicationStmt><distributor>Distributed under licence by Oxford University Computing Services on behalf of the BNC Consortium.</distributor><availability> This material is protected by international copyright laws and may not be copied or redistributed in any way. Consult the BNC Web Site at http://www.natcorp.ox.ac.uk for full licencing and distribution conditions.</availability><idno type="bnc">FX8</idno><idno type="old"> 093802 </idno></publicationStmt><sourceDesc><recordingStmt><recording n="093802" type="DAT"/></recordingStmt></sourceDesc></fileDesc><encodingDesc><tagsDecl><namespace name=""><tagUsage gi="align" occurs="4"/><tagUsage gi="c" occurs="21"/><tagUsage gi="event" occurs="1"/><tagUsage gi="gap" occurs="1"/><tagUsage gi="mw" occurs="2"/><tagUsage gi="pause" occurs="2"/><tagUsage gi="s" occurs="15"/><tagUsage gi="u" occurs="9"/><tagUsage gi="unclear" occurs="16"/><tagUsage gi="w" occurs="130"/></namespace></tagsDecl></encodingDesc><profileDesc><creation date="0000">0000-00-00 Origination/creation date not known </creation><particDesc n="C126"><person ageGroup="X" xml:id="PS22T" role="unspecified" sex="m" soc="AB" dialect="NONE" firstLang="EN-GBR" educ="Ed0"><persName>Doctor</persName> <occupation>doctor</occupation> <persNote>other participants are doctors patients</persNote></person><person ageGroup="X" xml:id="FX8PS000" role="unspecified" sex="u" soc="UU" dialect="NONE"/><person ageGroup="X" dialect="NONE" n="W0000" role="other" sex="u" soc="UU" xml:id="FX8PSUNK"><persName>Unknown speaker</persName></person><person ageGroup="X" dialect="NONE" n="W000M" role="other" sex="u" soc="UU" xml:id="FX8PSUGP"><persName>Group of unknown speakers</persName></person></particDesc><settingDesc><setting n="093802" who="PS22T FX8PS000"><placeName>Strathclyde: Lanarkshire </placeName> <locale> G.P.'s surgery </locale><activity> Medical consultation </activity></setting></settingDesc><textClass><catRef targets="SPO ALLTIM3 ALLAVA0 ALLTYP2 SCGDOM3 SPOLOG2 SPOREG3"/><classCode scheme="DLEE">S consult</classCode><keywords><term> medicine </term><term> medical consultation </term></keywords></textClass></profileDesc><revisionDesc><change date="2006-10-21" who="#OUCS">Tag usage updated for BNC-XML</change><change date="2000-12-13" who="#OUCS">Last check for BNC World first release</change><change date="2000-09-01" who="#OUCS">Check all tagcounts</change><change date="2000-06-23" who="#OUCS">Resequenced s-units and added headers</change><change date="2000-01-29" who="#OUCS">Revised participant details</change><change date="2000-01-21" who="#OUCS">Added date info</change><change date="2000-01-09" who="#OUCS">Updated all catrefs</change><change date="2000-01-09" who="#OUCS">Updated REC elements to include tape number</change><change date="2000-01-08" who="#OUCS">Updated titles</change><change date="1999-12-25" who="#OUCS">corrected tagUsage</change><change date="1999-09-20" who="#UCREL">POS codes revised for BNC-2; header updated</change><change date="1994-11-27" who="#dominic">Initial accession to corpus</change></revisionDesc></teiHeader>
<stext type="OTHERSP"><u who="FX8PSUNK">
<s n="1"><w c5="ITJ" hw="ah" pos="INTERJ">Ah </w><w c5="AV0" hw="there" pos="ADV">there </w><w c5="PNP" hw="we" pos="PRON">we </w><w c5="VBB" hw="be" pos="VERB">are</w><c c5="PUN">,</c><unclear/><c c5="PUN">.</c></s>
<s n="2"><w c5="AV0" hw="right" pos="ADV">Right </w><unclear/><w c5="AJ0" hw="abdominal" pos="ADJ">abdominal </w><w c5="NN1" hw="wound" pos="SUBST">wound</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she</w><w c5="VBZ" hw="be" pos="VERB">'s </w><w c5="AT0" hw="a" pos="ART">a </w><w c5="AJ0-NN1" hw="wee" pos="ADJ">wee </w><w c5="NN1" hw="bit" pos="SUBST">bit </w><pause/><w c5="VVN-AJ0" hw="confuse" pos="VERB">confused</w><c c5="PUN">.</c></s>
<s n="3"><w c5="PNP" hw="she" pos="PRON">She </w><w c5="VDD" hw="do" pos="VERB">did</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="VVI" hw="bother" pos="VERB">bother </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="tell" pos="VERB">tell </w><w c5="PNP" hw="i" pos="PRON">me </w><w c5="CJT" hw="that" pos="CONJ">that </w><w c5="PNP" hw="she" pos="PRON">she</w><w c5="VHD" hw="have" pos="VERB">'d </w><w c5="AV0" hw="only" pos="ADV">only </w><w c5="VVN" hw="get" pos="VERB">got </w><unclear/><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="call" pos="VERB">call </w><w c5="PNP" hw="you" pos="PRON">you</w><c c5="PUN">, </c><w c5="AV0" hw="right" pos="ADV">right</w><c c5="PUN">?</c></s>
<s n="4"><w c5="UNC" hw="erm" pos="UNC">Erm </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VBD" hw="be" pos="VERB">was</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="PRP" hw="in" pos="PREP">in </w><w c5="DPS" hw="she" pos="PRON">her </w><w c5="NN1" hw="nightdress" pos="SUBST">nightdress </w><w c5="CJC" hw="but" pos="CONJ">but </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="AV0" hw="only" pos="ADV">only </w><w c5="VVD" hw="dress" pos="VERB">dressed </w><w c5="PNX" hw="herself" pos="PRON">herself</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="say" pos="VERB">said </w><align with="FX8LC001"/><unclear/><align with="FX8LC002"/></s></u><u who="PS22T">
<s n="5"><align with="FX8LC001"/><w c5="CJC" hw="and" pos="CONJ">And </w><w c5="PNP" hw="you" pos="PRON">you </w><unclear/><align with="FX8LC002"/></s></u><u who="FX8PS000">
<s n="6"><w c5="PNP" hw="she" pos="PRON">She </w><w c5="VVD" hw="say" pos="VERB">said </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="go" pos="VERB">went </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="buy" pos="VERB">buy </w><w c5="PNI" hw="something" pos="PRON">something </w><unclear/><w c5="PNX" hw="herself" pos="PRON">herself</w><c c5="PUN">, </c><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVD" hw="phone" pos="VERB">phoned </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1" hw="clinic" pos="SUBST">clinic </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1" hw="clinic" pos="SUBST">clinic </w><unclear/><c c5="PUN">.</c></s>
<s n="7"><w c5="PNP" hw="she" pos="PRON">She</w><w c5="VBZ" hw="be" pos="VERB">'s </w><unclear/><w c5="AV0" hw="here" pos="ADV">here </w><w c5="CJC" hw="and" pos="CONJ">and </w><gap desc="name" reason="anonymization"/><w c5="VVZ" hw="say" pos="VERB">says </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VM0" hw="should" pos="VERB">should </w><w c5="VBI" hw="be" pos="VERB">be </w><unclear/><w c5="AV0" hw="fortnightly" pos="ADV">fortnightly </w><unclear/><c c5="PUN">.</c></s>
<s n="8"><pause/><w c5="AV0" hw="so" pos="ADV">So </w><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VDB" hw="do" pos="VERB">do</w><w c5="XX0" hw="not" pos="ADV">n't </w><w c5="VVI" hw="know" pos="VERB">know </w><w c5="CJS" hw="whether" pos="CONJ">whether </w><w c5="PNP" hw="you" pos="PRON">you </w><w c5="VVB" hw="want" pos="VERB">want </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="go" pos="VERB">go </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="VVI" hw="see" pos="VERB">see </w><w c5="PNP" hw="she" pos="PRON">her </w><mw c5="PRP"><w c5="AV0" hw="rather" pos="ADV">rather </w><w c5="CJS" hw="than" pos="CONJ">than</w></mw><c c5="PUN">, </c><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VM0" hw="could" pos="VERB">could </w><w c5="VVI" hw="get" pos="VERB">get </w><w c5="AT0" hw="a" pos="ART">a </w><w c5="NN1" hw="doctor" pos="SUBST">doctor </w><w c5="TO0" hw="to" pos="PREP">to </w><w c5="VVI" hw="go" pos="VERB">go </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="VVI" hw="see" pos="VERB">see </w><w c5="PNP" hw="she" pos="PRON">her </w><w c5="CJC" hw="and" pos="CONJ">and </w><w c5="NN1-VVB" hw="phone" pos="SUBST">phone</w><c c5="PUN">,</c><unclear/><c c5="PUN">.</c></s></u><u who="PS22T">
<s n="9"><unclear/><w c5="PNP" hw="it" pos="PRON">it</w><w c5="VBZ" hw="be" pos="VERB">'s </w><w c5="AV0" hw="just" pos="ADV">just </w><w c5="CJT" hw="that" pos="CONJ">that </w><w c5="PNP" hw="i" pos="PRON">I</w><w c5="VBB" hw="be" pos="VERB">'m </w><w c5="AV0" hw="never" pos="ADV">never </w><w c5="VVG" hw="gon" pos="VERB">gon</w><w c5="TO0" hw="na" pos="PREP">na </w><w c5="VVI" hw="get" pos="VERB">get </w><w c5="PRP" hw="to" pos="PREP">to </w><mw c5="PRP"><w c5="AVP" hw="up" pos="ADV">up </w><w c5="PRP" hw="to" pos="PREP">to</w></mw><c c5="PUN">.</c></s></u><u who="FX8PS000">
<s n="10"><unclear/><c c5="PUN">?</c></s></u><u who="PS22T">
<s n="11"><w c5="ITJ" hw="yeah" pos="INTERJ">Yeah</w><c c5="PUN">.</c></s></u><u who="FX8PS000">
<s n="12"><w c5="AV0" hw="okay" pos="ADV">Okay</w><c c5="PUN">.</c></s></u><u who="PS22T">
<s n="13"><w c5="ITJ" hw="yeah" pos="INTERJ">Yeah</w><c c5="PUN">.</c></s></u><u who="FX8PS000">
<s n="14"><w c5="UNC" hw="erm" pos="UNC">erm</w><c c5="PUN">, </c><w c5="ORD" hw="first" pos="ADJ">first </w><unclear/><w c5="CRD" hw="twelve" pos="ADJ">twelve </w><w c5="NN2" hw="week" pos="SUBST">weeks </w><w c5="AJ0" hw="pregnant" pos="ADJ">pregnant </w><w c5="AV0" hw="so" pos="ADV">so </w><w c5="VM0" hw="should" pos="VERB">should </w><w c5="PNP" hw="i" pos="PRON">I </w><w c5="VVI" hw="mark" pos="VERB">mark </w><w c5="PRP" hw="at" pos="PREP">at </w><w c5="AT0" hw="the" pos="ART">the </w><w c5="NN1-AJ0" hw="bottom" pos="SUBST">bottom </w><w c5="CJS" hw="when" pos="CONJ">when </w><w c5="PNP" hw="she" pos="PRON">she </w><w c5="VVZ" hw="type" pos="VERB">types </w><unclear/><c c5="PUN">.</c></s>
<s n="15"><w c5="UNC" hw="erm" pos="UNC">Erm </w><unclear/><w c5="DT0" hw="this" pos="ADJ">this </w><w c5="PNI" hw="one" pos="PRON">one</w><c c5="PUN">.</c><event desc="recording ends"/></s></u></stext></bncDoc>
.. Copyright (C) 2001-2014 NLTK Project
.. For license information, see LICENSE.TXT
>>> from nltk.corpus.reader.bnc import BNCCorpusReader
>>> bnc = BNCCorpusReader(root='.', fileids=r'FX8.xml')
Checking the word access.
-------------------------
>>> len(bnc.words())
151
>>> bnc.words()[:6]
['Ah', 'there', 'we', 'are', ',', '.']
>>> bnc.words(stem=True)[:6]
['ah', 'there', 'we', 'be', ',', '.']
>>> bnc.tagged_words()[:6]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_words(c5=True)[:6]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
Testing access to the sentences.
--------------------------------
>>> len(bnc.sents())
15
>>> bnc.sents()[0]
['Ah', 'there', 'we', 'are', ',', '.']
>>> bnc.sents(stem=True)[0]
['ah', 'there', 'we', 'be', ',', '.']
>>> bnc.tagged_sents()[0]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_sents(c5=True)[0]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
A not lazy loader.
-----------------
>>> eager = BNCCorpusReader(root='.', fileids=r'FX8.xml', lazy=False)
>>> len(eager.words())
151
>>> eager.words(stem=True)[6:17]
['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
>>> eager.tagged_words()[6:11]
[('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
>>> eager.tagged_words(c5=True)[6:17]
[('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
>>> len(eager.sents())
15
......@@ -6,6 +6,7 @@
# Steven Bird <stevenbird1@gmail.com> (additions)
# Edward Loper <edloper@gmail.com> (rewrite)
# Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite)
# Arthur Darcet <arthur@darcet.fr> (fixes)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
......@@ -49,7 +50,7 @@ flag.
>>> text = '''
... (How does it deal with this parenthesis?) "It should be part of the
... previous sentence." "(And the same with this one.)" ('And this one!')
... "('(And (this)) '?)" [(and this.)]
... "('(And (this)) '?)" [(and this. )]
... '''
>>> print('\n-----\n'.join(
... sent_detector.tokenize(text.strip())))
......@@ -64,7 +65,7 @@ flag.
-----
"('(And (this)) '?)"
-----
[(and this.)]
[(and this. )]
>>> print('\n-----\n'.join(
... sent_detector.tokenize(text.strip(), realign_boundaries=False)))
(How does it deal with this parenthesis?
......@@ -1297,12 +1298,15 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
break_decision=tokens[0].sentbreak,
)
def span_tokenize(self, text):
def span_tokenize(self, text, realign_boundaries=True):
"""
Given a text, returns a list of the (start, end) spans of sentences
in the text.
"""
return [(sl.start, sl.stop) for sl in self._slices_from_text(text)]
slices = self._slices_from_text(text)
if realign_boundaries:
slices = self._realign_boundaries(text, slices)
return [(sl.start, sl.stop) for sl in slices]
def sentences_from_text(self, text, realign_boundaries=True):
"""
......@@ -1311,10 +1315,7 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
True, includes in the sentence closing punctuation that
follows the period.
"""
sents = [text[sl] for sl in self._slices_from_text(text)]
if realign_boundaries:
sents = self._realign_boundaries(sents)
return sents
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
def _slices_from_text(self, text):
last_break = 0
......@@ -1330,7 +1331,7 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
last_break = match.end()
yield slice(last_break, len(text))
def _realign_boundaries(self, sents):
def _realign_boundaries(self, text, slices):
"""
Attempts to realign punctuation that falls after the period but
should otherwise be included in the same sentence.
......@@ -1344,21 +1345,21 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
["(Sent1.)", "Sent2."].
"""
realign = 0
for s1, s2 in _pair_iter(sents):
s1 = s1[realign:]
if not s2:
if s1:
yield s1
for sl1, sl2 in _pair_iter(slices):
sl1 = slice(sl1.start + realign, sl1.stop)
if not sl2:
if text[sl1]:
yield sl1
continue
m = self._lang_vars.re_boundary_realignment.match(s2)
m = self._lang_vars.re_boundary_realignment.match(text[sl2])
if m:
yield s1 + m.group(0).strip()
yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
realign = m.end()
else:
realign = 0
if s1:
yield s1
if text[sl1]:
yield sl1
def text_contains_sentbreak(self, text):
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment