Commit 40375f49 by Steven Bird

fixes to get tokenizer doctests to work; cleaned up package level imports to avoid wildcards

parent 00ee649f
...@@ -22,29 +22,23 @@ For more information about tokenization, please see the tokenizer HOWTO, ...@@ -22,29 +22,23 @@ For more information about tokenization, please see the tokenizer HOWTO,
or chapter 3 of the NLTK book. or chapter 3 of the NLTK book.
""" """
from nltk.data import load from ..data import load
from .simple import * from .simple import SpaceTokenizer, TabTokenizer, LineTokenizer,\
from .regexp import * line_tokenize
from .punkt import * from .regexp import RegexpTokenizer, WhitespaceTokenizer, BlanklineTokenizer,\
from .sexpr import * WordPunctTokenizer, wordpunct_tokenize, regexp_tokenize,\
from .treebank import * blankline_tokenize
from .punkt import PunktSentenceTokenizer, PunktWordTokenizer
__all__ = ['WhitespaceTokenizer', 'SpaceTokenizer', 'TabTokenizer', from .sexpr import SExprTokenizer, sexpr_tokenize
'LineTokenizer', 'RegexpTokenizer', 'BlanklineTokenizer', from .treebank import TreebankWordTokenizer
'WordPunctTokenizer', 'blankline_tokenize',
'wordpunct_tokenize', 'regexp_tokenize', 'word_tokenize', try:
'SExprTokenizer', 'sexpr_tokenize', 'line_tokenize', import numpy
'PunktWordTokenizer', 'PunktSentenceTokenizer', except ImportError:
'TreebankWordTokenizer', 'sent_tokenize', 'word_tokenize', pass
]
try: import numpy
except ImportError: pass
else: else:
from .texttiling import * from .texttiling import TextTilingTokenizer
__all__ += ['TextTilingTokenizer']
# Standard sentence tokenizer. # Standard sentence tokenizer.
def sent_tokenize(text): def sent_tokenize(text):
......
...@@ -79,7 +79,7 @@ appropriate orthographic context flag.""" ...@@ -79,7 +79,7 @@ appropriate orthographic context flag."""
#{ Language-dependent variables #{ Language-dependent variables
###################################################################### ######################################################################
class _PunktLanguageVars(object): class PunktLanguageVars(object):
""" """
Stores variables, mostly regular expressions, which may be Stores variables, mostly regular expressions, which may be
language-dependent for correct application of the algorithm. language-dependent for correct application of the algorithm.
...@@ -203,7 +203,7 @@ numeric tokens are changed to ##number## and hence contain alpha.)""" ...@@ -203,7 +203,7 @@ numeric tokens are changed to ##number## and hence contain alpha.)"""
class PunktWordTokenizer(TokenizerI): class PunktWordTokenizer(TokenizerI):
# Retained for backward compatibility # Retained for backward compatibility
def __init__(self, lang_vars=_PunktLanguageVars()): def __init__(self, lang_vars=PunktLanguageVars()):
self._lang_vars = lang_vars self._lang_vars = lang_vars
def tokenize(self, text): def tokenize(self, text):
...@@ -273,10 +273,10 @@ class PunktParameters(object): ...@@ -273,10 +273,10 @@ class PunktParameters(object):
self.ortho_context[typ] |= flag self.ortho_context[typ] |= flag
###################################################################### ######################################################################
#{ _PunktToken #{ PunktToken
###################################################################### ######################################################################
class _PunktToken(object): class PunktToken(object):
"""Stores a token of text with annotations produced during """Stores a token of text with annotations produced during
sentence boundary detection.""" sentence boundary detection."""
...@@ -416,12 +416,12 @@ class _PunktToken(object): ...@@ -416,12 +416,12 @@ class _PunktToken(object):
#{ Punkt base class #{ Punkt base class
###################################################################### ######################################################################
class _PunktBaseClass(object): class PunktBaseClass(object):
""" """
Includes common components of PunktTrainer and PunktSentenceTokenizer. Includes common components of PunktTrainer and PunktSentenceTokenizer.
""" """
def __init__(self, lang_vars=_PunktLanguageVars(), token_cls=_PunktToken, def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken,
params=PunktParameters()): params=PunktParameters()):
self._params = params self._params = params
self._lang_vars = lang_vars self._lang_vars = lang_vars
...@@ -507,13 +507,13 @@ class _PunktBaseClass(object): ...@@ -507,13 +507,13 @@ class _PunktBaseClass(object):
###################################################################### ######################################################################
class PunktTrainer(_PunktBaseClass): class PunktTrainer(PunktBaseClass):
"""Learns parameters used in Punkt sentence boundary detection.""" """Learns parameters used in Punkt sentence boundary detection."""
def __init__(self, train_text=None, verbose=False, def __init__(self, train_text=None, verbose=False,
lang_vars=_PunktLanguageVars(), token_cls=_PunktToken): lang_vars=PunktLanguageVars(), token_cls=PunktToken):
_PunktBaseClass.__init__(self, lang_vars=lang_vars, PunktBaseClass.__init__(self, lang_vars=lang_vars,
token_cls=token_cls) token_cls=token_cls)
self._type_fdist = FreqDist() self._type_fdist = FreqDist()
...@@ -1084,7 +1084,7 @@ class PunktTrainer(_PunktBaseClass): ...@@ -1084,7 +1084,7 @@ class PunktTrainer(_PunktBaseClass):
###################################################################### ######################################################################
class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI): class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
""" """
A sentence tokenizer which uses an unsupervised algorithm to build A sentence tokenizer which uses an unsupervised algorithm to build
a model for abbreviation words, collocations, and words that start a model for abbreviation words, collocations, and words that start
...@@ -1093,12 +1093,12 @@ class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI): ...@@ -1093,12 +1093,12 @@ class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI):
languages. languages.
""" """
def __init__(self, train_text=None, verbose=False, def __init__(self, train_text=None, verbose=False,
lang_vars=_PunktLanguageVars(), token_cls=_PunktToken): lang_vars=PunktLanguageVars(), token_cls=PunktToken):
""" """
train_text can either be the sole training text for this sentence train_text can either be the sole training text for this sentence
boundary detector, or can be a PunktParameters object. boundary detector, or can be a PunktParameters object.
""" """
_PunktBaseClass.__init__(self, lang_vars=lang_vars, PunktBaseClass.__init__(self, lang_vars=lang_vars,
token_cls=token_cls) token_cls=token_cls)
if train_text: if train_text:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment