Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
40375f49
Commit
40375f49
authored
Nov 05, 2011
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixes to get tokenizer doctests to work; cleaned up package level imports to avoid wildcards
parent
00ee649f
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
34 deletions
+28
-34
nltk/tokenize/__init__.py
+16
-22
nltk/tokenize/punkt.py
+12
-12
No files found.
nltk/tokenize/__init__.py
View file @
40375f49
...
@@ -22,29 +22,23 @@ For more information about tokenization, please see the tokenizer HOWTO,
...
@@ -22,29 +22,23 @@ For more information about tokenization, please see the tokenizer HOWTO,
or chapter 3 of the NLTK book.
or chapter 3 of the NLTK book.
"""
"""
from
nltk.data
import
load
from
..data
import
load
from
.simple
import
*
from
.simple
import
SpaceTokenizer
,
TabTokenizer
,
LineTokenizer
,
\
from
.regexp
import
*
line_tokenize
from
.punkt
import
*
from
.regexp
import
RegexpTokenizer
,
WhitespaceTokenizer
,
BlanklineTokenizer
,
\
from
.sexpr
import
*
WordPunctTokenizer
,
wordpunct_tokenize
,
regexp_tokenize
,
\
from
.treebank
import
*
blankline_tokenize
from
.punkt
import
PunktSentenceTokenizer
,
PunktWordTokenizer
__all__
=
[
'WhitespaceTokenizer'
,
'SpaceTokenizer'
,
'TabTokenizer'
,
from
.sexpr
import
SExprTokenizer
,
sexpr_tokenize
'LineTokenizer'
,
'RegexpTokenizer'
,
'BlanklineTokenizer'
,
from
.treebank
import
TreebankWordTokenizer
'WordPunctTokenizer'
,
'blankline_tokenize'
,
'wordpunct_tokenize'
,
'regexp_tokenize'
,
'word_tokenize'
,
try
:
'SExprTokenizer'
,
'sexpr_tokenize'
,
'line_tokenize'
,
import
numpy
'PunktWordTokenizer'
,
'PunktSentenceTokenizer'
,
except
ImportError
:
'TreebankWordTokenizer'
,
'sent_tokenize'
,
'word_tokenize'
,
pass
]
try
:
import
numpy
except
ImportError
:
pass
else
:
else
:
from
.texttiling
import
*
from
.texttiling
import
TextTilingTokenizer
__all__
+=
[
'TextTilingTokenizer'
]
# Standard sentence tokenizer.
# Standard sentence tokenizer.
def
sent_tokenize
(
text
):
def
sent_tokenize
(
text
):
...
...
nltk/tokenize/punkt.py
View file @
40375f49
...
@@ -79,7 +79,7 @@ appropriate orthographic context flag."""
...
@@ -79,7 +79,7 @@ appropriate orthographic context flag."""
#{ Language-dependent variables
#{ Language-dependent variables
######################################################################
######################################################################
class
_
PunktLanguageVars
(
object
):
class
PunktLanguageVars
(
object
):
"""
"""
Stores variables, mostly regular expressions, which may be
Stores variables, mostly regular expressions, which may be
language-dependent for correct application of the algorithm.
language-dependent for correct application of the algorithm.
...
@@ -203,7 +203,7 @@ numeric tokens are changed to ##number## and hence contain alpha.)"""
...
@@ -203,7 +203,7 @@ numeric tokens are changed to ##number## and hence contain alpha.)"""
class
PunktWordTokenizer
(
TokenizerI
):
class
PunktWordTokenizer
(
TokenizerI
):
# Retained for backward compatibility
# Retained for backward compatibility
def
__init__
(
self
,
lang_vars
=
_
PunktLanguageVars
()):
def
__init__
(
self
,
lang_vars
=
PunktLanguageVars
()):
self
.
_lang_vars
=
lang_vars
self
.
_lang_vars
=
lang_vars
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
...
@@ -273,10 +273,10 @@ class PunktParameters(object):
...
@@ -273,10 +273,10 @@ class PunktParameters(object):
self
.
ortho_context
[
typ
]
|=
flag
self
.
ortho_context
[
typ
]
|=
flag
######################################################################
######################################################################
#{
_
PunktToken
#{ PunktToken
######################################################################
######################################################################
class
_
PunktToken
(
object
):
class
PunktToken
(
object
):
"""Stores a token of text with annotations produced during
"""Stores a token of text with annotations produced during
sentence boundary detection."""
sentence boundary detection."""
...
@@ -416,12 +416,12 @@ class _PunktToken(object):
...
@@ -416,12 +416,12 @@ class _PunktToken(object):
#{ Punkt base class
#{ Punkt base class
######################################################################
######################################################################
class
_
PunktBaseClass
(
object
):
class
PunktBaseClass
(
object
):
"""
"""
Includes common components of PunktTrainer and PunktSentenceTokenizer.
Includes common components of PunktTrainer and PunktSentenceTokenizer.
"""
"""
def
__init__
(
self
,
lang_vars
=
_PunktLanguageVars
(),
token_cls
=
_
PunktToken
,
def
__init__
(
self
,
lang_vars
=
PunktLanguageVars
(),
token_cls
=
PunktToken
,
params
=
PunktParameters
()):
params
=
PunktParameters
()):
self
.
_params
=
params
self
.
_params
=
params
self
.
_lang_vars
=
lang_vars
self
.
_lang_vars
=
lang_vars
...
@@ -507,13 +507,13 @@ class _PunktBaseClass(object):
...
@@ -507,13 +507,13 @@ class _PunktBaseClass(object):
######################################################################
######################################################################
class
PunktTrainer
(
_
PunktBaseClass
):
class
PunktTrainer
(
PunktBaseClass
):
"""Learns parameters used in Punkt sentence boundary detection."""
"""Learns parameters used in Punkt sentence boundary detection."""
def
__init__
(
self
,
train_text
=
None
,
verbose
=
False
,
def
__init__
(
self
,
train_text
=
None
,
verbose
=
False
,
lang_vars
=
_PunktLanguageVars
(),
token_cls
=
_
PunktToken
):
lang_vars
=
PunktLanguageVars
(),
token_cls
=
PunktToken
):
_
PunktBaseClass
.
__init__
(
self
,
lang_vars
=
lang_vars
,
PunktBaseClass
.
__init__
(
self
,
lang_vars
=
lang_vars
,
token_cls
=
token_cls
)
token_cls
=
token_cls
)
self
.
_type_fdist
=
FreqDist
()
self
.
_type_fdist
=
FreqDist
()
...
@@ -1084,7 +1084,7 @@ class PunktTrainer(_PunktBaseClass):
...
@@ -1084,7 +1084,7 @@ class PunktTrainer(_PunktBaseClass):
######################################################################
######################################################################
class
PunktSentenceTokenizer
(
_
PunktBaseClass
,
TokenizerI
):
class
PunktSentenceTokenizer
(
PunktBaseClass
,
TokenizerI
):
"""
"""
A sentence tokenizer which uses an unsupervised algorithm to build
A sentence tokenizer which uses an unsupervised algorithm to build
a model for abbreviation words, collocations, and words that start
a model for abbreviation words, collocations, and words that start
...
@@ -1093,12 +1093,12 @@ class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI):
...
@@ -1093,12 +1093,12 @@ class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI):
languages.
languages.
"""
"""
def
__init__
(
self
,
train_text
=
None
,
verbose
=
False
,
def
__init__
(
self
,
train_text
=
None
,
verbose
=
False
,
lang_vars
=
_PunktLanguageVars
(),
token_cls
=
_
PunktToken
):
lang_vars
=
PunktLanguageVars
(),
token_cls
=
PunktToken
):
"""
"""
train_text can either be the sole training text for this sentence
train_text can either be the sole training text for this sentence
boundary detector, or can be a PunktParameters object.
boundary detector, or can be a PunktParameters object.
"""
"""
_
PunktBaseClass
.
__init__
(
self
,
lang_vars
=
lang_vars
,
PunktBaseClass
.
__init__
(
self
,
lang_vars
=
lang_vars
,
token_cls
=
token_cls
)
token_cls
=
token_cls
)
if
train_text
:
if
train_text
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment