Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
a39be2c0
Commit
a39be2c0
authored
Nov 14, 2011
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixing up wildcard imports
parent
26082247
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
47 additions
and
54 deletions
+47
-54
nltk/chunk/__init__.py
+5
-13
nltk/chunk/named_entity.py
+36
-29
nltk/chunk/regexp.py
+1
-2
nltk/chunk/util.py
+2
-4
nltk/downloader.py
+2
-2
nltk/parse/earleychart.py
+0
-3
nltk/tag/hmm.py
+1
-1
No files found.
nltk/chunk/__init__.py
View file @
a39be2c0
...
...
@@ -152,23 +152,15 @@ zero-length assertions).
pattern is valid.
"""
from
api
import
*
from
util
import
*
from
regexp
import
*
__all__
=
[
# ChunkParser interface
'ChunkParserI'
,
# Parsers
'RegexpChunkParser'
,
'RegexpParser'
,
'ne_chunk'
,
'batch_ne_chunk'
,
]
from
api
import
ChunkParserI
from
util
import
(
ChunkScore
,
accuracy
,
tagstr2tree
,
conllstr2tree
,
tree2conlltags
,
tree2conllstr
,
tree2conlltags
)
from
regexp
import
RegexpChunkParser
,
RegexpParser
# Standard treebank POS tagger
_BINARY_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
_MULTICLASS_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def
ne_chunk
(
tagged_tokens
,
binary
=
False
):
"""
Use NLTK's currently recommended named entity chunker to
...
...
nltk/chunk/named_entity.py
View file @
a39be2c0
...
...
@@ -11,28 +11,35 @@ Named entity chunker
import
os
,
re
,
pickle
from
xml.etree
import
ElementTree
as
ET
from
nltk.chunk.api
import
*
from
nltk.chunk.util
import
*
import
nltk
# This really shouldn't be loaded at import time. But it's used by a
# static method. Do a lazy loading?
_short_en_wordlist
=
set
(
nltk
.
corpus
.
words
.
words
(
'en-basic'
))
from
nltk.tag
import
ClassifierBasedTagger
,
pos_tag
from
nltk.classify
import
MaxentClassifier
from
nltk.tree
import
Tree
from
nltk.tokenize
import
word_tokenize
from
nltk.data
import
find
from
nltk.chunk.api
import
ChunkParserI
from
nltk.chunk.util
import
ChunkScore
class
NEChunkParserTagger
(
nltk
.
tag
.
ClassifierBasedTagger
):
class
NEChunkParserTagger
(
ClassifierBasedTagger
):
"""
The IOB tagger used by the chunk parser.
"""
def
__init__
(
self
,
train
):
nltk
.
tag
.
ClassifierBasedTagger
.
__init__
(
ClassifierBasedTagger
.
__init__
(
self
,
train
=
train
,
classifier_builder
=
self
.
_classifier_builder
)
def
_classifier_builder
(
self
,
train
):
return
nltk
.
MaxentClassifier
.
train
(
train
,
algorithm
=
'megam'
,
return
MaxentClassifier
.
train
(
train
,
algorithm
=
'megam'
,
gaussian_prior_sigma
=
1
,
trace
=
2
)
def
_english_wordlist
(
self
):
if
not
self
.
_en_wordlist
:
from
nltk.corpus
import
words
self
.
_en_wordlist
=
set
(
words
.
words
(
'en-basic'
))
return
self
.
_en_wordlist
def
_feature_detector
(
self
,
tokens
,
index
,
history
):
word
=
tokens
[
index
][
0
]
...
...
@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
'suffix3'
:
word
[
-
3
:]
.
lower
(),
'pos'
:
pos
,
'word'
:
word
,
'en-wordlist'
:
(
word
in
_short_en_wordlist
),
# xx!
'en-wordlist'
:
(
word
in
self
.
_english_wordlist
()),
'prevtag'
:
prevtag
,
'prevpos'
:
prevpos
,
'nextpos'
:
nextpos
,
...
...
@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI):
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
sent
=
nltk
.
Tree
(
'S'
,
[])
sent
=
Tree
(
'S'
,
[])
for
(
tok
,
tag
)
in
tagged_tokens
:
if
tag
==
'O'
:
sent
.
append
(
tok
)
elif
tag
.
startswith
(
'B-'
):
sent
.
append
(
nltk
.
Tree
(
tag
[
2
:],
[
tok
]))
sent
.
append
(
Tree
(
tag
[
2
:],
[
tok
]))
elif
tag
.
startswith
(
'I-'
):
if
(
sent
and
isinstance
(
sent
[
-
1
],
Tree
)
and
sent
[
-
1
]
.
node
==
tag
[
2
:]):
sent
[
-
1
]
.
append
(
tok
)
else
:
sent
.
append
(
nltk
.
Tree
(
tag
[
2
:],
[
tok
]))
sent
.
append
(
Tree
(
tag
[
2
:],
[
tok
]))
return
sent
@staticmethod
...
...
@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI):
"""
toks
=
[]
for
child
in
sent
:
if
isinstance
(
child
,
nltk
.
Tree
):
if
isinstance
(
child
,
Tree
):
if
len
(
child
)
==
0
:
print
"Warning -- empty chunk in sentence"
continue
...
...
@@ -171,10 +178,10 @@ def simplify_pos(s):
def
postag_tree
(
tree
):
# Part-of-speech tagging.
words
=
tree
.
leaves
()
tag_iter
=
(
pos
for
(
word
,
pos
)
in
nltk
.
pos_tag
(
words
))
tag_iter
=
(
pos
for
(
word
,
pos
)
in
pos_tag
(
words
))
newtree
=
Tree
(
'S'
,
[])
for
child
in
tree
:
if
isinstance
(
child
,
nltk
.
Tree
):
if
isinstance
(
child
,
Tree
):
newtree
.
append
(
Tree
(
child
.
node
,
[]))
for
subchild
in
child
:
newtree
[
-
1
]
.
append
(
(
subchild
,
tag_iter
.
next
())
)
...
...
@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt):
# Binary distinction (NE or not NE)
if
fmt
==
'binary'
:
i
=
0
toks
=
nltk
.
Tree
(
'S'
,
[])
toks
=
Tree
(
'S'
,
[])
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
e
<=
s
:
continue
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
nltk
.
Tree
(
'NE'
,
text
[
s
:
e
]
.
split
()))
toks
.
extend
(
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
Tree
(
'NE'
,
text
[
s
:
e
]
.
split
()))
i
=
e
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:]))
toks
.
extend
(
word_tokenize
(
text
[
i
:]))
yield
toks
# Multiclass distinction (NE type)
elif
fmt
==
'multiclass'
:
i
=
0
toks
=
nltk
.
Tree
(
'S'
,
[])
toks
=
Tree
(
'S'
,
[])
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
e
<=
s
:
continue
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
nltk
.
Tree
(
typ
,
text
[
s
:
e
]
.
split
()))
toks
.
extend
(
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
Tree
(
typ
,
text
[
s
:
e
]
.
split
()))
i
=
e
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:]))
toks
.
extend
(
word_tokenize
(
text
[
i
:]))
yield
toks
else
:
...
...
@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed):
def
build_model
(
fmt
=
'binary'
):
print
'Loading training data...'
train_paths
=
[
nltk
.
data
.
find
(
'corpora/ace_data/ace.dev'
),
nltk
.
data
.
find
(
'corpora/ace_data/ace.heldout'
),
nltk
.
data
.
find
(
'corpora/ace_data/bbn.dev'
),
nltk
.
data
.
find
(
'corpora/ace_data/muc.dev'
)]
train_paths
=
[
find
(
'corpora/ace_data/ace.dev'
),
find
(
'corpora/ace_data/ace.heldout'
),
find
(
'corpora/ace_data/bbn.dev'
),
find
(
'corpora/ace_data/muc.dev'
)]
train_trees
=
load_ace_data
(
train_paths
,
fmt
)
train_data
=
[
postag_tree
(
t
)
for
t
in
train_trees
]
print
'Training...'
...
...
@@ -282,7 +289,7 @@ def build_model(fmt='binary'):
del
train_data
print
'Loading eval data...'
eval_paths
=
[
nltk
.
data
.
find
(
'corpora/ace_data/ace.eval'
)]
eval_paths
=
[
find
(
'corpora/ace_data/ace.eval'
)]
eval_trees
=
load_ace_data
(
eval_paths
,
fmt
)
eval_data
=
[
postag_tree
(
t
)
for
t
in
eval_trees
]
...
...
nltk/chunk/regexp.py
View file @
a39be2c0
...
...
@@ -11,8 +11,7 @@ import types
from
nltk.tree
import
Tree
from
nltk.chunk.api
import
*
from
nltk.chunk.util
import
*
from
nltk.chunk.api
import
ChunkParserI
##//////////////////////////////////////////////////////
## ChunkString
...
...
nltk/chunk/util.py
View file @
a39be2c0
...
...
@@ -10,9 +10,7 @@ import re
import
string
from
nltk.tree
import
Tree
import
nltk.tag.util
from
api
import
*
from
nltk.tag.util
import
str2tuple
##//////////////////////////////////////////////////////
## EVALUATION
...
...
@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
if
sep
is
None
:
stack
[
-
1
]
.
append
(
text
)
else
:
stack
[
-
1
]
.
append
(
nltk
.
tag
.
util
.
str2tuple
(
text
,
sep
))
stack
[
-
1
]
.
append
(
str2tuple
(
text
,
sep
))
if
len
(
stack
)
!=
1
:
raise
ValueError
(
'Expected ] at char
%
d'
%
len
(
s
))
...
...
nltk/downloader.py
View file @
a39be2c0
...
...
@@ -168,8 +168,8 @@ except:
try
:
TKINTER
=
True
from
Tkinter
import
*
from
tkMessageBox
import
*
from
Tkinter
import
Tk
,
Frame
,
Label
,
Entry
,
Button
,
Canvas
,
Menu
,
IntVar
from
tkMessageBox
import
showerror
from
nltk.draw.table
import
Table
from
nltk.draw
import
ShowText
except
:
...
...
nltk/parse/earleychart.py
View file @
a39be2c0
...
...
@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
#from nltk.grammar import *
#from nltk.parse.api import ParserI
from
nltk.parse.chart
import
(
Chart
,
ChartParser
,
EdgeI
,
LeafEdge
,
LeafInitRule
,
BottomUpPredictRule
,
BottomUpPredictCombineRule
,
TopDownInitRule
,
SingleEdgeFundamentalRule
,
...
...
nltk/tag/hmm.py
View file @
a39be2c0
...
...
@@ -72,7 +72,7 @@ which includes extensive demonstration code.
import
re
import
types
from
numpy
import
*
from
numpy
import
zeros
,
ones
,
float32
,
float64
,
log2
,
hstack
,
array
,
argmax
from
nltk.probability
import
(
FreqDist
,
ConditionalFreqDist
,
ConditionalProbDist
,
DictionaryProbDist
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment