Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nltk
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
nltk
Commits
a39be2c0
Commit
a39be2c0
authored
Nov 14, 2011
by
Steven Bird
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixing up wildcard imports
parent
26082247
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
47 additions
and
54 deletions
+47
-54
nltk/chunk/__init__.py
+5
-13
nltk/chunk/named_entity.py
+36
-29
nltk/chunk/regexp.py
+1
-2
nltk/chunk/util.py
+2
-4
nltk/downloader.py
+2
-2
nltk/parse/earleychart.py
+0
-3
nltk/tag/hmm.py
+1
-1
No files found.
nltk/chunk/__init__.py
View file @
a39be2c0
...
@@ -152,23 +152,15 @@ zero-length assertions).
...
@@ -152,23 +152,15 @@ zero-length assertions).
pattern is valid.
pattern is valid.
"""
"""
from
api
import
*
from
api
import
ChunkParserI
from
util
import
*
from
util
import
(
ChunkScore
,
accuracy
,
tagstr2tree
,
conllstr2tree
,
from
regexp
import
*
tree2conlltags
,
tree2conllstr
,
tree2conlltags
)
from
regexp
import
RegexpChunkParser
,
RegexpParser
__all__
=
[
# ChunkParser interface
'ChunkParserI'
,
# Parsers
'RegexpChunkParser'
,
'RegexpParser'
,
'ne_chunk'
,
'batch_ne_chunk'
,
]
# Standard treebank POS tagger
# Standard treebank POS tagger
_BINARY_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
_BINARY_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
_MULTICLASS_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
_MULTICLASS_NE_CHUNKER
=
'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def
ne_chunk
(
tagged_tokens
,
binary
=
False
):
def
ne_chunk
(
tagged_tokens
,
binary
=
False
):
"""
"""
Use NLTK's currently recommended named entity chunker to
Use NLTK's currently recommended named entity chunker to
...
...
nltk/chunk/named_entity.py
View file @
a39be2c0
...
@@ -11,28 +11,35 @@ Named entity chunker
...
@@ -11,28 +11,35 @@ Named entity chunker
import
os
,
re
,
pickle
import
os
,
re
,
pickle
from
xml.etree
import
ElementTree
as
ET
from
xml.etree
import
ElementTree
as
ET
from
nltk.chunk.api
import
*
from
nltk.chunk.util
import
*
import
nltk
# This really shouldn't be loaded at import time. But it's used by a
from
nltk.tag
import
ClassifierBasedTagger
,
pos_tag
# static method. Do a lazy loading?
from
nltk.classify
import
MaxentClassifier
_short_en_wordlist
=
set
(
nltk
.
corpus
.
words
.
words
(
'en-basic'
))
from
nltk.tree
import
Tree
from
nltk.tokenize
import
word_tokenize
from
nltk.data
import
find
from
nltk.chunk.api
import
ChunkParserI
from
nltk.chunk.util
import
ChunkScore
class
NEChunkParserTagger
(
nltk
.
tag
.
ClassifierBasedTagger
):
class
NEChunkParserTagger
(
ClassifierBasedTagger
):
"""
"""
The IOB tagger used by the chunk parser.
The IOB tagger used by the chunk parser.
"""
"""
def
__init__
(
self
,
train
):
def
__init__
(
self
,
train
):
nltk
.
tag
.
ClassifierBasedTagger
.
__init__
(
ClassifierBasedTagger
.
__init__
(
self
,
train
=
train
,
self
,
train
=
train
,
classifier_builder
=
self
.
_classifier_builder
)
classifier_builder
=
self
.
_classifier_builder
)
def
_classifier_builder
(
self
,
train
):
def
_classifier_builder
(
self
,
train
):
return
nltk
.
MaxentClassifier
.
train
(
train
,
algorithm
=
'megam'
,
return
MaxentClassifier
.
train
(
train
,
algorithm
=
'megam'
,
gaussian_prior_sigma
=
1
,
gaussian_prior_sigma
=
1
,
trace
=
2
)
trace
=
2
)
def
_english_wordlist
(
self
):
if
not
self
.
_en_wordlist
:
from
nltk.corpus
import
words
self
.
_en_wordlist
=
set
(
words
.
words
(
'en-basic'
))
return
self
.
_en_wordlist
def
_feature_detector
(
self
,
tokens
,
index
,
history
):
def
_feature_detector
(
self
,
tokens
,
index
,
history
):
word
=
tokens
[
index
][
0
]
word
=
tokens
[
index
][
0
]
...
@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
...
@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
'suffix3'
:
word
[
-
3
:]
.
lower
(),
'suffix3'
:
word
[
-
3
:]
.
lower
(),
'pos'
:
pos
,
'pos'
:
pos
,
'word'
:
word
,
'word'
:
word
,
'en-wordlist'
:
(
word
in
_short_en_wordlist
),
# xx!
'en-wordlist'
:
(
word
in
self
.
_english_wordlist
()),
'prevtag'
:
prevtag
,
'prevtag'
:
prevtag
,
'prevpos'
:
prevpos
,
'prevpos'
:
prevpos
,
'nextpos'
:
nextpos
,
'nextpos'
:
nextpos
,
...
@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI):
...
@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI):
"""
"""
Convert a list of tagged tokens to a chunk-parse tree.
Convert a list of tagged tokens to a chunk-parse tree.
"""
"""
sent
=
nltk
.
Tree
(
'S'
,
[])
sent
=
Tree
(
'S'
,
[])
for
(
tok
,
tag
)
in
tagged_tokens
:
for
(
tok
,
tag
)
in
tagged_tokens
:
if
tag
==
'O'
:
if
tag
==
'O'
:
sent
.
append
(
tok
)
sent
.
append
(
tok
)
elif
tag
.
startswith
(
'B-'
):
elif
tag
.
startswith
(
'B-'
):
sent
.
append
(
nltk
.
Tree
(
tag
[
2
:],
[
tok
]))
sent
.
append
(
Tree
(
tag
[
2
:],
[
tok
]))
elif
tag
.
startswith
(
'I-'
):
elif
tag
.
startswith
(
'I-'
):
if
(
sent
and
isinstance
(
sent
[
-
1
],
Tree
)
and
if
(
sent
and
isinstance
(
sent
[
-
1
],
Tree
)
and
sent
[
-
1
]
.
node
==
tag
[
2
:]):
sent
[
-
1
]
.
node
==
tag
[
2
:]):
sent
[
-
1
]
.
append
(
tok
)
sent
[
-
1
]
.
append
(
tok
)
else
:
else
:
sent
.
append
(
nltk
.
Tree
(
tag
[
2
:],
[
tok
]))
sent
.
append
(
Tree
(
tag
[
2
:],
[
tok
]))
return
sent
return
sent
@staticmethod
@staticmethod
...
@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI):
...
@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI):
"""
"""
toks
=
[]
toks
=
[]
for
child
in
sent
:
for
child
in
sent
:
if
isinstance
(
child
,
nltk
.
Tree
):
if
isinstance
(
child
,
Tree
):
if
len
(
child
)
==
0
:
if
len
(
child
)
==
0
:
print
"Warning -- empty chunk in sentence"
print
"Warning -- empty chunk in sentence"
continue
continue
...
@@ -171,10 +178,10 @@ def simplify_pos(s):
...
@@ -171,10 +178,10 @@ def simplify_pos(s):
def
postag_tree
(
tree
):
def
postag_tree
(
tree
):
# Part-of-speech tagging.
# Part-of-speech tagging.
words
=
tree
.
leaves
()
words
=
tree
.
leaves
()
tag_iter
=
(
pos
for
(
word
,
pos
)
in
nltk
.
pos_tag
(
words
))
tag_iter
=
(
pos
for
(
word
,
pos
)
in
pos_tag
(
words
))
newtree
=
Tree
(
'S'
,
[])
newtree
=
Tree
(
'S'
,
[])
for
child
in
tree
:
for
child
in
tree
:
if
isinstance
(
child
,
nltk
.
Tree
):
if
isinstance
(
child
,
Tree
):
newtree
.
append
(
Tree
(
child
.
node
,
[]))
newtree
.
append
(
Tree
(
child
.
node
,
[]))
for
subchild
in
child
:
for
subchild
in
child
:
newtree
[
-
1
]
.
append
(
(
subchild
,
tag_iter
.
next
())
)
newtree
[
-
1
]
.
append
(
(
subchild
,
tag_iter
.
next
())
)
...
@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt):
...
@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt):
# Binary distinction (NE or not NE)
# Binary distinction (NE or not NE)
if
fmt
==
'binary'
:
if
fmt
==
'binary'
:
i
=
0
i
=
0
toks
=
nltk
.
Tree
(
'S'
,
[])
toks
=
Tree
(
'S'
,
[])
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
e
<=
s
:
continue
if
e
<=
s
:
continue
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:
s
]))
toks
.
extend
(
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
nltk
.
Tree
(
'NE'
,
text
[
s
:
e
]
.
split
()))
toks
.
append
(
Tree
(
'NE'
,
text
[
s
:
e
]
.
split
()))
i
=
e
i
=
e
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:]))
toks
.
extend
(
word_tokenize
(
text
[
i
:]))
yield
toks
yield
toks
# Multiclass distinction (NE type)
# Multiclass distinction (NE type)
elif
fmt
==
'multiclass'
:
elif
fmt
==
'multiclass'
:
i
=
0
i
=
0
toks
=
nltk
.
Tree
(
'S'
,
[])
toks
=
Tree
(
'S'
,
[])
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
for
(
s
,
e
,
typ
)
in
sorted
(
entities
):
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
s
<
i
:
s
=
i
# Overlapping! Deal with this better?
if
e
<=
s
:
continue
if
e
<=
s
:
continue
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:
s
]))
toks
.
extend
(
word_tokenize
(
text
[
i
:
s
]))
toks
.
append
(
nltk
.
Tree
(
typ
,
text
[
s
:
e
]
.
split
()))
toks
.
append
(
Tree
(
typ
,
text
[
s
:
e
]
.
split
()))
i
=
e
i
=
e
toks
.
extend
(
nltk
.
word_tokenize
(
text
[
i
:]))
toks
.
extend
(
word_tokenize
(
text
[
i
:]))
yield
toks
yield
toks
else
:
else
:
...
@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed):
...
@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed):
def
build_model
(
fmt
=
'binary'
):
def
build_model
(
fmt
=
'binary'
):
print
'Loading training data...'
print
'Loading training data...'
train_paths
=
[
nltk
.
data
.
find
(
'corpora/ace_data/ace.dev'
),
train_paths
=
[
find
(
'corpora/ace_data/ace.dev'
),
nltk
.
data
.
find
(
'corpora/ace_data/ace.heldout'
),
find
(
'corpora/ace_data/ace.heldout'
),
nltk
.
data
.
find
(
'corpora/ace_data/bbn.dev'
),
find
(
'corpora/ace_data/bbn.dev'
),
nltk
.
data
.
find
(
'corpora/ace_data/muc.dev'
)]
find
(
'corpora/ace_data/muc.dev'
)]
train_trees
=
load_ace_data
(
train_paths
,
fmt
)
train_trees
=
load_ace_data
(
train_paths
,
fmt
)
train_data
=
[
postag_tree
(
t
)
for
t
in
train_trees
]
train_data
=
[
postag_tree
(
t
)
for
t
in
train_trees
]
print
'Training...'
print
'Training...'
...
@@ -282,7 +289,7 @@ def build_model(fmt='binary'):
...
@@ -282,7 +289,7 @@ def build_model(fmt='binary'):
del
train_data
del
train_data
print
'Loading eval data...'
print
'Loading eval data...'
eval_paths
=
[
nltk
.
data
.
find
(
'corpora/ace_data/ace.eval'
)]
eval_paths
=
[
find
(
'corpora/ace_data/ace.eval'
)]
eval_trees
=
load_ace_data
(
eval_paths
,
fmt
)
eval_trees
=
load_ace_data
(
eval_paths
,
fmt
)
eval_data
=
[
postag_tree
(
t
)
for
t
in
eval_trees
]
eval_data
=
[
postag_tree
(
t
)
for
t
in
eval_trees
]
...
...
nltk/chunk/regexp.py
View file @
a39be2c0
...
@@ -11,8 +11,7 @@ import types
...
@@ -11,8 +11,7 @@ import types
from
nltk.tree
import
Tree
from
nltk.tree
import
Tree
from
nltk.chunk.api
import
*
from
nltk.chunk.api
import
ChunkParserI
from
nltk.chunk.util
import
*
##//////////////////////////////////////////////////////
##//////////////////////////////////////////////////////
## ChunkString
## ChunkString
...
...
nltk/chunk/util.py
View file @
a39be2c0
...
@@ -10,9 +10,7 @@ import re
...
@@ -10,9 +10,7 @@ import re
import
string
import
string
from
nltk.tree
import
Tree
from
nltk.tree
import
Tree
import
nltk.tag.util
from
nltk.tag.util
import
str2tuple
from
api
import
*
##//////////////////////////////////////////////////////
##//////////////////////////////////////////////////////
## EVALUATION
## EVALUATION
...
@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
...
@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
if
sep
is
None
:
if
sep
is
None
:
stack
[
-
1
]
.
append
(
text
)
stack
[
-
1
]
.
append
(
text
)
else
:
else
:
stack
[
-
1
]
.
append
(
nltk
.
tag
.
util
.
str2tuple
(
text
,
sep
))
stack
[
-
1
]
.
append
(
str2tuple
(
text
,
sep
))
if
len
(
stack
)
!=
1
:
if
len
(
stack
)
!=
1
:
raise
ValueError
(
'Expected ] at char
%
d'
%
len
(
s
))
raise
ValueError
(
'Expected ] at char
%
d'
%
len
(
s
))
...
...
nltk/downloader.py
View file @
a39be2c0
...
@@ -168,8 +168,8 @@ except:
...
@@ -168,8 +168,8 @@ except:
try
:
try
:
TKINTER
=
True
TKINTER
=
True
from
Tkinter
import
*
from
Tkinter
import
Tk
,
Frame
,
Label
,
Entry
,
Button
,
Canvas
,
Menu
,
IntVar
from
tkMessageBox
import
*
from
tkMessageBox
import
showerror
from
nltk.draw.table
import
Table
from
nltk.draw.table
import
Table
from
nltk.draw
import
ShowText
from
nltk.draw
import
ShowText
except
:
except
:
...
...
nltk/parse/earleychart.py
View file @
a39be2c0
...
@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down
...
@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
algorithm, originally formulated by Jay Earley (1970).
"""
"""
#from nltk.grammar import *
#from nltk.parse.api import ParserI
from
nltk.parse.chart
import
(
Chart
,
ChartParser
,
EdgeI
,
LeafEdge
,
LeafInitRule
,
from
nltk.parse.chart
import
(
Chart
,
ChartParser
,
EdgeI
,
LeafEdge
,
LeafInitRule
,
BottomUpPredictRule
,
BottomUpPredictCombineRule
,
BottomUpPredictRule
,
BottomUpPredictCombineRule
,
TopDownInitRule
,
SingleEdgeFundamentalRule
,
TopDownInitRule
,
SingleEdgeFundamentalRule
,
...
...
nltk/tag/hmm.py
View file @
a39be2c0
...
@@ -72,7 +72,7 @@ which includes extensive demonstration code.
...
@@ -72,7 +72,7 @@ which includes extensive demonstration code.
import
re
import
re
import
types
import
types
from
numpy
import
*
from
numpy
import
zeros
,
ones
,
float32
,
float64
,
log2
,
hstack
,
array
,
argmax
from
nltk.probability
import
(
FreqDist
,
ConditionalFreqDist
,
from
nltk.probability
import
(
FreqDist
,
ConditionalFreqDist
,
ConditionalProbDist
,
DictionaryProbDist
,
ConditionalProbDist
,
DictionaryProbDist
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment