Commit a39be2c0 by Steven Bird

fixing up wildcard imports

parent 26082247
......@@ -152,23 +152,15 @@ zero-length assertions).
pattern is valid.
"""
from api import *
from util import *
from regexp import *
__all__ = [
# ChunkParser interface
'ChunkParserI',
# Parsers
'RegexpChunkParser', 'RegexpParser',
'ne_chunk', 'batch_ne_chunk',
]
from api import ChunkParserI
from util import (ChunkScore, accuracy, tagstr2tree, conllstr2tree,
tree2conlltags, tree2conllstr, tree2conlltags)
from regexp import RegexpChunkParser, RegexpParser
# Standard treebank POS tagger
_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def ne_chunk(tagged_tokens, binary=False):
"""
Use NLTK's currently recommended named entity chunker to
......
......@@ -11,28 +11,35 @@ Named entity chunker
import os, re, pickle
from xml.etree import ElementTree as ET
from nltk.chunk.api import *
from nltk.chunk.util import *
import nltk
# This really shouldn't be loaded at import time. But it's used by a
# static method. Do a lazy loading?
_short_en_wordlist = set(nltk.corpus.words.words('en-basic'))
from nltk.tag import ClassifierBasedTagger, pos_tag
from nltk.classify import MaxentClassifier
from nltk.tree import Tree
from nltk.tokenize import word_tokenize
from nltk.data import find
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
class NEChunkParserTagger(ClassifierBasedTagger):
"""
The IOB tagger used by the chunk parser.
"""
def __init__(self, train):
nltk.tag.ClassifierBasedTagger.__init__(
ClassifierBasedTagger.__init__(
self, train=train,
classifier_builder=self._classifier_builder)
def _classifier_builder(self, train):
return nltk.MaxentClassifier.train(train, algorithm='megam',
return MaxentClassifier.train(train, algorithm='megam',
gaussian_prior_sigma=1,
trace=2)
def _english_wordlist(self):
if not self._en_wordlist:
from nltk.corpus import words
self._en_wordlist = set(words.words('en-basic'))
return self._en_wordlist
def _feature_detector(self, tokens, index, history):
word = tokens[index][0]
......@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
'suffix3': word[-3:].lower(),
'pos': pos,
'word': word,
'en-wordlist': (word in _short_en_wordlist), # xx!
'en-wordlist': (word in self._english_wordlist()),
'prevtag': prevtag,
'prevpos': prevpos,
'nextpos': nextpos,
......@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI):
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
sent = nltk.Tree('S', [])
sent = Tree('S', [])
for (tok,tag) in tagged_tokens:
if tag == 'O':
sent.append(tok)
elif tag.startswith('B-'):
sent.append(nltk.Tree(tag[2:], [tok]))
sent.append(Tree(tag[2:], [tok]))
elif tag.startswith('I-'):
if (sent and isinstance(sent[-1], Tree) and
sent[-1].node == tag[2:]):
sent[-1].append(tok)
else:
sent.append(nltk.Tree(tag[2:], [tok]))
sent.append(Tree(tag[2:], [tok]))
return sent
@staticmethod
......@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI):
"""
toks = []
for child in sent:
if isinstance(child, nltk.Tree):
if isinstance(child, Tree):
if len(child) == 0:
print "Warning -- empty chunk in sentence"
continue
......@@ -171,10 +178,10 @@ def simplify_pos(s):
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in nltk.pos_tag(words))
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree('S', [])
for child in tree:
if isinstance(child, nltk.Tree):
if isinstance(child, Tree):
newtree.append(Tree(child.node, []))
for subchild in child:
newtree[-1].append( (subchild, tag_iter.next()) )
......@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt):
# Binary distinction (NE or not NE)
if fmt == 'binary':
i = 0
toks = nltk.Tree('S', [])
toks = Tree('S', [])
for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue
toks.extend(nltk.word_tokenize(text[i:s]))
toks.append(nltk.Tree('NE', text[s:e].split()))
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree('NE', text[s:e].split()))
i = e
toks.extend(nltk.word_tokenize(text[i:]))
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
elif fmt == 'multiclass':
i = 0
toks = nltk.Tree('S', [])
toks = Tree('S', [])
for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue
toks.extend(nltk.word_tokenize(text[i:s]))
toks.append(nltk.Tree(typ, text[s:e].split()))
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
toks.extend(nltk.word_tokenize(text[i:]))
toks.extend(word_tokenize(text[i:]))
yield toks
else:
......@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed):
def build_model(fmt='binary'):
print 'Loading training data...'
train_paths = [nltk.data.find('corpora/ace_data/ace.dev'),
nltk.data.find('corpora/ace_data/ace.heldout'),
nltk.data.find('corpora/ace_data/bbn.dev'),
nltk.data.find('corpora/ace_data/muc.dev')]
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print 'Training...'
......@@ -282,7 +289,7 @@ def build_model(fmt='binary'):
del train_data
print 'Loading eval data...'
eval_paths = [nltk.data.find('corpora/ace_data/ace.eval')]
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
......
......@@ -11,8 +11,7 @@ import types
from nltk.tree import Tree
from nltk.chunk.api import *
from nltk.chunk.util import *
from nltk.chunk.api import ChunkParserI
##//////////////////////////////////////////////////////
## ChunkString
......
......@@ -10,9 +10,7 @@ import re
import string
from nltk.tree import Tree
import nltk.tag.util
from api import *
from nltk.tag.util import str2tuple
##//////////////////////////////////////////////////////
## EVALUATION
......@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
if sep is None:
stack[-1].append(text)
else:
stack[-1].append(nltk.tag.util.str2tuple(text, sep))
stack[-1].append(str2tuple(text, sep))
if len(stack) != 1:
raise ValueError('Expected ] at char %d' % len(s))
......
......@@ -168,8 +168,8 @@ except:
try:
TKINTER = True
from Tkinter import *
from tkMessageBox import *
from Tkinter import Tk, Frame, Label, Entry, Button, Canvas, Menu, IntVar
from tkMessageBox import showerror
from nltk.draw.table import Table
from nltk.draw import ShowText
except:
......
......@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
#from nltk.grammar import *
#from nltk.parse.api import ParserI
from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
BottomUpPredictRule, BottomUpPredictCombineRule,
TopDownInitRule, SingleEdgeFundamentalRule,
......
......@@ -72,7 +72,7 @@ which includes extensive demonstration code.
import re
import types
from numpy import *
from numpy import zeros, ones, float32, float64, log2, hstack, array, argmax
from nltk.probability import (FreqDist, ConditionalFreqDist,
ConditionalProbDist, DictionaryProbDist,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment