Commit a39be2c0 by Steven Bird

fixing up wildcard imports

parent 26082247
...@@ -152,23 +152,15 @@ zero-length assertions). ...@@ -152,23 +152,15 @@ zero-length assertions).
pattern is valid. pattern is valid.
""" """
from api import * from api import ChunkParserI
from util import * from util import (ChunkScore, accuracy, tagstr2tree, conllstr2tree,
from regexp import * tree2conlltags, tree2conllstr, tree2conlltags)
from regexp import RegexpChunkParser, RegexpParser
__all__ = [
# ChunkParser interface
'ChunkParserI',
# Parsers
'RegexpChunkParser', 'RegexpParser',
'ne_chunk', 'batch_ne_chunk',
]
# Standard treebank POS tagger # Standard treebank POS tagger
_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle' _BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle' _MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def ne_chunk(tagged_tokens, binary=False): def ne_chunk(tagged_tokens, binary=False):
""" """
Use NLTK's currently recommended named entity chunker to Use NLTK's currently recommended named entity chunker to
......
...@@ -11,28 +11,35 @@ Named entity chunker ...@@ -11,28 +11,35 @@ Named entity chunker
import os, re, pickle import os, re, pickle
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from nltk.chunk.api import *
from nltk.chunk.util import *
import nltk
# This really shouldn't be loaded at import time. But it's used by a from nltk.tag import ClassifierBasedTagger, pos_tag
# static method. Do a lazy loading? from nltk.classify import MaxentClassifier
_short_en_wordlist = set(nltk.corpus.words.words('en-basic')) from nltk.tree import Tree
from nltk.tokenize import word_tokenize
from nltk.data import find
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger): class NEChunkParserTagger(ClassifierBasedTagger):
""" """
The IOB tagger used by the chunk parser. The IOB tagger used by the chunk parser.
""" """
def __init__(self, train): def __init__(self, train):
nltk.tag.ClassifierBasedTagger.__init__( ClassifierBasedTagger.__init__(
self, train=train, self, train=train,
classifier_builder=self._classifier_builder) classifier_builder=self._classifier_builder)
def _classifier_builder(self, train): def _classifier_builder(self, train):
return nltk.MaxentClassifier.train(train, algorithm='megam', return MaxentClassifier.train(train, algorithm='megam',
gaussian_prior_sigma=1, gaussian_prior_sigma=1,
trace=2) trace=2)
def _english_wordlist(self):
if not self._en_wordlist:
from nltk.corpus import words
self._en_wordlist = set(words.words('en-basic'))
return self._en_wordlist
def _feature_detector(self, tokens, index, history): def _feature_detector(self, tokens, index, history):
word = tokens[index][0] word = tokens[index][0]
...@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger): ...@@ -79,7 +86,7 @@ class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
'suffix3': word[-3:].lower(), 'suffix3': word[-3:].lower(),
'pos': pos, 'pos': pos,
'word': word, 'word': word,
'en-wordlist': (word in _short_en_wordlist), # xx! 'en-wordlist': (word in self._english_wordlist()),
'prevtag': prevtag, 'prevtag': prevtag,
'prevpos': prevpos, 'prevpos': prevpos,
'nextpos': nextpos, 'nextpos': nextpos,
...@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI): ...@@ -117,19 +124,19 @@ class NEChunkParser(ChunkParserI):
""" """
Convert a list of tagged tokens to a chunk-parse tree. Convert a list of tagged tokens to a chunk-parse tree.
""" """
sent = nltk.Tree('S', []) sent = Tree('S', [])
for (tok,tag) in tagged_tokens: for (tok,tag) in tagged_tokens:
if tag == 'O': if tag == 'O':
sent.append(tok) sent.append(tok)
elif tag.startswith('B-'): elif tag.startswith('B-'):
sent.append(nltk.Tree(tag[2:], [tok])) sent.append(Tree(tag[2:], [tok]))
elif tag.startswith('I-'): elif tag.startswith('I-'):
if (sent and isinstance(sent[-1], Tree) and if (sent and isinstance(sent[-1], Tree) and
sent[-1].node == tag[2:]): sent[-1].node == tag[2:]):
sent[-1].append(tok) sent[-1].append(tok)
else: else:
sent.append(nltk.Tree(tag[2:], [tok])) sent.append(Tree(tag[2:], [tok]))
return sent return sent
@staticmethod @staticmethod
...@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI): ...@@ -139,7 +146,7 @@ class NEChunkParser(ChunkParserI):
""" """
toks = [] toks = []
for child in sent: for child in sent:
if isinstance(child, nltk.Tree): if isinstance(child, Tree):
if len(child) == 0: if len(child) == 0:
print "Warning -- empty chunk in sentence" print "Warning -- empty chunk in sentence"
continue continue
...@@ -171,10 +178,10 @@ def simplify_pos(s): ...@@ -171,10 +178,10 @@ def simplify_pos(s):
def postag_tree(tree): def postag_tree(tree):
# Part-of-speech tagging. # Part-of-speech tagging.
words = tree.leaves() words = tree.leaves()
tag_iter = (pos for (word, pos) in nltk.pos_tag(words)) tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree('S', []) newtree = Tree('S', [])
for child in tree: for child in tree:
if isinstance(child, nltk.Tree): if isinstance(child, Tree):
newtree.append(Tree(child.node, [])) newtree.append(Tree(child.node, []))
for subchild in child: for subchild in child:
newtree[-1].append( (subchild, tag_iter.next()) ) newtree[-1].append( (subchild, tag_iter.next()) )
...@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt): ...@@ -227,27 +234,27 @@ def load_ace_file(textfile, fmt):
# Binary distinction (NE or not NE) # Binary distinction (NE or not NE)
if fmt == 'binary': if fmt == 'binary':
i = 0 i = 0
toks = nltk.Tree('S', []) toks = Tree('S', [])
for (s,e,typ) in sorted(entities): for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better? if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue if e <= s: continue
toks.extend(nltk.word_tokenize(text[i:s])) toks.extend(word_tokenize(text[i:s]))
toks.append(nltk.Tree('NE', text[s:e].split())) toks.append(Tree('NE', text[s:e].split()))
i = e i = e
toks.extend(nltk.word_tokenize(text[i:])) toks.extend(word_tokenize(text[i:]))
yield toks yield toks
# Multiclass distinction (NE type) # Multiclass distinction (NE type)
elif fmt == 'multiclass': elif fmt == 'multiclass':
i = 0 i = 0
toks = nltk.Tree('S', []) toks = Tree('S', [])
for (s,e,typ) in sorted(entities): for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better? if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue if e <= s: continue
toks.extend(nltk.word_tokenize(text[i:s])) toks.extend(word_tokenize(text[i:s]))
toks.append(nltk.Tree(typ, text[s:e].split())) toks.append(Tree(typ, text[s:e].split()))
i = e i = e
toks.extend(nltk.word_tokenize(text[i:])) toks.extend(word_tokenize(text[i:]))
yield toks yield toks
else: else:
...@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed): ...@@ -271,10 +278,10 @@ def cmp_chunks(correct, guessed):
def build_model(fmt='binary'): def build_model(fmt='binary'):
print 'Loading training data...' print 'Loading training data...'
train_paths = [nltk.data.find('corpora/ace_data/ace.dev'), train_paths = [find('corpora/ace_data/ace.dev'),
nltk.data.find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/ace.heldout'),
nltk.data.find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/bbn.dev'),
nltk.data.find('corpora/ace_data/muc.dev')] find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt) train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees] train_data = [postag_tree(t) for t in train_trees]
print 'Training...' print 'Training...'
...@@ -282,7 +289,7 @@ def build_model(fmt='binary'): ...@@ -282,7 +289,7 @@ def build_model(fmt='binary'):
del train_data del train_data
print 'Loading eval data...' print 'Loading eval data...'
eval_paths = [nltk.data.find('corpora/ace_data/ace.eval')] eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt) eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees] eval_data = [postag_tree(t) for t in eval_trees]
......
...@@ -11,8 +11,7 @@ import types ...@@ -11,8 +11,7 @@ import types
from nltk.tree import Tree from nltk.tree import Tree
from nltk.chunk.api import * from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import *
##////////////////////////////////////////////////////// ##//////////////////////////////////////////////////////
## ChunkString ## ChunkString
......
...@@ -10,9 +10,7 @@ import re ...@@ -10,9 +10,7 @@ import re
import string import string
from nltk.tree import Tree from nltk.tree import Tree
import nltk.tag.util from nltk.tag.util import str2tuple
from api import *
##////////////////////////////////////////////////////// ##//////////////////////////////////////////////////////
## EVALUATION ## EVALUATION
...@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'): ...@@ -338,7 +336,7 @@ def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
if sep is None: if sep is None:
stack[-1].append(text) stack[-1].append(text)
else: else:
stack[-1].append(nltk.tag.util.str2tuple(text, sep)) stack[-1].append(str2tuple(text, sep))
if len(stack) != 1: if len(stack) != 1:
raise ValueError('Expected ] at char %d' % len(s)) raise ValueError('Expected ] at char %d' % len(s))
......
...@@ -168,8 +168,8 @@ except: ...@@ -168,8 +168,8 @@ except:
try: try:
TKINTER = True TKINTER = True
from Tkinter import * from Tkinter import Tk, Frame, Label, Entry, Button, Canvas, Menu, IntVar
from tkMessageBox import * from tkMessageBox import showerror
from nltk.draw.table import Table from nltk.draw.table import Table
from nltk.draw import ShowText from nltk.draw import ShowText
except: except:
......
...@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down ...@@ -28,9 +28,6 @@ The main parser class is L{EarleyChartParser}, which is a top-down
algorithm, originally formulated by Jay Earley (1970). algorithm, originally formulated by Jay Earley (1970).
""" """
#from nltk.grammar import *
#from nltk.parse.api import ParserI
from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule, from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
BottomUpPredictRule, BottomUpPredictCombineRule, BottomUpPredictRule, BottomUpPredictCombineRule,
TopDownInitRule, SingleEdgeFundamentalRule, TopDownInitRule, SingleEdgeFundamentalRule,
......
...@@ -72,7 +72,7 @@ which includes extensive demonstration code. ...@@ -72,7 +72,7 @@ which includes extensive demonstration code.
import re import re
import types import types
from numpy import * from numpy import zeros, ones, float32, float64, log2, hstack, array, argmax
from nltk.probability import (FreqDist, ConditionalFreqDist, from nltk.probability import (FreqDist, ConditionalFreqDist,
ConditionalProbDist, DictionaryProbDist, ConditionalProbDist, DictionaryProbDist,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment