Commit 67dbfa23 by Steven Bird

Merge branch 'parseri' into develop

parents 7216fb5a 00c52aa8
...@@ -119,6 +119,7 @@ from nltk.chunk import * ...@@ -119,6 +119,7 @@ from nltk.chunk import *
from nltk.classify import * from nltk.classify import *
from nltk.inference import * from nltk.inference import *
from nltk.metrics import * from nltk.metrics import *
from nltk.model import *
from nltk.parse import * from nltk.parse import *
from nltk.tag import * from nltk.tag import *
from nltk.tokenize import * from nltk.tokenize import *
...@@ -165,7 +166,7 @@ else: ...@@ -165,7 +166,7 @@ else:
from nltk import align, ccg, chunk, classify, collocations from nltk import align, ccg, chunk, classify, collocations
from nltk import data, featstruct, grammar, help, inference, metrics from nltk import data, featstruct, grammar, help, inference, metrics
from nltk import misc, parse, probability, sem, stem, wsd from nltk import misc, model, parse, probability, sem, stem, wsd
from nltk import tag, tbl, text, tokenize, tree, treetransforms, util from nltk import tag, tbl, text, tokenize, tree, treetransforms, util
# override any accidentally imported demo # override any accidentally imported demo
......
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2014 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.model.ngram import NgramModel
# Natural Language Toolkit: API for Language Models
#
# Copyright (C) 2001-2014 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# should this be a subclass of ConditionalProbDistI?
class ModelI(object):
"""
A processing interface for assigning a probability to the next word.
"""
def __init__(self):
'''Create a new language model.'''
raise NotImplementedError()
def prob(self, word, context):
'''Evaluate the probability of this word in this context.'''
raise NotImplementedError()
def logprob(self, word, context):
'''Evaluate the (negative) log probability of this word in this context.'''
raise NotImplementedError()
def choose_random_word(self, context):
'''Randomly select a word that is likely to appear in this context.'''
raise NotImplementedError()
def generate(self, n):
'''Generate n words of text from the language model.'''
raise NotImplementedError()
def entropy(self, text):
'''Evaluate the total entropy of a message with respect to the model.
This is the sum of the log probability of each word in the message.'''
raise NotImplementedError()
...@@ -32,7 +32,7 @@ class ParserI(object): ...@@ -32,7 +32,7 @@ class ParserI(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def parse(self, sent): def parse(self, sent, *args, **kwargs):
""" """
:return: An iterator that generates parse trees for the sentence. :return: An iterator that generates parse trees for the sentence.
When possible this list is sorted from most likely to least likely. When possible this list is sorted from most likely to least likely.
...@@ -42,25 +42,25 @@ class ParserI(object): ...@@ -42,25 +42,25 @@ class ParserI(object):
:rtype: iter(Tree) :rtype: iter(Tree)
""" """
if overridden(self.parse_sents): if overridden(self.parse_sents):
return next(self.parse_sents([sent])) return next(self.parse_sents([sent], *args, **kwargs))
elif overridden(self.parse_one): elif overridden(self.parse_one):
return (tree for tree in [self.parse_one(sent)] if tree is not None) return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None)
elif overridden(self.parse_all): elif overridden(self.parse_all):
return iter(self.parse_all(sent)) return iter(self.parse_all(sent, *args, **kwargs))
else: else:
raise NotImplementedError() raise NotImplementedError()
def parse_sents(self, sents): def parse_sents(self, sents, *args, **kwargs):
""" """
Apply ``self.parse()`` to each element of ``sents``. Apply ``self.parse()`` to each element of ``sents``.
:rtype: iter(iter(Tree)) :rtype: iter(iter(Tree))
""" """
return (self.parse(sent) for sent in sents) return (self.parse(sent, *args, **kwargs) for sent in sents)
def parse_all(self, sent): def parse_all(self, sent, *args, **kwargs):
""":rtype: list(Tree)""" """:rtype: list(Tree)"""
return list(self.parse(sent)) return list(self.parse(sent, *args, **kwargs))
def parse_one(self, sent): def parse_one(self, sent, *args, **kwargs):
""":rtype: Tree or None""" """:rtype: Tree or None"""
return next(self.parse(sent), None) return next(self.parse(sent, *args, **kwargs), None)
...@@ -1346,9 +1346,9 @@ class ChartParser(ParserI): ...@@ -1346,9 +1346,9 @@ class ChartParser(ParserI):
# Return the final chart. # Return the final chart.
return chart return chart
def parse_all(self, tokens, tree_class=Tree): def parse(self, tokens, tree_class=Tree):
chart = self.chart_parse(tokens) chart = self.chart_parse(tokens)
return chart.parses(self._grammar.start(), tree_class=tree_class) return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
class TopDownChartParser(ChartParser): class TopDownChartParser(ChartParser):
""" """
...@@ -1628,9 +1628,8 @@ def demo(choice=None, ...@@ -1628,9 +1628,8 @@ def demo(choice=None,
print() print()
cp = ChartParser(grammar, strategies[strategy][1], trace=trace) cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
t = time.time() t = time.time()
# parses = cp.parse_all(tokens) parses = cp.parse_all(tokens)
chart = cp.chart_parse(tokens) chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
times[strategies[strategy][0]] = time.time()-t times[strategies[strategy][0]] = time.time()-t
print("Nr edges in chart:", len(chart.edges())) print("Nr edges in chart:", len(chart.edges()))
if numparses: if numparses:
......
...@@ -87,56 +87,19 @@ class MaltParser(ParserI): ...@@ -87,56 +87,19 @@ class MaltParser(ParserI):
url='http://www.maltparser.org/', url='http://www.maltparser.org/',
verbose=verbose) verbose=verbose)
def parse_all(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a list of
words; it will be automatically tagged with this MaltParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:return: list(DependencyGraph)
"""
return self.parse_sents([sentence], verbose)
def parse_sents(self, sentences, verbose=False): def parse_sents(self, sentences, verbose=False):
""" """
Use MaltParser to parse multiple sentence. Takes multiple sentences as a Use MaltParser to parse multiple sentences. Takes multiple sentences as a
list where each sentence is a list of words. list where each sentence is a list of words.
Each sentence will be automatically tagged with this MaltParser instance's Each sentence will be automatically tagged with this MaltParser instance's
tagger. tagger.
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentence: list(list(str)) :type sentence: list(list(str))
:return: list(DependencyGraph) :return: iter(DependencyGraph)
""" """
tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences] tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
return self.tagged_parse_sents(tagged_sentences, verbose) return iter(self.tagged_parse_sents(tagged_sentences, verbose))
def parse(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a list of words.
The sentence will be automatically tagged with this MaltParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:return: ``DependencyGraph`` the dependency graph representation of the sentence
"""
return self.parse_sents([sentence], verbose)[0]
def raw_parse(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a string;
before parsing, it will be automatically tokenized and tagged with this
MaltParser instance's tagger.
:param sentence: Input sentence to parse
:type sentence: str
:return: list(DependencyGraph)
"""
words = word_tokenize(sentence)
return self.parse(words, verbose)
def tagged_parse(self, sentence, verbose=False): def tagged_parse(self, sentence, verbose=False):
""" """
...@@ -146,9 +109,9 @@ class MaltParser(ParserI): ...@@ -146,9 +109,9 @@ class MaltParser(ParserI):
:param sentence: Input sentence to parse :param sentence: Input sentence to parse
:type sentence: list(tuple(str, str)) :type sentence: list(tuple(str, str))
:return: ``DependencyGraph`` the dependency graph representation of the sentence :return: iter(DependencyGraph) the possible dependency graph representations of the sentence
""" """
return self.tagged_parse_sents([sentence], verbose)[0] return next(self.tagged_parse_sents([sentence], verbose))
def tagged_parse_sents(self, sentences, verbose=False): def tagged_parse_sents(self, sentences, verbose=False):
""" """
...@@ -158,7 +121,7 @@ class MaltParser(ParserI): ...@@ -158,7 +121,7 @@ class MaltParser(ParserI):
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentence: list(list(tuple(str, str))) :type sentence: list(list(tuple(str, str)))
:return: list(``DependencyGraph``) the dependency graph representation :return: iter(iter(``DependencyGraph``)) the dependency graph representation
of each sentence of each sentence
""" """
...@@ -193,7 +156,8 @@ class MaltParser(ParserI): ...@@ -193,7 +156,8 @@ class MaltParser(ParserI):
raise Exception("MaltParser parsing (%s) failed with exit " raise Exception("MaltParser parsing (%s) failed with exit "
"code %d" % (' '.join(cmd), ret)) "code %d" % (' '.join(cmd), ret))
return DependencyGraph.load(output_file.name) # Must return iter(iter(Tree))
return (iter([dep_graph]) for dep_graph in DependencyGraph.load(output_file.name))
finally: finally:
input_file.close() input_file.close()
os.remove(input_file.name) os.remove(input_file.name)
...@@ -276,8 +240,10 @@ def demo(): ...@@ -276,8 +240,10 @@ def demo():
maltParser = MaltParser() maltParser = MaltParser()
maltParser.train([dg1,dg2], verbose=verbose) maltParser.train([dg1,dg2], verbose=verbose)
print(maltParser.raw_parse('John sees Mary', verbose=verbose).tree().pprint()) maltParser.parse_one(['John','sees','Mary'], verbose=verbose).tree().pprint()
print(maltParser.raw_parse('a man runs', verbose=verbose).tree().pprint()) maltParser.parse_one(['a','man','runs'], verbose=verbose).tree().pprint()
next(maltParser.tagged_parse([('John','NNP'),('sees','VB'),('Mary','NNP')], verbose)).tree().pprint()
if __name__ == '__main__': if __name__ == '__main__':
demo() demo()
...@@ -397,10 +397,47 @@ def demo(choice=None, draw_parses=None, print_parses=None): ...@@ -397,10 +397,47 @@ def demo(choice=None, draw_parses=None, print_parses=None):
summary of the results are displayed. summary of the results are displayed.
""" """
import sys, time import sys, time
from nltk import tokenize, toy_pcfg1, toy_pcfg2 from nltk import tokenize
from nltk.parse import pchart from nltk.parse import pchart
# Define two demos. Each demo has a sentence and a grammar. # Define two demos. Each demo has a sentence and a grammar.
toy_pcfg1 = PCFG.fromstring("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
""")
toy_pcfg2 = PCFG.fromstring("""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
""")
demos = [('I saw John with my telescope', toy_pcfg1), demos = [('I saw John with my telescope', toy_pcfg1),
('the boy saw Jack with Bob under the table with a telescope', ('the boy saw Jack with Bob under the table with a telescope',
toy_pcfg2)] toy_pcfg2)]
......
...@@ -29,10 +29,16 @@ class StanfordParser(ParserI): ...@@ -29,10 +29,16 @@ class StanfordParser(ParserI):
>>> parser=StanfordParser( >>> parser=StanfordParser(
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
... ) ... )
>>> parser.raw_parse_sents((
>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
... "the quick brown fox jumps over the lazy dog", ... "the quick brown fox jumps over the lazy dog",
... "the quick grey wolf jumps over the lazy fox" ... "the quick grey wolf jumps over the lazy fox"
... )) ... ))], [])
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
...@@ -40,17 +46,17 @@ class StanfordParser(ParserI): ...@@ -40,17 +46,17 @@ class StanfordParser(ParserI):
[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
>>> parser.parse_sents(( >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
... "I 'm a dog".split(), ... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(), ... "This is my friends ' cat ( the tabby )".split(),
... )) ... ))], [])
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']), Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])] Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
>>> parser.tagged_parse_sents(( >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
... ( ... (
... ("The", "DT"), ... ("The", "DT"),
... ("quick", "JJ"), ... ("quick", "JJ"),
...@@ -63,7 +69,7 @@ class StanfordParser(ParserI): ...@@ -63,7 +69,7 @@ class StanfordParser(ParserI):
... ("dog", "NN"), ... ("dog", "NN"),
... (".", "."), ... (".", "."),
... ), ... ),
... )) ... ))],[])
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
...@@ -103,23 +109,11 @@ class StanfordParser(ParserI): ...@@ -103,23 +109,11 @@ class StanfordParser(ParserI):
cur_lines = [] cur_lines = []
for line in output_.splitlines(False): for line in output_.splitlines(False):
if line == '': if line == '':
res.append(Tree.fromstring('\n'.join(cur_lines))) res.append(iter([Tree.fromstring('\n'.join(cur_lines))]))
cur_lines = [] cur_lines = []
else: else:
cur_lines.append(line) cur_lines.append(line)
return res return iter(res)
def parse_all(self, sentence, verbose=False):
"""
Use StanfordParser to parse a sentence. Takes a sentence as a list of
words; it will be automatically tagged with this StanfordParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:rtype: Tree
"""
return self.parse_sents([sentence], verbose)
def parse_sents(self, sentences, verbose=False): def parse_sents(self, sentences, verbose=False):
""" """
...@@ -132,7 +126,7 @@ class StanfordParser(ParserI): ...@@ -132,7 +126,7 @@ class StanfordParser(ParserI):
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentences: list(list(str)) :type sentences: list(list(str))
:rtype: list(Tree) :rtype: iter(iter(Tree))
""" """
cmd = [ cmd = [
'edu.stanford.nlp.parser.lexparser.LexicalizedParser', 'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
...@@ -153,9 +147,9 @@ class StanfordParser(ParserI): ...@@ -153,9 +147,9 @@ class StanfordParser(ParserI):
:param sentence: Input sentence to parse :param sentence: Input sentence to parse
:type sentence: str :type sentence: str
:rtype: Tree :rtype: iter(Tree)
""" """
return self.raw_parse_sents((sentence,), verbose) return next(self.raw_parse_sents([sentence], verbose))
def raw_parse_sents(self, sentences, verbose=False): def raw_parse_sents(self, sentences, verbose=False):
""" """
...@@ -165,7 +159,7 @@ class StanfordParser(ParserI): ...@@ -165,7 +159,7 @@ class StanfordParser(ParserI):
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentences: list(str) :type sentences: list(str)
:rtype: list(Tree) :rtype: iter(iter(Tree))
""" """
cmd = [ cmd = [
'edu.stanford.nlp.parser.lexparser.LexicalizedParser', 'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
...@@ -183,9 +177,9 @@ class StanfordParser(ParserI): ...@@ -183,9 +177,9 @@ class StanfordParser(ParserI):
:param sentence: Input sentence to parse :param sentence: Input sentence to parse
:type sentence: list(tuple(str, str)) :type sentence: list(tuple(str, str))
:rtype: Tree :rtype: iter(Tree)
""" """
return self.tagged_parse_sents([sentence], verbose)[0] return next(self.tagged_parse_sents([sentence], verbose))
def tagged_parse_sents(self, sentences, verbose=False): def tagged_parse_sents(self, sentences, verbose=False):
""" """
...@@ -195,7 +189,7 @@ class StanfordParser(ParserI): ...@@ -195,7 +189,7 @@ class StanfordParser(ParserI):
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentences: list(list(tuple(str, str))) :type sentences: list(list(tuple(str, str)))
:rtype: Tree :rtype: iter(iter(Tree))
""" """
tag_separator = '/' tag_separator = '/'
cmd = [ cmd = [
......
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tempfile import tempfile
import pickle import pickle
...@@ -23,6 +26,7 @@ except ImportError: ...@@ -23,6 +26,7 @@ except ImportError:
from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
class Configuration(object): class Configuration(object):
""" """
Class for holding configuration which is the partial analysis of the input sentence. Class for holding configuration which is the partial analysis of the input sentence.
...@@ -44,9 +48,7 @@ class Configuration(object): ...@@ -44,9 +48,7 @@ class Configuration(object):
""" """
# dep_graph.nodes contain list of token for a sentence # dep_graph.nodes contain list of token for a sentence
self.stack = [0] # The root element self.stack = [0] # The root element
self.buffer = range( self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
1, len(
dep_graph.nodes)) # The rest is in the buffer
self.arcs = [] # empty set of arc self.arcs = [] # empty set of arc
self._tokens = dep_graph.nodes self._tokens = dep_graph.nodes
self._max_address = len(self.buffer) self._max_address = len(self.buffer)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment