Merge branch 'parseri' into develop

67dbfa23 · Steven Bird · 7216fb5a · 00c52aa8 · 67dbfa23 · 67dbfa23
Commit 67dbfa23 authored Feb 21, 2015 by Steven Bird
11 changed files
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -119,6 +119,7 @@ from nltk.chunk import *
 from nltk.classify import *
 from nltk.inference import *
 from nltk.metrics import *
+from nltk.model import *
 from nltk.parse import *
 from nltk.tag import *
 from nltk.tokenize import *
@@ -165,7 +166,7 @@ else:
 from nltk import align, ccg, chunk, classify, collocations
 from nltk import data, featstruct, grammar, help, inference, metrics
-from nltk import misc, parse, probability, sem, stem, wsd
+from nltk import misc, model, parse, probability, sem, stem, wsd
 from nltk import tag, tbl, text, tokenize, tree, treetransforms, util
 # override any accidentally imported demo

--- a/nltk/model/__init__.py
+++ b/nltk/model/__init__.py
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2014 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.model.ngram import NgramModel
--- a/nltk/model/api.py
+++ b/nltk/model/api.py
+# Natural Language Toolkit: API for Language Models
+#
+# Copyright (C) 2001-2014 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+# should this be a subclass of ConditionalProbDistI?
+class ModelI(object):
+    """
+    A processing interface for assigning a probability to the next word.
+    """
+    def __init__(self):
+        '''Create a new language model.'''
+        raise NotImplementedError()
+    def prob(self, word, context):
+        '''Evaluate the probability of this word in this context.'''
+        raise NotImplementedError()
+    def logprob(self, word, context):
+        '''Evaluate the (negative) log probability of this word in this context.'''
+        raise NotImplementedError()
+    def choose_random_word(self, context):
+        '''Randomly select a word that is likely to appear in this context.'''
+        raise NotImplementedError()
+    def generate(self, n):
+        '''Generate n words of text from the language model.'''
+        raise NotImplementedError()
+    def entropy(self, text):
+        '''Evaluate the total entropy of a message with respect to the model.
+        This is the sum of the log probability of each word in the message.'''
+        raise NotImplementedError()
--- a/nltk/model/ngram.py
+++ b/nltk/model/ngram.py
--- a/nltk/parse/api.py
+++ b/nltk/parse/api.py
@@ -32,7 +32,7 @@ class ParserI(object):
        """
        raise NotImplementedError()
-    def parse(self, sent):
+    def parse(self, sent, *args, **kwargs):
        """
        :return: An iterator that generates parse trees for the sentence.
        When possible this list is sorted from most likely to least likely.
@@ -42,25 +42,25 @@ class ParserI(object):
        :rtype: iter(Tree)
        """
        if overridden(self.parse_sents):
-            return next(self.parse_sents([sent]))
+            return next(self.parse_sents([sent], *args, **kwargs))
        elif overridden(self.parse_one):
-            return (tree for tree in [self.parse_one(sent)] if tree is not None)
+            return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None)
        elif overridden(self.parse_all):
-            return iter(self.parse_all(sent))
+            return iter(self.parse_all(sent, *args, **kwargs))
        else:
            raise NotImplementedError()
-    def parse_sents(self, sents):
+    def parse_sents(self, sents, *args, **kwargs):
        """
        Apply ``self.parse()`` to each element of ``sents``.
        :rtype: iter(iter(Tree))
        """
-        return (self.parse(sent) for sent in sents)
+        return (self.parse(sent, *args, **kwargs) for sent in sents)
-    def parse_all(self, sent):
+    def parse_all(self, sent, *args, **kwargs):
        """:rtype: list(Tree)"""
-        return list(self.parse(sent))
+        return list(self.parse(sent, *args, **kwargs))
-    def parse_one(self, sent):
+    def parse_one(self, sent, *args, **kwargs):
        """:rtype: Tree or None"""
-        return next(self.parse(sent), None)
+        return next(self.parse(sent, *args, **kwargs), None)
--- a/nltk/parse/chart.py
+++ b/nltk/parse/chart.py
@@ -1346,9 +1346,9 @@ class ChartParser(ParserI):
        # Return the final chart.
        return chart
-    def parse_all(self, tokens, tree_class=Tree):
+    def parse(self, tokens, tree_class=Tree):
        chart = self.chart_parse(tokens)
-        return chart.parses(self._grammar.start(), tree_class=tree_class)
+        return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
 class TopDownChartParser(ChartParser):
    """
@@ -1628,9 +1628,8 @@ def demo(choice=None,
        print()
        cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
        t = time.time()
-        # parses = cp.parse_all(tokens)
+        parses = cp.parse_all(tokens)
        chart = cp.chart_parse(tokens)
-        parses = list(chart.parses(grammar.start()))
        times[strategies[strategy][0]] = time.time()-t
        print("Nr edges in chart:", len(chart.edges()))
        if numparses:

--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -87,56 +87,19 @@ class MaltParser(ParserI):
            url='http://www.maltparser.org/',
            verbose=verbose)
-    def parse_all(self, sentence, verbose=False):
-        """
-        Use MaltParser to parse a sentence. Takes a sentence as a list of
-        words; it will be automatically tagged with this MaltParser instance's
-        tagger.
-        :param sentence: Input sentence to parse
-        :type sentence: list(str)
-        :return: list(DependencyGraph)
-        """
-        return self.parse_sents([sentence], verbose)
    def parse_sents(self, sentences, verbose=False):
        """
-        Use MaltParser to parse multiple sentence. Takes multiple sentences as a
+        Use MaltParser to parse multiple sentences. Takes multiple sentences as a
        list where each sentence is a list of words.
        Each sentence will be automatically tagged with this MaltParser instance's
        tagger.
        :param sentences: Input sentences to parse
        :type sentence: list(list(str))
-        :return: list(DependencyGraph)
+        :return: iter(DependencyGraph)
        """
        tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
-        return self.tagged_parse_sents(tagged_sentences, verbose)
+        return iter(self.tagged_parse_sents(tagged_sentences, verbose))
-    def parse(self, sentence, verbose=False):
-        """
-        Use MaltParser to parse a sentence. Takes a sentence as a list of words.
-        The sentence will be automatically tagged with this MaltParser instance's
-        tagger.
-        :param sentence: Input sentence to parse
-        :type sentence: list(str)
-        :return: ``DependencyGraph`` the dependency graph representation of the sentence
-        """
-        return self.parse_sents([sentence], verbose)[0]
-    def raw_parse(self, sentence, verbose=False):
-        """
-        Use MaltParser to parse a sentence. Takes a sentence as a string;
-        before parsing, it will be automatically tokenized and tagged with this
-        MaltParser instance's tagger.
-        :param sentence: Input sentence to parse
-        :type sentence: str
-        :return: list(DependencyGraph)
-        """
-        words = word_tokenize(sentence)
-        return self.parse(words, verbose)
    def tagged_parse(self, sentence, verbose=False):
        """
@@ -146,9 +109,9 @@ class MaltParser(ParserI):
        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
-        :return: ``DependencyGraph`` the dependency graph representation of the sentence
+        :return: iter(DependencyGraph) the possible dependency graph representations of the sentence
        """
-        return self.tagged_parse_sents([sentence], verbose)[0]
+        return next(self.tagged_parse_sents([sentence], verbose))
    def tagged_parse_sents(self, sentences, verbose=False):
        """
@@ -158,7 +121,7 @@ class MaltParser(ParserI):
        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
-        :return: list(``DependencyGraph``) the dependency graph representation
+        :return: iter(iter(``DependencyGraph``)) the dependency graph representation
                 of each sentence
        """
@@ -193,7 +156,8 @@ class MaltParser(ParserI):
                raise Exception("MaltParser parsing (%s) failed with exit "
                                "code %d" % (' '.join(cmd), ret))
-            return DependencyGraph.load(output_file.name)
+            # Must return iter(iter(Tree))
+            return (iter([dep_graph]) for dep_graph in  DependencyGraph.load(output_file.name))
        finally:
            input_file.close()
            os.remove(input_file.name)
@@ -276,8 +240,10 @@ def demo():
    maltParser = MaltParser()
    maltParser.train([dg1,dg2], verbose=verbose)
-    print(maltParser.raw_parse('John sees Mary', verbose=verbose).tree().pprint())
+    maltParser.parse_one(['John','sees','Mary'], verbose=verbose).tree().pprint()
-    print(maltParser.raw_parse('a man runs', verbose=verbose).tree().pprint())
+    maltParser.parse_one(['a','man','runs'], verbose=verbose).tree().pprint()
+    next(maltParser.tagged_parse([('John','NNP'),('sees','VB'),('Mary','NNP')], verbose)).tree().pprint()
 if __name__ == '__main__':
    demo()
--- a/nltk/parse/pchart.py
+++ b/nltk/parse/pchart.py
@@ -397,10 +397,47 @@ def demo(choice=None, draw_parses=None, print_parses=None):
    summary of the results are displayed.
    """
    import sys, time
-    from nltk import tokenize, toy_pcfg1, toy_pcfg2
+    from nltk import tokenize
    from nltk.parse import pchart
    # Define two demos.  Each demo has a sentence and a grammar.
+    toy_pcfg1 = PCFG.fromstring("""
+    S -> NP VP [1.0]
+    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
+    Det -> 'the' [0.8] | 'my' [0.2]
+    N -> 'man' [0.5] | 'telescope' [0.5]
+    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
+    V -> 'ate' [0.35] | 'saw' [0.65]
+    PP -> P NP [1.0]
+    P -> 'with' [0.61] | 'under' [0.39]
+    """)
+    toy_pcfg2 = PCFG.fromstring("""
+    S    -> NP VP         [1.0]
+    VP   -> V NP          [.59]
+    VP   -> V             [.40]
+    VP   -> VP PP         [.01]
+    NP   -> Det N         [.41]
+    NP   -> Name          [.28]
+    NP   -> NP PP         [.31]
+    PP   -> P NP          [1.0]
+    V    -> 'saw'         [.21]
+    V    -> 'ate'         [.51]
+    V    -> 'ran'         [.28]
+    N    -> 'boy'         [.11]
+    N    -> 'cookie'      [.12]
+    N    -> 'table'       [.13]
+    N    -> 'telescope'   [.14]
+    N    -> 'hill'        [.5]
+    Name -> 'Jack'        [.52]
+    Name -> 'Bob'         [.48]
+    P    -> 'with'        [.61]
+    P    -> 'under'       [.39]
+    Det  -> 'the'         [.41]
+    Det  -> 'a'           [.31]
+    Det  -> 'my'          [.28]
+    """)
    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

--- a/nltk/parse/stanford.py
+++ b/nltk/parse/stanford.py
@@ -29,10 +29,16 @@ class StanfordParser(ParserI):
    >>> parser=StanfordParser(
    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    ... )
-    >>> parser.raw_parse_sents((
+    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), 
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), 
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
+    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
    ...     "the quick brown fox jumps over the lazy dog",
    ...     "the quick grey wolf jumps over the lazy fox"
-    ... ))
+    ... ))], [])
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
@@ -40,17 +46,17 @@ class StanfordParser(ParserI):
    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
-    >>> parser.parse_sents((
+    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
-    ... ))
+    ... ))], [])
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
-    >>> parser.tagged_parse_sents((
+    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
    ...     (
    ...         ("The", "DT"),
    ...         ("quick", "JJ"),
@@ -63,7 +69,7 @@ class StanfordParser(ParserI):
    ...         ("dog", "NN"),
    ...         (".", "."),
    ...     ),
-    ... ))
+    ... ))],[])
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
@@ -103,23 +109,11 @@ class StanfordParser(ParserI):
        cur_lines = []
        for line in output_.splitlines(False):
            if line == '':
-                res.append(Tree.fromstring('\n'.join(cur_lines)))
+                res.append(iter([Tree.fromstring('\n'.join(cur_lines))]))
                cur_lines = []
            else:
                cur_lines.append(line)
-        return res
+        return iter(res)
-    def parse_all(self, sentence, verbose=False):
-        """
-        Use StanfordParser to parse a sentence. Takes a sentence as a list of
-        words; it will be automatically tagged with this StanfordParser instance's
-        tagger.
-        :param sentence: Input sentence to parse
-        :type sentence: list(str)
-        :rtype: Tree
-        """
-        return self.parse_sents([sentence], verbose)
    def parse_sents(self, sentences, verbose=False):
        """
@@ -132,7 +126,7 @@ class StanfordParser(ParserI):
        :param sentences: Input sentences to parse
        :type sentences: list(list(str))
-        :rtype: list(Tree)
+        :rtype: iter(iter(Tree))
        """
        cmd = [
            'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
@@ -153,9 +147,9 @@ class StanfordParser(ParserI):
        :param sentence: Input sentence to parse
        :type sentence: str
-        :rtype: Tree
+        :rtype: iter(Tree)
        """
-        return self.raw_parse_sents((sentence,), verbose)
+        return next(self.raw_parse_sents([sentence], verbose))
    def raw_parse_sents(self, sentences, verbose=False):
        """
@@ -165,7 +159,7 @@ class StanfordParser(ParserI):
        :param sentences: Input sentences to parse
        :type sentences: list(str)
-        :rtype: list(Tree)
+        :rtype: iter(iter(Tree))
        """
        cmd = [
            'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
@@ -183,9 +177,9 @@ class StanfordParser(ParserI):
        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
-        :rtype: Tree
+        :rtype: iter(Tree)
        """
-        return self.tagged_parse_sents([sentence], verbose)[0]
+        return next(self.tagged_parse_sents([sentence], verbose))
    def tagged_parse_sents(self, sentences, verbose=False):
        """
@@ -195,7 +189,7 @@ class StanfordParser(ParserI):
        :param sentences: Input sentences to parse
        :type sentences: list(list(tuple(str, str)))
-        :rtype: Tree
+        :rtype: iter(iter(Tree))
        """
        tag_separator = '/'
        cmd = [

--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -6,6 +6,9 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import tempfile
 import pickle
@@ -23,6 +26,7 @@ except ImportError:
 from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
 class Configuration(object):
    """
    Class for holding configuration which is the partial analysis of the input sentence.
@@ -44,9 +48,7 @@ class Configuration(object):
        """
        # dep_graph.nodes contain list of token for a sentence
        self.stack = [0]  # The root element
-        self.buffer = range(
+        self.buffer = list(range(1, len(dep_graph.nodes)))  # The rest is in the buffer
-            1, len(
-                dep_graph.nodes))  # The rest is in the buffer
        self.arcs = []  # empty set of arc
        self._tokens = dep_graph.nodes
        self._max_address = len(self.buffer)

--- a/nltk/probability.py
+++ b/nltk/probability.py