Merge branch 'develop' into parseri

04f8a9b6 · Steven Bird · 0d36f00f · b4a7085d · 04f8a9b6 · 04f8a9b6
Commit 04f8a9b6 authored Feb 13, 2015 by Steven Bird
15 changed files
--- a/ChangeLog
+++ b/ChangeLog
+Version 3.0.2 2015-02-08
+* make pretty-printing method names consistent
+* improvements to Portuguese stemmer
+* transition-based dependency parsers
+* code clean ups, minor bug fixes
+Thanks to the following contributors to 3.0.2:
+Long Duong, Saimadhav Heblikar, Helder, Denis Krusko,
+Felipe Madrigal, Dmitrijs Milajevs, Nathan Schneider,
+0ssifrage, kiwipi.
+Version 3.0.1 2015-01-12
+* fix setup.py for new version of setuptools
 Version 3.0.0 2014-09-07
 * minor bugfixes
 * added phrase extraction code by Liling Tan and Fredrik Hedman

--- a/nltk/VERSION
+++ b/nltk/VERSION
-3.0.0
+3.0.2
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -147,6 +147,8 @@ movie_reviews = LazyCorpusLoader(
    encoding='ascii')
 names = LazyCorpusLoader(
    'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
+nkjp = LazyCorpusLoader(
+    'nkjp', NKJPCorpusReader, r'', encoding='utf8')
 nps_chat = LazyCorpusLoader(
    'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
 pl196x = LazyCorpusLoader(

--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -93,6 +93,7 @@ from nltk.corpus.reader.framenet import *
 from nltk.corpus.reader.udhr import *
 from nltk.corpus.reader.bnc import *
 from nltk.corpus.reader.sentiwordnet import *
+from nltk.corpus.reader.nkjp import *
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -127,5 +128,6 @@ __all__ = [
    'CHILDESCorpusReader', 'AlignedCorpusReader',
    'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
    'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
-    'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset'
+    'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
+    'NKJPCorpusReader'
 ]
--- a/nltk/corpus/reader/nkjp.py
+++ b/nltk/corpus/reader/nkjp.py
--- a/nltk/parse/__init__.py
+++ b/nltk/parse/__init__.py
@@ -76,3 +76,4 @@ from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyPar
                                                      ProbabilisticNonprojectiveParser)
 from nltk.parse.malt import MaltParser
 from nltk.parse.evaluate import DependencyEvaluator
+from nltk.parse.transitionparser import TransitionParser
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -102,7 +102,7 @@ class DependencyGraph(object):
        self.nodes[head_address]['deps'].setdefault(relation,[])
        self.nodes[head_address]['deps'][relation].append(mod_address)
        #self.nodes[head_address]['deps'].append(mod_address)
    def connect_graph(self):
        """
@@ -113,7 +113,7 @@ class DependencyGraph(object):
            for node2 in self.nodes.values():
                if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
                    relation = node2['rel']
-                    node1['deps'].setdefault(relation,[]) 
+                    node1['deps'].setdefault(relation, [])
                    node1['deps'][relation].append(node2['address'])
                    #node1['deps'].append(node2['address'])
@@ -214,17 +214,21 @@ class DependencyGraph(object):
        lines = (l.rstrip() for l in input_)
        lines = (l for l in lines if l)
+        cell_number = None
        for index, line in enumerate(lines, start=1):
            cells = line.split(cell_separator)
-            nrCells = len(cells)
+            if cell_number is None:
+                cell_number = len(cells)
+            else:
+                assert cell_number == len(cells)
            if cell_extractor is None:
                try:
-                    cell_extractor = extractors[nrCells]
+                    cell_extractor = extractors[cell_number]
                except KeyError:
                    raise ValueError(
                        'Number of tab-delimited fields ({0}) not supported by '
-                        'CoNLL(10) or Malt-Tab(4) format'.format(nrCells)
+                        'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
                    )
            word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
@@ -246,6 +250,9 @@ class DependencyGraph(object):
                }
            )
+            # Make sure that he fake root node has labeled dependencies.
+            if (cell_number == 3) and (head == 0):
+                rel = 'ROOT'
            self.nodes[head]['deps'][rel].append(index)
        if not self.nodes[0]['deps']['ROOT']:
@@ -271,7 +278,7 @@ class DependencyGraph(object):
        """
        node = self.get_by_address(i)
        word = node['word']
-        deps = list(chain.from_iterable(node['deps'].values()))
+        deps = sorted(chain.from_iterable(node['deps'].values()))
        if deps:
            return Tree(word, [self._tree(dep) for dep in deps])
@@ -286,7 +293,7 @@ class DependencyGraph(object):
        node = self.root
        word = node['word']
-        deps = chain.from_iterable(node['deps'].values())
+        deps = sorted(chain.from_iterable(node['deps'].values()))
        return Tree(word, [self._tree(dep) for dep in deps])
    def triples(self, node=None):
@@ -299,7 +306,7 @@ class DependencyGraph(object):
            node = self.root
        head = (node['word'], node['ctag'])
-        for i in node['deps']:
+        for i in sorted(chain.from_iterable(node['deps'].values())):
            dep = self.get_by_address(i)
            yield (head, dep['rel'], (dep['word'], dep['ctag']))
            for triple in self.triples(node=dep):

--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object):
                }
            )
        #print (g_graph.nodes)
        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object):
        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
-            # deps must be a dictionary 
+            # TODO: It's dangerous to assume that deps it a dictionary
-            #node['deps'] = []
+            # because it's a default dictionary. Ideally, here we should not
+            # be concerned how dependencies are stored inside of a dependency
+            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])
@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object):
        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
-            root_count = 0
+            if analysis.count(-1) > 1:
-            root = []
+                # there are several root elements!
-            for i, cell in enumerate(analysis):
+                continue
-                if cell == -1:
-                    root_count += 1
+            graph = DependencyGraph()
-                    root = i
+            graph.root = graph.nodes[analysis.index(-1) + 1]
-            if root_count == 1:
-                graph = DependencyGraph()
+            for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
-                graph.nodes[0]['deps'] = root + 1
+                head_address = head_index + 1
-                for i in range(len(tokens)):
-                    node = {'word': tokens[i], 'address': i+1}
+                node = graph.nodes[address]
-                    node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
+                node.update(
-                    graph.nodes[i + 1] = node
+                    {
-#               cycle = graph.contains_cycle()
+                        'word': token,
-#               if not cycle:
+                        'address': address,
-                yield graph
+                    }
+                )
+                if head_address == 0:
+                    rel = 'ROOT'
+                else:
+                    rel = ''
+                graph.nodes[head_index + 1]['deps'][rel].append(address)
+            # TODO: check for cycles
+            yield graph
 #################################################################

--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -8,16 +8,16 @@
 import tempfile
 import pickle
-import os
-import copy
+from os import remove
-import operator
+from copy import deepcopy
-from nltk.parse.api import ParserI
+from operator import itemgetter
-import scipy.sparse as sparse
+from scipy import sparse
-import numpy as np
+from numpy import array
 from sklearn.datasets import load_svmlight_file
 from sklearn import svm
-from nltk.parse import DependencyGraph
-from evaluate import DependencyEvaluator
+from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
 class Configuration(object):
@@ -522,7 +522,7 @@ class TransitionParser(ParserI):
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
-            os.remove(input_file.name)
+            remove(input_file.name)
    def parse(self, depgraphs, modelFile):
        """
@@ -549,9 +549,9 @@ class TransitionParser(ParserI):
                        col.append(self._dictionary[feature])
                        row.append(0)
                        data.append(1.0)
-                np_col = np.array(sorted(col))  # NB : index must be sorted
+                np_col = array(sorted(col))  # NB : index must be sorted
-                np_row = np.array(row)
+                np_row = array(row)
-                np_data = np.array(data)
+                np_data = array(data)
                x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary)))
@@ -570,7 +570,7 @@ class TransitionParser(ParserI):
                #           votes[j] +=1
                #        k +=1
                # Sort votes according to the values
-                #sorted_votes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True)
+                #sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
                # We will use predict_proba instead of decision_function
                prob_dict = {}
@@ -579,7 +579,7 @@ class TransitionParser(ParserI):
                    prob_dict[i] = pred_prob[i]
                sorted_Prob = sorted(
                    prob_dict.items(),
-                    key=operator.itemgetter(1),
+                    key=itemgetter(1),
                    reverse=True)
                # Note that SHIFT is always a valid operation
@@ -609,7 +609,7 @@ class TransitionParser(ParserI):
            # Finish with operations build the dependency graph from Conf.arcs
-            new_depgraph = copy.deepcopy(depgraph)
+            new_depgraph = deepcopy(depgraph)
            for key in new_depgraph.nodes:
                node = new_depgraph.nodes[key]
                node['rel'] = ''
@@ -727,7 +727,7 @@ def demo():
     Number of training examples : 1
     Number of valid (projective) examples : 1
    ...
-    >>> os.remove(input_file.name)
+    >>> remove(input_file.name)
    B. Check the ARC-EAGER training
@@ -743,7 +743,7 @@ def demo():
     Number of valid (projective) examples : 1
    ...
-    >>> os.remove(input_file.name)
+    >>> remove(input_file.name)
    ###################### Check The Parsing Function ########################

--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
--- a/nltk/test/dependency.doctest
+++ b/nltk/test/dependency.doctest
@@ -35,30 +35,33 @@ CoNLL Data
    ... .       .       9       VMOD
    ... """
    >>> dg = DependencyGraph(treebank_data)
-    >>> print(dg.tree().pprint())
+    >>> dg.tree().pprint()
    (will
      (Vinken Pierre , (old (years 61)) ,)
      (join (board the) (as (director a nonexecutive)) (Nov. 29) .))
-    >>> print(list(dg.triples()))
+    >>> for head, rel, dep in dg.triples():
-    [((u'will', u'MD'), u'SUB', (u'Vinken', u'NNP')),
+    ...     print(
-     ((u'Vinken', u'NNP'), u'NMOD', (u'Pierre', u'NNP')),
+    ...         '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
-     ((u'Vinken', u'NNP'), u'P', (u',', u',')),
+    ...         .format(h=head, r=rel, d=dep)
-     ((u'Vinken', u'NNP'), u'NMOD', (u'old', u'JJ')),
+    ...     )
-     ((u'old', u'JJ'), u'AMOD', (u'years', u'NNS')),
+    (will, MD), SUB, (Vinken, NNP)
-     ((u'years', u'NNS'), u'NMOD', (u'61', u'CD')),
+    (Vinken, NNP), NMOD, (Pierre, NNP)
-     ((u'Vinken', u'NNP'), u'P', (u',', u',')),
+    (Vinken, NNP), P, (,, ,)
-     ((u'will', u'MD'), u'VC', (u'join', u'VB')),
+    (Vinken, NNP), NMOD, (old, JJ)
-     ((u'join', u'VB'), u'OBJ', (u'board', u'NN')),
+    (old, JJ), AMOD, (years, NNS)
-     ((u'board', u'NN'), u'NMOD', (u'the', u'DT')),
+    (years, NNS), NMOD, (61, CD)
-     ((u'join', u'VB'), u'VMOD', (u'as', u'IN')),
+    (Vinken, NNP), P, (,, ,)
-     ((u'as', u'IN'), u'PMOD', (u'director', u'NN')),
+    (will, MD), VC, (join, VB)
-     ((u'director', u'NN'), u'NMOD', (u'a', u'DT')),
+    (join, VB), OBJ, (board, NN)
-     ((u'director', u'NN'), u'NMOD', (u'nonexecutive', u'JJ')),
+    (board, NN), NMOD, (the, DT)
-     ((u'join', u'VB'), u'VMOD', (u'Nov.', u'NNP')),
+    (join, VB), VMOD, (as, IN)
-     ((u'Nov.', u'NNP'), u'NMOD', (u'29', u'CD')),
+    (as, IN), PMOD, (director, NN)
-     ((u'join', u'VB'), u'VMOD', (u'.', u'.'))]
+    (director, NN), NMOD, (a, DT)
+    (director, NN), NMOD, (nonexecutive, JJ)
+    (join, VB), VMOD, (Nov., NNP)
+    (Nov., NNP), NMOD, (29, CD)
+    (join, VB), VMOD, (., .)
 Using the dependency-parsed version of the Penn Treebank corpus sample.
@@ -159,21 +162,22 @@ Non-Projective Dependency Parsing
      'dog' -> 'his'
    >>> dp = NonprojectiveDependencyParser(grammar)
-    >>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']):
+    >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
-    ...     print(g)  # doctest: +NORMALIZE_WHITESPACE
-    {0: {'address': 0,
+    >>> print(g.root['word'])
-         'ctag': 'TOP',
+    taught
-         'deps': 3,
-         'feats': None,
+    >>> for _, node in sorted(g.nodes.items()):
-         'lemma': None,
+    ...     if node['word'] is not None:
-         'rel': 'TOP',
+    ...         print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
-         'tag': 'TOP',
+    1 the: []
-         'word': None},
+    2 man: [1]
-     1: {'address': 1, 'deps': [], 'word': 'the'},
+    3 taught: [2, 7]
-     2: {'address': 2, 'deps': [1], 'word': 'man'},
+    4 his: []
-     3: {'address': 3, 'deps': [2, 7], 'word': 'taught'},
+    5 dog: [4]
-     4: {'address': 4, 'deps': [], 'word': 'his'},
+    6 to: []
-     5: {'address': 5, 'deps': [4], 'word': 'dog'},
+    7 play: [5, 6, 8]
-     6: {'address': 6, 'deps': [], 'word': 'to'},
+    8 golf: []
-     7: {'address': 7, 'deps': [5, 6, 8], 'word': 'play'},
-     8: {'address': 8, 'deps': [], 'word': 'golf'}}
+    >>> print(g.tree())
+    (taught (man the) (play (dog his) to golf))
--- a/tox.ini
+++ b/tox.ini
 [tox]
-envlist = py26,py27,py32,py33,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py26-jenkins,py32-jenkins
+envlist = py26,py27,py32,py33,py34,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py34-nodeps,py26-jenkins,py32-jenkins,py34-jenkins
 [testenv]
@@ -63,6 +63,20 @@ commands =
    ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
    python runtests.py []
+[testenv:py34]
+deps =
+    numpy
+    nose >= 1.2.1
+    coverage
+    text-unidecode
+commands =
+    ; scipy and scikit-learn requires numpy even to run setup.py so
+    ; they can't be installed in one command
+    pip install --download-cache={toxworkdir}/_download scipy scikit-learn
+    ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
+    python runtests.py []
 [testenv:py26-nodeps]
 basepython = python2.6
@@ -84,6 +98,11 @@ basepython = python3.3
 deps = nose >= 1.2.1
 commands = python runtests.py []
+[testenv:py34-nodeps]
+basepython = python3.4
+deps = nose >= 1.2.1
+commands = python runtests.py []
 [testenv:py26-jenkins]
 basepython = python2.6
 commands = {toxinidir}/jenkins.sh
@@ -99,3 +118,11 @@ setenv =
 	STANFORD_MODELS = {homedir}/third/stanford-parser/
 	STANFORD_PARSER = {homedir}/third/stanford-parser/
 	STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
+[testenv:py34-jenkins]
+basepython = python3.4
+commands = {toxinidir}/jenkins.sh
+setenv =
+	STANFORD_MODELS = {homedir}/third/stanford-parser/
+	STANFORD_PARSER = {homedir}/third/stanford-parser/
+	STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
--- a/web/conf.py
+++ b/web/conf.py
@@ -44,7 +44,7 @@ master_doc = 'index'
 # General information about the project.
 project = 'NLTK'
-copyright = '2013, NLTK Project'
+copyright = '2015, NLTK Project'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

--- a/web/contribute.rst
+++ b/web/contribute.rst
@@ -5,14 +5,14 @@ The Natural Language Toolkit exists thanks to the efforts of dozens
 of voluntary developers who have contributed functionality and
 bugfixes since the project began in 2000 (`contributors <https://github.com/nltk/nltk#contributing>`_).
-In 2014 we are especially keen to improve NLTK coverage for:
+In 2015 we are especially keen to improve NLTK coverage for:
 `dependency parsing <https://github.com/nltk/nltk/wiki/Dependency-Parsing>`_,
 `machine translation <https://github.com/nltk/nltk/wiki/Machine-Translation>`_,
 `sentiment analysis <https://github.com/nltk/nltk/wiki/Sentiment-Analysis>`_,
 `twitter processing <https://github.com/nltk/nltk/wiki/Twitter-Processing>`_.
 New material in these areas will be covered in the second edition of
-the NLTK book, anticipated in 2015.
+the NLTK book, anticipated in early 2016.
 * `desired enhancements <https://github.com/nltk/nltk/issues?labels=enhancement&page=1&state=open>`_
 * `contribute a corpus <https://github.com/nltk/nltk/wiki/Adding-a-Corpus>`_
@@ -29,7 +29,6 @@ Individual packages are maintained by the following people:
 :Parsing: `Peter Ljunglöf <http://www.cse.chalmers.se/~peb/>`_, Gothenburg, Sweden (``nltk.parse, nltk.featstruct``)
 :Metrics: `Joel Nothman <http://joelnothman.com/>`_, Sydney, Australia (``nltk.metrics, nltk.tokenize.punkt``)
 :Python 3: `Mikhail Korobov <http://kmike.ru/>`_, Ekaterinburg, Russia
-:Integration: `Morten Minde Neergaard <http://8d.no/>`_, Oslo, Norway
 :Releases: `Steven Bird <http://estive.net>`_, Melbourne, Australia

--- a/web/news.rst
+++ b/web/news.rst
 NLTK News
 =========
+2015
+----
+NLTK 3.0.1 released : January 2015
+   Minor packaging update.
+2014
+----
 NLTK 3.0.0 released : September 2014
   Minor bugfixes. For full details see:
   https://github.com/nltk/nltk/blob/develop/ChangeLog
@@ -26,6 +35,9 @@ NLTK 3.0a4 released : June 2014
   https://github.com/nltk/nltk/blob/develop/ChangeLog
   http://nltk.org/nltk3-alpha/
+2013
+----
 NLTK Book Updates : October 2013
   We are updating the NLTK book for Python 3 and NLTK 3; please see
   http://nltk.org/book3/