Commit 04f8a9b6 by Steven Bird

Merge branch 'develop' into parseri

parents 0d36f00f b4a7085d
Version 3.0.2 2015-02-08
* make pretty-printing method names consistent
* improvements to Portuguese stemmer
* transition-based dependency parsers
* code clean ups, minor bug fixes
Thanks to the following contributors to 3.0.2:
Long Duong, Saimadhav Heblikar, Helder, Denis Krusko,
Felipe Madrigal, Dmitrijs Milajevs, Nathan Schneider,
0ssifrage, kiwipi.
Version 3.0.1 2015-01-12
* fix setup.py for new version of setuptools
Version 3.0.0 2014-09-07
* minor bugfixes
* added phrase extraction code by Liling Tan and Fredrik Hedman
......
......@@ -147,6 +147,8 @@ movie_reviews = LazyCorpusLoader(
encoding='ascii')
names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
nkjp = LazyCorpusLoader(
'nkjp', NKJPCorpusReader, r'', encoding='utf8')
nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
pl196x = LazyCorpusLoader(
......
......@@ -93,6 +93,7 @@ from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.nkjp import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
......@@ -127,5 +128,6 @@ __all__ = [
'CHILDESCorpusReader', 'AlignedCorpusReader',
'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset'
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
'NKJPCorpusReader'
]
......@@ -76,3 +76,4 @@ from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyPar
ProbabilisticNonprojectiveParser)
from nltk.parse.malt import MaltParser
from nltk.parse.evaluate import DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser
......@@ -102,7 +102,7 @@ class DependencyGraph(object):
self.nodes[head_address]['deps'].setdefault(relation,[])
self.nodes[head_address]['deps'][relation].append(mod_address)
#self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self):
"""
......@@ -113,7 +113,7 @@ class DependencyGraph(object):
for node2 in self.nodes.values():
if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
relation = node2['rel']
node1['deps'].setdefault(relation,[])
node1['deps'].setdefault(relation, [])
node1['deps'][relation].append(node2['address'])
#node1['deps'].append(node2['address'])
......@@ -214,17 +214,21 @@ class DependencyGraph(object):
lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l)
cell_number = None
for index, line in enumerate(lines, start=1):
cells = line.split(cell_separator)
nrCells = len(cells)
if cell_number is None:
cell_number = len(cells)
else:
assert cell_number == len(cells)
if cell_extractor is None:
try:
cell_extractor = extractors[nrCells]
cell_extractor = extractors[cell_number]
except KeyError:
raise ValueError(
'Number of tab-delimited fields ({0}) not supported by '
'CoNLL(10) or Malt-Tab(4) format'.format(nrCells)
'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
)
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
......@@ -246,6 +250,9 @@ class DependencyGraph(object):
}
)
# Make sure that he fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = 'ROOT'
self.nodes[head]['deps'][rel].append(index)
if not self.nodes[0]['deps']['ROOT']:
......@@ -271,7 +278,7 @@ class DependencyGraph(object):
"""
node = self.get_by_address(i)
word = node['word']
deps = list(chain.from_iterable(node['deps'].values()))
deps = sorted(chain.from_iterable(node['deps'].values()))
if deps:
return Tree(word, [self._tree(dep) for dep in deps])
......@@ -286,7 +293,7 @@ class DependencyGraph(object):
node = self.root
word = node['word']
deps = chain.from_iterable(node['deps'].values())
deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None):
......@@ -299,7 +306,7 @@ class DependencyGraph(object):
node = self.root
head = (node['word'], node['ctag'])
for i in node['deps']:
for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i)
yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep):
......
......@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object):
}
)
#print (g_graph.nodes)
# Fully connect non-root nodes in g_graph
g_graph.connect_graph()
original_graph = DependencyGraph()
......@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object):
logger.debug('Betas: %s', betas)
for node in original_graph.nodes.values():
# deps must be a dictionary
#node['deps'] = []
# TODO: It's dangerous to assume that deps it a dictionary
# because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
node['deps'] = {}
for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1])
......@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object):
# Filter parses
# ensure 1 root, every thing has 1 head
for analysis in analyses:
root_count = 0
root = []
for i, cell in enumerate(analysis):
if cell == -1:
root_count += 1
root = i
if root_count == 1:
graph = DependencyGraph()
graph.nodes[0]['deps'] = root + 1
for i in range(len(tokens)):
node = {'word': tokens[i], 'address': i+1}
node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
graph.nodes[i + 1] = node
# cycle = graph.contains_cycle()
# if not cycle:
yield graph
if analysis.count(-1) > 1:
# there are several root elements!
continue
graph = DependencyGraph()
graph.root = graph.nodes[analysis.index(-1) + 1]
for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
head_address = head_index + 1
node = graph.nodes[address]
node.update(
{
'word': token,
'address': address,
}
)
if head_address == 0:
rel = 'ROOT'
else:
rel = ''
graph.nodes[head_index + 1]['deps'][rel].append(address)
# TODO: check for cycles
yield graph
#################################################################
......
......@@ -8,16 +8,16 @@
import tempfile
import pickle
import os
import copy
import operator
from nltk.parse.api import ParserI
import scipy.sparse as sparse
import numpy as np
from os import remove
from copy import deepcopy
from operator import itemgetter
from scipy import sparse
from numpy import array
from sklearn.datasets import load_svmlight_file
from sklearn import svm
from nltk.parse import DependencyGraph
from evaluate import DependencyEvaluator
from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
class Configuration(object):
......@@ -522,7 +522,7 @@ class TransitionParser(ParserI):
# Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, 'wb'))
finally:
os.remove(input_file.name)
remove(input_file.name)
def parse(self, depgraphs, modelFile):
"""
......@@ -549,9 +549,9 @@ class TransitionParser(ParserI):
col.append(self._dictionary[feature])
row.append(0)
data.append(1.0)
np_col = np.array(sorted(col)) # NB : index must be sorted
np_row = np.array(row)
np_data = np.array(data)
np_col = array(sorted(col)) # NB : index must be sorted
np_row = array(row)
np_data = array(data)
x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary)))
......@@ -570,7 +570,7 @@ class TransitionParser(ParserI):
# votes[j] +=1
# k +=1
# Sort votes according to the values
#sorted_votes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True)
#sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
# We will use predict_proba instead of decision_function
prob_dict = {}
......@@ -579,7 +579,7 @@ class TransitionParser(ParserI):
prob_dict[i] = pred_prob[i]
sorted_Prob = sorted(
prob_dict.items(),
key=operator.itemgetter(1),
key=itemgetter(1),
reverse=True)
# Note that SHIFT is always a valid operation
......@@ -609,7 +609,7 @@ class TransitionParser(ParserI):
# Finish with operations build the dependency graph from Conf.arcs
new_depgraph = copy.deepcopy(depgraph)
new_depgraph = deepcopy(depgraph)
for key in new_depgraph.nodes:
node = new_depgraph.nodes[key]
node['rel'] = ''
......@@ -727,7 +727,7 @@ def demo():
Number of training examples : 1
Number of valid (projective) examples : 1
...
>>> os.remove(input_file.name)
>>> remove(input_file.name)
B. Check the ARC-EAGER training
......@@ -743,7 +743,7 @@ def demo():
Number of valid (projective) examples : 1
...
>>> os.remove(input_file.name)
>>> remove(input_file.name)
###################### Check The Parsing Function ########################
......
......@@ -35,30 +35,33 @@ CoNLL Data
... . . 9 VMOD
... """
>>> dg = DependencyGraph(treebank_data)
>>> print(dg.tree().pprint())
>>> dg.tree().pprint()
(will
(Vinken Pierre , (old (years 61)) ,)
(join (board the) (as (director a nonexecutive)) (Nov. 29) .))
>>> print(list(dg.triples()))
[((u'will', u'MD'), u'SUB', (u'Vinken', u'NNP')),
((u'Vinken', u'NNP'), u'NMOD', (u'Pierre', u'NNP')),
((u'Vinken', u'NNP'), u'P', (u',', u',')),
((u'Vinken', u'NNP'), u'NMOD', (u'old', u'JJ')),
((u'old', u'JJ'), u'AMOD', (u'years', u'NNS')),
((u'years', u'NNS'), u'NMOD', (u'61', u'CD')),
((u'Vinken', u'NNP'), u'P', (u',', u',')),
((u'will', u'MD'), u'VC', (u'join', u'VB')),
((u'join', u'VB'), u'OBJ', (u'board', u'NN')),
((u'board', u'NN'), u'NMOD', (u'the', u'DT')),
((u'join', u'VB'), u'VMOD', (u'as', u'IN')),
((u'as', u'IN'), u'PMOD', (u'director', u'NN')),
((u'director', u'NN'), u'NMOD', (u'a', u'DT')),
((u'director', u'NN'), u'NMOD', (u'nonexecutive', u'JJ')),
((u'join', u'VB'), u'VMOD', (u'Nov.', u'NNP')),
((u'Nov.', u'NNP'), u'NMOD', (u'29', u'CD')),
((u'join', u'VB'), u'VMOD', (u'.', u'.'))]
>>> for head, rel, dep in dg.triples():
... print(
... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
... .format(h=head, r=rel, d=dep)
... )
(will, MD), SUB, (Vinken, NNP)
(Vinken, NNP), NMOD, (Pierre, NNP)
(Vinken, NNP), P, (,, ,)
(Vinken, NNP), NMOD, (old, JJ)
(old, JJ), AMOD, (years, NNS)
(years, NNS), NMOD, (61, CD)
(Vinken, NNP), P, (,, ,)
(will, MD), VC, (join, VB)
(join, VB), OBJ, (board, NN)
(board, NN), NMOD, (the, DT)
(join, VB), VMOD, (as, IN)
(as, IN), PMOD, (director, NN)
(director, NN), NMOD, (a, DT)
(director, NN), NMOD, (nonexecutive, JJ)
(join, VB), VMOD, (Nov., NNP)
(Nov., NNP), NMOD, (29, CD)
(join, VB), VMOD, (., .)
Using the dependency-parsed version of the Penn Treebank corpus sample.
......@@ -159,21 +162,22 @@ Non-Projective Dependency Parsing
'dog' -> 'his'
>>> dp = NonprojectiveDependencyParser(grammar)
>>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']):
... print(g) # doctest: +NORMALIZE_WHITESPACE
{0: {'address': 0,
'ctag': 'TOP',
'deps': 3,
'feats': None,
'lemma': None,
'rel': 'TOP',
'tag': 'TOP',
'word': None},
1: {'address': 1, 'deps': [], 'word': 'the'},
2: {'address': 2, 'deps': [1], 'word': 'man'},
3: {'address': 3, 'deps': [2, 7], 'word': 'taught'},
4: {'address': 4, 'deps': [], 'word': 'his'},
5: {'address': 5, 'deps': [4], 'word': 'dog'},
6: {'address': 6, 'deps': [], 'word': 'to'},
7: {'address': 7, 'deps': [5, 6, 8], 'word': 'play'},
8: {'address': 8, 'deps': [], 'word': 'golf'}}
>>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
>>> print(g.root['word'])
taught
>>> for _, node in sorted(g.nodes.items()):
... if node['word'] is not None:
... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
1 the: []
2 man: [1]
3 taught: [2, 7]
4 his: []
5 dog: [4]
6 to: []
7 play: [5, 6, 8]
8 golf: []
>>> print(g.tree())
(taught (man the) (play (dog his) to golf))
[tox]
envlist = py26,py27,py32,py33,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py26-jenkins,py32-jenkins
envlist = py26,py27,py32,py33,py34,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py34-nodeps,py26-jenkins,py32-jenkins,py34-jenkins
[testenv]
......@@ -63,6 +63,20 @@ commands =
; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []
[testenv:py34]
deps =
numpy
nose >= 1.2.1
coverage
text-unidecode
commands =
; scipy and scikit-learn requires numpy even to run setup.py so
; they can't be installed in one command
pip install --download-cache={toxworkdir}/_download scipy scikit-learn
; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []
[testenv:py26-nodeps]
basepython = python2.6
......@@ -84,6 +98,11 @@ basepython = python3.3
deps = nose >= 1.2.1
commands = python runtests.py []
[testenv:py34-nodeps]
basepython = python3.4
deps = nose >= 1.2.1
commands = python runtests.py []
[testenv:py26-jenkins]
basepython = python2.6
commands = {toxinidir}/jenkins.sh
......@@ -99,3 +118,11 @@ setenv =
STANFORD_MODELS = {homedir}/third/stanford-parser/
STANFORD_PARSER = {homedir}/third/stanford-parser/
STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
[testenv:py34-jenkins]
basepython = python3.4
commands = {toxinidir}/jenkins.sh
setenv =
STANFORD_MODELS = {homedir}/third/stanford-parser/
STANFORD_PARSER = {homedir}/third/stanford-parser/
STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
......@@ -44,7 +44,7 @@ master_doc = 'index'
# General information about the project.
project = 'NLTK'
copyright = '2013, NLTK Project'
copyright = '2015, NLTK Project'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
......
......@@ -5,14 +5,14 @@ The Natural Language Toolkit exists thanks to the efforts of dozens
of voluntary developers who have contributed functionality and
bugfixes since the project began in 2000 (`contributors <https://github.com/nltk/nltk#contributing>`_).
In 2014 we are especially keen to improve NLTK coverage for:
In 2015 we are especially keen to improve NLTK coverage for:
`dependency parsing <https://github.com/nltk/nltk/wiki/Dependency-Parsing>`_,
`machine translation <https://github.com/nltk/nltk/wiki/Machine-Translation>`_,
`sentiment analysis <https://github.com/nltk/nltk/wiki/Sentiment-Analysis>`_,
`twitter processing <https://github.com/nltk/nltk/wiki/Twitter-Processing>`_.
New material in these areas will be covered in the second edition of
the NLTK book, anticipated in 2015.
the NLTK book, anticipated in early 2016.
* `desired enhancements <https://github.com/nltk/nltk/issues?labels=enhancement&page=1&state=open>`_
* `contribute a corpus <https://github.com/nltk/nltk/wiki/Adding-a-Corpus>`_
......@@ -29,7 +29,6 @@ Individual packages are maintained by the following people:
:Parsing: `Peter Ljunglöf <http://www.cse.chalmers.se/~peb/>`_, Gothenburg, Sweden (``nltk.parse, nltk.featstruct``)
:Metrics: `Joel Nothman <http://joelnothman.com/>`_, Sydney, Australia (``nltk.metrics, nltk.tokenize.punkt``)
:Python 3: `Mikhail Korobov <http://kmike.ru/>`_, Ekaterinburg, Russia
:Integration: `Morten Minde Neergaard <http://8d.no/>`_, Oslo, Norway
:Releases: `Steven Bird <http://estive.net>`_, Melbourne, Australia
......
NLTK News
=========
2015
----
NLTK 3.0.1 released : January 2015
Minor packaging update.
2014
----
NLTK 3.0.0 released : September 2014
Minor bugfixes. For full details see:
https://github.com/nltk/nltk/blob/develop/ChangeLog
......@@ -26,6 +35,9 @@ NLTK 3.0a4 released : June 2014
https://github.com/nltk/nltk/blob/develop/ChangeLog
http://nltk.org/nltk3-alpha/
2013
----
NLTK Book Updates : October 2013
We are updating the NLTK book for Python 3 and NLTK 3; please see
http://nltk.org/book3/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment