Commit cdcf6701 by Steven Bird

Merge branch 'parseri' into test

parents 8dcae605 04f8a9b6
Version 3.0.2 2015-02-08
* make pretty-printing method names consistent
* improvements to Portuguese stemmer
* transition-based dependency parsers
* code clean ups, minor bug fixes
Thanks to the following contributors to 3.0.2:
Long Duong, Saimadhav Heblikar, Helder, Denis Krusko,
Felipe Madrigal, Dmitrijs Milajevs, Nathan Schneider,
0ssifrage, kiwipi.
Version 3.0.1 2015-01-12
* fix setup.py for new version of setuptools
Version 3.0.0 2014-09-07 Version 3.0.0 2014-09-07
* minor bugfixes * minor bugfixes
* added phrase extraction code by Liling Tan and Fredrik Hedman * added phrase extraction code by Liling Tan and Fredrik Hedman
......
...@@ -145,7 +145,7 @@ try: ...@@ -145,7 +145,7 @@ try:
except ImportError: except ImportError:
pass pass
else: else:
from nltk import cluster; from .cluster import * from nltk import cluster
from nltk.downloader import download, download_shell from nltk.downloader import download, download_shell
try: try:
......
...@@ -548,7 +548,7 @@ def demo(): ...@@ -548,7 +548,7 @@ def demo():
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk import nltk
t = nltk.chunk.tagstr2tree(s, chunk_label='NP') t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
print(t.pprint()) t.pprint()
print() print()
s = """ s = """
...@@ -582,7 +582,7 @@ better JJR I-ADJP ...@@ -582,7 +582,7 @@ better JJR I-ADJP
""" """
conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP')) conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
print(conll_tree.pprint()) conll_tree.pprint()
# Demonstrate CoNLL output # Demonstrate CoNLL output
print("CoNLL output:") print("CoNLL output:")
......
...@@ -147,6 +147,8 @@ movie_reviews = LazyCorpusLoader( ...@@ -147,6 +147,8 @@ movie_reviews = LazyCorpusLoader(
encoding='ascii') encoding='ascii')
names = LazyCorpusLoader( names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii') 'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
nkjp = LazyCorpusLoader(
'nkjp', NKJPCorpusReader, r'', encoding='utf8')
nps_chat = LazyCorpusLoader( nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj') 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
pl196x = LazyCorpusLoader( pl196x = LazyCorpusLoader(
......
...@@ -93,6 +93,7 @@ from nltk.corpus.reader.framenet import * ...@@ -93,6 +93,7 @@ from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import * from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import * from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import * from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.nkjp import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree: # the function bracket_parse() defined in nltk.tree:
...@@ -127,5 +128,6 @@ __all__ = [ ...@@ -127,5 +128,6 @@ __all__ = [
'CHILDESCorpusReader', 'AlignedCorpusReader', 'CHILDESCorpusReader', 'AlignedCorpusReader',
'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader', 'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader', 'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset' 'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
'NKJPCorpusReader'
] ]
...@@ -76,3 +76,4 @@ from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyPar ...@@ -76,3 +76,4 @@ from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyPar
ProbabilisticNonprojectiveParser) ProbabilisticNonprojectiveParser)
from nltk.parse.malt import MaltParser from nltk.parse.malt import MaltParser
from nltk.parse.evaluate import DependencyEvaluator from nltk.parse.evaluate import DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser
...@@ -32,7 +32,7 @@ class ParserI(object): ...@@ -32,7 +32,7 @@ class ParserI(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def parse(self, sent): def parse(self, sent, *args, **kwargs):
""" """
:return: An iterator that generates parse trees for the sentence. :return: An iterator that generates parse trees for the sentence.
When possible this list is sorted from most likely to least likely. When possible this list is sorted from most likely to least likely.
...@@ -42,25 +42,25 @@ class ParserI(object): ...@@ -42,25 +42,25 @@ class ParserI(object):
:rtype: iter(Tree) :rtype: iter(Tree)
""" """
if overridden(self.parse_sents): if overridden(self.parse_sents):
return next(self.parse_sents([sent])) return next(self.parse_sents([sent], *args, **kwargs))
elif overridden(self.parse_one): elif overridden(self.parse_one):
return (tree for tree in [self.parse_one(sent)] if tree is not None) return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None)
elif overridden(self.parse_all): elif overridden(self.parse_all):
return iter(self.parse_all(sent)) return iter(self.parse_all(sent, *args, **kwargs))
else: else:
raise NotImplementedError() raise NotImplementedError()
def parse_sents(self, sents): def parse_sents(self, sents, *args, **kwargs):
""" """
Apply ``self.parse()`` to each element of ``sents``. Apply ``self.parse()`` to each element of ``sents``.
:rtype: iter(iter(Tree)) :rtype: iter(iter(Tree))
""" """
return (self.parse(sent) for sent in sents) return (self.parse(sent, *args, **kwargs) for sent in sents)
def parse_all(self, sent): def parse_all(self, sent, *args, **kwargs):
""":rtype: list(Tree)""" """:rtype: list(Tree)"""
return list(self.parse(sent)) return list(self.parse(sent, *args, **kwargs))
def parse_one(self, sent): def parse_one(self, sent, *args, **kwargs):
""":rtype: Tree or None""" """:rtype: Tree or None"""
return next(self.parse(sent), None) return next(self.parse(sent, *args, **kwargs), None)
...@@ -1346,9 +1346,9 @@ class ChartParser(ParserI): ...@@ -1346,9 +1346,9 @@ class ChartParser(ParserI):
# Return the final chart. # Return the final chart.
return chart return chart
def parse_all(self, tokens, tree_class=Tree): def parse(self, tokens, tree_class=Tree):
chart = self.chart_parse(tokens) chart = self.chart_parse(tokens)
return chart.parses(self._grammar.start(), tree_class=tree_class) return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
class TopDownChartParser(ChartParser): class TopDownChartParser(ChartParser):
""" """
...@@ -1628,9 +1628,8 @@ def demo(choice=None, ...@@ -1628,9 +1628,8 @@ def demo(choice=None,
print() print()
cp = ChartParser(grammar, strategies[strategy][1], trace=trace) cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
t = time.time() t = time.time()
# parses = cp.parse_all(tokens) parses = cp.parse_all(tokens)
chart = cp.chart_parse(tokens) chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
times[strategies[strategy][0]] = time.time()-t times[strategies[strategy][0]] = time.time()-t
print("Nr edges in chart:", len(chart.edges())) print("Nr edges in chart:", len(chart.edges()))
if numparses: if numparses:
......
...@@ -102,7 +102,7 @@ class DependencyGraph(object): ...@@ -102,7 +102,7 @@ class DependencyGraph(object):
self.nodes[head_address]['deps'].setdefault(relation,[]) self.nodes[head_address]['deps'].setdefault(relation,[])
self.nodes[head_address]['deps'][relation].append(mod_address) self.nodes[head_address]['deps'][relation].append(mod_address)
#self.nodes[head_address]['deps'].append(mod_address) #self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self): def connect_graph(self):
""" """
...@@ -113,7 +113,7 @@ class DependencyGraph(object): ...@@ -113,7 +113,7 @@ class DependencyGraph(object):
for node2 in self.nodes.values(): for node2 in self.nodes.values():
if node1['address'] != node2['address'] and node2['rel'] != 'TOP': if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
relation = node2['rel'] relation = node2['rel']
node1['deps'].setdefault(relation,[]) node1['deps'].setdefault(relation, [])
node1['deps'][relation].append(node2['address']) node1['deps'][relation].append(node2['address'])
#node1['deps'].append(node2['address']) #node1['deps'].append(node2['address'])
...@@ -214,17 +214,21 @@ class DependencyGraph(object): ...@@ -214,17 +214,21 @@ class DependencyGraph(object):
lines = (l.rstrip() for l in input_) lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l) lines = (l for l in lines if l)
cell_number = None
for index, line in enumerate(lines, start=1): for index, line in enumerate(lines, start=1):
cells = line.split(cell_separator) cells = line.split(cell_separator)
nrCells = len(cells) if cell_number is None:
cell_number = len(cells)
else:
assert cell_number == len(cells)
if cell_extractor is None: if cell_extractor is None:
try: try:
cell_extractor = extractors[nrCells] cell_extractor = extractors[cell_number]
except KeyError: except KeyError:
raise ValueError( raise ValueError(
'Number of tab-delimited fields ({0}) not supported by ' 'Number of tab-delimited fields ({0}) not supported by '
'CoNLL(10) or Malt-Tab(4) format'.format(nrCells) 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
) )
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
...@@ -246,6 +250,9 @@ class DependencyGraph(object): ...@@ -246,6 +250,9 @@ class DependencyGraph(object):
} }
) )
# Make sure that he fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = 'ROOT'
self.nodes[head]['deps'][rel].append(index) self.nodes[head]['deps'][rel].append(index)
if not self.nodes[0]['deps']['ROOT']: if not self.nodes[0]['deps']['ROOT']:
...@@ -271,7 +278,7 @@ class DependencyGraph(object): ...@@ -271,7 +278,7 @@ class DependencyGraph(object):
""" """
node = self.get_by_address(i) node = self.get_by_address(i)
word = node['word'] word = node['word']
deps = list(chain.from_iterable(node['deps'].values())) deps = sorted(chain.from_iterable(node['deps'].values()))
if deps: if deps:
return Tree(word, [self._tree(dep) for dep in deps]) return Tree(word, [self._tree(dep) for dep in deps])
...@@ -286,7 +293,7 @@ class DependencyGraph(object): ...@@ -286,7 +293,7 @@ class DependencyGraph(object):
node = self.root node = self.root
word = node['word'] word = node['word']
deps = chain.from_iterable(node['deps'].values()) deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps]) return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None): def triples(self, node=None):
...@@ -299,7 +306,7 @@ class DependencyGraph(object): ...@@ -299,7 +306,7 @@ class DependencyGraph(object):
node = self.root node = self.root
head = (node['word'], node['ctag']) head = (node['word'], node['ctag'])
for i in node['deps']: for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i) dep = self.get_by_address(i)
yield (head, dep['rel'], (dep['word'], dep['ctag'])) yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep): for triple in self.triples(node=dep):
...@@ -458,7 +465,7 @@ Nov. NNP 9 VMOD ...@@ -458,7 +465,7 @@ Nov. NNP 9 VMOD
. . 9 VMOD . . 9 VMOD
""") """)
tree = dg.tree() tree = dg.tree()
print(tree.pprint()) tree.pprint()
if nx: if nx:
# currently doesn't work # currently doesn't work
import networkx as NX import networkx as NX
...@@ -483,7 +490,7 @@ def conll_demo(): ...@@ -483,7 +490,7 @@ def conll_demo():
""" """
dg = DependencyGraph(conll_data1) dg = DependencyGraph(conll_data1)
tree = dg.tree() tree = dg.tree()
print(tree.pprint()) tree.pprint()
print(dg) print(dg)
print(dg.to_conll(4)) print(dg.to_conll(4))
...@@ -494,7 +501,8 @@ def conll_file_demo(): ...@@ -494,7 +501,8 @@ def conll_file_demo():
for entry in conll_data2.split('\n\n') if entry] for entry in conll_data2.split('\n\n') if entry]
for graph in graphs: for graph in graphs:
tree = graph.tree() tree = graph.tree()
print('\n' + tree.pprint()) print('\n')
tree.pprint()
def cycle_finding_demo(): def cycle_finding_demo():
......
...@@ -87,56 +87,19 @@ class MaltParser(ParserI): ...@@ -87,56 +87,19 @@ class MaltParser(ParserI):
url='http://www.maltparser.org/', url='http://www.maltparser.org/',
verbose=verbose) verbose=verbose)
def parse_all(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a list of
words; it will be automatically tagged with this MaltParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:return: list(DependencyGraph)
"""
return self.parse_sents([sentence], verbose)
def parse_sents(self, sentences, verbose=False): def parse_sents(self, sentences, verbose=False):
""" """
Use MaltParser to parse multiple sentence. Takes multiple sentences as a Use MaltParser to parse multiple sentences. Takes multiple sentences as a
list where each sentence is a list of words. list where each sentence is a list of words.
Each sentence will be automatically tagged with this MaltParser instance's Each sentence will be automatically tagged with this MaltParser instance's
tagger. tagger.
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentence: list(list(str)) :type sentence: list(list(str))
:return: list(DependencyGraph) :return: iter(DependencyGraph)
""" """
tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences] tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
return self.tagged_parse_sents(tagged_sentences, verbose) return iter(self.tagged_parse_sents(tagged_sentences, verbose))
def parse(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a list of words.
The sentence will be automatically tagged with this MaltParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:return: ``DependencyGraph`` the dependency graph representation of the sentence
"""
return self.parse_sents([sentence], verbose)[0]
def raw_parse(self, sentence, verbose=False):
"""
Use MaltParser to parse a sentence. Takes a sentence as a string;
before parsing, it will be automatically tokenized and tagged with this
MaltParser instance's tagger.
:param sentence: Input sentence to parse
:type sentence: str
:return: list(DependencyGraph)
"""
words = word_tokenize(sentence)
return self.parse(words, verbose)
def tagged_parse(self, sentence, verbose=False): def tagged_parse(self, sentence, verbose=False):
""" """
...@@ -158,7 +121,7 @@ class MaltParser(ParserI): ...@@ -158,7 +121,7 @@ class MaltParser(ParserI):
:param sentences: Input sentences to parse :param sentences: Input sentences to parse
:type sentence: list(list(tuple(str, str))) :type sentence: list(list(tuple(str, str)))
:return: list(``DependencyGraph``) the dependency graph representation :return: iter(iter(``DependencyGraph``)) the dependency graph representation
of each sentence of each sentence
""" """
...@@ -193,7 +156,7 @@ class MaltParser(ParserI): ...@@ -193,7 +156,7 @@ class MaltParser(ParserI):
raise Exception("MaltParser parsing (%s) failed with exit " raise Exception("MaltParser parsing (%s) failed with exit "
"code %d" % (' '.join(cmd), ret)) "code %d" % (' '.join(cmd), ret))
return DependencyGraph.load(output_file.name) return iter(DependencyGraph.load(output_file.name))
finally: finally:
input_file.close() input_file.close()
os.remove(input_file.name) os.remove(input_file.name)
...@@ -276,8 +239,8 @@ def demo(): ...@@ -276,8 +239,8 @@ def demo():
maltParser = MaltParser() maltParser = MaltParser()
maltParser.train([dg1,dg2], verbose=verbose) maltParser.train([dg1,dg2], verbose=verbose)
print(maltParser.raw_parse('John sees Mary', verbose=verbose).tree().pprint()) maltParser.parse_one(['John','sees','Mary'], verbose=verbose).tree().pprint()
print(maltParser.raw_parse('a man runs', verbose=verbose).tree().pprint()) maltParser.parse_one(['a','man','runs'], verbose=verbose).tree().pprint()
if __name__ == '__main__': if __name__ == '__main__':
demo() demo()
...@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object): ...@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object):
} }
) )
#print (g_graph.nodes) #print (g_graph.nodes)
# Fully connect non-root nodes in g_graph # Fully connect non-root nodes in g_graph
g_graph.connect_graph() g_graph.connect_graph()
original_graph = DependencyGraph() original_graph = DependencyGraph()
...@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object): ...@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object):
logger.debug('Betas: %s', betas) logger.debug('Betas: %s', betas)
for node in original_graph.nodes.values(): for node in original_graph.nodes.values():
# deps must be a dictionary # TODO: It's dangerous to assume that deps it a dictionary
#node['deps'] = [] # because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
node['deps'] = {} node['deps'] = {}
for i in range(1, len(tokens) + 1): for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1]) original_graph.add_arc(betas[i][0], betas[i][1])
...@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object): ...@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object):
# Filter parses # Filter parses
# ensure 1 root, every thing has 1 head # ensure 1 root, every thing has 1 head
for analysis in analyses: for analysis in analyses:
root_count = 0 if analysis.count(-1) > 1:
root = [] # there are several root elements!
for i, cell in enumerate(analysis): continue
if cell == -1:
root_count += 1 graph = DependencyGraph()
root = i graph.root = graph.nodes[analysis.index(-1) + 1]
if root_count == 1:
graph = DependencyGraph() for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
graph.nodes[0]['deps'] = root + 1 head_address = head_index + 1
for i in range(len(tokens)):
node = {'word': tokens[i], 'address': i+1} node = graph.nodes[address]
node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i] node.update(
graph.nodes[i + 1] = node {
# cycle = graph.contains_cycle() 'word': token,
# if not cycle: 'address': address,
yield graph }
)
if head_address == 0:
rel = 'ROOT'
else:
rel = ''
graph.nodes[head_index + 1]['deps'][rel].append(address)
# TODO: check for cycles
yield graph
################################################################# #################################################################
......
...@@ -109,18 +109,6 @@ class StanfordParser(ParserI): ...@@ -109,18 +109,6 @@ class StanfordParser(ParserI):
cur_lines.append(line) cur_lines.append(line)
return res return res
def parse_all(self, sentence, verbose=False):
"""
Use StanfordParser to parse a sentence. Takes a sentence as a list of
words; it will be automatically tagged with this StanfordParser instance's
tagger.
:param sentence: Input sentence to parse
:type sentence: list(str)
:rtype: Tree
"""
return self.parse_sents([sentence], verbose)
def parse_sents(self, sentences, verbose=False): def parse_sents(self, sentences, verbose=False):
""" """
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
......
...@@ -8,16 +8,16 @@ ...@@ -8,16 +8,16 @@
import tempfile import tempfile
import pickle import pickle
import os
import copy from os import remove
import operator from copy import deepcopy
from nltk.parse.api import ParserI from operator import itemgetter
import scipy.sparse as sparse from scipy import sparse
import numpy as np from numpy import array
from sklearn.datasets import load_svmlight_file from sklearn.datasets import load_svmlight_file
from sklearn import svm from sklearn import svm
from nltk.parse import DependencyGraph
from evaluate import DependencyEvaluator from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
class Configuration(object): class Configuration(object):
...@@ -522,7 +522,7 @@ class TransitionParser(ParserI): ...@@ -522,7 +522,7 @@ class TransitionParser(ParserI):
# Save the model to file name (as pickle) # Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, 'wb')) pickle.dump(model, open(modelfile, 'wb'))
finally: finally:
os.remove(input_file.name) remove(input_file.name)
def parse(self, depgraphs, modelFile): def parse(self, depgraphs, modelFile):
""" """
...@@ -549,9 +549,9 @@ class TransitionParser(ParserI): ...@@ -549,9 +549,9 @@ class TransitionParser(ParserI):
col.append(self._dictionary[feature]) col.append(self._dictionary[feature])
row.append(0) row.append(0)
data.append(1.0) data.append(1.0)
np_col = np.array(sorted(col)) # NB : index must be sorted np_col = array(sorted(col)) # NB : index must be sorted
np_row = np.array(row) np_row = array(row)
np_data = np.array(data) np_data = array(data)
x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary))) x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary)))
...@@ -570,7 +570,7 @@ class TransitionParser(ParserI): ...@@ -570,7 +570,7 @@ class TransitionParser(ParserI):
# votes[j] +=1 # votes[j] +=1
# k +=1 # k +=1
# Sort votes according to the values # Sort votes according to the values
#sorted_votes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True) #sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
# We will use predict_proba instead of decision_function # We will use predict_proba instead of decision_function
prob_dict = {} prob_dict = {}
...@@ -579,7 +579,7 @@ class TransitionParser(ParserI): ...@@ -579,7 +579,7 @@ class TransitionParser(ParserI):
prob_dict[i] = pred_prob[i] prob_dict[i] = pred_prob[i]
sorted_Prob = sorted( sorted_Prob = sorted(
prob_dict.items(), prob_dict.items(),
key=operator.itemgetter(1), key=itemgetter(1),
reverse=True) reverse=True)
# Note that SHIFT is always a valid operation # Note that SHIFT is always a valid operation
...@@ -609,7 +609,7 @@ class TransitionParser(ParserI): ...@@ -609,7 +609,7 @@ class TransitionParser(ParserI):
# Finish with operations build the dependency graph from Conf.arcs # Finish with operations build the dependency graph from Conf.arcs
new_depgraph = copy.deepcopy(depgraph) new_depgraph = deepcopy(depgraph)
for key in new_depgraph.nodes: for key in new_depgraph.nodes:
node = new_depgraph.nodes[key] node = new_depgraph.nodes[key]
node['rel'] = '' node['rel'] = ''
...@@ -727,7 +727,7 @@ def demo(): ...@@ -727,7 +727,7 @@ def demo():
Number of training examples : 1 Number of training examples : 1
Number of valid (projective) examples : 1 Number of valid (projective) examples : 1
... ...
>>> os.remove(input_file.name) >>> remove(input_file.name)
B. Check the ARC-EAGER training B. Check the ARC-EAGER training
...@@ -743,7 +743,7 @@ def demo(): ...@@ -743,7 +743,7 @@ def demo():
Number of valid (projective) examples : 1 Number of valid (projective) examples : 1
... ...
>>> os.remove(input_file.name) >>> remove(input_file.name)
###################### Check The Parsing Function ######################## ###################### Check The Parsing Function ########################
......
...@@ -35,30 +35,33 @@ CoNLL Data ...@@ -35,30 +35,33 @@ CoNLL Data
... . . 9 VMOD ... . . 9 VMOD
... """ ... """
>>> dg = DependencyGraph(treebank_data) >>> dg = DependencyGraph(treebank_data)
>>> print(dg.tree().pprint()) >>> dg.tree().pprint()
(will (will
(Vinken Pierre , (old (years 61)) ,) (Vinken Pierre , (old (years 61)) ,)
(join (board the) (as (director a nonexecutive)) (Nov. 29) .)) (join (board the) (as (director a nonexecutive)) (Nov. 29) .))
>>> print(list(dg.triples())) >>> for head, rel, dep in dg.triples():
[((u'will', u'MD'), u'SUB', (u'Vinken', u'NNP')), ... print(
((u'Vinken', u'NNP'), u'NMOD', (u'Pierre', u'NNP')), ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
((u'Vinken', u'NNP'), u'P', (u',', u',')), ... .format(h=head, r=rel, d=dep)
((u'Vinken', u'NNP'), u'NMOD', (u'old', u'JJ')), ... )
((u'old', u'JJ'), u'AMOD', (u'years', u'NNS')), (will, MD), SUB, (Vinken, NNP)
((u'years', u'NNS'), u'NMOD', (u'61', u'CD')), (Vinken, NNP), NMOD, (Pierre, NNP)
((u'Vinken', u'NNP'), u'P', (u',', u',')), (Vinken, NNP), P, (,, ,)
((u'will', u'MD'), u'VC', (u'join', u'VB')), (Vinken, NNP), NMOD, (old, JJ)
((u'join', u'VB'), u'OBJ', (u'board', u'NN')), (old, JJ), AMOD, (years, NNS)
((u'board', u'NN'), u'NMOD', (u'the', u'DT')), (years, NNS), NMOD, (61, CD)
((u'join', u'VB'), u'VMOD', (u'as', u'IN')), (Vinken, NNP), P, (,, ,)
((u'as', u'IN'), u'PMOD', (u'director', u'NN')), (will, MD), VC, (join, VB)
((u'director', u'NN'), u'NMOD', (u'a', u'DT')), (join, VB), OBJ, (board, NN)
((u'director', u'NN'), u'NMOD', (u'nonexecutive', u'JJ')), (board, NN), NMOD, (the, DT)
((u'join', u'VB'), u'VMOD', (u'Nov.', u'NNP')), (join, VB), VMOD, (as, IN)
((u'Nov.', u'NNP'), u'NMOD', (u'29', u'CD')), (as, IN), PMOD, (director, NN)
((u'join', u'VB'), u'VMOD', (u'.', u'.'))] (director, NN), NMOD, (a, DT)
(director, NN), NMOD, (nonexecutive, JJ)
(join, VB), VMOD, (Nov., NNP)
(Nov., NNP), NMOD, (29, CD)
(join, VB), VMOD, (., .)
Using the dependency-parsed version of the Penn Treebank corpus sample. Using the dependency-parsed version of the Penn Treebank corpus sample.
...@@ -159,21 +162,22 @@ Non-Projective Dependency Parsing ...@@ -159,21 +162,22 @@ Non-Projective Dependency Parsing
'dog' -> 'his' 'dog' -> 'his'
>>> dp = NonprojectiveDependencyParser(grammar) >>> dp = NonprojectiveDependencyParser(grammar)
>>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']): >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
... print(g) # doctest: +NORMALIZE_WHITESPACE
{0: {'address': 0, >>> print(g.root['word'])
'ctag': 'TOP', taught
'deps': 3,
'feats': None, >>> for _, node in sorted(g.nodes.items()):
'lemma': None, ... if node['word'] is not None:
'rel': 'TOP', ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
'tag': 'TOP', 1 the: []
'word': None}, 2 man: [1]
1: {'address': 1, 'deps': [], 'word': 'the'}, 3 taught: [2, 7]
2: {'address': 2, 'deps': [1], 'word': 'man'}, 4 his: []
3: {'address': 3, 'deps': [2, 7], 'word': 'taught'}, 5 dog: [4]
4: {'address': 4, 'deps': [], 'word': 'his'}, 6 to: []
5: {'address': 5, 'deps': [4], 'word': 'dog'}, 7 play: [5, 6, 8]
6: {'address': 6, 'deps': [], 'word': 'to'}, 8 golf: []
7: {'address': 7, 'deps': [5, 6, 8], 'word': 'play'},
8: {'address': 8, 'deps': [], 'word': 'golf'}} >>> print(g.tree())
(taught (man the) (play (dog his) to golf))
...@@ -370,7 +370,7 @@ Dependency Graph to LFG f-structure ...@@ -370,7 +370,7 @@ Dependency Graph to LFG f-structure
subj g:[pred 'John']] subj g:[pred 'John']]
>>> fstruct.to_depgraph().tree().pprint() >>> fstruct.to_depgraph().tree().pprint()
'(sees (dog a) John)' (sees (dog a) John)
--------------------------------- ---------------------------------
LFG f-structure to Glue LFG f-structure to Glue
......
...@@ -41,7 +41,7 @@ tree object to one of several standard tree encodings: ...@@ -41,7 +41,7 @@ tree object to one of several standard tree encodings:
There is also a fancy ASCII art representation: There is also a fancy ASCII art representation:
>>> tree.pprint() >>> tree.pretty_print()
s s
________|_____ ________|_____
| vp | vp
...@@ -52,7 +52,7 @@ There is also a fancy ASCII art representation: ...@@ -52,7 +52,7 @@ There is also a fancy ASCII art representation:
| | | | | | | | | |
the dog chased the cat the dog chased the cat
>>> tree.pprint(unicodelines=True, nodedist=4) >>> tree.pretty_print(unicodelines=True, nodedist=4)
s s
┌──────────────┴────────┐ ┌──────────────┴────────┐
│ vp │ vp
......
...@@ -685,7 +685,7 @@ class Tree(list): ...@@ -685,7 +685,7 @@ class Tree(list):
from nltk.draw.tree import draw_trees from nltk.draw.tree import draw_trees
draw_trees(self) draw_trees(self)
def pprint(self, sentence=None, highlight=(), **viz_args): def pretty_print(self, sentence=None, highlight=(), **viz_args):
""" """
Pretty-print this tree as ASCII or Unicode art. Pretty-print this tree as ASCII or Unicode art.
For explanation of the arguments, see the documentation for For explanation of the arguments, see the documentation for
...@@ -734,6 +734,17 @@ class Tree(list): ...@@ -734,6 +734,17 @@ class Tree(list):
def __str__(self): def __str__(self):
return self.pformat() return self.pformat()
def pprint(self, **args):
"""
Print a string representation of this Tree to 'stream'
"""
if "stream" in args:
stream = args["stream"]
else:
stream = None
print(self.pformat(**args), file=stream)
def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False): def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
""" """
:return: A pretty-printed string representation of this tree. :return: A pretty-printed string representation of this tree.
...@@ -751,7 +762,7 @@ class Tree(list): ...@@ -751,7 +762,7 @@ class Tree(list):
# Try writing it on one line. # Try writing it on one line.
s = self._pformat_flat(nodesep, parens, quotes) s = self._pformat_flat(nodesep, parens, quotes)
if len(s)+indent < margin: if len(s) + indent < margin:
return s return s
# If it doesn't fit on one line, then write it on multi-lines. # If it doesn't fit on one line, then write it on multi-lines.
......
[tox] [tox]
envlist = py26,py27,py32,py33,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py26-jenkins,py32-jenkins envlist = py26,py27,py32,py33,py34,pypy,py26-nodeps,py27-nodeps,py32-nodeps,py33-nodeps,py34-nodeps,py26-jenkins,py32-jenkins,py34-jenkins
[testenv] [testenv]
...@@ -63,6 +63,20 @@ commands = ...@@ -63,6 +63,20 @@ commands =
; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs [] ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py [] python runtests.py []
[testenv:py34]
deps =
numpy
nose >= 1.2.1
coverage
text-unidecode
commands =
; scipy and scikit-learn requires numpy even to run setup.py so
; they can't be installed in one command
pip install --download-cache={toxworkdir}/_download scipy scikit-learn
; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []
[testenv:py26-nodeps] [testenv:py26-nodeps]
basepython = python2.6 basepython = python2.6
...@@ -84,6 +98,11 @@ basepython = python3.3 ...@@ -84,6 +98,11 @@ basepython = python3.3
deps = nose >= 1.2.1 deps = nose >= 1.2.1
commands = python runtests.py [] commands = python runtests.py []
[testenv:py34-nodeps]
basepython = python3.4
deps = nose >= 1.2.1
commands = python runtests.py []
[testenv:py26-jenkins] [testenv:py26-jenkins]
basepython = python2.6 basepython = python2.6
commands = {toxinidir}/jenkins.sh commands = {toxinidir}/jenkins.sh
...@@ -99,3 +118,11 @@ setenv = ...@@ -99,3 +118,11 @@ setenv =
STANFORD_MODELS = {homedir}/third/stanford-parser/ STANFORD_MODELS = {homedir}/third/stanford-parser/
STANFORD_PARSER = {homedir}/third/stanford-parser/ STANFORD_PARSER = {homedir}/third/stanford-parser/
STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/ STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
[testenv:py34-jenkins]
basepython = python3.4
commands = {toxinidir}/jenkins.sh
setenv =
STANFORD_MODELS = {homedir}/third/stanford-parser/
STANFORD_PARSER = {homedir}/third/stanford-parser/
STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/
...@@ -44,7 +44,7 @@ master_doc = 'index' ...@@ -44,7 +44,7 @@ master_doc = 'index'
# General information about the project. # General information about the project.
project = 'NLTK' project = 'NLTK'
copyright = '2013, NLTK Project' copyright = '2015, NLTK Project'
# The version info for the project you're documenting, acts as replacement for # The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the # |version| and |release|, also used in various other places throughout the
......
...@@ -5,14 +5,14 @@ The Natural Language Toolkit exists thanks to the efforts of dozens ...@@ -5,14 +5,14 @@ The Natural Language Toolkit exists thanks to the efforts of dozens
of voluntary developers who have contributed functionality and of voluntary developers who have contributed functionality and
bugfixes since the project began in 2000 (`contributors <https://github.com/nltk/nltk#contributing>`_). bugfixes since the project began in 2000 (`contributors <https://github.com/nltk/nltk#contributing>`_).
In 2014 we are especially keen to improve NLTK coverage for: In 2015 we are especially keen to improve NLTK coverage for:
`dependency parsing <https://github.com/nltk/nltk/wiki/Dependency-Parsing>`_, `dependency parsing <https://github.com/nltk/nltk/wiki/Dependency-Parsing>`_,
`machine translation <https://github.com/nltk/nltk/wiki/Machine-Translation>`_, `machine translation <https://github.com/nltk/nltk/wiki/Machine-Translation>`_,
`sentiment analysis <https://github.com/nltk/nltk/wiki/Sentiment-Analysis>`_, `sentiment analysis <https://github.com/nltk/nltk/wiki/Sentiment-Analysis>`_,
`twitter processing <https://github.com/nltk/nltk/wiki/Twitter-Processing>`_. `twitter processing <https://github.com/nltk/nltk/wiki/Twitter-Processing>`_.
New material in these areas will be covered in the second edition of New material in these areas will be covered in the second edition of
the NLTK book, anticipated in 2015. the NLTK book, anticipated in early 2016.
* `desired enhancements <https://github.com/nltk/nltk/issues?labels=enhancement&page=1&state=open>`_ * `desired enhancements <https://github.com/nltk/nltk/issues?labels=enhancement&page=1&state=open>`_
* `contribute a corpus <https://github.com/nltk/nltk/wiki/Adding-a-Corpus>`_ * `contribute a corpus <https://github.com/nltk/nltk/wiki/Adding-a-Corpus>`_
...@@ -29,7 +29,6 @@ Individual packages are maintained by the following people: ...@@ -29,7 +29,6 @@ Individual packages are maintained by the following people:
:Parsing: `Peter Ljunglöf <http://www.cse.chalmers.se/~peb/>`_, Gothenburg, Sweden (``nltk.parse, nltk.featstruct``) :Parsing: `Peter Ljunglöf <http://www.cse.chalmers.se/~peb/>`_, Gothenburg, Sweden (``nltk.parse, nltk.featstruct``)
:Metrics: `Joel Nothman <http://joelnothman.com/>`_, Sydney, Australia (``nltk.metrics, nltk.tokenize.punkt``) :Metrics: `Joel Nothman <http://joelnothman.com/>`_, Sydney, Australia (``nltk.metrics, nltk.tokenize.punkt``)
:Python 3: `Mikhail Korobov <http://kmike.ru/>`_, Ekaterinburg, Russia :Python 3: `Mikhail Korobov <http://kmike.ru/>`_, Ekaterinburg, Russia
:Integration: `Morten Minde Neergaard <http://8d.no/>`_, Oslo, Norway
:Releases: `Steven Bird <http://estive.net>`_, Melbourne, Australia :Releases: `Steven Bird <http://estive.net>`_, Melbourne, Australia
......
NLTK News NLTK News
========= =========
2015
----
NLTK 3.0.1 released : January 2015
Minor packaging update.
2014
----
NLTK 3.0.0 released : September 2014 NLTK 3.0.0 released : September 2014
Minor bugfixes. For full details see: Minor bugfixes. For full details see:
https://github.com/nltk/nltk/blob/develop/ChangeLog https://github.com/nltk/nltk/blob/develop/ChangeLog
...@@ -26,6 +35,9 @@ NLTK 3.0a4 released : June 2014 ...@@ -26,6 +35,9 @@ NLTK 3.0a4 released : June 2014
https://github.com/nltk/nltk/blob/develop/ChangeLog https://github.com/nltk/nltk/blob/develop/ChangeLog
http://nltk.org/nltk3-alpha/ http://nltk.org/nltk3-alpha/
2013
----
NLTK Book Updates : October 2013 NLTK Book Updates : October 2013
We are updating the NLTK book for Python 3 and NLTK 3; please see We are updating the NLTK book for Python 3 and NLTK 3; please see
http://nltk.org/book3/ http://nltk.org/book3/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment