Commit 29246d9a by Dmitrijs Milajevs

Updating NonprojectiveDependencyParser.parse().

Updating the method to use new DependencyGraph API.
parent e933cca0
...@@ -102,7 +102,7 @@ class DependencyGraph(object): ...@@ -102,7 +102,7 @@ class DependencyGraph(object):
self.nodes[head_address]['deps'].setdefault(relation,[]) self.nodes[head_address]['deps'].setdefault(relation,[])
self.nodes[head_address]['deps'][relation].append(mod_address) self.nodes[head_address]['deps'][relation].append(mod_address)
#self.nodes[head_address]['deps'].append(mod_address) #self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self): def connect_graph(self):
""" """
...@@ -113,7 +113,7 @@ class DependencyGraph(object): ...@@ -113,7 +113,7 @@ class DependencyGraph(object):
for node2 in self.nodes.values(): for node2 in self.nodes.values():
if node1['address'] != node2['address'] and node2['rel'] != 'TOP': if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
relation = node2['rel'] relation = node2['rel']
node1['deps'].setdefault(relation,[]) node1['deps'].setdefault(relation, [])
node1['deps'][relation].append(node2['address']) node1['deps'][relation].append(node2['address'])
#node1['deps'].append(node2['address']) #node1['deps'].append(node2['address'])
...@@ -214,17 +214,21 @@ class DependencyGraph(object): ...@@ -214,17 +214,21 @@ class DependencyGraph(object):
lines = (l.rstrip() for l in input_) lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l) lines = (l for l in lines if l)
cell_number = None
for index, line in enumerate(lines, start=1): for index, line in enumerate(lines, start=1):
cells = line.split(cell_separator) cells = line.split(cell_separator)
nrCells = len(cells) if cell_number is None:
cell_number = len(cells)
else:
assert cell_number == len(cells)
if cell_extractor is None: if cell_extractor is None:
try: try:
cell_extractor = extractors[nrCells] cell_extractor = extractors[cell_number]
except KeyError: except KeyError:
raise ValueError( raise ValueError(
'Number of tab-delimited fields ({0}) not supported by ' 'Number of tab-delimited fields ({0}) not supported by '
'CoNLL(10) or Malt-Tab(4) format'.format(nrCells) 'CoNLL(10) or Malt-Tab(4) format'.format(len(cells))
) )
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
...@@ -246,6 +250,9 @@ class DependencyGraph(object): ...@@ -246,6 +250,9 @@ class DependencyGraph(object):
} }
) )
# Make sure that he fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = 'ROOT'
self.nodes[head]['deps'][rel].append(index) self.nodes[head]['deps'][rel].append(index)
if not self.nodes[0]['deps']['ROOT']: if not self.nodes[0]['deps']['ROOT']:
...@@ -271,7 +278,7 @@ class DependencyGraph(object): ...@@ -271,7 +278,7 @@ class DependencyGraph(object):
""" """
node = self.get_by_address(i) node = self.get_by_address(i)
word = node['word'] word = node['word']
deps = list(chain.from_iterable(node['deps'].values())) deps = sorted(chain.from_iterable(node['deps'].values()))
if deps: if deps:
return Tree(word, [self._tree(dep) for dep in deps]) return Tree(word, [self._tree(dep) for dep in deps])
...@@ -286,7 +293,7 @@ class DependencyGraph(object): ...@@ -286,7 +293,7 @@ class DependencyGraph(object):
node = self.root node = self.root
word = node['word'] word = node['word']
deps = chain.from_iterable(node['deps'].values()) deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps]) return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None): def triples(self, node=None):
...@@ -299,7 +306,7 @@ class DependencyGraph(object): ...@@ -299,7 +306,7 @@ class DependencyGraph(object):
node = self.root node = self.root
head = (node['word'], node['ctag']) head = (node['word'], node['ctag'])
for i in node['deps']: for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i) dep = self.get_by_address(i)
yield (head, dep['rel'], (dep['word'], dep['ctag'])) yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep): for triple in self.triples(node=dep):
......
...@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object): ...@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object):
} }
) )
#print (g_graph.nodes) #print (g_graph.nodes)
# Fully connect non-root nodes in g_graph # Fully connect non-root nodes in g_graph
g_graph.connect_graph() g_graph.connect_graph()
original_graph = DependencyGraph() original_graph = DependencyGraph()
...@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object): ...@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object):
logger.debug('Betas: %s', betas) logger.debug('Betas: %s', betas)
for node in original_graph.nodes.values(): for node in original_graph.nodes.values():
# deps must be a dictionary # TODO: It's dangerous to assume that deps it a dictionary
#node['deps'] = [] # because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
node['deps'] = {} node['deps'] = {}
for i in range(1, len(tokens) + 1): for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1]) original_graph.add_arc(betas[i][0], betas[i][1])
...@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object): ...@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object):
# Filter parses # Filter parses
# ensure 1 root, every thing has 1 head # ensure 1 root, every thing has 1 head
for analysis in analyses: for analysis in analyses:
root_count = 0 if analysis.count(-1) > 1:
root = [] # there are several root elements!
for i, cell in enumerate(analysis): continue
if cell == -1:
root_count += 1 graph = DependencyGraph()
root = i graph.root = graph.nodes[analysis.index(-1) + 1]
if root_count == 1:
graph = DependencyGraph() for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
graph.nodes[0]['deps'] = root + 1 head_address = head_index + 1
for i in range(len(tokens)):
node = {'word': tokens[i], 'address': i+1} node = graph.nodes[address]
node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i] node.update(
graph.nodes[i + 1] = node {
# cycle = graph.contains_cycle() 'word': token,
# if not cycle: 'address': address,
yield graph }
)
if head_address == 0:
rel = 'ROOT'
else:
rel = ''
graph.nodes[head_index + 1]['deps'][rel].append(address)
# TODO: check for cycles
yield graph
################################################################# #################################################################
......
...@@ -35,30 +35,33 @@ CoNLL Data ...@@ -35,30 +35,33 @@ CoNLL Data
... . . 9 VMOD ... . . 9 VMOD
... """ ... """
>>> dg = DependencyGraph(treebank_data) >>> dg = DependencyGraph(treebank_data)
>>> print(dg.tree().pprint()) >>> dg.tree().pprint()
(will (will
(Vinken Pierre , (old (years 61)) ,) (Vinken Pierre , (old (years 61)) ,)
(join (board the) (as (director a nonexecutive)) (Nov. 29) .)) (join (board the) (as (director a nonexecutive)) (Nov. 29) .))
>>> print(list(dg.triples())) >>> for head, rel, dep in dg.triples():
[((u'will', u'MD'), u'SUB', (u'Vinken', u'NNP')), ... print(
((u'Vinken', u'NNP'), u'NMOD', (u'Pierre', u'NNP')), ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
((u'Vinken', u'NNP'), u'P', (u',', u',')), ... .format(h=head, r=rel, d=dep)
((u'Vinken', u'NNP'), u'NMOD', (u'old', u'JJ')), ... )
((u'old', u'JJ'), u'AMOD', (u'years', u'NNS')), (will, MD), SUB, (Vinken, NNP)
((u'years', u'NNS'), u'NMOD', (u'61', u'CD')), (Vinken, NNP), NMOD, (Pierre, NNP)
((u'Vinken', u'NNP'), u'P', (u',', u',')), (Vinken, NNP), P, (,, ,)
((u'will', u'MD'), u'VC', (u'join', u'VB')), (Vinken, NNP), NMOD, (old, JJ)
((u'join', u'VB'), u'OBJ', (u'board', u'NN')), (old, JJ), AMOD, (years, NNS)
((u'board', u'NN'), u'NMOD', (u'the', u'DT')), (years, NNS), NMOD, (61, CD)
((u'join', u'VB'), u'VMOD', (u'as', u'IN')), (Vinken, NNP), P, (,, ,)
((u'as', u'IN'), u'PMOD', (u'director', u'NN')), (will, MD), VC, (join, VB)
((u'director', u'NN'), u'NMOD', (u'a', u'DT')), (join, VB), OBJ, (board, NN)
((u'director', u'NN'), u'NMOD', (u'nonexecutive', u'JJ')), (board, NN), NMOD, (the, DT)
((u'join', u'VB'), u'VMOD', (u'Nov.', u'NNP')), (join, VB), VMOD, (as, IN)
((u'Nov.', u'NNP'), u'NMOD', (u'29', u'CD')), (as, IN), PMOD, (director, NN)
((u'join', u'VB'), u'VMOD', (u'.', u'.'))] (director, NN), NMOD, (a, DT)
(director, NN), NMOD, (nonexecutive, JJ)
(join, VB), VMOD, (Nov., NNP)
(Nov., NNP), NMOD, (29, CD)
(join, VB), VMOD, (., .)
Using the dependency-parsed version of the Penn Treebank corpus sample. Using the dependency-parsed version of the Penn Treebank corpus sample.
...@@ -159,21 +162,23 @@ Non-Projective Dependency Parsing ...@@ -159,21 +162,23 @@ Non-Projective Dependency Parsing
'dog' -> 'his' 'dog' -> 'his'
>>> dp = NonprojectiveDependencyParser(grammar) >>> dp = NonprojectiveDependencyParser(grammar)
>>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
>>> print(g.root['word'])
taught
>>> for _, node in sorted(g.nodes.items()):
... if node['word'] is not None:
... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
1 the: []
2 man: [1]
3 taught: [2, 7]
4 his: []
5 dog: [4]
6 to: []
7 play: [5, 6, 8]
8 golf: []
>>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']): >>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']):
... print(g) # doctest: +NORMALIZE_WHITESPACE ... print(g.tree())
{0: {'address': 0, (taught (man the) (play (dog his) to golf))
'ctag': 'TOP',
'deps': 3,
'feats': None,
'lemma': None,
'rel': 'TOP',
'tag': 'TOP',
'word': None},
1: {'address': 1, 'deps': [], 'word': 'the'},
2: {'address': 2, 'deps': [1], 'word': 'man'},
3: {'address': 3, 'deps': [2, 7], 'word': 'taught'},
4: {'address': 4, 'deps': [], 'word': 'his'},
5: {'address': 5, 'deps': [4], 'word': 'dog'},
6: {'address': 6, 'deps': [], 'word': 'to'},
7: {'address': 7, 'deps': [5, 6, 8], 'word': 'play'},
8: {'address': 8, 'deps': [], 'word': 'golf'}}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment