Updating NonprojectiveDependencyParser.parse().

Updating the method to use new DependencyGraph API.

Updating NonprojectiveDependencyParser.parse().
Updating the method to use new DependencyGraph API.
29246d9a · Dmitrijs Milajevs · e933cca0 · 29246d9a · 29246d9a · 29246d9a
Commit 29246d9a authored Feb 08, 2015 by Dmitrijs Milajevs
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 65 deletions

nltk/parse/dependencygraph.py
+15 -8

nltk/parse/nonprojectivedependencyparser.py
+32 -20

nltk/test/dependency.doctest
+42 -37

No files found.
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -102,7 +102,7 @@ class DependencyGraph(object):
        self.nodes[head_address]['deps'].setdefault(relation,[])
        self.nodes[head_address]['deps'][relation].append(mod_address)
        #self.nodes[head_address]['deps'].append(mod_address)
    def connect_graph(self):
        """
@@ -113,7 +113,7 @@ class DependencyGraph(object):
            for node2 in self.nodes.values():
                if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
                    relation = node2['rel']
-                    node1['deps'].setdefault(relation,[]) 
+                    node1['deps'].setdefault(relation, [])
                    node1['deps'][relation].append(node2['address'])
                    #node1['deps'].append(node2['address'])
@@ -214,17 +214,21 @@ class DependencyGraph(object):
        lines = (l.rstrip() for l in input_)
        lines = (l for l in lines if l)
+        cell_number = None
        for index, line in enumerate(lines, start=1):
            cells = line.split(cell_separator)
-            nrCells = len(cells)
+            if cell_number is None:
+                cell_number = len(cells)
+            else:
+                assert cell_number == len(cells)
            if cell_extractor is None:
                try:
-                    cell_extractor = extractors[nrCells]
+                    cell_extractor = extractors[cell_number]
                except KeyError:
                    raise ValueError(
                        'Number of tab-delimited fields ({0}) not supported by '
-                        'CoNLL(10) or Malt-Tab(4) format'.format(nrCells)
+                        'CoNLL(10) or Malt-Tab(4) format'.format(len(cells))
                    )
            word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
@@ -246,6 +250,9 @@ class DependencyGraph(object):
                }
            )
+            # Make sure that he fake root node has labeled dependencies.
+            if (cell_number == 3) and (head == 0):
+                rel = 'ROOT'
            self.nodes[head]['deps'][rel].append(index)
        if not self.nodes[0]['deps']['ROOT']:
@@ -271,7 +278,7 @@ class DependencyGraph(object):
        """
        node = self.get_by_address(i)
        word = node['word']
-        deps = list(chain.from_iterable(node['deps'].values()))
+        deps = sorted(chain.from_iterable(node['deps'].values()))
        if deps:
            return Tree(word, [self._tree(dep) for dep in deps])
@@ -286,7 +293,7 @@ class DependencyGraph(object):
        node = self.root
        word = node['word']
-        deps = chain.from_iterable(node['deps'].values())
+        deps = sorted(chain.from_iterable(node['deps'].values()))
        return Tree(word, [self._tree(dep) for dep in deps])
    def triples(self, node=None):
@@ -299,7 +306,7 @@ class DependencyGraph(object):
            node = self.root
        head = (node['word'], node['ctag'])
-        for i in node['deps']:
+        for i in sorted(chain.from_iterable(node['deps'].values())):
            dep = self.get_by_address(i)
            yield (head, dep['rel'], (dep['word'], dep['ctag']))
            for triple in self.triples(node=dep):

--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -462,8 +462,8 @@ class ProbabilisticNonprojectiveParser(object):
                }
            )
        #print (g_graph.nodes)
        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
@@ -567,8 +567,10 @@ class ProbabilisticNonprojectiveParser(object):
        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
-            # deps must be a dictionary 
+            # TODO: It's dangerous to assume that deps it a dictionary
-            #node['deps'] = []
+            # because it's a default dictionary. Ideally, here we should not
+            # be concerned how dependencies are stored inside of a dependency
+            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])
@@ -701,22 +703,32 @@ class NonprojectiveDependencyParser(object):
        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
-            root_count = 0
+            if analysis.count(-1) > 1:
-            root = []
+                # there are several root elements!
-            for i, cell in enumerate(analysis):
+                continue
-                if cell == -1:
-                    root_count += 1
+            graph = DependencyGraph()
-                    root = i
+            graph.root = graph.nodes[analysis.index(-1) + 1]
-            if root_count == 1:
-                graph = DependencyGraph()
+            for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
-                graph.nodes[0]['deps'] = root + 1
+                head_address = head_index + 1
-                for i in range(len(tokens)):
-                    node = {'word': tokens[i], 'address': i+1}
+                node = graph.nodes[address]
-                    node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
+                node.update(
-                    graph.nodes[i + 1] = node
+                    {
-#               cycle = graph.contains_cycle()
+                        'word': token,
-#               if not cycle:
+                        'address': address,
-                yield graph
+                    }
+                )
+                if head_address == 0:
+                    rel = 'ROOT'
+                else:
+                    rel = ''
+                graph.nodes[head_index + 1]['deps'][rel].append(address)
+            # TODO: check for cycles
+            yield graph
 #################################################################

--- a/nltk/test/dependency.doctest
+++ b/nltk/test/dependency.doctest
@@ -35,30 +35,33 @@ CoNLL Data
    ... .       .       9       VMOD
    ... """
    >>> dg = DependencyGraph(treebank_data)
-    >>> print(dg.tree().pprint())
+    >>> dg.tree().pprint()
    (will
      (Vinken Pierre , (old (years 61)) ,)
      (join (board the) (as (director a nonexecutive)) (Nov. 29) .))
-    >>> print(list(dg.triples()))
+    >>> for head, rel, dep in dg.triples():
-    [((u'will', u'MD'), u'SUB', (u'Vinken', u'NNP')),
+    ...     print(
-     ((u'Vinken', u'NNP'), u'NMOD', (u'Pierre', u'NNP')),
+    ...         '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
-     ((u'Vinken', u'NNP'), u'P', (u',', u',')),
+    ...         .format(h=head, r=rel, d=dep)
-     ((u'Vinken', u'NNP'), u'NMOD', (u'old', u'JJ')),
+    ...     )
-     ((u'old', u'JJ'), u'AMOD', (u'years', u'NNS')),
+    (will, MD), SUB, (Vinken, NNP)
-     ((u'years', u'NNS'), u'NMOD', (u'61', u'CD')),
+    (Vinken, NNP), NMOD, (Pierre, NNP)
-     ((u'Vinken', u'NNP'), u'P', (u',', u',')),
+    (Vinken, NNP), P, (,, ,)
-     ((u'will', u'MD'), u'VC', (u'join', u'VB')),
+    (Vinken, NNP), NMOD, (old, JJ)
-     ((u'join', u'VB'), u'OBJ', (u'board', u'NN')),
+    (old, JJ), AMOD, (years, NNS)
-     ((u'board', u'NN'), u'NMOD', (u'the', u'DT')),
+    (years, NNS), NMOD, (61, CD)
-     ((u'join', u'VB'), u'VMOD', (u'as', u'IN')),
+    (Vinken, NNP), P, (,, ,)
-     ((u'as', u'IN'), u'PMOD', (u'director', u'NN')),
+    (will, MD), VC, (join, VB)
-     ((u'director', u'NN'), u'NMOD', (u'a', u'DT')),
+    (join, VB), OBJ, (board, NN)
-     ((u'director', u'NN'), u'NMOD', (u'nonexecutive', u'JJ')),
+    (board, NN), NMOD, (the, DT)
-     ((u'join', u'VB'), u'VMOD', (u'Nov.', u'NNP')),
+    (join, VB), VMOD, (as, IN)
-     ((u'Nov.', u'NNP'), u'NMOD', (u'29', u'CD')),
+    (as, IN), PMOD, (director, NN)
-     ((u'join', u'VB'), u'VMOD', (u'.', u'.'))]
+    (director, NN), NMOD, (a, DT)
+    (director, NN), NMOD, (nonexecutive, JJ)
+    (join, VB), VMOD, (Nov., NNP)
+    (Nov., NNP), NMOD, (29, CD)
+    (join, VB), VMOD, (., .)
 Using the dependency-parsed version of the Penn Treebank corpus sample.
@@ -159,21 +162,23 @@ Non-Projective Dependency Parsing
      'dog' -> 'his'
    >>> dp = NonprojectiveDependencyParser(grammar)
+    >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
+    >>> print(g.root['word'])
+    taught
+    >>> for _, node in sorted(g.nodes.items()):
+    ...     if node['word'] is not None:
+    ...         print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
+    1 the: []
+    2 man: [1]
+    3 taught: [2, 7]
+    4 his: []
+    5 dog: [4]
+    6 to: []
+    7 play: [5, 6, 8]
+    8 golf: []
    >>> for g in dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']):
-    ...     print(g)  # doctest: +NORMALIZE_WHITESPACE
+    ...     print(g.tree())
-    {0: {'address': 0,
+    (taught (man the) (play (dog his) to golf))
-         'ctag': 'TOP',
-         'deps': 3,
-         'feats': None,
-         'lemma': None,
-         'rel': 'TOP',
-         'tag': 'TOP',
-         'word': None},
-     1: {'address': 1, 'deps': [], 'word': 'the'},
-     2: {'address': 2, 'deps': [1], 'word': 'man'},
-     3: {'address': 3, 'deps': [2, 7], 'word': 'taught'},
-     4: {'address': 4, 'deps': [], 'word': 'his'},
-     5: {'address': 5, 'deps': [4], 'word': 'dog'},
-     6: {'address': 6, 'deps': [], 'word': 'to'},
-     7: {'address': 7, 'deps': [5, 6, 8], 'word': 'play'},
-     8: {'address': 8, 'deps': [], 'word': 'golf'}}