Fixed bug in 'ne_chunked' demo. Some renaming of functions and tidying up.

d566f23c · Ewan Klein · 2c24d60c · d566f23c
Commit d566f23c authored Oct 30, 2013 by Ewan Klein
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 26 deletions

nltk/sem/relextract.py
+34 -26

No files found.
--- a/nltk/sem/relextract.py
+++ b/nltk/sem/relextract.py
@@ -112,9 +112,9 @@ def list2sym(lst):
    sym = sym.replace('.', '')
    return sym
-def mk_pairs(tree):
+def _tree2semi_rel(tree):
    """
-    Group a chunk structure into a list of pairs of the form (list(str), ``Tree``)
+    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 
    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
@@ -127,23 +127,23 @@ def mk_pairs(tree):
    from nltk.tree import Tree
-    pairs = []
+    semi_rels = []
-    pair = [[], None]
+    semi_rel = [[], None]
    for dtr in tree:
        if not isinstance(dtr, Tree):
-            pair[0].append(dtr)
+            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
-            pair[1] = dtr
+            semi_rel[1] = dtr
-            pairs.append(pair)
+            semi_rels.append(semi_rel)
-            pair = [[], None]
+            semi_rel = [[], None]
-    return pairs
+    return semi_rels
-def mk_reldicts(pairs, window=5, trace=0):
+def semi_rel2reldict(pairs, window=5, trace=False):
    """
-    Converts the pairs generated by ``mk_pairs`` into a 'reldict': a dictionary which
+    Converts the pairs generated by ``_tree2semi_rel`` into a 'reldict': a dictionary which
    stores information about the subject and object NEs plus the filler between them.
    Additionally, a left and right context of length =< window are captured (within
    a given input sentence).
@@ -162,19 +162,20 @@ def mk_reldicts(pairs, window=5, trace=0):
        reldict['subjtext'] = _join(pairs[0][1].leaves())
        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
        reldict['filler'] = _join(pairs[1][0])
+        reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
        reldict['objclass'] = pairs[1][1].label()
        reldict['objtext'] = _join(pairs[1][1].leaves())
        reldict['objsym'] = list2sym(pairs[1][1].leaves())
        reldict['rcon'] = _join(pairs[2][0][:window])
        if trace:
-            print("(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass']))
+            print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass']))
        result.append(reldict)
        pairs = pairs[1:]
    return result
 def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
    """
-    Filter the output of ``mk_reldicts`` according to specified NE classes and a filler pattern.
+    Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
    The parameters ``subjclass`` and ``objclass`` can be used to restrict the
    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
@@ -208,14 +209,15 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10
            objclass = _expand(objclass)
        else:
            raise ValueError("your value for the object type has not been recognized: %s" % objclass)
    if corpus == 'ace' or corpus == 'conll2002':
-        pairs = mk_pairs(doc)
+        pairs = _tree2semi_rel(doc)
    elif corpus == 'ieer':
-        pairs = mk_pairs(doc.text) + mk_pairs(doc.headline)
+        pairs = _tree2semi_rel(doc.text) + _tree2semi_rel(doc.headline)
    else:
        raise ValueError("corpus type not recognized")
-    reldicts = mk_reldicts(pairs)
+    reldicts = semi_rel2reldict(pairs)
    relfilter = lambda x: (x['subjclass'] == subjclass and
                           len(x['filler'].split()) <= window and
@@ -225,7 +227,7 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10
    return list(filter(relfilter, reldicts))
-def show_raw_rtuple(reldict, lcon=False, rcon=False):
+def rtuple(reldict, lcon=False, rcon=False):
    """
    Pretty print the reldict as an rtuple.
    :param reldict: a relation dictionary
@@ -242,7 +244,7 @@ def show_raw_rtuple(reldict, lcon=False, rcon=False):
    printargs = tuple(items)
    return format % printargs
-def show_clause(reldict, relsym):
+def clause(reldict, relsym):
    """
    Print the relation in clausal form.
    :param reldict: a relation dictionary
@@ -296,7 +298,7 @@ def in_demo(trace=0, sql=True):
                print(doc.docno)
                print("=" * 15)
            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
-                print(show_clause(rel, relsym='IN'))
+                print(clause(rel, relsym='IN'))
                if sql:
                    try:
                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
@@ -332,7 +334,7 @@ def roles_demo(trace=0):
    commissioner|
    counsel|
    director|
-    economist|
+    economist|       
    editor|
    executive|
    foreman|
@@ -365,7 +367,7 @@ def roles_demo(trace=0):
                print("=" * 15)
                lcon = rcon = True
            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
-                print(show_raw_rtuple(rel, lcon=lcon, rcon=rcon))
+                print(rtuple(rel, lcon=lcon, rcon=rcon))
 ##############################################
@@ -422,7 +424,7 @@ def conllned(trace=1):
        if trace:
                lcon = rcon = True
        for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
-            print(show_raw_rtuple(rel, lcon=True, rcon=True))
+            print(rtuple(rel, lcon=True, rcon=True))
 #############################################
 ## Spanish CONLL2002: (PER, ORG)
@@ -445,16 +447,21 @@ def conllesp():
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
-    for r in rels[:10]: print(show_clause(r, relsym='DE'))
+    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()
 def ne_chunked():
-    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
+    print()
+    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
+    print("=" * 45)
+    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
-    for sent in nltk.corpus.treebank.tagged_sents()[:100]:
+    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
-        print(extract_rels('ORG', 'LOC', sent, corpus='ace', pattern = IN))
+        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
+        for rel in rels:
+            print('{0:<5}{1}'.format(i, rtuple(rel)))
 if __name__ == '__main__':
@@ -465,6 +472,7 @@ if __name__ == '__main__':
    conllned()
    conllesp()
    ieer_headlines()
+    ne_chunked()