Commit d566f23c by Ewan Klein

Fixed bug in 'ne_chunked' demo. Some renaming of functions and tidying up.

parent 2c24d60c
...@@ -112,9 +112,9 @@ def list2sym(lst): ...@@ -112,9 +112,9 @@ def list2sym(lst):
sym = sym.replace('.', '') sym = sym.replace('.', '')
return sym return sym
def mk_pairs(tree): def _tree2semi_rel(tree):
""" """
Group a chunk structure into a list of pairs of the form (list(str), ``Tree``) Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal identifies pairs whose first member is a list (possibly empty) of terminal
...@@ -127,23 +127,23 @@ def mk_pairs(tree): ...@@ -127,23 +127,23 @@ def mk_pairs(tree):
from nltk.tree import Tree from nltk.tree import Tree
pairs = [] semi_rels = []
pair = [[], None] semi_rel = [[], None]
for dtr in tree: for dtr in tree:
if not isinstance(dtr, Tree): if not isinstance(dtr, Tree):
pair[0].append(dtr) semi_rel[0].append(dtr)
else: else:
# dtr is a Tree # dtr is a Tree
pair[1] = dtr semi_rel[1] = dtr
pairs.append(pair) semi_rels.append(semi_rel)
pair = [[], None] semi_rel = [[], None]
return pairs return semi_rels
def mk_reldicts(pairs, window=5, trace=0): def semi_rel2reldict(pairs, window=5, trace=False):
""" """
Converts the pairs generated by ``mk_pairs`` into a 'reldict': a dictionary which Converts the pairs generated by ``_tree2semi_rel`` into a 'reldict': a dictionary which
stores information about the subject and object NEs plus the filler between them. stores information about the subject and object NEs plus the filler between them.
Additionally, a left and right context of length =< window are captured (within Additionally, a left and right context of length =< window are captured (within
a given input sentence). a given input sentence).
...@@ -162,19 +162,20 @@ def mk_reldicts(pairs, window=5, trace=0): ...@@ -162,19 +162,20 @@ def mk_reldicts(pairs, window=5, trace=0):
reldict['subjtext'] = _join(pairs[0][1].leaves()) reldict['subjtext'] = _join(pairs[0][1].leaves())
reldict['subjsym'] = list2sym(pairs[0][1].leaves()) reldict['subjsym'] = list2sym(pairs[0][1].leaves())
reldict['filler'] = _join(pairs[1][0]) reldict['filler'] = _join(pairs[1][0])
reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
reldict['objclass'] = pairs[1][1].label() reldict['objclass'] = pairs[1][1].label()
reldict['objtext'] = _join(pairs[1][1].leaves()) reldict['objtext'] = _join(pairs[1][1].leaves())
reldict['objsym'] = list2sym(pairs[1][1].leaves()) reldict['objsym'] = list2sym(pairs[1][1].leaves())
reldict['rcon'] = _join(pairs[2][0][:window]) reldict['rcon'] = _join(pairs[2][0][:window])
if trace: if trace:
print("(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])) print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass']))
result.append(reldict) result.append(reldict)
pairs = pairs[1:] pairs = pairs[1:]
return result return result
def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10): def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
""" """
Filter the output of ``mk_reldicts`` according to specified NE classes and a filler pattern. Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
The parameters ``subjclass`` and ``objclass`` can be used to restrict the The parameters ``subjclass`` and ``objclass`` can be used to restrict the
Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
...@@ -208,14 +209,15 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10 ...@@ -208,14 +209,15 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10
objclass = _expand(objclass) objclass = _expand(objclass)
else: else:
raise ValueError("your value for the object type has not been recognized: %s" % objclass) raise ValueError("your value for the object type has not been recognized: %s" % objclass)
if corpus == 'ace' or corpus == 'conll2002': if corpus == 'ace' or corpus == 'conll2002':
pairs = mk_pairs(doc) pairs = _tree2semi_rel(doc)
elif corpus == 'ieer': elif corpus == 'ieer':
pairs = mk_pairs(doc.text) + mk_pairs(doc.headline) pairs = _tree2semi_rel(doc.text) + _tree2semi_rel(doc.headline)
else: else:
raise ValueError("corpus type not recognized") raise ValueError("corpus type not recognized")
reldicts = mk_reldicts(pairs) reldicts = semi_rel2reldict(pairs)
relfilter = lambda x: (x['subjclass'] == subjclass and relfilter = lambda x: (x['subjclass'] == subjclass and
len(x['filler'].split()) <= window and len(x['filler'].split()) <= window and
...@@ -225,7 +227,7 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10 ...@@ -225,7 +227,7 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10
return list(filter(relfilter, reldicts)) return list(filter(relfilter, reldicts))
def show_raw_rtuple(reldict, lcon=False, rcon=False): def rtuple(reldict, lcon=False, rcon=False):
""" """
Pretty print the reldict as an rtuple. Pretty print the reldict as an rtuple.
:param reldict: a relation dictionary :param reldict: a relation dictionary
...@@ -242,7 +244,7 @@ def show_raw_rtuple(reldict, lcon=False, rcon=False): ...@@ -242,7 +244,7 @@ def show_raw_rtuple(reldict, lcon=False, rcon=False):
printargs = tuple(items) printargs = tuple(items)
return format % printargs return format % printargs
def show_clause(reldict, relsym): def clause(reldict, relsym):
""" """
Print the relation in clausal form. Print the relation in clausal form.
:param reldict: a relation dictionary :param reldict: a relation dictionary
...@@ -296,7 +298,7 @@ def in_demo(trace=0, sql=True): ...@@ -296,7 +298,7 @@ def in_demo(trace=0, sql=True):
print(doc.docno) print(doc.docno)
print("=" * 15) print("=" * 15)
for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(show_clause(rel, relsym='IN')) print(clause(rel, relsym='IN'))
if sql: if sql:
try: try:
rtuple = (rel['subjtext'], rel['objtext'], doc.docno) rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
...@@ -332,7 +334,7 @@ def roles_demo(trace=0): ...@@ -332,7 +334,7 @@ def roles_demo(trace=0):
commissioner| commissioner|
counsel| counsel|
director| director|
economist| economist|
editor| editor|
executive| executive|
foreman| foreman|
...@@ -365,7 +367,7 @@ def roles_demo(trace=0): ...@@ -365,7 +367,7 @@ def roles_demo(trace=0):
print("=" * 15) print("=" * 15)
lcon = rcon = True lcon = rcon = True
for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
print(show_raw_rtuple(rel, lcon=lcon, rcon=rcon)) print(rtuple(rel, lcon=lcon, rcon=rcon))
############################################## ##############################################
...@@ -422,7 +424,7 @@ def conllned(trace=1): ...@@ -422,7 +424,7 @@ def conllned(trace=1):
if trace: if trace:
lcon = rcon = True lcon = rcon = True
for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10): for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
print(show_raw_rtuple(rel, lcon=True, rcon=True)) print(rtuple(rel, lcon=True, rcon=True))
############################################# #############################################
## Spanish CONLL2002: (PER, ORG) ## Spanish CONLL2002: (PER, ORG)
...@@ -445,16 +447,21 @@ def conllesp(): ...@@ -445,16 +447,21 @@ def conllesp():
print("=" * 45) print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train') rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(show_clause(r, relsym='DE')) for r in rels[:10]: print(clause(r, relsym='DE'))
print() print()
def ne_chunked(): def ne_chunked():
IN = re.compile(r'.*\bin\b(?!\b.+ing)') print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = [] rels = []
for sent in nltk.corpus.treebank.tagged_sents()[:100]: for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent) sent = nltk.ne_chunk(sent)
print(extract_rels('ORG', 'LOC', sent, corpus='ace', pattern = IN)) rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
if __name__ == '__main__': if __name__ == '__main__':
...@@ -465,6 +472,7 @@ if __name__ == '__main__': ...@@ -465,6 +472,7 @@ if __name__ == '__main__':
conllned() conllned()
conllesp() conllesp()
ieer_headlines() ieer_headlines()
ne_chunked()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment