Commit 851304a4 by Steven Bird

Modified tgrep API for greater consistency with NLTK:

* arguments are in same order as re.search (pattern then objects being searched)
* dropped support for searching a single tree; API requires a sequence
* return an iterator instead of a list
* use sphinx docstrings
parent d1b3da0d
......@@ -27,15 +27,15 @@ Usage
=====
>>> from nltk.tree import ParentedTree
>>> from nltk import tgrep
>>> from nltk.tgrep import tgrep_nodes, tgrep_positions
>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
>>> tgrep.tgrep_nodes(tree, 'NN')
>>> list(tgrep_nodes('NN', [tree]))
[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]
>>> tgrep.tgrep_positions(tree, 'NN')
>>> list(tgrep_positions('NN', [tree]))
[(0, 2), (2, 1)]
>>> tgrep.tgrep_nodes(tree, 'DT')
>>> list(tgrep_nodes('DT', [tree]))
[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]
>>> tgrep.tgrep_nodes(tree, 'DT $ JJ')
>>> list(tgrep_nodes('DT $ JJ', [tree]))
[ParentedTree('DT', ['the'])]
This implementation adds syntax to select nodes based on their NLTK
......@@ -48,7 +48,7 @@ valid node selectors. Example:
ParentedTree('DT', ['the'])
>>> tree[0,0].treeposition()
(0, 0)
>>> tgrep.tgrep_nodes(tree, 'N(0,0)')
>>> tgrep_nodes('N(0,0)', [tree])
[ParentedTree('DT', ['the'])]
Caveats:
......@@ -879,73 +879,65 @@ def treepositions_no_leaves(tree):
prefixes.add(pos[:length])
return [pos for pos in treepositions if pos in prefixes]
def tgrep_positions(tree, tgrep_string, search_leaves = True):
'''
Return all tree positions in the given tree which match the given
`tgrep_string`.
def tgrep_positions(pattern, trees, search_leaves=True):
"""
Return the tree positions in the trees which match the given pattern.
Arguments:
- `tree`: a NLTK tree (usually a ParentedTree), or an iterable over trees
- `tgrep_string`: a tgrep search query, either as a string value,
or compiled into a lambda function with `tgrep_compile`
- `search_leaves`: Boolean flag; if this is False, the method will
not return any positions which are leaf nodes of the given tree(s)
'''
# compile tgrep_string if needed
if isinstance(tgrep_string, (binary_type, text_type)):
tgrep_string = tgrep_compile(tgrep_string)
# check if tree is iterable
tree_iter = None
if not _istree(tree):
try:
tree_iter = iter(tree)
except TypeError:
pass
if tree_iter is not None:
return [tgrep_positions(t, tgrep_string, search_leaves)
for t in tree_iter]
else:
# tree is not an iterable but a single tree
:param pattern: a tgrep search pattern
:type pattern: str or output of tgrep_compile()
:param trees: a sequence of NLTK trees (usually ParentedTrees)
:type trees: iter(ParentedTree) or iter(Tree)
:param search_leaves: whether ot return matching leaf nodes
:type search_leaves: bool
:rtype: iter(tree positions)
"""
if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
try:
if search_leaves:
search_positions = tree.treepositions()
positions = tree.treepositions()
else:
search_positions = treepositions_no_leaves(tree)
positions = treepositions_no_leaves(tree)
except AttributeError:
return []
return [position for position in search_positions
if tgrep_string(tree[position])]
def tgrep_nodes(tree, tgrep_string, search_leaves = True):
'''
Return all tree nodes in the given tree which match the given
`tgrep_ string`.
Arguments:
- `tree`: a NLTK tree (usually a ParentedTree), or an iterable over trees
- `tgrep_string`: a tgrep search query, either as a string value,
or compiled into a lambda function with `tgrep_compile`
- `search_leaves`: Boolean flag; if this is False, the method will
not return any leaf nodes of the given tree(s)
'''
# compile tgrep_string if needed
if isinstance(tgrep_string, (binary_type, text_type)):
tgrep_string = tgrep_compile(tgrep_string)
# check if tree is iterable
tree_iter = None
if not _istree(tree):
yield []
yield [position for position in positions
if pattern(tree[position])]
def tgrep_nodes(pattern, trees, search_leaves=True):
"""
Return the tree nodes in the trees which match the given pattern.
:param pattern: a tgrep search pattern
:type pattern: str or output of tgrep_compile()
:param trees: a sequence of NLTK trees (usually ParentedTrees)
:type trees: iter(ParentedTree) or iter(Tree)
:param search_leaves: whether ot return matching leaf nodes
:type search_leaves: bool
:rtype: iter(tree nodes)
"""
if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
try:
tree_iter = iter(tree)
except TypeError:
pass
if tree_iter is not None:
return [tgrep_nodes(t, tgrep_string, search_leaves) for t in tree_iter]
else:
# tree is not an iterable but a single tree
return [tree[position] for position in
tgrep_positions(tree, tgrep_string, search_leaves)]
if search_leaves:
positions = tree.treepositions()
else:
positions = treepositions_no_leaves(tree)
except AttributeError:
yield []
yield [tree[position] for position in positions
if pattern(tree[position])]
# run module doctests
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment