Commit 851304a4 by Steven Bird

Modified tgrep API for greater consistency with NLTK:

* arguments are in same order as re.search (pattern then objects being searched)
* dropped support for searching a single tree; API requires a sequence
* return an iterator instead of a list
* use sphinx docstrings
parent d1b3da0d
...@@ -27,15 +27,15 @@ Usage ...@@ -27,15 +27,15 @@ Usage
===== =====
>>> from nltk.tree import ParentedTree >>> from nltk.tree import ParentedTree
>>> from nltk import tgrep >>> from nltk.tgrep import tgrep_nodes, tgrep_positions
>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
>>> tgrep.tgrep_nodes(tree, 'NN') >>> list(tgrep_nodes('NN', [tree]))
[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])] [ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]
>>> tgrep.tgrep_positions(tree, 'NN') >>> list(tgrep_positions('NN', [tree]))
[(0, 2), (2, 1)] [(0, 2), (2, 1)]
>>> tgrep.tgrep_nodes(tree, 'DT') >>> list(tgrep_nodes('DT', [tree]))
[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])] [ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]
>>> tgrep.tgrep_nodes(tree, 'DT $ JJ') >>> list(tgrep_nodes('DT $ JJ', [tree]))
[ParentedTree('DT', ['the'])] [ParentedTree('DT', ['the'])]
This implementation adds syntax to select nodes based on their NLTK This implementation adds syntax to select nodes based on their NLTK
...@@ -48,7 +48,7 @@ valid node selectors. Example: ...@@ -48,7 +48,7 @@ valid node selectors. Example:
ParentedTree('DT', ['the']) ParentedTree('DT', ['the'])
>>> tree[0,0].treeposition() >>> tree[0,0].treeposition()
(0, 0) (0, 0)
>>> tgrep.tgrep_nodes(tree, 'N(0,0)') >>> tgrep_nodes('N(0,0)', [tree])
[ParentedTree('DT', ['the'])] [ParentedTree('DT', ['the'])]
Caveats: Caveats:
...@@ -879,73 +879,65 @@ def treepositions_no_leaves(tree): ...@@ -879,73 +879,65 @@ def treepositions_no_leaves(tree):
prefixes.add(pos[:length]) prefixes.add(pos[:length])
return [pos for pos in treepositions if pos in prefixes] return [pos for pos in treepositions if pos in prefixes]
def tgrep_positions(tree, tgrep_string, search_leaves = True): def tgrep_positions(pattern, trees, search_leaves=True):
''' """
Return all tree positions in the given tree which match the given Return the tree positions in the trees which match the given pattern.
`tgrep_string`.
Arguments: :param pattern: a tgrep search pattern
- `tree`: a NLTK tree (usually a ParentedTree), or an iterable over trees :type pattern: str or output of tgrep_compile()
- `tgrep_string`: a tgrep search query, either as a string value, :param trees: a sequence of NLTK trees (usually ParentedTrees)
or compiled into a lambda function with `tgrep_compile` :type trees: iter(ParentedTree) or iter(Tree)
- `search_leaves`: Boolean flag; if this is False, the method will :param search_leaves: whether ot return matching leaf nodes
not return any positions which are leaf nodes of the given tree(s) :type search_leaves: bool
''' :rtype: iter(tree positions)
# compile tgrep_string if needed """
if isinstance(tgrep_string, (binary_type, text_type)):
tgrep_string = tgrep_compile(tgrep_string) if isinstance(pattern, (binary_type, text_type)):
# check if tree is iterable pattern = tgrep_compile(pattern)
tree_iter = None
if not _istree(tree): for tree in trees:
try:
tree_iter = iter(tree)
except TypeError:
pass
if tree_iter is not None:
return [tgrep_positions(t, tgrep_string, search_leaves)
for t in tree_iter]
else:
# tree is not an iterable but a single tree
try: try:
if search_leaves: if search_leaves:
search_positions = tree.treepositions() positions = tree.treepositions()
else: else:
search_positions = treepositions_no_leaves(tree) positions = treepositions_no_leaves(tree)
except AttributeError: except AttributeError:
return [] yield []
return [position for position in search_positions
if tgrep_string(tree[position])] yield [position for position in positions
if pattern(tree[position])]
def tgrep_nodes(tree, tgrep_string, search_leaves = True):
''' def tgrep_nodes(pattern, trees, search_leaves=True):
Return all tree nodes in the given tree which match the given """
`tgrep_ string`. Return the tree nodes in the trees which match the given pattern.
Arguments: :param pattern: a tgrep search pattern
- `tree`: a NLTK tree (usually a ParentedTree), or an iterable over trees :type pattern: str or output of tgrep_compile()
- `tgrep_string`: a tgrep search query, either as a string value, :param trees: a sequence of NLTK trees (usually ParentedTrees)
or compiled into a lambda function with `tgrep_compile` :type trees: iter(ParentedTree) or iter(Tree)
- `search_leaves`: Boolean flag; if this is False, the method will :param search_leaves: whether ot return matching leaf nodes
not return any leaf nodes of the given tree(s) :type search_leaves: bool
''' :rtype: iter(tree nodes)
# compile tgrep_string if needed """
if isinstance(tgrep_string, (binary_type, text_type)):
tgrep_string = tgrep_compile(tgrep_string) if isinstance(pattern, (binary_type, text_type)):
# check if tree is iterable pattern = tgrep_compile(pattern)
tree_iter = None
if not _istree(tree): for tree in trees:
try: try:
tree_iter = iter(tree) if search_leaves:
except TypeError: positions = tree.treepositions()
pass else:
if tree_iter is not None: positions = treepositions_no_leaves(tree)
return [tgrep_nodes(t, tgrep_string, search_leaves) for t in tree_iter] except AttributeError:
else: yield []
# tree is not an iterable but a single tree
return [tree[position] for position in yield [tree[position] for position in positions
tgrep_positions(tree, tgrep_string, search_leaves)] if pattern(tree[position])]
# run module doctests # run module doctests
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment