Commit 6d74ec21 by Steven Bird

add support to default pos tagger for selecting tagset, in support of nltk/nltk_book#133

parent 2c4d7627
...@@ -83,7 +83,7 @@ from nltk.data import load ...@@ -83,7 +83,7 @@ from nltk.data import load
# Standard treebank POS tagger # Standard treebank POS tagger
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
def pos_tag(tokens): def pos_tag(tokens, tagset=None):
""" """
Use NLTK's currently recommended part of speech tagger to Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens. tag the given list of tokens.
...@@ -101,6 +101,8 @@ def pos_tag(tokens): ...@@ -101,6 +101,8 @@ def pos_tag(tokens):
:rtype: list(tuple(str, str)) :rtype: list(tuple(str, str))
""" """
tagger = load(_POS_TAGGER) tagger = load(_POS_TAGGER)
if tagset:
return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
return tagger.tag(tokens) return tagger.tag(tokens)
def pos_tag_sents(sentences): def pos_tag_sents(sentences):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment