add support to default pos tagger for selecting tagset, in support of nltk/nltk_book#133

6d74ec21 · Steven Bird · 2c4d7627 · 6d74ec21
Commit 6d74ec21 authored May 06, 2015 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletions

nltk/tag/__init__.py
+3 -1

No files found.
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -83,7 +83,7 @@ from nltk.data import load
 # Standard treebank POS tagger
 _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
-def pos_tag(tokens):
+def pos_tag(tokens, tagset=None):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
@@ -101,6 +101,8 @@ def pos_tag(tokens):
    :rtype: list(tuple(str, str))
    """
    tagger = load(_POS_TAGGER)
+    if tagset:
+        return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
    return tagger.tag(tokens)
 def pos_tag_sents(sentences):