moved tagging howto docs into docstrings

aada675c · Steven Bird · 7afca07a · aada675c · aada675c · aada675c
Commit aada675c authored Nov 10, 2011 by Steven Bird
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 199 additions and 17 deletions

nltk/tag/__init__.py
+50 -3

nltk/tag/brill.py
+50 -1

nltk/tag/hmm.py
+3 -0

nltk/tag/sequential.py
+89 -13

nltk/tag/util.py
+7 -0

nltk/test/tag.doctest
+0 -0

No files found.
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -9,9 +9,45 @@
 """
 NLTK Taggers

-Classes and interfaces for tagging each token of a sentence with
-supplementary information, such as its part of speech.  This task,
-which is known as tagging, is defined by the ``TaggerI`` interface.
+This package contains classes and interfaces for part-of-speech
+tagging, or simply "tagging".
+
+A "tag" is a case-sensitive string that specifies some property of a token,
+such as its part of speech.  Tagged tokens are encoded as tuples
+``(tag, token)``.  For example, the following tagged token combines
+the word ``'fly'`` with a noun part of speech tag (``'NN'``):
+
+    >>> tagged_tok = ('fly', 'NN')
+
+An off-the-shelf tagger is available.  It uses the Penn Treebank tagset:
+
+    >>> from nltk import pos_tag, word_tokenize
+    >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+    [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
+    'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
+    ('.', '.')]
+
+This package defines several taggers, which take a token list (typically a
+sentence), assign a tag to each token, and return the resulting list of
+tagged tokens.  Most of the taggers are built automatically based on a
+training corpus.  For example, the unigram tagger tags each word *w*
+by checking what the most frequent tag for *w* was in a training corpus:
+
+    >>> from nltk.corpus import brown
+    >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
+    >>> tagger.tag(['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'])
+    [('Mitchell', 'NP'), ('decried', None), ('the', 'AT'), ('high', 'JJ'),
+    ('rate', 'NN'), ('of', 'IN'), ('unemployment', None)]
+
+Note that words that the tagger has not seen during training receive a tag
+of ``None``.
+
+We evaluate a tagger on data that was not seen during training:
+
+    >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) # doctest: +ELLIPSIS
+    0.734...
+
+For more information, please consult chapter 5 of the NLTK Book.
 """

 from nltk.tag.api        import TaggerI
@@ -45,6 +81,17 @@ def pos_tag(tokens):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
+    
+        >>> from nltk import pos_tag, word_tokenize
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
+        'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
+        ('.', '.')]
+
+    :param tokens: Sequence of tokens to be tagged
+    :type tokens: list(str)
+    :return: The tagged tokens
+    :rtype: list(tuple(str, str))
    """
    tagger = load(_POS_TAGGER)
    return tagger.tag(tokens)

--- a/nltk/tag/brill.py
+++ b/nltk/tag/brill.py
@@ -8,7 +8,56 @@
 # For license information, see LICENSE.TXT

 """
-Brill's transformational rule-based tagger.
+Brill Tagger
+
+The Brill Tagger is a transformational rule-based tagger.
+It starts by running an initial tagger, and then
+improves the tagging by applying a list of transformation rules.
+These transformation rules are automatically learned from the training
+corpus, based on one or more "rule templates."
+
+    >>> from nltk.tag.brill import *
+    >>> templates = [
+    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
+    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
+    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)),
+    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)),
+    ...     SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
+    ...     SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
+    ...     SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
+    ...     SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
+    ...     ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
+    ...     ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
+    ...     ]
+    >>> trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger_2,
+    ...                                  templates=templates, trace=3,
+    ...                                  deterministic=True)
+    >>> brill_tagger = trainer.train(brown_train, max_rules=10)  # doctest: +NORMALIZE_WHITESPACE
+    Training Brill tagger on 4523 sentences...
+    Finding initial useful rules...
+        Found 75359 useful rules.
+    <BLANKLINE>
+               B      |     
+       S   F   r   O  |        Score = Fixed - Broken
+       c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+       o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+       r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+       e   d   n   r  |  e
+    ------------------+-------------------------------------------------------
+     354 354   0   3  | TO -> IN if the tag of the following word is 'AT'
+     111 173  62   3  | NN -> VB if the tag of the preceding word is 'TO'
+     110 110   0   4  | TO -> IN if the tag of the following word is 'NP'
+      83 157  74   4  | NP -> NP-TL if the tag of the following word is
+                      |   'NN-TL'
+      73  77   4   0  | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
+      71 116  45   3  | TO -> IN if the tag of words i+1...i+2 is 'NNS'
+      65  65   0   3  | NN -> VB if the tag of the preceding word is 'MD'
+      63  63   0   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVZ'
+      59  62   3   2  | CS -> QL if the text of words i+1...i+3 is 'as'
+      55  57   2   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVD'
+    >>> print 'Accuracy: %4.1f%%' % (
+    ...     100.0 * brill_tagger.evaluate(brown_test))
+    Accuracy: 89.5%
 """

 import bisect        # for binary search through a subset of indices

--- a/nltk/tag/hmm.py
+++ b/nltk/tag/hmm.py
@@ -65,6 +65,9 @@ This discussion assumes that the HMM has been trained. This is probably the
 most difficult task with the model, and requires either MLE estimates of the
 parameters or unsupervised learning using the Baum-Welch algorithm, a variant
 of EM.
+
+For more information, please consult the source code for this module,
+which includes extensive demonstration code.
 """

 import re

--- a/nltk/tag/sequential.py
+++ b/nltk/tag/sequential.py
@@ -212,13 +212,21 @@ class ContextTagger(SequentialBackoffTagger):
 class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):
    """
    A tagger that assigns the same tag to every token.
+    
+        >>> default_tagger = DefaultTagger('NN')
+        >>> default_tagger.tag('This is a test'.split())
+        [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
+
+    This tagger is recommended as a backoff tagger, in cases where
+    a more powerful tagger is unable to assign a tag to the word
+    (e.g. because the word was not seen during training).
+    
+    :param tag: The tag to assign to each token
+    :type tag: str
    """
    yaml_tag = '!nltk.DefaultTagger'
    
    def __init__(self, tag):
-        """
-        Construct a new tagger that assigns tag to all tokens.
-        """
        self._tag = tag
        SequentialBackoffTagger.__init__(self, None)
        
@@ -271,9 +279,34 @@ class NgramTagger(ContextTagger, yaml.YAMLObject):

 class UnigramTagger(NgramTagger):
    """
-    A tagger that chooses a token's tag based its word string.
-    Unigram taggers are typically trained on a tagged corpus.
+    Unigram Tagger
+    
+    The UnigramTagger finds the most likely tag for each word in a training
+    corpus, and then uses that information to assign tags to new tokens.
+
+        >>> from nltk.corpus import brown
+        >>> test_sent = brown.sents(categories='news')[0] 
+        >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
+        >>> unigram_tagger.tag(test_sent)
+        [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'),
+        ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'),
+        ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'),
+        ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'),
+        ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'),
+        ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
+        
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
    """
+
    yaml_tag = '!nltk.UnigramTagger'

    def __init__(self, train=None, model=None,
@@ -290,8 +323,18 @@ class BigramTagger(NgramTagger):
    A tagger that chooses a token's tag based its word string and on
    the preceeding words' tag.  In particular, a tuple consisting
    of the previous tag and the word is looked up in a table, and
-    the corresponding tag is returned.  Bigram taggers are typically
-    trained on a tagged corpus.
+    the corresponding tag is returned.
+
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
    """
    yaml_tag = '!nltk.BigramTagger'

@@ -306,8 +349,18 @@ class TrigramTagger(NgramTagger):
    A tagger that chooses a token's tag based its word string and on
    the preceeding two words' tags.  In particular, a tuple consisting
    of the previous two tags and the word is looked up in a table, and
-    the corresponding tag is returned.  Trigram taggers are typically
-    trained them on a tagged corpus.
+    the corresponding tag is returned.
+    
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
    """
    yaml_tag = '!nltk.TrigramTagger'

@@ -363,10 +416,33 @@ class AffixTagger(ContextTagger, yaml.YAMLObject):

 class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
    """
-    A tagger that assigns tags to words based on regular expressions
-    over word strings.
-
-    Construct a new regexp tagger.
+    Regular Expression Tagger
+    
+    The RegexpTagger assigns tags to tokens by comparing their
+    word strings to a series of regular expressions.  The following tagger
+    uses word suffixes to make guesses about the correct Brown Corpus part
+    of speech tag:
+    
+        >>> from nltk.corpus import brown
+        >>> test_sent = brown.sents(categories='news')[0] 
+        >>> regexp_tagger = RegexpTagger(
+        ...     [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ...      (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+        ...      (r'.*able$', 'JJ'),                # adjectives
+        ...      (r'.*ness$', 'NN'),                # nouns formed from adjectives
+        ...      (r'.*ly$', 'RB'),                  # adverbs
+        ...      (r'.*s$', 'NNS'),                  # plural nouns
+        ...      (r'.*ing$', 'VBG'),                # gerunds
+        ...      (r'.*ed$', 'VBD'),                 # past tense verbs
+        ...      (r'.*', 'NN')                      # nouns (default)
+        ... ])
+        >>> regexp_tagger.tag(test_sent)
+        [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
+        ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
+        ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
+        ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
+        ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
+        ('place', 'NN'), ('.', 'NN')]

    :type regexps: list(tuple(str, str))
    :param regexps: A list of ``(regexp, tag)`` pairs, each of

--- a/nltk/tag/util.py
+++ b/nltk/tag/util.py
@@ -12,6 +12,9 @@ def str2tuple(s, sep='/'):
    corresponding tuple representation.  The rightmost occurrence of
    *sep* in *s* will be used to divide *s* into a word string and
    a tag string.  If *sep* does not occur in *s*, return (s, None).
+    
+        >>> str2tuple('fly/NN')
+        ('fly', 'NN')

    :type s: str
    :param s: The string representation of a tagged token.
@@ -33,6 +36,10 @@ def tuple2str(tagged_token, sep='/'):
    separator, followed by the token's tag.  (If the tag is None,
    then just return the bare word string.)
    
+        >>> tagged_token = ('fly', 'NN')
+        >>> tuple2str(tagged_token)
+        'fly/NN'
+
    :type tagged_token: tuple(str, str)
    :param tagged_token: The tuple representation of a tagged token.
    :type sep: str

--- a/nltk/test/tag.doctest
+++ b/nltk/test/tag.doctest