Commit aada675c by Steven Bird

moved tagging howto docs into docstrings

parent 7afca07a
......@@ -9,9 +9,45 @@
"""
NLTK Taggers
Classes and interfaces for tagging each token of a sentence with
supplementary information, such as its part of speech. This task,
which is known as tagging, is defined by the ``TaggerI`` interface.
This package contains classes and interfaces for part-of-speech
tagging, or simply "tagging".
A "tag" is a case-sensitive string that specifies some property of a token,
such as its part of speech. Tagged tokens are encoded as tuples
``(tag, token)``. For example, the following tagged token combines
the word ``'fly'`` with a noun part of speech tag (``'NN'``):
>>> tagged_tok = ('fly', 'NN')
An off-the-shelf tagger is available. It uses the Penn Treebank tagset:
>>> from nltk import pos_tag, word_tokenize
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
('.', '.')]
This package defines several taggers, which take a token list (typically a
sentence), assign a tag to each token, and return the resulting list of
tagged tokens. Most of the taggers are built automatically based on a
training corpus. For example, the unigram tagger tags each word *w*
by checking what the most frequent tag for *w* was in a training corpus:
>>> from nltk.corpus import brown
>>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> tagger.tag(['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'])
[('Mitchell', 'NP'), ('decried', None), ('the', 'AT'), ('high', 'JJ'),
('rate', 'NN'), ('of', 'IN'), ('unemployment', None)]
Note that words that the tagger has not seen during training receive a tag
of ``None``.
We evaluate a tagger on data that was not seen during training:
>>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) # doctest: +ELLIPSIS
0.734...
For more information, please consult chapter 5 of the NLTK Book.
"""
from nltk.tag.api import TaggerI
......@@ -45,6 +81,17 @@ def pos_tag(tokens):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
>>> from nltk import pos_tag, word_tokenize
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
('.', '.')]
:param tokens: Sequence of tokens to be tagged
:type tokens: list(str)
:return: The tagged tokens
:rtype: list(tuple(str, str))
"""
tagger = load(_POS_TAGGER)
return tagger.tag(tokens)
......
......@@ -8,7 +8,56 @@
# For license information, see LICENSE.TXT
"""
Brill's transformational rule-based tagger.
Brill Tagger
The Brill Tagger is a transformational rule-based tagger.
It starts by running an initial tagger, and then
improves the tagging by applying a list of transformation rules.
These transformation rules are automatically learned from the training
corpus, based on one or more "rule templates."
>>> from nltk.tag.brill import *
>>> templates = [
... SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
... SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
... SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)),
... SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)),
... SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
... SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
... SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
... SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
... ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
... ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
... ]
>>> trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger_2,
... templates=templates, trace=3,
... deterministic=True)
>>> brill_tagger = trainer.train(brown_train, max_rules=10) # doctest: +NORMALIZE_WHITESPACE
Training Brill tagger on 4523 sentences...
Finding initial useful rules...
Found 75359 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
c i o t | R Fixed = num tags changed incorrect -> correct
o x k h | u Broken = num tags changed correct -> incorrect
r e e e | l Other = num tags changed incorrect -> incorrect
e d n r | e
------------------+-------------------------------------------------------
354 354 0 3 | TO -> IN if the tag of the following word is 'AT'
111 173 62 3 | NN -> VB if the tag of the preceding word is 'TO'
110 110 0 4 | TO -> IN if the tag of the following word is 'NP'
83 157 74 4 | NP -> NP-TL if the tag of the following word is
| 'NN-TL'
73 77 4 0 | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
71 116 45 3 | TO -> IN if the tag of words i+1...i+2 is 'NNS'
65 65 0 3 | NN -> VB if the tag of the preceding word is 'MD'
63 63 0 0 | VBD -> VBN if the tag of words i-3...i-1 is 'HVZ'
59 62 3 2 | CS -> QL if the text of words i+1...i+3 is 'as'
55 57 2 0 | VBD -> VBN if the tag of words i-3...i-1 is 'HVD'
>>> print 'Accuracy: %4.1f%%' % (
... 100.0 * brill_tagger.evaluate(brown_test))
Accuracy: 89.5%
"""
import bisect # for binary search through a subset of indices
......
......@@ -65,6 +65,9 @@ This discussion assumes that the HMM has been trained. This is probably the
most difficult task with the model, and requires either MLE estimates of the
parameters or unsupervised learning using the Baum-Welch algorithm, a variant
of EM.
For more information, please consult the source code for this module,
which includes extensive demonstration code.
"""
import re
......
......@@ -212,13 +212,21 @@ class ContextTagger(SequentialBackoffTagger):
class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):
"""
A tagger that assigns the same tag to every token.
>>> default_tagger = DefaultTagger('NN')
>>> default_tagger.tag('This is a test'.split())
[('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
This tagger is recommended as a backoff tagger, in cases where
a more powerful tagger is unable to assign a tag to the word
(e.g. because the word was not seen during training).
:param tag: The tag to assign to each token
:type tag: str
"""
yaml_tag = '!nltk.DefaultTagger'
def __init__(self, tag):
"""
Construct a new tagger that assigns tag to all tokens.
"""
self._tag = tag
SequentialBackoffTagger.__init__(self, None)
......@@ -271,9 +279,34 @@ class NgramTagger(ContextTagger, yaml.YAMLObject):
class UnigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string.
Unigram taggers are typically trained on a tagged corpus.
Unigram Tagger
The UnigramTagger finds the most likely tag for each word in a training
corpus, and then uses that information to assign tags to new tokens.
>>> from nltk.corpus import brown
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> unigram_tagger.tag(test_sent)
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'),
('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'),
('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'),
('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'),
('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'),
('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
yaml_tag = '!nltk.UnigramTagger'
def __init__(self, train=None, model=None,
......@@ -290,8 +323,18 @@ class BigramTagger(NgramTagger):
A tagger that chooses a token's tag based its word string and on
the preceeding words' tag. In particular, a tuple consisting
of the previous tag and the word is looked up in a table, and
the corresponding tag is returned. Bigram taggers are typically
trained on a tagged corpus.
the corresponding tag is returned.
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
yaml_tag = '!nltk.BigramTagger'
......@@ -306,8 +349,18 @@ class TrigramTagger(NgramTagger):
A tagger that chooses a token's tag based its word string and on
the preceeding two words' tags. In particular, a tuple consisting
of the previous two tags and the word is looked up in a table, and
the corresponding tag is returned. Trigram taggers are typically
trained them on a tagged corpus.
the corresponding tag is returned.
:param train: The corpus of training data, a list of tagged sentences
:type train: list(list(tuple(str, str)))
:param model: The tagger model
:type model: dict
:param backoff: Another tagger which this tagger will consult when it is
unable to tag a word
:type backoff: TaggerI
:param cutoff: The number of instances of training data the tagger must see
in order not to use the backoff tagger
:type cutoff: int
"""
yaml_tag = '!nltk.TrigramTagger'
......@@ -363,10 +416,33 @@ class AffixTagger(ContextTagger, yaml.YAMLObject):
class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
"""
A tagger that assigns tags to words based on regular expressions
over word strings.
Construct a new regexp tagger.
Regular Expression Tagger
The RegexpTagger assigns tags to tokens by comparing their
word strings to a series of regular expressions. The following tagger
uses word suffixes to make guesses about the correct Brown Corpus part
of speech tag:
>>> from nltk.corpus import brown
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
... (r'.*ly$', 'RB'), # adverbs
... (r'.*s$', 'NNS'), # plural nouns
... (r'.*ing$', 'VBG'), # gerunds
... (r'.*ed$', 'VBD'), # past tense verbs
... (r'.*', 'NN') # nouns (default)
... ])
>>> regexp_tagger.tag(test_sent)
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
('place', 'NN'), ('.', 'NN')]
:type regexps: list(tuple(str, str))
:param regexps: A list of ``(regexp, tag)`` pairs, each of
......
......@@ -12,6 +12,9 @@ def str2tuple(s, sep='/'):
corresponding tuple representation. The rightmost occurrence of
*sep* in *s* will be used to divide *s* into a word string and
a tag string. If *sep* does not occur in *s*, return (s, None).
>>> str2tuple('fly/NN')
('fly', 'NN')
:type s: str
:param s: The string representation of a tagged token.
......@@ -33,6 +36,10 @@ def tuple2str(tagged_token, sep='/'):
separator, followed by the token's tag. (If the tag is None,
then just return the bare word string.)
>>> tagged_token = ('fly', 'NN')
>>> tuple2str(tagged_token)
'fly/NN'
:type tagged_token: tuple(str, str)
:param tagged_token: The tuple representation of a tagged token.
:type sep: str
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment