fixed doctests

2d4d9050 · Steven Bird · aada675c · 2d4d9050
Commit 2d4d9050 authored Nov 10, 2011 by Steven Bird
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 18 deletions

nltk/tag/brill.py
+25 -18

No files found.
--- a/nltk/tag/brill.py
+++ b/nltk/tag/brill.py
@@ -16,7 +16,11 @@ improves the tagging by applying a list of transformation rules.
 These transformation rules are automatically learned from the training
 corpus, based on one or more "rule templates."
-    >>> from nltk.tag.brill import *
+    >>> from nltk.corpus import brown
+    >>> from nltk.tag import UnigramTagger
+    >>> brown_train = list(brown.tagged_sents(categories='news')[:500])
+    >>> brown_test = list(brown.tagged_sents(categories='news')[500:600])
+    >>> unigram_tagger = UnigramTagger(brown_train)
    >>> templates = [
    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
    ...     SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
@@ -29,13 +33,13 @@ corpus, based on one or more "rule templates."
    ...     ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
    ...     ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ...     ]
-    >>> trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger_2,
+    >>> trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger,
    ...                                  templates=templates, trace=3,
    ...                                  deterministic=True)
-    >>> brill_tagger = trainer.train(brown_train, max_rules=10)  # doctest: +NORMALIZE_WHITESPACE
+    >>> brill_tagger = trainer.train(brown_train, max_rules=10)
-    Training Brill tagger on 4523 sentences...
+    Training Brill tagger on 500 sentences...
    Finding initial useful rules...
-        Found 75359 useful rules.
+        Found 10210 useful rules.
    <BLANKLINE>
               B      |     
       S   F   r   O  |        Score = Fixed - Broken
@@ -44,20 +48,23 @@ corpus, based on one or more "rule templates."
       r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
       e   d   n   r  |  e
    ------------------+-------------------------------------------------------
-     354 354   0   3  | TO -> IN if the tag of the following word is 'AT'
+      46  46   0   0  | TO -> IN if the tag of the following word is 'AT'
-     111 173  62   3  | NN -> VB if the tag of the preceding word is 'TO'
+      18  20   2   0  | TO -> IN if the tag of words i+1...i+3 is 'CD'
-     110 110   0   4  | TO -> IN if the tag of the following word is 'NP'
+      14  14   0   0  | IN -> IN-TL if the tag of the preceding word is
-      83 157  74   4  | NP -> NP-TL if the tag of the following word is
+                      |   'NN-TL', and the tag of the following word is
                      |   'NN-TL'
-      73  77   4   0  | VBD -> VBN if the tag of words i-2...i-1 is 'BEDZ'
+      11  11   0   1  | TO -> IN if the tag of the following word is 'NNS'
-      71 116  45   3  | TO -> IN if the tag of words i+1...i+2 is 'NNS'
+      10  10   0   0  | TO -> IN if the tag of the following word is 'JJ'
-      65  65   0   3  | NN -> VB if the tag of the preceding word is 'MD'
+       8   8   0   0  | , -> ,-HL if the tag of the preceding word is 'NP-
-      63  63   0   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVZ'
+                      |   HL'
-      59  62   3   2  | CS -> QL if the text of words i+1...i+3 is 'as'
+       7   7   0   1  | NN -> VB if the tag of the preceding word is 'MD'
-      55  57   2   0  | VBD -> VBN if the tag of words i-3...i-1 is 'HVD'
+       7  13   6   0  | NN -> VB if the tag of the preceding word is 'TO'
-    >>> print 'Accuracy: %4.1f%%' % (
+       7   7   0   0  | NP-TL -> NP if the tag of words i+1...i+2 is 'NNS'
-    ...     100.0 * brill_tagger.evaluate(brown_test))
+       7   7   0   0  | VBN -> VBD if the tag of the preceding word is
-    Accuracy: 89.5%
+                      |   'NP'
+    >>> brill_tagger.evaluate(brown_test) # doctest: +ELLIPSIS
+    0.742...
 """
 import bisect        # for binary search through a subset of indices