Merge remote-tracking branch 'nltk/twitter' into twitter

e3d7ff88 · lrnzcig · 0c119885 · a1d30e00 · e3d7ff88 · e3d7ff88
Commit e3d7ff88 authored Apr 29, 2015 by lrnzcig
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 11 deletions

nltk/corpus/__init__.py
+1 -1

nltk/corpus/reader/bnc.py
+1 -1

nltk/corpus/reader/tweets.py
+16 -6

nltk/twitter/__init__.py
+8 -0

nltk/twitter/twitter_demo.py
+4 -3

No files found.
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
 treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
 tweets = LazyCorpusLoader(
-    'tweets', TwitterCorpusReader)
+    'twitter', TwitterCorpusReader, '.*\.json')
 udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
 udhr2 = LazyCorpusLoader(

--- a/nltk/corpus/reader/bnc.py
+++ b/nltk/corpus/reader/bnc.py
@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
    http://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
-    instantiate the reder as::
+    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')


--- a/nltk/corpus/reader/tweets.py
+++ b/nltk/corpus/reader/tweets.py
@@ -26,6 +26,21 @@ class TwitterCorpusReader(CorpusReader):

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.
+
+    Construct a new Tweet corpus reader for a set of documents
+    located at the given root directory.
+
+    If you made your own tweet collection in a directory called
+    `twitter-files`, then you can initialise the reader as::
+
+        >>> reader = TwitterCorpusReader(root='twitter-files', '.*\.json') # doctest: +SKIP
+
+    The recommended approach is to use this directory as the value of the
+    environmental variable `TWITTER`, and then invoke the reader as::
+
+        >>> root = os.environ['TWITTER']
+        >>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
+
    """

    CorpusView = StreamBackedCorpusView
@@ -33,15 +48,10 @@ class TwitterCorpusReader(CorpusReader):
    The corpus view class used by this reader.
    """

-    def __init__(self, root, fileids,
+    def __init__(self, root,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """
-        Construct a new Tweet corpus reader for a set of documents
-        located at the given root directory.  Example usage:
-
-            >>> root = os.environ['TWITTER']
-            >>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP

        :param root: The root directory for this corpus.


--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -13,6 +13,14 @@ This package contains classes for retrieving Tweet documents using the
 Twitter API.

 """
+try:
+    from twython import Twython, TwythonStreamer
+except ImportError as err:
+    import textwrap
+    MSG = """The NLTK twitterclient module requires the Twython package. See\
+    https://twython.readthedocs.org/ for installation instructions."""
+    err.msg = textwrap.fill(MSG)
+    raise

 from nltk.twitter.util import credsfromfile
 from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\

--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -192,8 +192,9 @@ def corpusreader_demo():

    """
    from nltk.corpus import TwitterCorpusReader
-    root = os.environ['TWITTER']
-    reader = TwitterCorpusReader(root, '1k_sample.json')
+    #root = os.environ['TWITTER']
+    #reader = TwitterCorpusReader(root, '1k_sample.json')
+    reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
    print()
    print("Complete tweet documents")
    print(SPACER)
@@ -214,7 +215,7 @@ def corpusreader_demo():


 ALL = range(12)
-DEMOS = ALL[9:10]
+DEMOS = ALL[11:]


 if __name__ == "__main__":