updates to tweets corpus reading

10a99af5 · Ewan Klein · fbbcf3a5 · 10a99af5 · 10a99af5 · 10a99af5
Commit 10a99af5 authored Apr 29, 2015 by Ewan Klein
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 20 deletions

nltk/corpus/reader/tweets.py
+1 -1

nltk/twitter/__init__.py
+0 -9

nltk/twitter/twitter_demo.py
+13 -5

nltk/twitter/twitterclient.py
+9 -5

No files found.
--- a/nltk/corpus/reader/tweets.py
+++ b/nltk/corpus/reader/tweets.py
@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
    The corpus view class used by this reader.
    """

-    def __init__(self, root,
+    def __init__(self, root, fileids = None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
 Twitter API.

 """
-try:
-    from twython import Twython, TwythonStreamer
-except ImportError as err:
-    import textwrap
-    MSG = """The NLTK twitterclient module requires the Twython package. See\
-    https://twython.readthedocs.org/ for installation instructions."""
-    err.msg = textwrap.fill(MSG)
-    raise
-
 from nltk.twitter.util import credsfromfile
 from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
     TweetWriter
--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -191,29 +191,37 @@ def corpusreader_demo():
    * the result of tokenising the raw strings.

    """
-    from nltk.corpus import TwitterCorpusReader
+    #from nltk.corpus import TwitterCorpusReader
+    from nltk.corpus import tweets
+    tweets.fileids()
    #root = os.environ['TWITTER']
    #reader = TwitterCorpusReader(root, '1k_sample.json')
-    reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
+    #reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
    print()
    print("Complete tweet documents")
    print(SPACER)
-    for tweet in reader.docs()[:2]:
+    for tweet in tweets.docs()[:2]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
-    for text in reader.strings()[:15]:
+    for text in tweets.strings()[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
-    for text in reader.tokenized()[:15]:
+    for text in tweets.tokenized()[:15]:
        print(text)


+#def corpusreader_demo():
+    #from nltk.corpus import brown
+    #brown.words()
+
+
+
 ALL = range(12)
 DEMOS = ALL[11:]


--- a/nltk/twitter/twitterclient.py
+++ b/nltk/twitter/twitterclient.py
@@ -6,11 +6,14 @@
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-from twython.exceptions import TwythonRateLimitError
+

 """
-NLTK Twitter client.
+NLTK Twitter client

+This module offers methods for collecting and processing tweets. Most of the
+functionality depends on access to the Twitter APIs, and this is handled via
+the third party Twython library.

 If one of the methods below returns an integer, it is probably a `Twitter
 error code <https://dev.twitter.com/overview/api/response-codes>`_. For
@@ -31,6 +34,7 @@ from nltk.compat import UTC

 try:
    from twython import Twython, TwythonStreamer
+    from twython.exceptions import TwythonRateLimitError
 except ImportError as err:
    import textwrap
    MSG = """The NLTK twitterclient module requires the Twython package. See\
@@ -204,10 +208,10 @@ class Query(Twython):
        results = self.search(q=keywords, count=min(100, count), lang=lang)
        count_from_query = results['search_metadata']['count']
        self.handler.handle_chunk(results['statuses'])
-        
+
        '''
        pagination loop: keep fetching tweets until the count requested is reached,
-        dealing with twitter rate limits 
+        dealing with twitter rate limits
        '''
        while count_from_query < count:
            max_id = results['search_metadata']['max_id']
@@ -217,7 +221,7 @@ class Query(Twython):
            except TwythonRateLimitError as e:
                print("Waiting for 15 minutes -{0}".format(e))
                time.sleep(15*60) # wait 15 minutes
-                continue           
+                continue
            count_from_query += results['search_metadata']['count']
            self.handler.handle_chunk(results['statuses'])