Commit 10a99af5 by Ewan Klein

updates to tweets corpus reading

parent fbbcf3a5
......@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader.
"""
def __init__(self, root,
def __init__(self, root, fileids = None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
......
......@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
try:
from twython import Twython, TwythonStreamer
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
https://twython.readthedocs.org/ for installation instructions."""
err.msg = textwrap.fill(MSG)
raise
from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
TweetWriter
......@@ -191,29 +191,37 @@ def corpusreader_demo():
* the result of tokenising the raw strings.
"""
from nltk.corpus import TwitterCorpusReader
#from nltk.corpus import TwitterCorpusReader
from nltk.corpus import tweets
tweets.fileids()
#root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json')
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
#reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print()
print("Complete tweet documents")
print(SPACER)
for tweet in reader.docs()[:2]:
for tweet in tweets.docs()[:2]:
print(json.dumps(tweet, indent=1, sort_keys=True))
print()
print("Raw tweet strings:")
print(SPACER)
for text in reader.strings()[:15]:
for text in tweets.strings()[:15]:
print(text)
print()
print("Tokenized tweet strings:")
print(SPACER)
for text in reader.tokenized()[:15]:
for text in tweets.tokenized()[:15]:
print(text)
#def corpusreader_demo():
#from nltk.corpus import brown
#brown.words()
ALL = range(12)
DEMOS = ALL[11:]
......
......@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from twython.exceptions import TwythonRateLimitError
"""
NLTK Twitter client.
NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
......@@ -31,6 +34,7 @@ from nltk.compat import UTC
try:
from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
......@@ -204,10 +208,10 @@ class Query(Twython):
results = self.search(q=keywords, count=min(100, count), lang=lang)
count_from_query = results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
'''
pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits
dealing with twitter rate limits
'''
while count_from_query < count:
max_id = results['search_metadata']['max_id']
......@@ -217,7 +221,7 @@ class Query(Twython):
except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e))
time.sleep(15*60) # wait 15 minutes
continue
continue
count_from_query += results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment