Commit e3d7ff88 by lrnzcig

Merge remote-tracking branch 'nltk/twitter' into twitter

parents 0c119885 a1d30e00
......@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
tweets = LazyCorpusLoader(
'tweets', TwitterCorpusReader)
'twitter', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader(
......
......@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reder as::
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
......
......@@ -26,6 +26,21 @@ class TwitterCorpusReader(CorpusReader):
Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
>>> reader = TwitterCorpusReader(root='twitter-files', '.*\.json') # doctest: +SKIP
The recommended approach is to use this directory as the value of the
environmental variable `TWITTER`, and then invoke the reader as::
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
"""
CorpusView = StreamBackedCorpusView
......@@ -33,15 +48,10 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader.
"""
def __init__(self, root, fileids,
def __init__(self, root,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
Construct a new Tweet corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
:param root: The root directory for this corpus.
......
......@@ -13,6 +13,14 @@ This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
try:
from twython import Twython, TwythonStreamer
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
https://twython.readthedocs.org/ for installation instructions."""
err.msg = textwrap.fill(MSG)
raise
from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
......
......@@ -192,8 +192,9 @@ def corpusreader_demo():
"""
from nltk.corpus import TwitterCorpusReader
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '1k_sample.json')
#root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json')
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print()
print("Complete tweet documents")
print(SPACER)
......@@ -214,7 +215,7 @@ def corpusreader_demo():
ALL = range(12)
DEMOS = ALL[9:10]
DEMOS = ALL[11:]
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment