Commit e3d7ff88 by lrnzcig

Merge remote-tracking branch 'nltk/twitter' into twitter

parents 0c119885 a1d30e00
...@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader( ...@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
treebank_raw = LazyCorpusLoader( treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2') 'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
tweets = LazyCorpusLoader( tweets = LazyCorpusLoader(
'tweets', TwitterCorpusReader) 'twitter', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader( udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader) 'udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader( udhr2 = LazyCorpusLoader(
......
...@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader): ...@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
http://www.ota.ox.ac.uk/desc/2554 http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can If you extracted the archive to a directory called `BNC`, then you can
instantiate the reder as:: instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
......
...@@ -26,6 +26,21 @@ class TwitterCorpusReader(CorpusReader): ...@@ -26,6 +26,21 @@ class TwitterCorpusReader(CorpusReader):
Individual Tweets can be tokenized using the default tokenizer, or by a Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor. custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
>>> reader = TwitterCorpusReader(root='twitter-files', '.*\.json') # doctest: +SKIP
The recommended approach is to use this directory as the value of the
environmental variable `TWITTER`, and then invoke the reader as::
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
""" """
CorpusView = StreamBackedCorpusView CorpusView = StreamBackedCorpusView
...@@ -33,15 +48,10 @@ class TwitterCorpusReader(CorpusReader): ...@@ -33,15 +48,10 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader. The corpus view class used by this reader.
""" """
def __init__(self, root, fileids, def __init__(self, root,
word_tokenizer=TweetTokenizer(), word_tokenizer=TweetTokenizer(),
encoding='utf8'): encoding='utf8'):
""" """
Construct a new Tweet corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
:param root: The root directory for this corpus. :param root: The root directory for this corpus.
......
...@@ -13,6 +13,14 @@ This package contains classes for retrieving Tweet documents using the ...@@ -13,6 +13,14 @@ This package contains classes for retrieving Tweet documents using the
Twitter API. Twitter API.
""" """
try:
from twython import Twython, TwythonStreamer
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
https://twython.readthedocs.org/ for installation instructions."""
err.msg = textwrap.fill(MSG)
raise
from nltk.twitter.util import credsfromfile from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\ from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
......
...@@ -192,8 +192,9 @@ def corpusreader_demo(): ...@@ -192,8 +192,9 @@ def corpusreader_demo():
""" """
from nltk.corpus import TwitterCorpusReader from nltk.corpus import TwitterCorpusReader
root = os.environ['TWITTER'] #root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '1k_sample.json') #reader = TwitterCorpusReader(root, '1k_sample.json')
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print() print()
print("Complete tweet documents") print("Complete tweet documents")
print(SPACER) print(SPACER)
...@@ -214,7 +215,7 @@ def corpusreader_demo(): ...@@ -214,7 +215,7 @@ def corpusreader_demo():
ALL = range(12) ALL = range(12)
DEMOS = ALL[9:10] DEMOS = ALL[11:]
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment