Commit ffc789a2 by Ewan Klein

Merge branch 'twitter' into sentiment

parents 2b9c9449 d4717752
...@@ -94,6 +94,17 @@ try: ...@@ -94,6 +94,17 @@ try:
except ImportError: except ImportError:
pass pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
########################################################### ###########################################################
# TOP-LEVEL MODULES # TOP-LEVEL MODULES
########################################################### ###########################################################
......
...@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader( ...@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
treebank_raw = LazyCorpusLoader( treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2') 'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
tweets = LazyCorpusLoader( tweets = LazyCorpusLoader(
'tweets', TwitterCorpusReader) 'twitter', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader( udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader) 'udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader( udhr2 = LazyCorpusLoader(
......
...@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader): ...@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
http://www.ota.ox.ac.uk/desc/2554 http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can If you extracted the archive to a directory called `BNC`, then you can
instantiate the reder as:: instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
......
...@@ -26,6 +26,22 @@ class TwitterCorpusReader(CorpusReader): ...@@ -26,6 +26,22 @@ class TwitterCorpusReader(CorpusReader):
Individual Tweets can be tokenized using the default tokenizer, or by a Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor. custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
However, the recommended approach is to use this directory as the value of the
environmental variable `TWITTER`, and then invoke the reader as::
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '.*\.json')
""" """
CorpusView = StreamBackedCorpusView CorpusView = StreamBackedCorpusView
...@@ -33,15 +49,10 @@ class TwitterCorpusReader(CorpusReader): ...@@ -33,15 +49,10 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader. The corpus view class used by this reader.
""" """
def __init__(self, root, fileids, def __init__(self, root, fileids = None,
word_tokenizer=TweetTokenizer(), word_tokenizer=TweetTokenizer(),
encoding='utf8'): encoding='utf8'):
""" """
Construct a new Tweet corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
:param root: The root directory for this corpus. :param root: The root directory for this corpus.
......
...@@ -73,7 +73,7 @@ path = [] ...@@ -73,7 +73,7 @@ path = []
# User-specified locations: # User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d] path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
if os.path.expanduser('~/') != '~/': if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data'))) path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'): if sys.platform.startswith('win'):
......
...@@ -924,6 +924,10 @@ class Downloader(object): ...@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
""" """
# Check if we are on GAE where we cannot write into filesystem.
if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a # Check if we have sufficient permissions to install in a
# variety of system-wide locations. # variety of system-wide locations.
for nltkdir in nltk.data.path: for nltkdir in nltk.data.path:
...@@ -2267,4 +2271,3 @@ if __name__ == '__main__': ...@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir, downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force, quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error) halt_on_error=options.halt_on_error)
...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(), ...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath: if searchpath:
msg += '\n\n Searched in:' msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath) msg += ''.join('\n - %s' % d for d in searchpath)
if url: msg += ('\n\n For more information, on %s, see:\n <%s>' % if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url)) (filename, url))
div = '='*75 div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div)) raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
......
...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger ...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger from nltk.tag.hunpos import HunposTagger
from nltk.tag.stanford import StanfordTagger from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag from nltk.tag.mapping import tagset_mapping, map_tag
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford NER-tagger # Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
# #
# Copyright (C) 2001-2015 NLTK Project # Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org> # Author: Nitin Madnani <nmadnani@ets.org>
...@@ -9,6 +9,12 @@ ...@@ -9,6 +9,12 @@
""" """
A module for interfacing with the Stanford taggers. A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
""" """
import os import os
...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI): ...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = '' _SEPARATOR = ''
_JAR = '' _JAR = ''
def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR: if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be ' warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean POS- or NERTagger?') 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar( self._stanford_jar = find_jar(
self._JAR, path_to_jar, self._JAR, path_to_jar,
searchpath=(), url=_stanford_url, searchpath=(), url=_stanford_url,
verbose=verbose) verbose=verbose)
self._stanford_model = find_file(path_to_model, self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose) env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding self._encoding = encoding
self.java_options = java_options self.java_options = java_options
...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI): ...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence) tagged_sentences.append(sentence)
return tagged_sentences return tagged_sentences
class POSTagger(StanfordTagger): class StanfordPOSTagger(StanfordTagger):
""" """
A class for pos tagging with Stanford Tagger. The input is the paths to: A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger): ...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import POSTagger >>> from nltk.tag import StanfordPOSTagger
>>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger', >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
""" """
...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger): ...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar' _JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(POSTagger, self).__init__(*args, **kwargs) super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger): ...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile', '-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences'] self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
class NERTagger(StanfordTagger): class StanfordNERTagger(StanfordTagger):
""" """
A class for ner tagging with Stanford Tagger. The input is the paths to: A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here, - (optionally) the path to the stanford tagger jar file. If not specified here,
...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger): ...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import NERTagger >>> from nltk.tag import StanfordNERTagger
>>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz', >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger): ...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags' _FORMAT = 'slashTags'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(NERTagger, self).__init__(*args, **kwargs) super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger): ...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
def parse_output(self, text): def parse_output(self, text):
if self._FORMAT == 'slashTags': if self._FORMAT == 'slashTags':
return super(NERTagger, self).parse_output(text) return super(StanfordNERTagger, self).parse_output(text)
raise NotImplementedError raise NotImplementedError
......
This diff is collapsed. Click to expand it.
...@@ -13,7 +13,6 @@ This package contains classes for retrieving Tweet documents using the ...@@ -13,7 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API. Twitter API.
""" """
from nltk.twitter.util import credsfromfile from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\ from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
TweetWriter TweetWriter
...@@ -72,4 +72,6 @@ class TweetHandlerI(object): ...@@ -72,4 +72,6 @@ class TweetHandlerI(object):
(default implementation should be enough in most cases) (default implementation should be enough in most cases)
""" """
for item in data_chunk: for item in data_chunk:
self.handle(item) if self.handle(item) == False:
return False
return True
...@@ -47,7 +47,7 @@ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800'] # UserIDs co ...@@ -47,7 +47,7 @@ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800'] # UserIDs co
HYDRATED = os.path.join(TWITTER, 'rehydrated.json') HYDRATED = os.path.join(TWITTER, 'rehydrated.json')
DATE = (2015, 4, 20, 16, 40) DATE = (2015, 4, 20, 16, 40)
# demo 0
@verbose @verbose
def twitterclass_demo(): def twitterclass_demo():
""" """
...@@ -62,7 +62,7 @@ def twitterclass_demo(): ...@@ -62,7 +62,7 @@ def twitterclass_demo():
tw = Twitter() tw = Twitter()
tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream
# demo 1
@verbose @verbose
def sampletoscreen_demo(limit=20): def sampletoscreen_demo(limit=20):
""" """
...@@ -73,7 +73,7 @@ def sampletoscreen_demo(limit=20): ...@@ -73,7 +73,7 @@ def sampletoscreen_demo(limit=20):
client.register(TweetViewer(limit=limit)) client.register(TweetViewer(limit=limit))
client.sample() client.sample()
# demo 2
@verbose @verbose
def tracktoscreen_demo(track="taylor swift", limit=10): def tracktoscreen_demo(track="taylor swift", limit=10):
""" """
...@@ -84,7 +84,7 @@ def tracktoscreen_demo(track="taylor swift", limit=10): ...@@ -84,7 +84,7 @@ def tracktoscreen_demo(track="taylor swift", limit=10):
client.register(TweetViewer(limit=limit)) client.register(TweetViewer(limit=limit))
client.filter(track=track) client.filter(track=track)
# demo 3
@verbose @verbose
def search_demo(keywords='nltk'): def search_demo(keywords='nltk'):
""" """
...@@ -95,7 +95,7 @@ def search_demo(keywords='nltk'): ...@@ -95,7 +95,7 @@ def search_demo(keywords='nltk'):
for tweet in client.search_tweets(keywords=keywords, count=10): for tweet in client.search_tweets(keywords=keywords, count=10):
print(tweet['text']) print(tweet['text'])
# demo 4
@verbose @verbose
def tweets_by_user_demo(user='NLTK_org', count=200): def tweets_by_user_demo(user='NLTK_org', count=200):
oauth = credsfromfile() oauth = credsfromfile()
...@@ -103,7 +103,7 @@ def tweets_by_user_demo(user='NLTK_org', count=200): ...@@ -103,7 +103,7 @@ def tweets_by_user_demo(user='NLTK_org', count=200):
client.register(TweetWriter()) client.register(TweetWriter())
client.user_tweets(user, count) client.user_tweets(user, count)
# demo 5
@verbose @verbose
def lookup_by_userid_demo(): def lookup_by_userid_demo():
""" """
...@@ -118,7 +118,7 @@ def lookup_by_userid_demo(): ...@@ -118,7 +118,7 @@ def lookup_by_userid_demo():
following = info['friends_count'] following = info['friends_count']
print("{0}, followers: {1}, following: {2}".format(name, followers, following)) print("{0}, followers: {1}, following: {2}".format(name, followers, following))
# demo 6
@verbose @verbose
def followtoscreen_demo(limit=10): def followtoscreen_demo(limit=10):
""" """
...@@ -133,7 +133,7 @@ def followtoscreen_demo(limit=10): ...@@ -133,7 +133,7 @@ def followtoscreen_demo(limit=10):
client.register(TweetViewer(limit=limit)) client.register(TweetViewer(limit=limit))
client.statuses.filter(follow=USERIDS) client.statuses.filter(follow=USERIDS)
# demo 7
@verbose @verbose
def streamtofile_demo(limit=20): def streamtofile_demo(limit=20):
""" """
...@@ -144,7 +144,7 @@ def streamtofile_demo(limit=20): ...@@ -144,7 +144,7 @@ def streamtofile_demo(limit=20):
client.register(TweetWriter(limit=limit, repeat=False)) client.register(TweetWriter(limit=limit, repeat=False))
client.statuses.sample() client.statuses.sample()
# demo 8
@verbose @verbose
def limit_by_time_demo(limit=20, date_limit=DATE): def limit_by_time_demo(limit=20, date_limit=DATE):
""" """
...@@ -155,9 +155,9 @@ def limit_by_time_demo(limit=20, date_limit=DATE): ...@@ -155,9 +155,9 @@ def limit_by_time_demo(limit=20, date_limit=DATE):
client.register(TweetWriter(limit=limit, date_limit=date_limit)) client.register(TweetWriter(limit=limit, date_limit=date_limit))
client.sample() client.sample()
# demo 9
@verbose @verbose
def extract_tweetids_demo(infile, outfile): def extract_tweetids_demo(infile = TWEETS, outfile = IDS):
""" """
Given a list of full tweets in a file (``infile``), write just the Given a list of full tweets in a file (``infile``), write just the
tweetIDs to a new file (`outfile`) tweetIDs to a new file (`outfile`)
...@@ -166,9 +166,9 @@ def extract_tweetids_demo(infile, outfile): ...@@ -166,9 +166,9 @@ def extract_tweetids_demo(infile, outfile):
json2csv(infile, outfile, FIELDS) json2csv(infile, outfile, FIELDS)
print("Writing ids to {0}".format(outfile)) print("Writing ids to {0}".format(outfile))
# demo 10
@verbose @verbose
def expand_tweetids_demo(infile, outfile): def expand_tweetids_demo(infile = IDS, outfile = HYDRATED):
""" """
Given a list of tweetIDs in a file (``infile``), try to recover the full Given a list of tweetIDs in a file (``infile``), try to recover the full
('hydrated') tweets from the REST API and write the results to a new file (`outfile`). ('hydrated') tweets from the REST API and write the results to a new file (`outfile`).
...@@ -180,7 +180,7 @@ def expand_tweetids_demo(infile, outfile): ...@@ -180,7 +180,7 @@ def expand_tweetids_demo(infile, outfile):
client = Query(**oauth) client = Query(**oauth)
client.lookup(infile, outfile) client.lookup(infile, outfile)
# demo 11
@verbose @verbose
def corpusreader_demo(): def corpusreader_demo():
""" """
...@@ -191,58 +191,41 @@ def corpusreader_demo(): ...@@ -191,58 +191,41 @@ def corpusreader_demo():
* the result of tokenising the raw strings. * the result of tokenising the raw strings.
""" """
from nltk.corpus import TwitterCorpusReader from nltk.corpus import tweets
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '1k_sample.json') #reader = TwitterCorpusReader(root, '1k_sample.json')
#reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print() print()
print("Complete tweet documents") print("Complete tweet documents")
print(SPACER) print(SPACER)
for tweet in reader.docs()[:2]: for tweet in tweets.docs()[:1]:
print(json.dumps(tweet, indent=1, sort_keys=True)) print(json.dumps(tweet, indent=1, sort_keys=True))
print() print()
print("Raw tweet strings:") print("Raw tweet strings:")
print(SPACER) print(SPACER)
for text in reader.strings()[:15]: for text in tweets.strings()[:15]:
print(text) print(text)
print() print()
print("Tokenized tweet strings:") print("Tokenized tweet strings:")
print(SPACER) print(SPACER)
for text in reader.tokenized()[:15]: for toks in tweets.tokenized()[:15]:
print(text) print(toks)
ALL = range(12) ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
DEMOS = ALL[9:10] search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
streamtofile_demo, limit_by_time_demo,
extract_tweetids_demo, expand_tweetids_demo, corpusreader_demo]
DEMOS = ALL[11:]
if __name__ == "__main__": if __name__ == "__main__":
"""Run selected demo functions.""" """Run selected demo functions."""
if 0 in DEMOS:
twitterclass_demo() for demo in DEMOS:
if 1 in DEMOS: demo()
sampletoscreen_demo()
if 2 in DEMOS:
tracktoscreen_demo()
if 3 in DEMOS:
search_demo()
if 4 in DEMOS:
tweets_by_user_demo()
if 5 in DEMOS:
lookup_by_userid_demo()
if 6 in DEMOS:
followtoscreen_demo()
if 7 in DEMOS:
streamtofile_demo()
if 8 in DEMOS:
limit_by_time_demo()
if 9 in DEMOS:
extract_tweetids_demo(TWEETS, IDS)
if 10 in DEMOS:
expand_tweetids_demo(IDS, HYDRATED)
if 11 in DEMOS:
corpusreader_demo()
...@@ -6,11 +6,14 @@ ...@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com> # Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from twython.exceptions import TwythonRateLimitError
""" """
NLTK Twitter client. NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For error code <https://dev.twitter.com/overview/api/response-codes>`_. For
...@@ -31,6 +34,7 @@ from nltk.compat import UTC ...@@ -31,6 +34,7 @@ from nltk.compat import UTC
try: try:
from twython import Twython, TwythonStreamer from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError
except ImportError as err: except ImportError as err:
import textwrap import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\ MSG = """The NLTK twitterclient module requires the Twython package. See\
...@@ -117,7 +121,7 @@ class Streamer(TwythonStreamer): ...@@ -117,7 +121,7 @@ class Streamer(TwythonStreamer):
try: try:
if track == '' and follow == '': if track == '' and follow == '':
raise ValueError("Please supply a value for 'track' or 'follow'.") raise ValueError("Please supply a value for 'track' or 'follow'.")
self.statuses.filter(track=track, follow=follow) self.statuses.filter(track=track, follow=follow, lang=lang)
except requests.exceptions.ChunkedEncodingError as e: except requests.exceptions.ChunkedEncodingError as e:
if e is not None: if e is not None:
print("Error (stream will continue): {0}".format(e)) print("Error (stream will continue): {0}".format(e))
...@@ -203,11 +207,12 @@ class Query(Twython): ...@@ -203,11 +207,12 @@ class Query(Twython):
""" """
results = self.search(q=keywords, count=min(100, count), lang=lang) results = self.search(q=keywords, count=min(100, count), lang=lang)
count_from_query = results['search_metadata']['count'] count_from_query = results['search_metadata']['count']
self.handler.handle_chunk(results['statuses']) if self.handler.handle_chunk(results['statuses']) == False:
return
''' '''
pagination loop: keep fetching tweets until the count requested is reached, pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits dealing with twitter rate limits
''' '''
while count_from_query < count: while count_from_query < count:
max_id = results['search_metadata']['max_id'] max_id = results['search_metadata']['max_id']
...@@ -217,9 +222,10 @@ class Query(Twython): ...@@ -217,9 +222,10 @@ class Query(Twython):
except TwythonRateLimitError as e: except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e)) print("Waiting for 15 minutes -{0}".format(e))
time.sleep(15*60) # wait 15 minutes time.sleep(15*60) # wait 15 minutes
continue continue
count_from_query += results['search_metadata']['count'] count_from_query += results['search_metadata']['count']
self.handler.handle_chunk(results['statuses']) if self.handler.handle_chunk(results['statuses']) == False:
return
def user_info_from_id(self, userids): def user_info_from_id(self, userids):
""" """
...@@ -269,11 +275,19 @@ class Twitter(object): ...@@ -269,11 +275,19 @@ class Twitter(object):
:param bool stream: If ``True``, use the live public stream,\ :param bool stream: If ``True``, use the live public stream,\
otherwise search past public tweets otherwise search past public tweets
:param int limit: Number of tweets to process :param int limit: Number of tweets to process
:param tuple date_limit: The date at which to stop collecting new\
data. This should be entered as a tuple which can serve as the\
argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
40)` for 12:30 pm on April 1 2015.\
Note that, in the case of streaming, it is the maximum date, i.e.\
a date in the future; if not, it is the minimum date, i.e. a date\
in the past
:param str lang: language
""" """
if to_screen: if to_screen:
handler = TweetViewer(limit=limit, date_limit=date_limit) handler = TweetViewer(limit=limit, date_limit=date_limit, stream=stream)
else: else:
handler = TweetWriter(limit=limit, date_limit=date_limit, repeat=False) handler = TweetWriter(limit=limit, date_limit=date_limit, stream=stream, repeat=False)
if stream: if stream:
self.streamer.register(handler) self.streamer.register(handler)
...@@ -315,7 +329,7 @@ class TweetWriter(TweetHandlerI): ...@@ -315,7 +329,7 @@ class TweetWriter(TweetHandlerI):
""" """
Handle data by writing it to a file. Handle data by writing it to a file.
""" """
def __init__(self, limit=2000, date_limit=None, repeat=True, fprefix='tweets', def __init__(self, limit=2000, date_limit=None, repeat=True, stream=True, fprefix='tweets',
subdir='twitter-files'): subdir='twitter-files'):
""" """
:param limit: number of data items to process in the current round of processing :param limit: number of data items to process in the current round of processing
...@@ -323,6 +337,8 @@ class TweetWriter(TweetHandlerI): ...@@ -323,6 +337,8 @@ class TweetWriter(TweetHandlerI):
:param repeat: flag to determine whether multiple files should be\ :param repeat: flag to determine whether multiple files should be\
written. If ``True``, the length of each file will be set by the value\ written. If ``True``, the length of each file will be set by the value\
of ``limit``. See also :py:func:`handle`. of ``limit``. See also :py:func:`handle`.
:param
""" """
self.repeat = repeat self.repeat = repeat
...@@ -330,6 +346,7 @@ class TweetWriter(TweetHandlerI): ...@@ -330,6 +346,7 @@ class TweetWriter(TweetHandlerI):
self.subdir = guess_path(subdir) self.subdir = guess_path(subdir)
self.fname = self.timestamped_file() self.fname = self.timestamped_file()
self.startingup = True self.startingup = True
self.stream = stream
TweetHandlerI.__init__(self, limit, date_limit) TweetHandlerI.__init__(self, limit, date_limit)
...@@ -366,9 +383,14 @@ class TweetWriter(TweetHandlerI): ...@@ -366,9 +383,14 @@ class TweetWriter(TweetHandlerI):
if self.date_limit: if self.date_limit:
tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\ tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\
%H:%M:%S +0000 %Y').replace(tzinfo=UTC) %H:%M:%S +0000 %Y').replace(tzinfo=UTC)
if tweet_date > self.date_limit: if (tweet_date > self.date_limit and self.stream == True) or \
print("Date limit {0} is earlier than date of current tweet {1}".\ (tweet_date < self.date_limit and self.stream == False):
format(self.date_limit, tweet_date)) if self.stream:
message = "earlier"
else:
message = "later"
print("Date limit {0} is {1} than date of current tweet {2}".\
format(self.date_limit, message, tweet_date))
return False return False
self.startingup = False self.startingup = False
......
...@@ -30,9 +30,13 @@ def extract_fields(tweet, fields): ...@@ -30,9 +30,13 @@ def extract_fields(tweet, fields):
""" """
out = [] out = []
for field in fields: for field in fields:
_add_field_to_out(tweet, field, out) try:
_add_field_to_out(tweet, field, out)
except TypeError:
raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
return out return out
def _add_field_to_out(json, field, out): def _add_field_to_out(json, field, out):
if isinstance(field, dict): if isinstance(field, dict):
for key, value in field.iteritems(): for key, value in field.iteritems():
...@@ -40,9 +44,28 @@ def _add_field_to_out(json, field, out): ...@@ -40,9 +44,28 @@ def _add_field_to_out(json, field, out):
else: else:
if isinstance(field, basestring): if isinstance(field, basestring):
out += [json[field]] out += [json[field]]
else : else:
out += [json[value] for value in field] out += [json[value] for value in field]
def _get_entity_recursive(json, entity):
if json == None:
return None
if isinstance(json, dict):
for key, value in json.iteritems():
if key == entity:
return value
candidate = _get_entity_recursive(value, entity)
if candidate != None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate != None:
return candidate
return None
else:
return None
def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'): def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
""" """
...@@ -53,8 +76,10 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'): ...@@ -53,8 +76,10 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
to a CSV file for easier processing. For example, just tweetIDs or to a CSV file for easier processing. For example, just tweetIDs or
just the text content of the tweets can be extracted. just the text content of the tweets can be extracted.
Additionally, the function allows combinations of fields of Twitter. See Additionally, the function allows combinations of fields of other Twitter
below. objects (mainly the users, see below).
For Twitter entities (e.g. hashtags of a tweet) see json2csv_entities
:param str infile: The name of the file containing full tweets :param str infile: The name of the file containing full tweets
...@@ -65,25 +90,107 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'): ...@@ -65,25 +90,107 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\ are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields. <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
Addionally, it allows fileds from other Twitter entities. Addionally, it allows fileds from other Twitter objects.
e. g.: ['id', 'text', {'user' : ['id', 'followers_count', 'friends_count']}] e. g.: ['id', 'text', {'user' : ['id', 'followers_count', 'friends_count']}]
Not suitable for entities like hastags; use json2csv_entities instead.
Not for the place of a tweet; also use json2csv.
:param error: Behaviour for encoding errors, see\ :param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes https://docs.python.org/3/library/codecs.html#codec-base-classes
""" """
with open(infile) as inf: with open(infile) as inf:
if compat.PY3 == True: writer = get_outf_writer_compat(outfile, encoding, errors)
outf = open(outfile, 'w', encoding=encoding)
writer = csv.writer(outf)
else:
outf = open(outfile, 'wb')
writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
for line in inf: for line in inf:
tweet = json.loads(line) tweet = json.loads(line)
row = extract_fields(tweet, fields) row = extract_fields(tweet, fields)
writer.writerow(row) writer.writerow(row)
def get_outf_writer_compat(outfile, encoding, errors):
if compat.PY3 == True:
outf = open(outfile, 'w', encoding=encoding, errors=errors)
writer = csv.writer(outf)
else:
outf = open(outfile, 'wb')
writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
return writer
def json2csv_entities(infile, outfile, main_fields, entity_name, entity_fields,
encoding='utf8', errors='replace'):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
:param str infile: The name of the file containing full tweets
:param str outfile: The name of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If entity_name is expressed as a dictionary, then it is list of fields\
of the object that corresponds to the key of the dictionary (could be\
the user object, or the place of a tweet object).
:param list entity_name: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For the user object,\
needs to be expressed as a dictionary: {'user' : 'urls'}. For the\
bounding box of the place from which a tweet was twitted, as a dict\
as well: {'place', 'bounding_box'}
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. ['text'] (of the hashtag)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
"""
with open(infile) as inf:
writer = get_outf_writer_compat(outfile, encoding, errors)
for line in inf:
tweet = json.loads(line)
if isinstance(entity_name, dict):
for key, value in entity_name.iteritems():
object_json = _get_entity_recursive(tweet, key)
if object_json == None:
# can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_name)
_write_to_file(tweet_fields, items, entity_fields, writer)
def _write_to_file(object_fields, items, entity_fields, writer):
if items == None:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens for "place" of a tweet
row = object_fields
for key, value in items.iteritems():
if key in entity_fields:
if isinstance(value, list):
row += value
else:
row += [value]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def credsfromfile(creds_file=None, subdir=None, verbose=False): def credsfromfile(creds_file=None, subdir=None, verbose=False):
""" """
Read OAuth credentials from a text file. Read OAuth credentials from a text file.
......
...@@ -7,5 +7,6 @@ scipy>=0.13.2 ...@@ -7,5 +7,6 @@ scipy>=0.13.2
matplotlib>=1.3.1 matplotlib>=1.3.1
scikit-learn>=0.14.1 scikit-learn>=0.14.1
python-crfsuite>=0.8.2 python-crfsuite>=0.8.2
pyparsing
twython>=3.2.0 twython>=3.2.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment