Commit ffc789a2 by Ewan Klein

Merge branch 'twitter' into sentiment

parents 2b9c9449 d4717752
......@@ -94,6 +94,17 @@ try:
except ImportError:
pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
###########################################################
# TOP-LEVEL MODULES
###########################################################
......
......@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
tweets = LazyCorpusLoader(
'tweets', TwitterCorpusReader)
'twitter', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader(
......
......@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reder as::
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
......
......@@ -26,6 +26,22 @@ class TwitterCorpusReader(CorpusReader):
Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
However, the recommended approach is to use this directory as the value of the
environmental variable `TWITTER`, and then invoke the reader as::
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '.*\.json')
"""
CorpusView = StreamBackedCorpusView
......@@ -33,15 +49,10 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader.
"""
def __init__(self, root, fileids,
def __init__(self, root, fileids = None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
Construct a new Tweet corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = os.environ['TWITTER']
>>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP
:param root: The root directory for this corpus.
......
......@@ -73,7 +73,7 @@ path = []
# User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
if os.path.expanduser('~/') != '~/':
if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'):
......
......@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
"""
# Check if we are on GAE where we cannot write into filesystem.
if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a
# variety of system-wide locations.
for nltkdir in nltk.data.path:
......@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error)
......@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath:
msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath)
if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url))
div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
......
......@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger
from nltk.tag.stanford import StanfordTagger
from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag
......
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford NER-tagger
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
......@@ -9,6 +9,12 @@
"""
A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""
import os
......@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = ''
_JAR = ''
def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean POS- or NERTagger?')
'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
searchpath=(), url=_stanford_url,
verbose=verbose)
self._stanford_model = find_file(path_to_model,
self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding
self.java_options = java_options
......@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence)
return tagged_sentences
class POSTagger(StanfordTagger):
class StanfordPOSTagger(StanfordTagger):
"""
A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
......@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
Example:
>>> from nltk.tag.stanford import POSTagger
>>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
>>> from nltk.tag import StanfordPOSTagger
>>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
......@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs):
super(POSTagger, self).__init__(*args, **kwargs)
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
......@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
class NERTagger(StanfordTagger):
class StanfordNERTagger(StanfordTagger):
"""
A class for ner tagging with Stanford Tagger. The input is the paths to:
A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
......@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
Example:
>>> from nltk.tag.stanford import NERTagger
>>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
>>> from nltk.tag import StanfordNERTagger
>>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
......@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags'
def __init__(self, *args, **kwargs):
super(NERTagger, self).__init__(*args, **kwargs)
super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
......@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
def parse_output(self, text):
if self._FORMAT == 'slashTags':
return super(NERTagger, self).parse_output(text)
return super(StanfordNERTagger, self).parse_output(text)
raise NotImplementedError
......
This diff is collapsed. Click to expand it.
......@@ -13,7 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
TweetWriter
......@@ -72,4 +72,6 @@ class TweetHandlerI(object):
(default implementation should be enough in most cases)
"""
for item in data_chunk:
self.handle(item)
if self.handle(item) == False:
return False
return True
......@@ -47,7 +47,7 @@ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800'] # UserIDs co
HYDRATED = os.path.join(TWITTER, 'rehydrated.json')
DATE = (2015, 4, 20, 16, 40)
# demo 0
@verbose
def twitterclass_demo():
"""
......@@ -62,7 +62,7 @@ def twitterclass_demo():
tw = Twitter()
tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream
# demo 1
@verbose
def sampletoscreen_demo(limit=20):
"""
......@@ -73,7 +73,7 @@ def sampletoscreen_demo(limit=20):
client.register(TweetViewer(limit=limit))
client.sample()
# demo 2
@verbose
def tracktoscreen_demo(track="taylor swift", limit=10):
"""
......@@ -84,7 +84,7 @@ def tracktoscreen_demo(track="taylor swift", limit=10):
client.register(TweetViewer(limit=limit))
client.filter(track=track)
# demo 3
@verbose
def search_demo(keywords='nltk'):
"""
......@@ -95,7 +95,7 @@ def search_demo(keywords='nltk'):
for tweet in client.search_tweets(keywords=keywords, count=10):
print(tweet['text'])
# demo 4
@verbose
def tweets_by_user_demo(user='NLTK_org', count=200):
oauth = credsfromfile()
......@@ -103,7 +103,7 @@ def tweets_by_user_demo(user='NLTK_org', count=200):
client.register(TweetWriter())
client.user_tweets(user, count)
# demo 5
@verbose
def lookup_by_userid_demo():
"""
......@@ -118,7 +118,7 @@ def lookup_by_userid_demo():
following = info['friends_count']
print("{0}, followers: {1}, following: {2}".format(name, followers, following))
# demo 6
@verbose
def followtoscreen_demo(limit=10):
"""
......@@ -133,7 +133,7 @@ def followtoscreen_demo(limit=10):
client.register(TweetViewer(limit=limit))
client.statuses.filter(follow=USERIDS)
# demo 7
@verbose
def streamtofile_demo(limit=20):
"""
......@@ -144,7 +144,7 @@ def streamtofile_demo(limit=20):
client.register(TweetWriter(limit=limit, repeat=False))
client.statuses.sample()
# demo 8
@verbose
def limit_by_time_demo(limit=20, date_limit=DATE):
"""
......@@ -155,9 +155,9 @@ def limit_by_time_demo(limit=20, date_limit=DATE):
client.register(TweetWriter(limit=limit, date_limit=date_limit))
client.sample()
# demo 9
@verbose
def extract_tweetids_demo(infile, outfile):
def extract_tweetids_demo(infile = TWEETS, outfile = IDS):
"""
Given a list of full tweets in a file (``infile``), write just the
tweetIDs to a new file (`outfile`)
......@@ -166,9 +166,9 @@ def extract_tweetids_demo(infile, outfile):
json2csv(infile, outfile, FIELDS)
print("Writing ids to {0}".format(outfile))
# demo 10
@verbose
def expand_tweetids_demo(infile, outfile):
def expand_tweetids_demo(infile = IDS, outfile = HYDRATED):
"""
Given a list of tweetIDs in a file (``infile``), try to recover the full
('hydrated') tweets from the REST API and write the results to a new file (`outfile`).
......@@ -180,7 +180,7 @@ def expand_tweetids_demo(infile, outfile):
client = Query(**oauth)
client.lookup(infile, outfile)
# demo 11
@verbose
def corpusreader_demo():
"""
......@@ -191,58 +191,41 @@ def corpusreader_demo():
* the result of tokenising the raw strings.
"""
from nltk.corpus import TwitterCorpusReader
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '1k_sample.json')
from nltk.corpus import tweets
#reader = TwitterCorpusReader(root, '1k_sample.json')
#reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print()
print("Complete tweet documents")
print(SPACER)
for tweet in reader.docs()[:2]:
for tweet in tweets.docs()[:1]:
print(json.dumps(tweet, indent=1, sort_keys=True))
print()
print("Raw tweet strings:")
print(SPACER)
for text in reader.strings()[:15]:
for text in tweets.strings()[:15]:
print(text)
print()
print("Tokenized tweet strings:")
print(SPACER)
for text in reader.tokenized()[:15]:
print(text)
for toks in tweets.tokenized()[:15]:
print(toks)
ALL = range(12)
DEMOS = ALL[9:10]
ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
streamtofile_demo, limit_by_time_demo,
extract_tweetids_demo, expand_tweetids_demo, corpusreader_demo]
DEMOS = ALL[11:]
if __name__ == "__main__":
"""Run selected demo functions."""
if 0 in DEMOS:
twitterclass_demo()
if 1 in DEMOS:
sampletoscreen_demo()
if 2 in DEMOS:
tracktoscreen_demo()
if 3 in DEMOS:
search_demo()
if 4 in DEMOS:
tweets_by_user_demo()
if 5 in DEMOS:
lookup_by_userid_demo()
if 6 in DEMOS:
followtoscreen_demo()
if 7 in DEMOS:
streamtofile_demo()
if 8 in DEMOS:
limit_by_time_demo()
if 9 in DEMOS:
extract_tweetids_demo(TWEETS, IDS)
if 10 in DEMOS:
expand_tweetids_demo(IDS, HYDRATED)
if 11 in DEMOS:
corpusreader_demo()
for demo in DEMOS:
demo()
......@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from twython.exceptions import TwythonRateLimitError
"""
NLTK Twitter client.
NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
......@@ -31,6 +34,7 @@ from nltk.compat import UTC
try:
from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
......@@ -117,7 +121,7 @@ class Streamer(TwythonStreamer):
try:
if track == '' and follow == '':
raise ValueError("Please supply a value for 'track' or 'follow'.")
self.statuses.filter(track=track, follow=follow)
self.statuses.filter(track=track, follow=follow, lang=lang)
except requests.exceptions.ChunkedEncodingError as e:
if e is not None:
print("Error (stream will continue): {0}".format(e))
......@@ -203,11 +207,12 @@ class Query(Twython):
"""
results = self.search(q=keywords, count=min(100, count), lang=lang)
count_from_query = results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
if self.handler.handle_chunk(results['statuses']) == False:
return
'''
pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits
dealing with twitter rate limits
'''
while count_from_query < count:
max_id = results['search_metadata']['max_id']
......@@ -217,9 +222,10 @@ class Query(Twython):
except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e))
time.sleep(15*60) # wait 15 minutes
continue
continue
count_from_query += results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
if self.handler.handle_chunk(results['statuses']) == False:
return
def user_info_from_id(self, userids):
"""
......@@ -269,11 +275,19 @@ class Twitter(object):
:param bool stream: If ``True``, use the live public stream,\
otherwise search past public tweets
:param int limit: Number of tweets to process
:param tuple date_limit: The date at which to stop collecting new\
data. This should be entered as a tuple which can serve as the\
argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
40)` for 12:30 pm on April 1 2015.\
Note that, in the case of streaming, it is the maximum date, i.e.\
a date in the future; if not, it is the minimum date, i.e. a date\
in the past
:param str lang: language
"""
if to_screen:
handler = TweetViewer(limit=limit, date_limit=date_limit)
handler = TweetViewer(limit=limit, date_limit=date_limit, stream=stream)
else:
handler = TweetWriter(limit=limit, date_limit=date_limit, repeat=False)
handler = TweetWriter(limit=limit, date_limit=date_limit, stream=stream, repeat=False)
if stream:
self.streamer.register(handler)
......@@ -315,7 +329,7 @@ class TweetWriter(TweetHandlerI):
"""
Handle data by writing it to a file.
"""
def __init__(self, limit=2000, date_limit=None, repeat=True, fprefix='tweets',
def __init__(self, limit=2000, date_limit=None, repeat=True, stream=True, fprefix='tweets',
subdir='twitter-files'):
"""
:param limit: number of data items to process in the current round of processing
......@@ -323,6 +337,8 @@ class TweetWriter(TweetHandlerI):
:param repeat: flag to determine whether multiple files should be\
written. If ``True``, the length of each file will be set by the value\
of ``limit``. See also :py:func:`handle`.
:param
"""
self.repeat = repeat
......@@ -330,6 +346,7 @@ class TweetWriter(TweetHandlerI):
self.subdir = guess_path(subdir)
self.fname = self.timestamped_file()
self.startingup = True
self.stream = stream
TweetHandlerI.__init__(self, limit, date_limit)
......@@ -366,9 +383,14 @@ class TweetWriter(TweetHandlerI):
if self.date_limit:
tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\
%H:%M:%S +0000 %Y').replace(tzinfo=UTC)
if tweet_date > self.date_limit:
print("Date limit {0} is earlier than date of current tweet {1}".\
format(self.date_limit, tweet_date))
if (tweet_date > self.date_limit and self.stream == True) or \
(tweet_date < self.date_limit and self.stream == False):
if self.stream:
message = "earlier"
else:
message = "later"
print("Date limit {0} is {1} than date of current tweet {2}".\
format(self.date_limit, message, tweet_date))
return False
self.startingup = False
......
......@@ -30,9 +30,13 @@ def extract_fields(tweet, fields):
"""
out = []
for field in fields:
_add_field_to_out(tweet, field, out)
try:
_add_field_to_out(tweet, field, out)
except TypeError:
raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
return out
def _add_field_to_out(json, field, out):
if isinstance(field, dict):
for key, value in field.iteritems():
......@@ -40,9 +44,28 @@ def _add_field_to_out(json, field, out):
else:
if isinstance(field, basestring):
out += [json[field]]
else :
else:
out += [json[value] for value in field]
def _get_entity_recursive(json, entity):
if json == None:
return None
if isinstance(json, dict):
for key, value in json.iteritems():
if key == entity:
return value
candidate = _get_entity_recursive(value, entity)
if candidate != None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate != None:
return candidate
return None
else:
return None
def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
"""
......@@ -53,8 +76,10 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
to a CSV file for easier processing. For example, just tweetIDs or
just the text content of the tweets can be extracted.
Additionally, the function allows combinations of fields of Twitter. See
below.
Additionally, the function allows combinations of fields of other Twitter
objects (mainly the users, see below).
For Twitter entities (e.g. hashtags of a tweet) see json2csv_entities
:param str infile: The name of the file containing full tweets
......@@ -65,25 +90,107 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
Addionally, it allows fileds from other Twitter entities.
Addionally, it allows fileds from other Twitter objects.
e. g.: ['id', 'text', {'user' : ['id', 'followers_count', 'friends_count']}]
Not suitable for entities like hastags; use json2csv_entities instead.
Not for the place of a tweet; also use json2csv.
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
"""
with open(infile) as inf:
if compat.PY3 == True:
outf = open(outfile, 'w', encoding=encoding)
writer = csv.writer(outf)
else:
outf = open(outfile, 'wb')
writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
writer = get_outf_writer_compat(outfile, encoding, errors)
for line in inf:
tweet = json.loads(line)
row = extract_fields(tweet, fields)
writer.writerow(row)
def get_outf_writer_compat(outfile, encoding, errors):
if compat.PY3 == True:
outf = open(outfile, 'w', encoding=encoding, errors=errors)
writer = csv.writer(outf)
else:
outf = open(outfile, 'wb')
writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
return writer
def json2csv_entities(infile, outfile, main_fields, entity_name, entity_fields,
encoding='utf8', errors='replace'):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
:param str infile: The name of the file containing full tweets
:param str outfile: The name of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If entity_name is expressed as a dictionary, then it is list of fields\
of the object that corresponds to the key of the dictionary (could be\
the user object, or the place of a tweet object).
:param list entity_name: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For the user object,\
needs to be expressed as a dictionary: {'user' : 'urls'}. For the\
bounding box of the place from which a tweet was twitted, as a dict\
as well: {'place', 'bounding_box'}
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. ['text'] (of the hashtag)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
"""
with open(infile) as inf:
writer = get_outf_writer_compat(outfile, encoding, errors)
for line in inf:
tweet = json.loads(line)
if isinstance(entity_name, dict):
for key, value in entity_name.iteritems():
object_json = _get_entity_recursive(tweet, key)
if object_json == None:
# can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_name)
_write_to_file(tweet_fields, items, entity_fields, writer)
def _write_to_file(object_fields, items, entity_fields, writer):
if items == None:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens for "place" of a tweet
row = object_fields
for key, value in items.iteritems():
if key in entity_fields:
if isinstance(value, list):
row += value
else:
row += [value]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
def credsfromfile(creds_file=None, subdir=None, verbose=False):
"""
Read OAuth credentials from a text file.
......
......@@ -7,5 +7,6 @@ scipy>=0.13.2
matplotlib>=1.3.1
scikit-learn>=0.14.1
python-crfsuite>=0.8.2
pyparsing
twython>=3.2.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment