Merge branch 'twitter' into sentiment

ffc789a2 · Ewan Klein · 2b9c9449 · d4717752 · ffc789a2 · ffc789a2
Commit ffc789a2 authored Apr 30, 2015 by Ewan Klein
17 changed files
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -94,6 +94,17 @@ try:
 except ImportError:
    pass

+# Override missing methods on environments where it cannot be used like GAE.
+import subprocess
+if not hasattr(subprocess, 'PIPE'):
+    def _fake_PIPE(*args, **kwargs):
+        raise NotImplementedError('subprocess.PIPE is not supported.')
+    subprocess.PIPE = _fake_PIPE
+if not hasattr(subprocess, 'Popen'):
+    def _fake_Popen(*args, **kwargs):
+        raise NotImplementedError('subprocess.Popen is not supported.')
+    subprocess.Popen = _fake_Popen
+
 ###########################################################
 # TOP-LEVEL MODULES
 ###########################################################

--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -201,7 +201,7 @@ treebank_chunk = LazyCorpusLoader(
 treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
 tweets = LazyCorpusLoader(
-    'tweets', TwitterCorpusReader)
+    'twitter', TwitterCorpusReader, '.*\.json')
 udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
 udhr2 = LazyCorpusLoader(

--- a/nltk/corpus/reader/bnc.py
+++ b/nltk/corpus/reader/bnc.py
@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
    http://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
-    instantiate the reder as::
+    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')


--- a/nltk/corpus/reader/tweets.py
+++ b/nltk/corpus/reader/tweets.py
@@ -26,6 +26,22 @@ class TwitterCorpusReader(CorpusReader):

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.
+
+    Construct a new Tweet corpus reader for a set of documents
+    located at the given root directory.
+
+    If you made your own tweet collection in a directory called
+    `twitter-files`, then you can initialise the reader as::
+
+        from nltk.corpus import TwitterCorpusReader
+        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
+
+    However, the recommended approach is to use this directory as the value of the
+    environmental variable `TWITTER`, and then invoke the reader as::
+
+       root = os.environ['TWITTER']
+       reader = TwitterCorpusReader(root, '.*\.json')
+
    """

    CorpusView = StreamBackedCorpusView
@@ -33,15 +49,10 @@ class TwitterCorpusReader(CorpusReader):
    The corpus view class used by this reader.
    """

-    def __init__(self, root, fileids,
+    def __init__(self, root, fileids = None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """
-        Construct a new Tweet corpus reader for a set of documents
-        located at the given root directory.  Example usage:
-
-            >>> root = os.environ['TWITTER']
-            >>> reader = TwitterCorpusReader(root, '.*\.json') # doctest: +SKIP

        :param root: The root directory for this corpus.


--- a/nltk/data.py
+++ b/nltk/data.py
@@ -73,7 +73,7 @@ path = []

 # User-specified locations:
 path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
-if os.path.expanduser('~/') != '~/':
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
    path.append(os.path.expanduser(str('~/nltk_data')))

 if sys.platform.startswith('win'):

--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -924,6 +924,10 @@ class Downloader(object):
        permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
        ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
        """
+        # Check if we are on GAE where we cannot write into filesystem.
+        if 'APPENGINE_RUNTIME' in os.environ:
+            return
+
        # Check if we have sufficient permissions to install in a
        # variety of system-wide locations.
        for nltkdir in nltk.data.path:
@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
        downloader.download(download_dir=options.dir,
            quiet=options.quiet, force=options.force,
            halt_on_error=options.halt_on_error)
-
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
        if searchpath:
            msg += '\n\n  Searched in:'
            msg += ''.join('\n    - %s' % d for d in searchpath)
-        if url: msg += ('\n\n  For more information, on %s, see:\n    <%s>' %
+        if url: msg += ('\n\n  For more information on %s, see:\n    <%s>' %
                        (filename, url))
        div = '='*75
        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))

--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -71,7 +71,7 @@ from nltk.tag.brill         import BrillTagger
 from nltk.tag.brill_trainer import BrillTaggerTrainer
 from nltk.tag.tnt           import TnT
 from nltk.tag.hunpos        import HunposTagger
-from nltk.tag.stanford      import StanfordTagger
+from nltk.tag.stanford      import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
 from nltk.tag.hmm           import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
 from nltk.tag.senna         import SennaTagger, SennaChunkTagger, SennaNERTagger
 from nltk.tag.mapping       import tagset_mapping, map_tag

--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
 # -*- coding: utf-8 -*-
-# Natural Language Toolkit: Interface to the Stanford NER-tagger
+# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Author: Nitin Madnani <nmadnani@ets.org>
@@ -9,6 +9,12 @@

 """
 A module for interfacing with the Stanford taggers.
+
+Tagger models need to be downloaded from http://nlp.stanford.edu/software
+and the STANFORD_MODELS environment variable set (a colon-separated
+list of paths).
+
+For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
 """

 import os
@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
    _SEPARATOR = ''
    _JAR = ''

-    def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
+    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
-                    'instantiated directly. Did you mean POS- or NERTagger?')
+                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

-        self._stanford_model = find_file(path_to_model,
+        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
            tagged_sentences.append(sentence)
        return tagged_sentences

-class POSTagger(StanfordTagger):
+class StanfordPOSTagger(StanfordTagger):
    """
    A class for pos tagging with Stanford Tagger. The input is the paths to:
     - a model trained on training data
@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):

    Example:

-        >>> from nltk.tag.stanford import POSTagger
-        >>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
-        ...                '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
+        >>> from nltk.tag import StanfordPOSTagger
+        >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
        >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
    """
@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
    _JAR = 'stanford-postagger.jar'

    def __init__(self, *args, **kwargs):
-        super(POSTagger, self).__init__(*args, **kwargs)
+        super(StanfordPOSTagger, self).__init__(*args, **kwargs)

    @property
    def _cmd(self):
@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
                '-model', self._stanford_model, '-textFile',
                self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']

-class NERTagger(StanfordTagger):
+class StanfordNERTagger(StanfordTagger):
    """
-    A class for ner tagging with Stanford Tagger. The input is the paths to:
+    A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:

    - a model trained on training data
    - (optionally) the path to the stanford tagger jar file. If not specified here,
@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):

    Example:

-        >>> from nltk.tag.stanford import NERTagger
-        >>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
-        ...                '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
+        >>> from nltk.tag import StanfordNERTagger
+        >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
        >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
         ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
    _FORMAT = 'slashTags'

    def __init__(self, *args, **kwargs):
-        super(NERTagger, self).__init__(*args, **kwargs)
+        super(StanfordNERTagger, self).__init__(*args, **kwargs)

    @property
    def _cmd(self):
@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):

    def parse_output(self, text):
      if self._FORMAT == 'slashTags':
-        return super(NERTagger, self).parse_output(text)
+        return super(StanfordNERTagger, self).parse_output(text)
      raise NotImplementedError



--- a/nltk/test/unit/test_tgrep.py
+++ b/nltk/test/unit/test_tgrep.py
--- a/nltk/tgrep.py
+++ b/nltk/tgrep.py
--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -13,7 +13,6 @@ This package contains classes for retrieving Tweet documents using the
 Twitter API.

 """
-
 from nltk.twitter.util import credsfromfile
 from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
     TweetWriter
--- a/nltk/twitter/api.py
+++ b/nltk/twitter/api.py
@@ -72,4 +72,6 @@ class TweetHandlerI(object):
        (default implementation should be enough in most cases)
        """
        for item in data_chunk:
-            self.handle(item)
+            if self.handle(item) == False:
+                return False
+        return True
--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -47,7 +47,7 @@ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800'] # UserIDs co
 HYDRATED = os.path.join(TWITTER, 'rehydrated.json')
 DATE = (2015, 4, 20, 16, 40)

-# demo 0
+
 @verbose
 def twitterclass_demo():
    """
@@ -62,7 +62,7 @@ def twitterclass_demo():
    tw = Twitter()
    tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream

-# demo 1
+
 @verbose
 def sampletoscreen_demo(limit=20):
    """
@@ -73,7 +73,7 @@ def sampletoscreen_demo(limit=20):
    client.register(TweetViewer(limit=limit))
    client.sample()

-# demo 2
+
 @verbose
 def tracktoscreen_demo(track="taylor swift", limit=10):
    """
@@ -84,7 +84,7 @@ def tracktoscreen_demo(track="taylor swift", limit=10):
    client.register(TweetViewer(limit=limit))
    client.filter(track=track)

-# demo 3
+
 @verbose
 def search_demo(keywords='nltk'):
    """
@@ -95,7 +95,7 @@ def search_demo(keywords='nltk'):
    for tweet in client.search_tweets(keywords=keywords, count=10):
        print(tweet['text'])

-# demo 4
+
 @verbose
 def tweets_by_user_demo(user='NLTK_org', count=200):
    oauth = credsfromfile()
@@ -103,7 +103,7 @@ def tweets_by_user_demo(user='NLTK_org', count=200):
    client.register(TweetWriter())
    client.user_tweets(user, count)

-# demo 5
+
 @verbose
 def lookup_by_userid_demo():
    """
@@ -118,7 +118,7 @@ def lookup_by_userid_demo():
        following = info['friends_count']
        print("{0}, followers: {1}, following: {2}".format(name, followers, following))

-# demo 6
+
 @verbose
 def followtoscreen_demo(limit=10):
    """
@@ -133,7 +133,7 @@ def followtoscreen_demo(limit=10):
    client.register(TweetViewer(limit=limit))
    client.statuses.filter(follow=USERIDS)

-# demo 7
+
 @verbose
 def streamtofile_demo(limit=20):
    """
@@ -144,7 +144,7 @@ def streamtofile_demo(limit=20):
    client.register(TweetWriter(limit=limit, repeat=False))
    client.statuses.sample()

-# demo 8
+
 @verbose
 def limit_by_time_demo(limit=20, date_limit=DATE):
    """
@@ -155,9 +155,9 @@ def limit_by_time_demo(limit=20, date_limit=DATE):
    client.register(TweetWriter(limit=limit, date_limit=date_limit))
    client.sample()

-# demo 9
+
 @verbose
-def extract_tweetids_demo(infile, outfile):
+def extract_tweetids_demo(infile = TWEETS, outfile = IDS):
    """
    Given a list of full tweets in a file (``infile``), write just the
    tweetIDs to a new file (`outfile`)
@@ -166,9 +166,9 @@ def extract_tweetids_demo(infile, outfile):
    json2csv(infile, outfile, FIELDS)
    print("Writing ids to {0}".format(outfile))

-# demo 10
+
 @verbose
-def expand_tweetids_demo(infile, outfile):
+def expand_tweetids_demo(infile = IDS, outfile = HYDRATED):
    """
    Given a list of tweetIDs in a file (``infile``), try to recover the full
    ('hydrated') tweets from the REST API and write the results to a new file (`outfile`).
@@ -180,7 +180,7 @@ def expand_tweetids_demo(infile, outfile):
    client = Query(**oauth)
    client.lookup(infile, outfile)

-# demo 11
+
 @verbose
 def corpusreader_demo():
    """
@@ -191,58 +191,41 @@ def corpusreader_demo():
    * the result of tokenising the raw strings.

    """
-    from nltk.corpus import TwitterCorpusReader
-    root = os.environ['TWITTER']
-    reader = TwitterCorpusReader(root, '1k_sample.json')
+    from nltk.corpus import tweets
+
+    #reader = TwitterCorpusReader(root, '1k_sample.json')
+    #reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
    print()
    print("Complete tweet documents")
    print(SPACER)
-    for tweet in reader.docs()[:2]:
+    for tweet in tweets.docs()[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
-    for text in reader.strings()[:15]:
+    for text in tweets.strings()[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
-    for text in reader.tokenized()[:15]:
-        print(text)
+    for toks in tweets.tokenized()[:15]:
+        print(toks)


-ALL = range(12)
-DEMOS = ALL[9:10]
+ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
+         search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
+         streamtofile_demo, limit_by_time_demo,
+         extract_tweetids_demo, expand_tweetids_demo, corpusreader_demo]

+DEMOS = ALL[11:]

 if __name__ == "__main__":
    """Run selected demo functions."""
-    if 0 in DEMOS:
-        twitterclass_demo()
-    if 1 in DEMOS:
-        sampletoscreen_demo()
-    if 2 in DEMOS:
-        tracktoscreen_demo()
-    if 3 in DEMOS:
-        search_demo()
-    if 4 in DEMOS:
-        tweets_by_user_demo()
-    if 5 in DEMOS:
-        lookup_by_userid_demo()
-    if 6 in DEMOS:
-        followtoscreen_demo()
-    if 7 in DEMOS:
-        streamtofile_demo()
-    if 8 in DEMOS:
-        limit_by_time_demo()
-    if 9 in DEMOS:
-        extract_tweetids_demo(TWEETS, IDS)
-    if 10 in DEMOS:
-        expand_tweetids_demo(IDS, HYDRATED)
-    if 11 in DEMOS:
-        corpusreader_demo()
+
+    for demo in DEMOS:
+        demo()



--- a/nltk/twitter/twitterclient.py
+++ b/nltk/twitter/twitterclient.py
@@ -6,11 +6,14 @@
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-from twython.exceptions import TwythonRateLimitError
+

 """
-NLTK Twitter client.
+NLTK Twitter client

+This module offers methods for collecting and processing tweets. Most of the
+functionality depends on access to the Twitter APIs, and this is handled via
+the third party Twython library.

 If one of the methods below returns an integer, it is probably a `Twitter
 error code <https://dev.twitter.com/overview/api/response-codes>`_. For
@@ -31,6 +34,7 @@ from nltk.compat import UTC

 try:
    from twython import Twython, TwythonStreamer
+    from twython.exceptions import TwythonRateLimitError
 except ImportError as err:
    import textwrap
    MSG = """The NLTK twitterclient module requires the Twython package. See\
@@ -117,7 +121,7 @@ class Streamer(TwythonStreamer):
            try:
                if track == '' and follow == '':
                    raise ValueError("Please supply a value for 'track' or 'follow'.")
-                self.statuses.filter(track=track, follow=follow)
+                self.statuses.filter(track=track, follow=follow, lang=lang)
            except requests.exceptions.ChunkedEncodingError as e:
                if e is not None:
                    print("Error (stream will continue): {0}".format(e))
@@ -203,11 +207,12 @@ class Query(Twython):
        """
        results = self.search(q=keywords, count=min(100, count), lang=lang)
        count_from_query = results['search_metadata']['count']
-        self.handler.handle_chunk(results['statuses'])
-        
+        if self.handler.handle_chunk(results['statuses']) == False:
+            return
+
        '''
        pagination loop: keep fetching tweets until the count requested is reached,
-        dealing with twitter rate limits 
+        dealing with twitter rate limits
        '''
        while count_from_query < count:
            max_id = results['search_metadata']['max_id']
@@ -217,9 +222,10 @@ class Query(Twython):
            except TwythonRateLimitError as e:
                print("Waiting for 15 minutes -{0}".format(e))
                time.sleep(15*60) # wait 15 minutes
-                continue           
+                continue
            count_from_query += results['search_metadata']['count']
-            self.handler.handle_chunk(results['statuses'])
+            if self.handler.handle_chunk(results['statuses']) == False:
+                return

    def user_info_from_id(self, userids):
        """
@@ -269,11 +275,19 @@ class Twitter(object):
        :param bool stream: If ``True``, use the live public stream,\
        otherwise search past public tweets
        :param int limit: Number of tweets to process
+        :param tuple date_limit: The date at which to stop collecting new\
+        data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
+        40)` for 12:30 pm on April 1 2015.\
+        Note that, in the case of streaming, it is the maximum date, i.e.\
+        a date in the future; if not, it is the minimum date, i.e. a date\
+        in the past
+        :param str lang: language
        """
        if to_screen:
-            handler = TweetViewer(limit=limit, date_limit=date_limit)
+            handler = TweetViewer(limit=limit, date_limit=date_limit, stream=stream)
        else:
-            handler = TweetWriter(limit=limit, date_limit=date_limit, repeat=False)
+            handler = TweetWriter(limit=limit, date_limit=date_limit, stream=stream, repeat=False)

        if stream:
            self.streamer.register(handler)
@@ -315,7 +329,7 @@ class TweetWriter(TweetHandlerI):
    """
    Handle data by writing it to a file.
    """
-    def __init__(self, limit=2000, date_limit=None, repeat=True, fprefix='tweets',
+    def __init__(self, limit=2000, date_limit=None, repeat=True, stream=True, fprefix='tweets',
                 subdir='twitter-files'):
        """
        :param limit: number of data items to process in the current round of processing
@@ -323,6 +337,8 @@ class TweetWriter(TweetHandlerI):
        :param repeat: flag to determine whether multiple files should be\
        written. If ``True``, the length of each file will be set by the value\
        of ``limit``. See also :py:func:`handle`.
+        
+        :param 

        """
        self.repeat = repeat
@@ -330,6 +346,7 @@ class TweetWriter(TweetHandlerI):
        self.subdir = guess_path(subdir)
        self.fname = self.timestamped_file()
        self.startingup = True
+        self.stream = stream
        TweetHandlerI.__init__(self, limit, date_limit)


@@ -366,9 +383,14 @@ class TweetWriter(TweetHandlerI):
        if self.date_limit:
            tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\
            %H:%M:%S +0000 %Y').replace(tzinfo=UTC)
-            if tweet_date > self.date_limit:
-                print("Date limit {0} is earlier than date of current tweet {1}".\
-                                 format(self.date_limit, tweet_date))
+            if (tweet_date > self.date_limit and self.stream == True) or \
+                (tweet_date < self.date_limit and self.stream == False):
+                if self.stream:
+                    message = "earlier"
+                else:
+                    message = "later"
+                print("Date limit {0} is {1} than date of current tweet {2}".\
+                                 format(self.date_limit, message, tweet_date))
                return False

        self.startingup = False

--- a/nltk/twitter/util.py
+++ b/nltk/twitter/util.py
@@ -30,9 +30,13 @@ def extract_fields(tweet, fields):
    """
    out = []
    for field in fields:
-        _add_field_to_out(tweet, field, out)
+        try:
+            _add_field_to_out(tweet, field, out)
+        except TypeError:
+            raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
    return out

+
 def _add_field_to_out(json, field, out):
    if isinstance(field, dict):
        for key, value in field.iteritems():
@@ -40,9 +44,28 @@ def _add_field_to_out(json, field, out):
    else:
        if isinstance(field, basestring):
            out += [json[field]]
-        else :
+        else:
            out += [json[value] for value in field]
-        
+
+def _get_entity_recursive(json, entity):
+    if json == None:
+        return None
+    if isinstance(json, dict):
+        for key, value in json.iteritems():
+            if key == entity:
+                return value 
+            candidate = _get_entity_recursive(value, entity)
+            if candidate != None:
+                return candidate
+        return None
+    elif isinstance(json, list):
+        for item in json:
+            candidate = _get_entity_recursive(item, entity)
+            if candidate != None:
+                return candidate
+        return None
+    else:
+        return None

 def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
    """
@@ -53,8 +76,10 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
    to a CSV file for easier processing. For example, just tweetIDs or
    just the text content of the tweets can be extracted.
    
-    Additionally, the function allows combinations of fields of Twitter. See
-    below.
+    Additionally, the function allows combinations of fields of other Twitter
+    objects (mainly the users, see below).
+    
+    For Twitter entities (e.g. hashtags of a tweet) see json2csv_entities

    :param str infile: The name of the file containing full tweets

@@ -65,25 +90,107 @@ def json2csv(infile, outfile, fields, encoding='utf8', errors='replace'):
    are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
-    Addionally, it allows fileds from other Twitter entities.
+    Addionally, it allows fileds from other Twitter objects.
    e. g.: ['id', 'text', {'user' : ['id', 'followers_count', 'friends_count']}]
-    
+    Not suitable for entities like hastags; use json2csv_entities instead.
+    Not for the place of a tweet; also use json2csv.

    :param error: Behaviour for encoding errors, see\
    https://docs.python.org/3/library/codecs.html#codec-base-classes 
    """
    with open(infile) as inf:
-        if compat.PY3 == True:
-            outf = open(outfile, 'w', encoding=encoding)
-            writer = csv.writer(outf)
-        else:
-            outf = open(outfile, 'wb')
-            writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
+        writer = get_outf_writer_compat(outfile, encoding, errors)
        for line in inf:
            tweet = json.loads(line)
            row = extract_fields(tweet, fields)
            writer.writerow(row)

+def get_outf_writer_compat(outfile, encoding, errors):
+    if compat.PY3 == True:
+        outf = open(outfile, 'w', encoding=encoding, errors=errors)
+        writer = csv.writer(outf)
+    else:
+        outf = open(outfile, 'wb')
+        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
+    return writer
+    
+    
+def json2csv_entities(infile, outfile, main_fields, entity_name, entity_fields,
+                      encoding='utf8', errors='replace'):
+    """
+    Extract selected fields from a file of line-separated JSON tweets and
+    write to a file in CSV format.
+
+    This utility function allows a file of full tweets to be easily converted
+    to a CSV file for easier processing of Twitter entities. For example, the
+    hashtags or media elements of a tweet can be extracted.
+    
+    :param str infile: The name of the file containing full tweets
+
+    :param str outfile: The name of the text file where results should be\
+    written
+
+    :param list main_fields: The list of fields to be extracted from the main\
+    object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
+    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
+    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
+    If entity_name is expressed as a dictionary, then it is list of fields\
+    of the object that corresponds to the key of the dictionary (could be\
+    the user object, or the place of a tweet object).
+
+    :param list entity_name: The name of the entity: 'hashtags', 'media',\
+    'urls' and 'user_mentions' for the tweet object. For the user object,\
+    needs to be expressed as a dictionary: {'user' : 'urls'}. For the\
+    bounding box of the place from which a tweet was twitted, as a dict\
+    as well: {'place', 'bounding_box'}
+    
+    :param list entity_fields: The list of fields to be extracted from the\
+    entity. E.g. ['text'] (of the hashtag)
+    
+    :param error: Behaviour for encoding errors, see\
+    https://docs.python.org/3/library/codecs.html#codec-base-classes
+    """
+    with open(infile) as inf:
+        writer = get_outf_writer_compat(outfile, encoding, errors)
+        for line in inf:
+            tweet = json.loads(line)
+            if isinstance(entity_name, dict):
+                for key, value in entity_name.iteritems():
+                    object_json = _get_entity_recursive(tweet, key)
+                    if object_json == None:
+                        # can happen in the case of "place"
+                        continue
+                    object_fields = extract_fields(object_json, main_fields)
+                    items = _get_entity_recursive(object_json, value)
+                    _write_to_file(object_fields, items, entity_fields, writer)
+            else:
+                tweet_fields = extract_fields(tweet, main_fields)
+                items = _get_entity_recursive(tweet, entity_name)
+                _write_to_file(tweet_fields, items, entity_fields, writer)
+
+def _write_to_file(object_fields, items, entity_fields, writer):
+    if items == None:
+        # it could be that the entity is just not present for the tweet
+        # e.g. tweet hashtag is always present, even as [], however
+        # tweet media may not be present
+        return
+    if isinstance(items, dict):
+        # this happens for "place" of a tweet
+        row = object_fields
+        for key, value in items.iteritems():
+            if key in entity_fields:
+                if isinstance(value, list):
+                    row += value
+                else:
+                    row += [value]
+        writer.writerow(row)
+        return
+    # in general it is a list
+    for item in items:
+        row = object_fields + extract_fields(item, entity_fields)
+        writer.writerow(row)
+    
+    
 def credsfromfile(creds_file=None, subdir=None, verbose=False):
    """
    Read OAuth credentials from a text file.

--- a/pip-req.txt
+++ b/pip-req.txt
@@ -7,5 +7,6 @@ scipy>=0.13.2
 matplotlib>=1.3.1
 scikit-learn>=0.14.1
 python-crfsuite>=0.8.2
+pyparsing
 twython>=3.2.0