Merge remote-tracking branch 'nltk/twitter' into twitter

e08cc758 · lrnzcig · e3d7ff88 · 10a99af5 · e08cc758 · e08cc758
Commit e08cc758 authored Apr 29, 2015 by lrnzcig
13 changed files
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -94,6 +94,17 @@ try:
 except ImportError:
    pass
+# Override missing methods on environments where it cannot be used like GAE.
+import subprocess
+if not hasattr(subprocess, 'PIPE'):
+    def _fake_PIPE(*args, **kwargs):
+        raise NotImplementedError('subprocess.PIPE is not supported.')
+    subprocess.PIPE = _fake_PIPE
+if not hasattr(subprocess, 'Popen'):
+    def _fake_Popen(*args, **kwargs):
+        raise NotImplementedError('subprocess.Popen is not supported.')
+    subprocess.Popen = _fake_Popen
 ###########################################################
 # TOP-LEVEL MODULES
 ###########################################################

--- a/nltk/corpus/reader/tweets.py
+++ b/nltk/corpus/reader/tweets.py
@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
    The corpus view class used by this reader.
    """
-    def __init__(self, root,
+    def __init__(self, root, fileids = None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

--- a/nltk/data.py
+++ b/nltk/data.py
@@ -73,7 +73,7 @@ path = []
 # User-specified locations:
 path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
-if os.path.expanduser('~/') != '~/':
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
    path.append(os.path.expanduser(str('~/nltk_data')))
 if sys.platform.startswith('win'):

--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -924,6 +924,10 @@ class Downloader(object):
        permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
        ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
        """
+        # Check if we are on GAE where we cannot write into filesystem.
+        if 'APPENGINE_RUNTIME' in os.environ:
+            return
        # Check if we have sufficient permissions to install in a
        # variety of system-wide locations.
        for nltkdir in nltk.data.path:
@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
        downloader.download(download_dir=options.dir,
            quiet=options.quiet, force=options.force,
            halt_on_error=options.halt_on_error)
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
        if searchpath:
            msg += '\n\n  Searched in:'
            msg += ''.join('\n    - %s' % d for d in searchpath)
-        if url: msg += ('\n\n  For more information, on %s, see:\n    <%s>' %
+        if url: msg += ('\n\n  For more information on %s, see:\n    <%s>' %
                        (filename, url))
        div = '='*75
        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))

--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -71,7 +71,7 @@ from nltk.tag.brill         import BrillTagger
 from nltk.tag.brill_trainer import BrillTaggerTrainer
 from nltk.tag.tnt           import TnT
 from nltk.tag.hunpos        import HunposTagger
-from nltk.tag.stanford      import StanfordTagger
+from nltk.tag.stanford      import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
 from nltk.tag.hmm           import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
 from nltk.tag.senna         import SennaTagger, SennaChunkTagger, SennaNERTagger
 from nltk.tag.mapping       import tagset_mapping, map_tag

--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
 # -*- coding: utf-8 -*-
-# Natural Language Toolkit: Interface to the Stanford NER-tagger
+# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Author: Nitin Madnani <nmadnani@ets.org>
@@ -9,6 +9,12 @@
 """
 A module for interfacing with the Stanford taggers.
+Tagger models need to be downloaded from http://nlp.stanford.edu/software
+and the STANFORD_MODELS environment variable set (a colon-separated
+list of paths).
+For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
 """
 import os
@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
    _SEPARATOR = ''
    _JAR = ''
-    def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
+    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
-                    'instantiated directly. Did you mean POS- or NERTagger?')
+                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)
-        self._stanford_model = find_file(path_to_model,
+        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
            tagged_sentences.append(sentence)
        return tagged_sentences
-class POSTagger(StanfordTagger):
+class StanfordPOSTagger(StanfordTagger):
    """
    A class for pos tagging with Stanford Tagger. The input is the paths to:
     - a model trained on training data
@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
    Example:
-        >>> from nltk.tag.stanford import POSTagger
+        >>> from nltk.tag import StanfordPOSTagger
-        >>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
+        >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
-        ...                '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
        >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
    """
@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
    _JAR = 'stanford-postagger.jar'
    def __init__(self, *args, **kwargs):
-        super(POSTagger, self).__init__(*args, **kwargs)
+        super(StanfordPOSTagger, self).__init__(*args, **kwargs)
    @property
    def _cmd(self):
@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
                '-model', self._stanford_model, '-textFile',
                self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
-class NERTagger(StanfordTagger):
+class StanfordNERTagger(StanfordTagger):
    """
-    A class for ner tagging with Stanford Tagger. The input is the paths to:
+    A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
    - a model trained on training data
    - (optionally) the path to the stanford tagger jar file. If not specified here,
@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
    Example:
-        >>> from nltk.tag.stanford import NERTagger
+        >>> from nltk.tag import StanfordNERTagger
-        >>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
+        >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
-        ...                '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
        >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
         ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
    _FORMAT = 'slashTags'
    def __init__(self, *args, **kwargs):
-        super(NERTagger, self).__init__(*args, **kwargs)
+        super(StanfordNERTagger, self).__init__(*args, **kwargs)
    @property
    def _cmd(self):
@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
    def parse_output(self, text):
      if self._FORMAT == 'slashTags':
-        return super(NERTagger, self).parse_output(text)
+        return super(StanfordNERTagger, self).parse_output(text)
      raise NotImplementedError

--- a/nltk/test/unit/test_tgrep.py
+++ b/nltk/test/unit/test_tgrep.py
--- a/nltk/tgrep.py
+++ b/nltk/tgrep.py
--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
 Twitter API.
 """
-try:
-    from twython import Twython, TwythonStreamer
-except ImportError as err:
-    import textwrap
-    MSG = """The NLTK twitterclient module requires the Twython package. See\
-    https://twython.readthedocs.org/ for installation instructions."""
-    err.msg = textwrap.fill(MSG)
-    raise
 from nltk.twitter.util import credsfromfile
 from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
     TweetWriter
--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -191,29 +191,37 @@ def corpusreader_demo():
    * the result of tokenising the raw strings.
    """
-    from nltk.corpus import TwitterCorpusReader
+    #from nltk.corpus import TwitterCorpusReader
+    from nltk.corpus import tweets
+    tweets.fileids()
    #root = os.environ['TWITTER']
    #reader = TwitterCorpusReader(root, '1k_sample.json')
-    reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
+    #reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
    print()
    print("Complete tweet documents")
    print(SPACER)
-    for tweet in reader.docs()[:2]:
+    for tweet in tweets.docs()[:2]:
        print(json.dumps(tweet, indent=1, sort_keys=True))
    print()
    print("Raw tweet strings:")
    print(SPACER)
-    for text in reader.strings()[:15]:
+    for text in tweets.strings()[:15]:
        print(text)
    print()
    print("Tokenized tweet strings:")
    print(SPACER)
-    for text in reader.tokenized()[:15]:
+    for text in tweets.tokenized()[:15]:
        print(text)
+#def corpusreader_demo():
+    #from nltk.corpus import brown
+    #brown.words()
 ALL = range(12)
 DEMOS = ALL[11:]

--- a/nltk/twitter/twitterclient.py
+++ b/nltk/twitter/twitterclient.py
@@ -6,11 +6,14 @@
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-from twython.exceptions import TwythonRateLimitError
 """
-NLTK Twitter client.
+NLTK Twitter client
+This module offers methods for collecting and processing tweets. Most of the
+functionality depends on access to the Twitter APIs, and this is handled via
+the third party Twython library.
 If one of the methods below returns an integer, it is probably a `Twitter
 error code <https://dev.twitter.com/overview/api/response-codes>`_. For
@@ -31,6 +34,7 @@ from nltk.compat import UTC
 try:
    from twython import Twython, TwythonStreamer
+    from twython.exceptions import TwythonRateLimitError
 except ImportError as err:
    import textwrap
    MSG = """The NLTK twitterclient module requires the Twython package. See\
@@ -204,10 +208,10 @@ class Query(Twython):
        results = self.search(q=keywords, count=min(100, count), lang=lang)
        count_from_query = results['search_metadata']['count']
        self.handler.handle_chunk(results['statuses'])
        '''
        pagination loop: keep fetching tweets until the count requested is reached,
-        dealing with twitter rate limits 
+        dealing with twitter rate limits
        '''
        while count_from_query < count:
            max_id = results['search_metadata']['max_id']
@@ -217,7 +221,7 @@ class Query(Twython):
            except TwythonRateLimitError as e:
                print("Waiting for 15 minutes -{0}".format(e))
                time.sleep(15*60) # wait 15 minutes
-                continue           
+                continue
            count_from_query += results['search_metadata']['count']
            self.handler.handle_chunk(results['statuses'])

--- a/pip-req.txt
+++ b/pip-req.txt
@@ -7,5 +7,6 @@ scipy>=0.13.2
 matplotlib>=1.3.1
 scikit-learn>=0.14.1
 python-crfsuite>=0.8.2
+pyparsing
 twython>=3.2.0