Commit e08cc758 by lrnzcig

Merge remote-tracking branch 'nltk/twitter' into twitter

parents e3d7ff88 10a99af5
...@@ -94,6 +94,17 @@ try: ...@@ -94,6 +94,17 @@ try:
except ImportError: except ImportError:
pass pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
########################################################### ###########################################################
# TOP-LEVEL MODULES # TOP-LEVEL MODULES
########################################################### ###########################################################
......
...@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader): ...@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader. The corpus view class used by this reader.
""" """
def __init__(self, root, def __init__(self, root, fileids = None,
word_tokenizer=TweetTokenizer(), word_tokenizer=TweetTokenizer(),
encoding='utf8'): encoding='utf8'):
""" """
......
...@@ -73,7 +73,7 @@ path = [] ...@@ -73,7 +73,7 @@ path = []
# User-specified locations: # User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d] path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
if os.path.expanduser('~/') != '~/': if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data'))) path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'): if sys.platform.startswith('win'):
......
...@@ -924,6 +924,10 @@ class Downloader(object): ...@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
""" """
# Check if we are on GAE where we cannot write into filesystem.
if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a # Check if we have sufficient permissions to install in a
# variety of system-wide locations. # variety of system-wide locations.
for nltkdir in nltk.data.path: for nltkdir in nltk.data.path:
...@@ -2267,4 +2271,3 @@ if __name__ == '__main__': ...@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir, downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force, quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error) halt_on_error=options.halt_on_error)
...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(), ...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath: if searchpath:
msg += '\n\n Searched in:' msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath) msg += ''.join('\n - %s' % d for d in searchpath)
if url: msg += ('\n\n For more information, on %s, see:\n <%s>' % if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url)) (filename, url))
div = '='*75 div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div)) raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
......
...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger ...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger from nltk.tag.hunpos import HunposTagger
from nltk.tag.stanford import StanfordTagger from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag from nltk.tag.mapping import tagset_mapping, map_tag
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford NER-tagger # Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
# #
# Copyright (C) 2001-2015 NLTK Project # Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org> # Author: Nitin Madnani <nmadnani@ets.org>
...@@ -9,6 +9,12 @@ ...@@ -9,6 +9,12 @@
""" """
A module for interfacing with the Stanford taggers. A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
""" """
import os import os
...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI): ...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = '' _SEPARATOR = ''
_JAR = '' _JAR = ''
def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR: if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be ' warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean POS- or NERTagger?') 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar( self._stanford_jar = find_jar(
self._JAR, path_to_jar, self._JAR, path_to_jar,
searchpath=(), url=_stanford_url, searchpath=(), url=_stanford_url,
verbose=verbose) verbose=verbose)
self._stanford_model = find_file(path_to_model, self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose) env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding self._encoding = encoding
self.java_options = java_options self.java_options = java_options
...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI): ...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence) tagged_sentences.append(sentence)
return tagged_sentences return tagged_sentences
class POSTagger(StanfordTagger): class StanfordPOSTagger(StanfordTagger):
""" """
A class for pos tagging with Stanford Tagger. The input is the paths to: A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger): ...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import POSTagger >>> from nltk.tag import StanfordPOSTagger
>>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger', >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
""" """
...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger): ...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar' _JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(POSTagger, self).__init__(*args, **kwargs) super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger): ...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile', '-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences'] self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
class NERTagger(StanfordTagger): class StanfordNERTagger(StanfordTagger):
""" """
A class for ner tagging with Stanford Tagger. The input is the paths to: A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here, - (optionally) the path to the stanford tagger jar file. If not specified here,
...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger): ...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import NERTagger >>> from nltk.tag import StanfordNERTagger
>>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz', >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger): ...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags' _FORMAT = 'slashTags'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(NERTagger, self).__init__(*args, **kwargs) super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger): ...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
def parse_output(self, text): def parse_output(self, text):
if self._FORMAT == 'slashTags': if self._FORMAT == 'slashTags':
return super(NERTagger, self).parse_output(text) return super(StanfordNERTagger, self).parse_output(text)
raise NotImplementedError raise NotImplementedError
......
This diff is collapsed. Click to expand it.
...@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the ...@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API. Twitter API.
""" """
try:
from twython import Twython, TwythonStreamer
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
https://twython.readthedocs.org/ for installation instructions."""
err.msg = textwrap.fill(MSG)
raise
from nltk.twitter.util import credsfromfile from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\ from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
TweetWriter TweetWriter
...@@ -191,29 +191,37 @@ def corpusreader_demo(): ...@@ -191,29 +191,37 @@ def corpusreader_demo():
* the result of tokenising the raw strings. * the result of tokenising the raw strings.
""" """
from nltk.corpus import TwitterCorpusReader #from nltk.corpus import TwitterCorpusReader
from nltk.corpus import tweets
tweets.fileids()
#root = os.environ['TWITTER'] #root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json') #reader = TwitterCorpusReader(root, '1k_sample.json')
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json') #reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print() print()
print("Complete tweet documents") print("Complete tweet documents")
print(SPACER) print(SPACER)
for tweet in reader.docs()[:2]: for tweet in tweets.docs()[:2]:
print(json.dumps(tweet, indent=1, sort_keys=True)) print(json.dumps(tweet, indent=1, sort_keys=True))
print() print()
print("Raw tweet strings:") print("Raw tweet strings:")
print(SPACER) print(SPACER)
for text in reader.strings()[:15]: for text in tweets.strings()[:15]:
print(text) print(text)
print() print()
print("Tokenized tweet strings:") print("Tokenized tweet strings:")
print(SPACER) print(SPACER)
for text in reader.tokenized()[:15]: for text in tweets.tokenized()[:15]:
print(text) print(text)
#def corpusreader_demo():
#from nltk.corpus import brown
#brown.words()
ALL = range(12) ALL = range(12)
DEMOS = ALL[11:] DEMOS = ALL[11:]
......
...@@ -6,11 +6,14 @@ ...@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com> # Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from twython.exceptions import TwythonRateLimitError
""" """
NLTK Twitter client. NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For error code <https://dev.twitter.com/overview/api/response-codes>`_. For
...@@ -31,6 +34,7 @@ from nltk.compat import UTC ...@@ -31,6 +34,7 @@ from nltk.compat import UTC
try: try:
from twython import Twython, TwythonStreamer from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError
except ImportError as err: except ImportError as err:
import textwrap import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\ MSG = """The NLTK twitterclient module requires the Twython package. See\
...@@ -204,10 +208,10 @@ class Query(Twython): ...@@ -204,10 +208,10 @@ class Query(Twython):
results = self.search(q=keywords, count=min(100, count), lang=lang) results = self.search(q=keywords, count=min(100, count), lang=lang)
count_from_query = results['search_metadata']['count'] count_from_query = results['search_metadata']['count']
self.handler.handle_chunk(results['statuses']) self.handler.handle_chunk(results['statuses'])
''' '''
pagination loop: keep fetching tweets until the count requested is reached, pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits dealing with twitter rate limits
''' '''
while count_from_query < count: while count_from_query < count:
max_id = results['search_metadata']['max_id'] max_id = results['search_metadata']['max_id']
...@@ -217,7 +221,7 @@ class Query(Twython): ...@@ -217,7 +221,7 @@ class Query(Twython):
except TwythonRateLimitError as e: except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e)) print("Waiting for 15 minutes -{0}".format(e))
time.sleep(15*60) # wait 15 minutes time.sleep(15*60) # wait 15 minutes
continue continue
count_from_query += results['search_metadata']['count'] count_from_query += results['search_metadata']['count']
self.handler.handle_chunk(results['statuses']) self.handler.handle_chunk(results['statuses'])
......
...@@ -7,5 +7,6 @@ scipy>=0.13.2 ...@@ -7,5 +7,6 @@ scipy>=0.13.2
matplotlib>=1.3.1 matplotlib>=1.3.1
scikit-learn>=0.14.1 scikit-learn>=0.14.1
python-crfsuite>=0.8.2 python-crfsuite>=0.8.2
pyparsing
twython>=3.2.0 twython>=3.2.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment