Commit e08cc758 by lrnzcig

Merge remote-tracking branch 'nltk/twitter' into twitter

parents e3d7ff88 10a99af5
......@@ -94,6 +94,17 @@ try:
except ImportError:
pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
###########################################################
# TOP-LEVEL MODULES
###########################################################
......
......@@ -48,7 +48,7 @@ class TwitterCorpusReader(CorpusReader):
The corpus view class used by this reader.
"""
def __init__(self, root,
def __init__(self, root, fileids = None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
......
......@@ -73,7 +73,7 @@ path = []
# User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
if os.path.expanduser('~/') != '~/':
if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'):
......
......@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
"""
# Check if we are on GAE where we cannot write into filesystem.
if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a
# variety of system-wide locations.
for nltkdir in nltk.data.path:
......@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error)
......@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath:
msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath)
if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url))
div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
......
......@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger
from nltk.tag.stanford import StanfordTagger
from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag
......
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford NER-tagger
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
......@@ -9,6 +9,12 @@
"""
A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""
import os
......@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = ''
_JAR = ''
def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean POS- or NERTagger?')
'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
searchpath=(), url=_stanford_url,
verbose=verbose)
self._stanford_model = find_file(path_to_model,
self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding
self.java_options = java_options
......@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence)
return tagged_sentences
class POSTagger(StanfordTagger):
class StanfordPOSTagger(StanfordTagger):
"""
A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
......@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
Example:
>>> from nltk.tag.stanford import POSTagger
>>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
>>> from nltk.tag import StanfordPOSTagger
>>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
......@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs):
super(POSTagger, self).__init__(*args, **kwargs)
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
......@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
class NERTagger(StanfordTagger):
class StanfordNERTagger(StanfordTagger):
"""
A class for ner tagging with Stanford Tagger. The input is the paths to:
A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
......@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
Example:
>>> from nltk.tag.stanford import NERTagger
>>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
>>> from nltk.tag import StanfordNERTagger
>>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
......@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags'
def __init__(self, *args, **kwargs):
super(NERTagger, self).__init__(*args, **kwargs)
super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
......@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
def parse_output(self, text):
if self._FORMAT == 'slashTags':
return super(NERTagger, self).parse_output(text)
return super(StanfordNERTagger, self).parse_output(text)
raise NotImplementedError
......
This diff is collapsed. Click to expand it.
......@@ -13,15 +13,6 @@ This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
try:
from twython import Twython, TwythonStreamer
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
https://twython.readthedocs.org/ for installation instructions."""
err.msg = textwrap.fill(MSG)
raise
from nltk.twitter.util import credsfromfile
from nltk.twitter.twitterclient import Streamer, Query, Twitter, TweetViewer,\
TweetWriter
......@@ -191,29 +191,37 @@ def corpusreader_demo():
* the result of tokenising the raw strings.
"""
from nltk.corpus import TwitterCorpusReader
#from nltk.corpus import TwitterCorpusReader
from nltk.corpus import tweets
tweets.fileids()
#root = os.environ['TWITTER']
#reader = TwitterCorpusReader(root, '1k_sample.json')
reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
#reader = TwitterCorpusReader('twitter', 'tweets.20150417.json')
print()
print("Complete tweet documents")
print(SPACER)
for tweet in reader.docs()[:2]:
for tweet in tweets.docs()[:2]:
print(json.dumps(tweet, indent=1, sort_keys=True))
print()
print("Raw tweet strings:")
print(SPACER)
for text in reader.strings()[:15]:
for text in tweets.strings()[:15]:
print(text)
print()
print("Tokenized tweet strings:")
print(SPACER)
for text in reader.tokenized()[:15]:
for text in tweets.tokenized()[:15]:
print(text)
#def corpusreader_demo():
#from nltk.corpus import brown
#brown.words()
ALL = range(12)
DEMOS = ALL[11:]
......
......@@ -6,11 +6,14 @@
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from twython.exceptions import TwythonRateLimitError
"""
NLTK Twitter client.
NLTK Twitter client
This module offers methods for collecting and processing tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
......@@ -31,6 +34,7 @@ from nltk.compat import UTC
try:
from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError
except ImportError as err:
import textwrap
MSG = """The NLTK twitterclient module requires the Twython package. See\
......@@ -204,10 +208,10 @@ class Query(Twython):
results = self.search(q=keywords, count=min(100, count), lang=lang)
count_from_query = results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
'''
pagination loop: keep fetching tweets until the count requested is reached,
dealing with twitter rate limits
dealing with twitter rate limits
'''
while count_from_query < count:
max_id = results['search_metadata']['max_id']
......@@ -217,7 +221,7 @@ class Query(Twython):
except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e))
time.sleep(15*60) # wait 15 minutes
continue
continue
count_from_query += results['search_metadata']['count']
self.handler.handle_chunk(results['statuses'])
......
......@@ -7,5 +7,6 @@ scipy>=0.13.2
matplotlib>=1.3.1
scikit-learn>=0.14.1
python-crfsuite>=0.8.2
pyparsing
twython>=3.2.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment