Commit fbbcf3a5 by Ewan Klein

merge from develop branch

parents a1d30e00 2c4d7627
...@@ -94,6 +94,17 @@ try: ...@@ -94,6 +94,17 @@ try:
except ImportError: except ImportError:
pass pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
########################################################### ###########################################################
# TOP-LEVEL MODULES # TOP-LEVEL MODULES
########################################################### ###########################################################
......
...@@ -73,7 +73,7 @@ path = [] ...@@ -73,7 +73,7 @@ path = []
# User-specified locations: # User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d] path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
if os.path.expanduser('~/') != '~/': if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data'))) path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'): if sys.platform.startswith('win'):
......
...@@ -924,6 +924,10 @@ class Downloader(object): ...@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
""" """
# Check if we are on GAE where we cannot write into filesystem.
if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a # Check if we have sufficient permissions to install in a
# variety of system-wide locations. # variety of system-wide locations.
for nltkdir in nltk.data.path: for nltkdir in nltk.data.path:
...@@ -2267,4 +2271,3 @@ if __name__ == '__main__': ...@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir, downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force, quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error) halt_on_error=options.halt_on_error)
...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(), ...@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath: if searchpath:
msg += '\n\n Searched in:' msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath) msg += ''.join('\n - %s' % d for d in searchpath)
if url: msg += ('\n\n For more information, on %s, see:\n <%s>' % if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url)) (filename, url))
div = '='*75 div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div)) raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
......
...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger ...@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger from nltk.tag.hunpos import HunposTagger
from nltk.tag.stanford import StanfordTagger from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag from nltk.tag.mapping import tagset_mapping, map_tag
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford NER-tagger # Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
# #
# Copyright (C) 2001-2015 NLTK Project # Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org> # Author: Nitin Madnani <nmadnani@ets.org>
...@@ -9,6 +9,12 @@ ...@@ -9,6 +9,12 @@
""" """
A module for interfacing with the Stanford taggers. A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
""" """
import os import os
...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI): ...@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = '' _SEPARATOR = ''
_JAR = '' _JAR = ''
def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR: if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be ' warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean POS- or NERTagger?') 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar( self._stanford_jar = find_jar(
self._JAR, path_to_jar, self._JAR, path_to_jar,
searchpath=(), url=_stanford_url, searchpath=(), url=_stanford_url,
verbose=verbose) verbose=verbose)
self._stanford_model = find_file(path_to_model, self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose) env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding self._encoding = encoding
self.java_options = java_options self.java_options = java_options
...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI): ...@@ -100,7 +106,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence) tagged_sentences.append(sentence)
return tagged_sentences return tagged_sentences
class POSTagger(StanfordTagger): class StanfordPOSTagger(StanfordTagger):
""" """
A class for pos tagging with Stanford Tagger. The input is the paths to: A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger): ...@@ -110,9 +116,8 @@ class POSTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import POSTagger >>> from nltk.tag import StanfordPOSTagger
>>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger', >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
""" """
...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger): ...@@ -121,7 +126,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar' _JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(POSTagger, self).__init__(*args, **kwargs) super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger): ...@@ -129,9 +134,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile', '-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences'] self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
class NERTagger(StanfordTagger): class StanfordNERTagger(StanfordTagger):
""" """
A class for ner tagging with Stanford Tagger. The input is the paths to: A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data - a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here, - (optionally) the path to the stanford tagger jar file. If not specified here,
...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger): ...@@ -140,9 +145,8 @@ class NERTagger(StanfordTagger):
Example: Example:
>>> from nltk.tag.stanford import NERTagger >>> from nltk.tag import StanfordNERTagger
>>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz', >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger): ...@@ -154,7 +158,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags' _FORMAT = 'slashTags'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(NERTagger, self).__init__(*args, **kwargs) super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property @property
def _cmd(self): def _cmd(self):
...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger): ...@@ -165,7 +169,7 @@ class NERTagger(StanfordTagger):
def parse_output(self, text): def parse_output(self, text):
if self._FORMAT == 'slashTags': if self._FORMAT == 'slashTags':
return super(NERTagger, self).parse_output(text) return super(StanfordNERTagger, self).parse_output(text)
raise NotImplementedError raise NotImplementedError
......
This diff is collapsed. Click to expand it.
...@@ -7,5 +7,6 @@ scipy>=0.13.2 ...@@ -7,5 +7,6 @@ scipy>=0.13.2
matplotlib>=1.3.1 matplotlib>=1.3.1
scikit-learn>=0.14.1 scikit-learn>=0.14.1
python-crfsuite>=0.8.2 python-crfsuite>=0.8.2
pyparsing
twython>=3.2.0 twython>=3.2.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment