Commit 515d01c7 by Dmitrijs Milajevs

Merge branch 'develop' into wsd-tests

parents 9c6d1a8f a410b109
This source diff could not be displayed because it is too large. You can view the blob instead.
;;; Complete symbols at point using Pymacs.
;;; See pycomplete.py for the Python side of things and a short description
;;; of what to expect.
(require 'pymacs)
(require 'python-mode)
(pymacs-load "pycomplete")
(defun py-complete ()
(interactive)
(let ((pymacs-forget-mutability t))
(insert (pycomplete-pycomplete (py-symbol-near-point)
(py-find-global-imports)))))
(defun py-find-global-imports ()
(save-excursion
(let (first-class-or-def imports)
(goto-char (point-min))
(setq first-class-or-def
(re-search-forward "^ *\\(def\\|class\\) " nil t))
(goto-char (point-min))
(setq imports nil)
(while (re-search-forward
"^\\(import \\|from \\([A-Za-z_][A-Za-z_0-9]*\\) import \\).*"
nil t)
(setq imports (append imports
(list (buffer-substring
(match-beginning 0)
(match-end 0))))))
imports)))
(define-key py-mode-map "\M-\C-i" 'py-complete)
(provide 'pycomplete)
from __future__ import print_function
"""
Python dot expression completion using Pymacs.
This almost certainly needs work, but if you add
(require 'pycomplete)
to your .xemacs/init.el file (untried w/ GNU Emacs so far) and have Pymacs
installed, when you hit M-TAB it will try to complete the dot expression
before point. For example, given this import at the top of the file:
import time
typing "time.cl" then hitting M-TAB should complete "time.clock".
This is unlikely to be done the way Emacs completion ought to be done, but
it's a start. Perhaps someone with more Emacs mojo can take this stuff and
do it right.
See pycomplete.el for the Emacs Lisp side of things.
"""
import sys
import os.path
try:
x = set
except NameError:
from sets import Set as set
else:
del x
def get_all_completions(s, imports=None):
"""Return contextual completion of s (string of >= zero chars).
If given, imports is a list of import statements to be executed first.
"""
locald = {}
if imports is not None:
for stmt in imports:
try:
exec stmt in globals(), locald
except TypeError:
raise TypeError, "invalid type: %s" % stmt
dots = s.split(".")
if not s or len(dots) == 1:
keys = set()
keys.update(locald.keys())
keys.update(globals().keys())
import __builtin__
keys.update(dir(__builtin__))
keys = list(keys)
keys.sort()
if s:
return [k for k in keys if k.startswith(s)]
else:
return keys
sym = None
for i in range(1, len(dots)):
s = ".".join(dots[:i])
try:
sym = eval(s, globals(), locald)
except NameError:
try:
sym = __import__(s, globals(), locald, [])
except ImportError:
return []
if sym is not None:
s = dots[-1]
return [k for k in dir(sym) if k.startswith(s)]
def pycomplete(s, imports=None):
completions = get_all_completions(s, imports)
dots = s.split(".")
return os.path.commonprefix([k[len(dots[-1]):] for k in completions])
if __name__ == "__main__":
print("<empty> ->", pycomplete(""))
print("sys.get ->", pycomplete("sys.get"))
print("sy ->", pycomplete("sy"))
print("sy (sys in context) ->", pycomplete("sy", imports=["import sys"]))
print("foo. ->", pycomplete("foo."))
print("Enc (email * imported) ->",)
print(pycomplete("Enc", imports=["from email import *"]))
print("E (email * imported) ->",)
print(pycomplete("E", imports=["from email import *"]))
print("Enc ->", pycomplete("Enc"))
print("E ->", pycomplete("E"))
# Local Variables :
# pymacs-auto-reload : t
# End :
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -358,10 +358,19 @@ except ImportError: # python 2.6 ...@@ -358,10 +358,19 @@ except ImportError: # python 2.6
# The following datasets have a /PY3 subdirectory containing # The following datasets have a /PY3 subdirectory containing
# a full copy of the data which has been re-encoded or repickled. # a full copy of the data which has been re-encoded or repickled.
_PY3_DATA_UPDATES = ["chunkers/maxent_ne_chunker",
"help/tagsets", _PY3_DATA_UPDATES = []
"taggers/maxent_treebank_pos_tagger",
"tokenizers/punkt"] if sys.platform.startswith('win'):
_PY3_DATA_UPDATES = ["chunkers\maxent_ne_chunker",
"help\tagsets",
"taggers\maxent_treebank_pos_tagger",
"tokenizers\punkt"]
else:
_PY3_DATA_UPDATES = ["chunkers/maxent_ne_chunker",
"help/tagsets",
"taggers/maxent_treebank_pos_tagger",
"tokenizers/punkt"]
# for use in adding /PY3 to the second (filename) argument # for use in adding /PY3 to the second (filename) argument
# of the file pointers in data.py # of the file pointers in data.py
......
...@@ -17,15 +17,15 @@ As metrics, they must satisfy the following three requirements: ...@@ -17,15 +17,15 @@ As metrics, they must satisfy the following three requirements:
1. d(a, a) = 0 1. d(a, a) = 0
2. d(a, b) >= 0 2. d(a, b) >= 0
3. d(a, c) <= d(a, b) + d(b, c) 3. d(a, c) <= d(a, b) + d(b, c)
""" """
from __future__ import print_function from __future__ import print_function
def _edit_dist_init(len1, len2): def _edit_dist_init(len1, len2):
lev = [] lev = []
for i in range(len1): for i in range(len1):
lev.append([0] * len2) # initialize 2-D array to zero lev.append([0] * len2) # initialize 2D array to zero
for i in range(len1): for i in range(len1):
lev[i][0] = i # column 0: 0,1,2,3,4,... lev[i][0] = i # column 0: 0,1,2,3,4,...
for j in range(len2): for j in range(len2):
...@@ -114,11 +114,13 @@ def masi_distance(label1, label2): ...@@ -114,11 +114,13 @@ def masi_distance(label1, label2):
labels are assigned. labels are assigned.
>>> from nltk.metrics import masi_distance >>> from nltk.metrics import masi_distance
>>> masi_distance(set([1,2]), set([1,2,3,4])) >>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
0.665... 0.665...
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation. Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
for Semantic and Pragmatic Annotation.
""" """
len_intersection = len(label1.intersection(label2)) len_intersection = len(label1.intersection(label2))
len_union = len(label1.union(label2)) len_union = len(label1.union(label2))
len_label1 = len(label1) len_label1 = len(label1)
...@@ -136,7 +138,7 @@ def masi_distance(label1, label2): ...@@ -136,7 +138,7 @@ def masi_distance(label1, label2):
def interval_distance(label1,label2): def interval_distance(label1,label2):
"""Krippendorff'1 interval distance metric """Krippendorff's interval distance metric
>>> from nltk.metrics import interval_distance >>> from nltk.metrics import interval_distance
>>> interval_distance(1,10) >>> interval_distance(1,10)
...@@ -144,8 +146,9 @@ def interval_distance(label1,label2): ...@@ -144,8 +146,9 @@ def interval_distance(label1,label2):
Krippendorff 1980, Content Analysis: An Introduction to its Methodology Krippendorff 1980, Content Analysis: An Introduction to its Methodology
""" """
try: try:
return pow(label1-label2,2) return pow(label1 - label2, 2)
# return pow(list(label1)[0]-list(label2)[0],2) # return pow(list(label1)[0]-list(label2)[0],2)
except: except:
print("non-numeric labels not supported with interval distance") print("non-numeric labels not supported with interval distance")
...@@ -153,13 +156,17 @@ def interval_distance(label1,label2): ...@@ -153,13 +156,17 @@ def interval_distance(label1,label2):
def presence(label): def presence(label):
"""Higher-order function to test presence of a given label """Higher-order function to test presence of a given label
""" """
return lambda x,y: 1.0*((label in x) == (label in y))
return lambda x, y: 1.0 * ((label in x) == (label in y))
def fractional_presence(label): def fractional_presence(label):
return lambda x,y:abs((float(1.0/len(x)) - float(1.0/len(y))))*(label in x and label in y) or 0.0*(label not in x and label not in y) or abs((float(1.0/len(x))))*(label in x and label not in y) or ((float(1.0/len(y))))*(label not in x and label in y) return lambda x, y:\
abs((float(1.0 / len(x)) - float(1.0 / len(y)))) * (label in x and label in y) \
or 0.0 * (label not in x and label not in y) \
or abs(float(1.0 / len(x))) * (label in x and label not in y) \
or (float(1.0 / len(y))) * (label not in x and label in y)
def custom_distance(file): def custom_distance(file):
...@@ -174,7 +181,9 @@ def custom_distance(file): ...@@ -174,7 +181,9 @@ def custom_distance(file):
def demo(): def demo():
edit_distance_examples = [("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"), ("language", "lnaugage"), ("language", "lngauage")] edit_distance_examples = [
("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"),
("language", "lnaugage"), ("language", "lngauage")]
for s1, s2 in edit_distance_examples: for s1, s2 in edit_distance_examples:
print("Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2)) print("Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2))
for s1, s2 in edit_distance_examples: for s1, s2 in edit_distance_examples:
......
...@@ -81,7 +81,7 @@ class Boxer(object): ...@@ -81,7 +81,7 @@ class Boxer(object):
:param input: str Input sentence to parse :param input: str Input sentence to parse
:param occur_index: bool Should predicates be occurrence indexed? :param occur_index: bool Should predicates be occurrence indexed?
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
:return: ``drt.AbstractDrs`` :return: ``drt.DrtExpression``
""" """
discourse_ids = ([discourse_id] if discourse_id is not None else None) discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
...@@ -96,7 +96,7 @@ class Boxer(object): ...@@ -96,7 +96,7 @@ class Boxer(object):
:param input: list of str Input sentences to parse as a single discourse :param input: list of str Input sentences to parse as a single discourse
:param occur_index: bool Should predicates be occurrence indexed? :param occur_index: bool Should predicates be occurrence indexed?
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
:return: ``drt.AbstractDrs`` :return: ``drt.DrtExpression``
""" """
discourse_ids = ([discourse_id] if discourse_id is not None else None) discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([input], discourse_ids, question, verbose) d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
...@@ -111,7 +111,7 @@ class Boxer(object): ...@@ -111,7 +111,7 @@ class Boxer(object):
:param inputs: list of str Input sentences to parse as individual discourses :param inputs: list of str Input sentences to parse as individual discourses
:param occur_index: bool Should predicates be occurrence indexed? :param occur_index: bool Should predicates be occurrence indexed?
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
:return: list of ``drt.AbstractDrs`` :return: list of ``drt.DrtExpression``
""" """
return self.interpret_multi_sents([[input] for input in inputs], discourse_ids, question, verbose) return self.interpret_multi_sents([[input] for input in inputs], discourse_ids, question, verbose)
...@@ -122,7 +122,7 @@ class Boxer(object): ...@@ -122,7 +122,7 @@ class Boxer(object):
:param inputs: list of list of str Input discourses to parse :param inputs: list of list of str Input discourses to parse
:param occur_index: bool Should predicates be occurrence indexed? :param occur_index: bool Should predicates be occurrence indexed?
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
:return: ``drt.AbstractDrs`` :return: ``drt.DrtExpression``
""" """
if discourse_ids is not None: if discourse_ids is not None:
assert len(inputs) == len(discourse_ids) assert len(inputs) == len(discourse_ids)
...@@ -291,7 +291,7 @@ class BoxerOutputDrsParser(DrtParser): ...@@ -291,7 +291,7 @@ class BoxerOutputDrsParser(DrtParser):
""" """
Parse a DRS condition Parse a DRS condition
:return: list of ``AbstractDrs`` :return: list of ``DrtExpression``
""" """
tok = self.token() tok = self.token()
accum = self.handle_condition(tok, indices) accum = self.handle_condition(tok, indices)
...@@ -312,7 +312,7 @@ class BoxerOutputDrsParser(DrtParser): ...@@ -312,7 +312,7 @@ class BoxerOutputDrsParser(DrtParser):
Handle a DRS condition Handle a DRS condition
:param indices: list of int :param indices: list of int
:return: list of ``AbstractDrs`` :return: list of ``DrtExpression``
""" """
if tok == 'not': if tok == 'not':
return [self._handle_not()] return [self._handle_not()]
...@@ -1148,7 +1148,7 @@ class NltkDrtBoxerDrsInterpreter(object): ...@@ -1148,7 +1148,7 @@ class NltkDrtBoxerDrsInterpreter(object):
def interpret(self, ex): def interpret(self, ex):
""" """
:param ex: ``AbstractBoxerDrs`` :param ex: ``AbstractBoxerDrs``
:return: ``AbstractDrs`` :return: ``DrtExpression``
""" """
if isinstance(ex, BoxerDrs): if isinstance(ex, BoxerDrs):
drs = DRS([Variable('x%d' % r) for r in ex.refs], list(map(self.interpret, ex.conds))) drs = DRS([Variable('x%d' % r) for r in ex.refs], list(map(self.interpret, ex.conds)))
......
...@@ -1229,6 +1229,12 @@ def demo(): ...@@ -1229,6 +1229,12 @@ def demo():
def test_draw(): def test_draw():
try:
from tkinter import Tk
except ImportError:
from nose import SkipTest
raise SkipTest("tkinter is required, but it's not available.")
expressions = [ expressions = [
r'x', r'x',
r'([],[])', r'([],[])',
......
...@@ -7,14 +7,19 @@ ...@@ -7,14 +7,19 @@
# URL: <http://nltk.org/> # URL: <http://nltk.org/>
# For license information, see LICENSE.TXT # For license information, see LICENSE.TXT
from nltk import compat # this fixes tkinter imports for Python 2.x from nltk import compat # this fixes tkinter imports for Python 2.x
from tkinter.font import Font try:
from tkinter.font import Font
from tkinter import (Button, Frame, IntVar, Label,
Listbox, Menu, Scrollbar, Tk)
from nltk.draw.util import CanvasFrame, ShowText
from tkinter import (Button, Frame, IntVar, Label, except ImportError:
Listbox, Menu, Scrollbar, Tk) """Ignore ImportError because tkinter might not be available."""
from nltk.draw.util import CanvasFrame, ShowText
from nltk.util import in_idle from nltk.util import in_idle
from nltk.tag import RegexpTagger from nltk.tag import RegexpTagger
from nltk.parse import MaltParser from nltk.parse import MaltParser
...@@ -22,6 +27,7 @@ from nltk.sem.logic import Variable ...@@ -22,6 +27,7 @@ from nltk.sem.logic import Variable
from nltk.sem.drt import DrsDrawer, DrtVariableExpression from nltk.sem.drt import DrsDrawer, DrtVariableExpression
from nltk.sem.glue import DrtGlue from nltk.sem.glue import DrtGlue
class DrtGlueDemo(object): class DrtGlueDemo(object):
def __init__(self, examples): def __init__(self, examples):
# Set up the main window. # Set up the main window.
......
...@@ -580,7 +580,7 @@ class DrtGlueFormula(GlueFormula): ...@@ -580,7 +580,7 @@ class DrtGlueFormula(GlueFormula):
if isinstance(meaning, string_types): if isinstance(meaning, string_types):
self.meaning = drt.DrtExpression.fromstring(meaning) self.meaning = drt.DrtExpression.fromstring(meaning)
elif isinstance(meaning, drt.AbstractDrs): elif isinstance(meaning, drt.DrtExpression):
self.meaning = meaning self.meaning = meaning
else: else:
raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__)) raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))
......
...@@ -25,7 +25,7 @@ from functools import reduce ...@@ -25,7 +25,7 @@ from functools import reduce
from nltk import compat from nltk import compat
from nltk.parse import load_parser from nltk.parse import load_parser
from nltk.draw.tree import draw_trees
from nltk.sem.skolemize import skolemize from nltk.sem.skolemize import skolemize
from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression, from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression,
ExistsExpression, IffExpression, ImpExpression, ExistsExpression, IffExpression, ImpExpression,
...@@ -53,8 +53,8 @@ class Constants(object): ...@@ -53,8 +53,8 @@ class Constants(object):
HOLE = 'HOLE' HOLE = 'HOLE'
LABEL = 'LABEL' LABEL = 'LABEL'
MAP = {ALL: lambda v,e: AllExpression(v.variable, e), MAP = {ALL: lambda v, e: AllExpression(v.variable, e),
EXISTS: lambda v,e: ExistsExpression(v.variable, e), EXISTS: lambda v, e: ExistsExpression(v.variable, e),
NOT: NegatedExpression, NOT: NegatedExpression,
AND: AndExpression, AND: AndExpression,
OR: OrExpression, OR: OrExpression,
...@@ -62,6 +62,7 @@ class Constants(object): ...@@ -62,6 +62,7 @@ class Constants(object):
IFF: IffExpression, IFF: IffExpression,
PRED: ApplicationExpression} PRED: ApplicationExpression}
class HoleSemantics(object): class HoleSemantics(object):
""" """
This class holds the broken-down components of a hole semantics, i.e. it This class holds the broken-down components of a hole semantics, i.e. it
...@@ -90,8 +91,8 @@ class HoleSemantics(object): ...@@ -90,8 +91,8 @@ class HoleSemantics(object):
""" """
self.holes = set() self.holes = set()
self.labels = set() self.labels = set()
self.fragments = {} # mapping of label -> formula fragment self.fragments = {} # mapping of label -> formula fragment
self.constraints = set() # set of Constraints self.constraints = set() # set of Constraints
self._break_down(usr) self._break_down(usr)
self.top_most_labels = self._find_top_most_labels() self.top_most_labels = self._find_top_most_labels()
self.top_hole = self._find_top_hole() self.top_hole = self._find_top_hole()
...@@ -129,7 +130,7 @@ class HoleSemantics(object): ...@@ -129,7 +130,7 @@ class HoleSemantics(object):
def _find_top_nodes(self, node_list): def _find_top_nodes(self, node_list):
top_nodes = node_list.copy() top_nodes = node_list.copy()
for f in compat.itervalues(self.fragments): for f in compat.itervalues(self.fragments):
#the label is the first argument of the predicate # the label is the first argument of the predicate
args = f[1] args = f[1]
for arg in args: for arg in args:
if arg in node_list: if arg in node_list:
...@@ -149,7 +150,7 @@ class HoleSemantics(object): ...@@ -149,7 +150,7 @@ class HoleSemantics(object):
Return the hole that will be the top of the formula tree. Return the hole that will be the top of the formula tree.
""" """
top_holes = self._find_top_nodes(self.holes) top_holes = self._find_top_nodes(self.holes)
assert len(top_holes) == 1 # it must be unique assert len(top_holes) == 1 # it must be unique
return top_holes.pop() return top_holes.pop()
def pluggings(self): def pluggings(self):
...@@ -277,7 +278,7 @@ class HoleSemantics(object): ...@@ -277,7 +278,7 @@ class HoleSemantics(object):
if node in plugging: if node in plugging:
return self._formula_tree(plugging, plugging[node]) return self._formula_tree(plugging, plugging[node])
elif node in self.fragments: elif node in self.fragments:
pred,args = self.fragments[node] pred, args = self.fragments[node]
children = [self._formula_tree(plugging, arg) for arg in args] children = [self._formula_tree(plugging, arg) for arg in args]
return reduce(Constants.MAP[pred.variable.name], children) return reduce(Constants.MAP[pred.variable.name], children)
else: else:
...@@ -293,15 +294,19 @@ class Constraint(object): ...@@ -293,15 +294,19 @@ class Constraint(object):
def __init__(self, lhs, rhs): def __init__(self, lhs, rhs):
self.lhs = lhs self.lhs = lhs
self.rhs = rhs self.rhs = rhs
def __eq__(self, other): def __eq__(self, other):
if self.__class__ == other.__class__: if self.__class__ == other.__class__:
return self.lhs == other.lhs and self.rhs == other.rhs return self.lhs == other.lhs and self.rhs == other.rhs
else: else:
return False return False
def __ne__(self, other): def __ne__(self, other):
return not (self == other) return not (self == other)
def __hash__(self): def __hash__(self):
return hash(repr(self)) return hash(repr(self))
def __repr__(self): def __repr__(self):
return '(%s < %s)' % (self.lhs, self.rhs) return '(%s < %s)' % (self.lhs, self.rhs)
...@@ -310,14 +315,16 @@ def hole_readings(sentence, grammar_filename=None, verbose=False): ...@@ -310,14 +315,16 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
if not grammar_filename: if not grammar_filename:
grammar_filename = 'grammars/sample_grammars/hole.fcfg' grammar_filename = 'grammars/sample_grammars/hole.fcfg'
if verbose: print('Reading grammar file', grammar_filename) if verbose:
print('Reading grammar file', grammar_filename)
parser = load_parser(grammar_filename) parser = load_parser(grammar_filename)
# Parse the sentence. # Parse the sentence.
tokens = sentence.split() tokens = sentence.split()
trees = list(parser.parse(tokens)) trees = list(parser.parse(tokens))
if verbose: print('Got %d different parses' % len(trees)) if verbose:
print('Got %d different parses' % len(trees))
all_readings = [] all_readings = []
for tree in trees: for tree in trees:
...@@ -325,14 +332,16 @@ def hole_readings(sentence, grammar_filename=None, verbose=False): ...@@ -325,14 +332,16 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
sem = tree.label()['SEM'].simplify() sem = tree.label()['SEM'].simplify()
# Print the raw semantic representation. # Print the raw semantic representation.
if verbose: print('Raw: ', sem) if verbose:
print('Raw: ', sem)
# Skolemize away all quantifiers. All variables become unique. # Skolemize away all quantifiers. All variables become unique.
while isinstance(sem, LambdaExpression): while isinstance(sem, LambdaExpression):
sem = sem.term sem = sem.term
skolemized = skolemize(sem) skolemized = skolemize(sem)
if verbose: print('Skolemized:', skolemized) if verbose:
print('Skolemized:', skolemized)
# Break the hole semantics representation down into its components # Break the hole semantics representation down into its components
# i.e. holes, labels, formula fragments and constraints. # i.e. holes, labels, formula fragments and constraints.
...@@ -346,7 +355,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False): ...@@ -346,7 +355,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
print('Top hole: ', hole_sem.top_hole) print('Top hole: ', hole_sem.top_hole)
print('Top labels: ', hole_sem.top_most_labels) print('Top labels: ', hole_sem.top_most_labels)
print('Fragments:') print('Fragments:')
for (l,f) in hole_sem.fragments.items(): for l, f in hole_sem.fragments.items():
print('\t%s: %s' % (l, f)) print('\t%s: %s' % (l, f))
# Find all the possible ways to plug the formulas together. # Find all the possible ways to plug the formulas together.
...@@ -357,7 +366,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False): ...@@ -357,7 +366,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
# Print out the formulas in a textual format. # Print out the formulas in a textual format.
if verbose: if verbose:
for i,r in enumerate(readings): for i, r in enumerate(readings):
print() print()
print('%d. %s' % (i, r)) print('%d. %s' % (i, r))
print() print()
...@@ -368,7 +377,8 @@ def hole_readings(sentence, grammar_filename=None, verbose=False): ...@@ -368,7 +377,8 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
if __name__ == '__main__': if __name__ == '__main__':
for r in hole_readings('a dog barks'): print(r) for r in hole_readings('a dog barks'):
print(r)
print() print()
for r in hole_readings('every girl chases a dog'): print(r) for r in hole_readings('every girl chases a dog'):
print(r)
...@@ -81,10 +81,9 @@ Later additions: ...@@ -81,10 +81,9 @@ Later additions:
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
Additional modifications were made to incorperate this module into Additional modifications were made to incorperate this module into
nltk. All such modifications are marked with \"--NLTK--\". The nltk nltk. All such modifications are marked with \"--NLTK--\".
version of this module is maintained by the NLTK developers, and is
available from <http://nltk.sourceforge.net>
""" """
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
## --NLTK-- ## --NLTK--
......
...@@ -26,6 +26,7 @@ from nltk.stem import porter ...@@ -26,6 +26,7 @@ from nltk.stem import porter
from nltk.stem.api import StemmerI from nltk.stem.api import StemmerI
class SnowballStemmer(StemmerI): class SnowballStemmer(StemmerI):
""" """
...@@ -189,7 +190,6 @@ class _ScandinavianStemmer(_LanguageSpecificStemmer): ...@@ -189,7 +190,6 @@ class _ScandinavianStemmer(_LanguageSpecificStemmer):
return r1 return r1
class _StandardStemmer(_LanguageSpecificStemmer): class _StandardStemmer(_LanguageSpecificStemmer):
""" """
...@@ -2430,12 +2430,12 @@ class PortugueseStemmer(_StandardStemmer): ...@@ -2430,12 +2430,12 @@ class PortugueseStemmer(_StandardStemmer):
""" """
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
__step1_suffixes = ('amentos', 'imentos', 'uciones', 'amento', __step1_suffixes = ('amentos', 'imentos', 'uções', 'amento',
'imento', 'adoras', 'adores', 'a\xE7o~es', 'imento', 'adoras', 'adores', 'a\xE7o~es',
'log\xEDas', '\xEAncias', 'amente', 'logias', '\xEAncias', 'amente',
'idades', 'ismos', 'istas', 'adora', 'idades', 'ismos', 'istas', 'adora',
'a\xE7a~o', 'antes', '\xE2ncia', 'a\xE7a~o', 'antes', '\xE2ncia',
'log\xEDa', 'uci\xF3n', '\xEAncia', 'logia', 'ução', '\xEAncia',
'mente', 'idade', 'ezas', 'icos', 'icas', 'mente', 'idade', 'ezas', 'icos', 'icas',
'ismo', '\xE1vel', '\xEDvel', 'ista', 'ismo', '\xE1vel', '\xEDvel', 'ista',
'osos', 'osas', 'ador', 'ante', 'ivas', 'osos', 'osas', 'ador', 'ante', 'ivas',
...@@ -2528,11 +2528,11 @@ class PortugueseStemmer(_StandardStemmer): ...@@ -2528,11 +2528,11 @@ class PortugueseStemmer(_StandardStemmer):
elif r2.endswith(suffix): elif r2.endswith(suffix):
step1_success = True step1_success = True
if suffix in ("log\xEDa", "log\xEDas"): if suffix in ("logia", "logias"):
word = word[:-2] word = word[:-2]
rv = rv[:-2] rv = rv[:-2]
elif suffix in ("uci\xF3n", "uciones"): elif suffix in ("ução", "uções"):
word = "".join((word[:-len(suffix)], "u")) word = "".join((word[:-len(suffix)], "u"))
rv = "".join((rv[:-len(suffix)], "u")) rv = "".join((rv[:-len(suffix)], "u"))
...@@ -3551,7 +3551,7 @@ class SpanishStemmer(_StandardStemmer): ...@@ -3551,7 +3551,7 @@ class SpanishStemmer(_StandardStemmer):
word = word[:-len(suffix)] word = word[:-len(suffix)]
rv = rv[:-len(suffix)] rv = rv[:-len(suffix)]
if word[-2:] == "gu" and rv[-1] == "u": if word[-2:] == "gu" and rv[-1:] == "u":
word = word[:-1] word = word[:-1]
else: else:
word = word[:-len(suffix)] word = word[:-len(suffix)]
...@@ -3561,11 +3561,9 @@ class SpanishStemmer(_StandardStemmer): ...@@ -3561,11 +3561,9 @@ class SpanishStemmer(_StandardStemmer):
.replace("\xED", "i").replace("\xF3", "o") .replace("\xED", "i").replace("\xF3", "o")
.replace("\xFA", "u")) .replace("\xFA", "u"))
return word return word
class SwedishStemmer(_ScandinavianStemmer): class SwedishStemmer(_ScandinavianStemmer):
""" """
...@@ -3644,11 +3642,9 @@ class SwedishStemmer(_ScandinavianStemmer): ...@@ -3644,11 +3642,9 @@ class SwedishStemmer(_ScandinavianStemmer):
word = word[:-1] word = word[:-1]
break break
return word return word
def demo(): def demo():
""" """
This function provides a demonstration of the Snowball stemmers. This function provides a demonstration of the Snowball stemmers.
...@@ -3720,8 +3716,6 @@ def demo(): ...@@ -3720,8 +3716,6 @@ def demo():
print("\n") print("\n")
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
...@@ -64,7 +64,7 @@ Construct a lexicon: ...@@ -64,7 +64,7 @@ Construct a lexicon:
... ''') ... ''')
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
>>> for parse in parser.parse("you prefer that cake".split()): >>> for parse in parser.parse("you prefer that cake".split()): # doctest: +SKIP
... chart.printCCGDerivation(parse) ... chart.printCCGDerivation(parse)
... break ... break
... ...
...@@ -77,7 +77,7 @@ Construct a lexicon: ...@@ -77,7 +77,7 @@ Construct a lexicon:
--------------------------------< --------------------------------<
S S
>>> for parse in parser.parse("that is the cake which you prefer".split()): >>> for parse in parser.parse("that is the cake which you prefer".split()): # doctest: +SKIP
... chart.printCCGDerivation(parse) ... chart.printCCGDerivation(parse)
... break ... break
... ...
...@@ -114,7 +114,7 @@ Without Substitution (no output) ...@@ -114,7 +114,7 @@ Without Substitution (no output)
With Substitution: With Substitution:
>>> for parse in parser.parse(sent): >>> for parse in parser.parse(sent): # doctest: +SKIP
... chart.printCCGDerivation(parse) ... chart.printCCGDerivation(parse)
... break ... break
... ...
...@@ -185,7 +185,7 @@ Note that while the two derivations are different, they are semantically equival ...@@ -185,7 +185,7 @@ Note that while the two derivations are different, they are semantically equival
>>> lex = lexicon.parseLexicon(test1_lex) >>> lex = lexicon.parseLexicon(test1_lex)
>>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
>>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()): >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()):
... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE ... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE +SKIP
I will cook and might eat the mushrooms and parsnips I will cook and might eat the mushrooms and parsnips
NP ((S\NP)/VP) (VP/NP) ((_var2\.,_var2)/.,_var2) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var2\.,_var2)/.,_var2) N NP ((S\NP)/VP) (VP/NP) ((_var2\.,_var2)/.,_var2) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var2\.,_var2)/.,_var2) N
---------------------->B ---------------------->B
...@@ -234,7 +234,7 @@ Interesting to point that the two parses are clearly semantically different. ...@@ -234,7 +234,7 @@ Interesting to point that the two parses are clearly semantically different.
>>> lex = lexicon.parseLexicon(test2_lex) >>> lex = lexicon.parseLexicon(test2_lex)
>>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
>>> for parse in parser.parse("articles which I will file and forget without reading".split()): >>> for parse in parser.parse("articles which I will file and forget without reading".split()):
... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE ... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE +SKIP
articles which I will file and forget without reading articles which I will file and forget without reading
N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var3\.,_var3)/.,_var3) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var3\.,_var3)/.,_var3) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
-----------------< -----------------<
...@@ -271,4 +271,3 @@ Interesting to point that the two parses are clearly semantically different. ...@@ -271,4 +271,3 @@ Interesting to point that the two parses are clearly semantically different.
(N\N) (N\N)
-----------------------------------------------------------------------------------------------------------------------------< -----------------------------------------------------------------------------------------------------------------------------<
N N
...@@ -8,7 +8,7 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a ...@@ -8,7 +8,7 @@ counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) a
>>> from nltk.metrics import Paice >>> from nltk.metrics import Paice
------------------------------------- -------------------------------------
Understemming and Overstemming values Understemming and Overstemming values
------------------------------------- -------------------------------------
...@@ -26,11 +26,10 @@ Understemming and Overstemming values ...@@ -26,11 +26,10 @@ Understemming and Overstemming values
(4.0, 5.0, 2.0, 16.0) (4.0, 5.0, 2.0, 16.0)
>>> p.ui, p.oi, p.sw >>> p.ui, p.oi, p.sw
(0.8, 0.125, 0.15625) (0.8..., 0.125..., 0.15625...)
>>> p.errt >>> p.errt
1.0 1.0
>>> p.coords >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords]
[(0.0, 1.0), (0.0, 0.375), (0.6, 0.125), (0.8, 0.125)] [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')]
\ No newline at end of file
...@@ -4,8 +4,13 @@ from nltk.compat import PY3 ...@@ -4,8 +4,13 @@ from nltk.compat import PY3
from nltk.corpus import teardown_module from nltk.corpus import teardown_module
def setup_module(module): def setup_module(module):
from nose import SkipTest from nose import SkipTest
raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")
if not PY3: if not PY3:
raise SkipTest("portuguese_en.doctest was skipped because " raise SkipTest(
"non-ascii doctests are not supported under Python 2.x") "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
\ No newline at end of file )
...@@ -3,6 +3,7 @@ from __future__ import print_function, unicode_literals ...@@ -3,6 +3,7 @@ from __future__ import print_function, unicode_literals
import unittest import unittest
from nltk.stem.snowball import SnowballStemmer from nltk.stem.snowball import SnowballStemmer
class SnowballTest(unittest.TestCase): class SnowballTest(unittest.TestCase):
def test_russian(self): def test_russian(self):
...@@ -22,6 +23,14 @@ class SnowballTest(unittest.TestCase): ...@@ -22,6 +23,14 @@ class SnowballTest(unittest.TestCase):
assert stemmer_german.stem("keinen") == 'kein' assert stemmer_german.stem("keinen") == 'kein'
assert stemmer_german2.stem("keinen") == 'keinen' assert stemmer_german2.stem("keinen") == 'keinen'
def test_spanish(self):
stemmer = SnowballStemmer('spanish')
assert stemmer.stem("Visionado") == 'vision'
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == 'algu'
def test_short_strings_bug(self): def test_short_strings_bug(self):
stemmer = SnowballStemmer('english') stemmer = SnowballStemmer('english')
assert stemmer.stem("y's") == 'y' assert stemmer.stem("y's") == 'y'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment