Commit 31d4363d by gradyward

Stylistic cleanup

parent 9c16fbbe
......@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
"""
import os
import sys
import logging
import numpy
import sys
# Constructs a log
log = logging.getLogger(__name__)
# Setup base path so that we can import modules who are dependent on it
......@@ -15,7 +18,7 @@ sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
sys.path.append(one_up_path)
#Import modules that are dependent on the base path
# Import modules that are dependent on the base path
import util_functions
from errors import *
from datetime import datetime
......@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
# We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
except Exception as ex:
str = (
msg = (
"predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
).format(ex)
log.exception(str)
raise ClassifierTrainingInternalError(str)
log.exception(msg)
raise ClassifierTrainingInternalError(msg)
return feat_extractor, predict_classifier, cv_error_results
......
......@@ -2,6 +2,7 @@
Errors for the EASE repository
"""
class EaseError(Exception):
pass
......@@ -45,6 +46,7 @@ class InputError(EaseError):
"""
The user supplied an argument which was incorrect.
"""
def __init__(self, expr, msg):
self.expr = expr
self.msg = msg
......
......@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
Performs spell and grammar checking, tokenization, and stemming.
"""
import nltk
import sys
import random
import os
import logging
from ease.errors import InputError
import nltk
import sys
from errors import *
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path = base_path + "/"
base_path += "/"
log = logging.getLogger(__name__)
......@@ -97,9 +98,9 @@ class EssaySet(object):
try:
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
except UnicodeError as ex:
str = "Could not parse essay text into ascii: {}".format(ex)
log.exception(str)
raise EssaySetRequestError(ex)
msg = "Could not parse essay text into ascii: {}".format(ex)
log.exception(msg)
raise EssaySetRequestError(msg)
# Validates that score is an integer and essay_text is a string.
try:
......@@ -107,9 +108,9 @@ class EssaySet(object):
essay_text = str(essay_text)
essay_generated = int(essay_generated)
except TypeError:
str = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
log.exception(str)
raise EssaySetRequestError(str)
ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
log.exception(ex)
raise EssaySetRequestError(ex)
# Validates that essay generated is 0 or 1
if essay_generated != 0 and essay_generated != 1:
......
......@@ -2,24 +2,26 @@
Extracts features from training set and test set essays
"""
import numpy
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import operator
import logging
import numpy
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
from errors import *
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path = base_path + "/"
base_path += "/"
log = logging.getLogger(__name__)
......@@ -79,9 +81,8 @@ class FeatureExtractor(object):
sum([len(essay) for essay in essay_set._cleaned_essays]))
# Gets the number and positions of grammar errors
good_pos_tags, bad_pos_positions = self._get_grammar_errors(
essay_set._pos_tags, essay_set._cleaned_essays, essay_set._tokens
)
good_pos_tags, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
essay_set._cleaned_essays)
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# grammatical errors. Though this was not what the original author used, it is clearly what his code
# implies, as if this is intended to be a true "grammar errors per character", we should have that
......@@ -154,7 +155,7 @@ class FeatureExtractor(object):
# SEE COMMENT AROUND LINE 85
good_grammar_ratios, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
essay_set._cleaned_essays, essay_set._tokens)
essay_set._cleaned_essays)
good_pos_tag_proportion = [len(bad_pos_positions[m]) / float(word_counts[m]) for m in xrange(0, len(essays))]
length_array = numpy.array((
......@@ -204,7 +205,7 @@ class FeatureExtractor(object):
prompt_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if (tok_length == 0):
if tok_length == 0:
tok_length = 1
prompt_overlap.append(len([i for i in j if i in prompt_toks]))
prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
......@@ -212,7 +213,7 @@ class FeatureExtractor(object):
expand_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if (tok_length == 0):
if tok_length == 0:
tok_length = 1
expand_overlap.append(len([i for i in j if i in expand_syns]))
expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
......@@ -221,7 +222,7 @@ class FeatureExtractor(object):
return prompt_arr.copy()
def _get_grammar_errors(self, pos, essays, tokens):
def _get_grammar_errors(self, pos, essays):
"""
Internal function to get the number of grammar errors in given text
......@@ -251,7 +252,7 @@ class FeatureExtractor(object):
start, end = bad_pos_tuples[m]
for j in xrange(m + 1, len(bad_pos_tuples)):
lstart, lend = bad_pos_tuples[j]
if lstart >= start and lstart <= end:
if start <= lstart <= end:
bad_pos_tuples[m][1] = bad_pos_tuples[j][1]
to_delete.append(j)
......@@ -268,7 +269,8 @@ class FeatureExtractor(object):
good_grammar_ratios.append(good_grammar_ratio)
return good_grammar_ratios, bad_pos_positions
def _get_good_pos_ngrams(self):
@staticmethod
def _get_good_pos_ngrams():
"""
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
......
......@@ -8,11 +8,12 @@ import logging
import sys
# Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
#Depend on base path to be imported
# Depend on base path to be imported
from essay_set import EssaySet
from errors import *
......@@ -45,7 +46,6 @@ def grade(grader_data, submission):
# Instantiates the Essay set which will carry our essay while it is being classified and graded.
grader_set = EssaySet(essay_type="test")
feedback = {}
# Retrieves the model and extractor we will be using
model, extractor = _get_classifier_and_extractor(grader_data)
......
import unittest
import os
from ease import create, grade
import random
import logging
import json
from ease import create, grade
log = logging.getLogger(__name__)
ROOT_PATH = os.path.abspath(__file__)
......@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT = 50
QUICK_TEST_LIMIT = 5
# noinspection PyClassHasNoInit
class DataLoader():
def load_text_files(self, pathname):
@staticmethod
def load_text_files(pathname):
filenames = os.listdir(pathname)
text = []
for filename in filenames:
......@@ -23,7 +27,8 @@ class DataLoader():
text.append(data[:CHARACTER_LIMIT])
return text
def load_json_file(self, filename):
@staticmethod
def load_json_file(filename):
datafile = open(os.path.join(filename))
data = json.load(datafile)
return data
......@@ -34,38 +39,42 @@ class DataLoader():
"""
pass
class PolarityLoader(DataLoader):
def __init__(self, pathname):
self.pathname = pathname
def load_data(self):
filenames = os.listdir(self.pathname)
directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]]
directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
#Sort so neg is first
# Sort so neg is first
directories.sort()
#We need to have both a postive and a negative folder to classify
if len(directories)!=2:
# We need to have both a postive and a negative folder to classify
if len(directories) != 2:
raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
neg = self.load_text_files(directories[0])
pos = self.load_text_files(directories[1])
scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))]
scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
text = neg + pos
return scores, text
class JSONLoader(DataLoader):
def __init__(self, pathname):
self.pathname = pathname
def load_data(self):
filenames = os.listdir(self.pathname)
files = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if os.path.isfile(os.path.join(self.pathname,f)) if f.endswith(".json")]
files = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
os.path.isfile(os.path.join(self.pathname, f)) if f.endswith(".json")]
files.sort()
#We need to have both a postive and a negative folder to classify
# We need to have both a postive and a negative folder to classify
if len(files) == 0:
return [], []
......@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
all_scores = []
all_text = []
for i in xrange(0,len(data)):
for i in xrange(0, len(data)):
scores = [d['score'] for d in data[i]]
text = [d['text'] for d in data[i]]
if isinstance(scores[0], list):
new_text = []
new_scores = []
for i in xrange(0,len(scores)):
text = scores[i]
s = scores[i]
for j in s:
for j in xrange(0, len(scores)):
text = scores[j]
s = scores[j]
for k in s:
new_text.append(text)
new_scores.append(j)
new_scores.append(k)
text = new_text
scores = new_scores
......@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
return all_scores, all_text
class ModelCreator():
def __init__(self, scores, text):
self.scores = scores
self.text = text
#Governs which creation function in the ease.create module to use. See module for info.
# Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text, list):
self.create_model_generic = False
else:
......@@ -112,7 +122,9 @@ class ModelCreator():
if not self.create_model_generic:
return create.create(self.text, self.scores, "")
else:
return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []),
self.scores)
class Grader():
def __init__(self, model_data):
......@@ -122,7 +134,9 @@ class Grader():
if isinstance(submission, basestring):
return grade.grade(self.model_data, submission)
else:
return grade.grade_generic(self.model_data, submission.get('numeric_values', []), submission.get('textual_values', []))
return grade.grade_generic(self.model_data, submission.get('numeric_values', []),
submission.get('textual_values', []))
class GenericTest(object):
loader = DataLoader
......@@ -137,11 +151,11 @@ class GenericTest(object):
return scores, text
def generic_setup(self, scores, text):
#Shuffle to mix up the classes, set seed to make it repeatable
# Shuffle to mix up the classes, set seed to make it repeatable
random.seed(1)
shuffled_scores = []
shuffled_text = []
indices = [i for i in xrange(0,len(scores))]
indices = [i for i in xrange(0, len(scores))]
random.shuffle(indices)
for i in indices:
shuffled_scores.append(scores[i])
......@@ -159,45 +173,46 @@ class GenericTest(object):
grader = Grader(results)
results = grader.grade(self.text[0])
assert results['success']==True
assert results['success'] == True
def scoring_accuracy(self):
random.seed(1)
model_creator = ModelCreator(self.scores, self.text)
results = model_creator.create_model()
assert results['success']==True
assert results['success'] == True
cv_kappa = results['cv_kappa']
cv_mae = results['cv_mean_absolute_error']
assert cv_kappa>=self.expected_kappa_min
assert cv_mae <=self.expected_mae_max
assert cv_kappa >= self.expected_kappa_min
assert cv_mae <= self.expected_mae_max
def generic_model_creation_and_grading(self):
log.info(self.scores)
log.info(self.text)
score_subset = [random.randint(0,100) for i in xrange(0,min([QUICK_TEST_LIMIT, len(self.scores)]))]
score_subset = [random.randint(0, 100) for i in xrange(0, min([QUICK_TEST_LIMIT, len(self.scores)]))]
text_subset = self.text[:QUICK_TEST_LIMIT]
text_subset = {
'textual_values' : [[t] for t in text_subset],
'numeric_values' : [[1] for i in xrange(0,len(text_subset))]
'textual_values': [[t] for t in text_subset],
'numeric_values': [[1] for i in xrange(0, len(text_subset))]
}
model_creator = ModelCreator(score_subset, text_subset)
results = model_creator.create_model()
assert results['success']==True
assert results['success'] == True
grader = Grader(results)
test_text = {
'textual_values' : [self.text[0]],
'numeric_values' : [1]
'textual_values': [self.text[0]],
'numeric_values': [1]
}
results = grader.grade(test_text)
assert results['success']==True
assert results['success'] == True
class PolarityTest(unittest.TestCase,GenericTest):
class PolarityTest(unittest.TestCase, GenericTest):
loader = PolarityLoader
data_path = "data/polarity"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
# These will increase if we allow more data in.
# I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2
expected_mae_max = 1
......@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
def test_generic_model_creation_and_grading(self):
self.generic_model_creation_and_grading()
class JSONTest(GenericTest):
loader = JSONLoader
data_path = "data/json_data"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
# These will increase if we allow more data in.
# I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2
expected_mae_max = 1
......@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
self.scores, self.text = self.load_data()
return self.scores, self.text
def test_loop():
json_test = JSONTest()
scores, text = json_test.setUp()
for i in xrange(0,len(scores)):
for i in xrange(0, len(scores)):
json_test.generic_setup(scores[i], text[i])
yield json_test.model_creation_and_grading
yield json_test.scoring_accuracy
......
from unittest import TestCase
from nose.tools import assert_equal
from mock import patch
from ease.util_functions import spell_correct
......@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
@patch("util_functions.os.popen")
def test_aspell_not_found(self, popen_mock):
# Expected behavior when aspell is not installed is to return the original
# string with no corrections.
popen_mock.side_effect = OSError
......
......@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path = base_path + "/"
base_path += "/"
#Paths to needed data files
# Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
......@@ -100,7 +100,7 @@ def spell_correct(string):
incorrect_words = list()
correct_spelling = list()
for i in range(1, len(incorrect)):
if (len(incorrect[i]) > 10):
if len(incorrect[i]) > 10:
#Reformat aspell output to make sense
match = re.search(":", incorrect[i])
if hasattr(match, "start"):
......@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
"""
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
dict_matrix = dict.fit_transform(essays)
dictionary = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
dict_matrix = dictionary.fit_transform(essays)
set_score = numpy.asarray(scores, dtype=numpy.int)
med_score = numpy.median(set_score)
new_score = set_score
if (med_score == 0):
if med_score == 0:
med_score = 1
new_score[set_score < med_score] = 0
new_score[set_score >= med_score] = 1
......@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
fish_vals.append(fish_val)
cutoff = 1
if (len(fish_vals) > max_features_pass_2):
if len(fish_vals) > max_features_pass_2:
cutoff = sorted(fish_vals)[max_features_pass_2]
good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
getVar = lambda searchList, ind: [searchList[i] for i in ind]
vocab = getVar(dict.get_feature_names(), good_cols)
get_var = lambda search_list, ind: [search_list[i] for i in ind]
vocab = get_var(dictionary.get_feature_names(), good_cols)
return vocab
......@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
chunks.append(range(range_min, range_max))
preds = []
set_score = numpy.asarray(sel_score, dtype=numpy.int)
chunk_vec = numpy.asarray(range(0, len(chunks)))
for i in xrange(0, len(chunks)):
loop_inds = list(
chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
preds.append(list(sim_fit.predict(arr[chunks[i]])))
all_preds = list(chain(*preds))
return (all_preds)
return all_preds
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
......@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator = 0.0
denominator = 0.0
if (num_ratings > 1):
if num_ratings > 1:
for i in range(num_ratings):
for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment