Commit 2e6cb8e5 by Hugh Brown Committed by Vik Paruchuri

./grade.py: W391 blank line at end of file

parent 0d7ac804
[pep8]
ignore=E501,E712,E711
......@@ -7,22 +7,23 @@ import sys
import logging
import numpy
#Define base path and add to sys path
# Define base path and add to sys path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
sys.path.append(one_up_path)
#Import modules that are dependent on the base path
# Import modules that are dependent on the base path
import model_creator
import util_functions
import predictor_set
import predictor_extractor
#Make a log
# Make a log
log = logging.getLogger(__name__)
def create(text,score,prompt_string):
def create(text, score, prompt_string):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
......@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays
"""
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
'score' : score, 'text' : text, 'prompt' : prompt_string}
# Initialize a results dictionary to return
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': util_functions.AlgorithmTypes.classification,
'score': score, 'text': text, 'prompt': prompt_string}
if len(text)!=len(score):
if len(text) != len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
#Decide what algorithm to use (regression or classification)
# Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score)))>5:
# Count the number of unique score points in the score list
if len(util_functions.f7(list(score))) > 5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
......@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
type = util_functions.AlgorithmTypes.regression
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
# Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
msg = "essay set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
#Gets features from the essay set and computes error
# Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['algorithm'] = type
results['success']=True
results['success'] = True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
......@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
return results
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
......@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used
"""
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
# Initialize a result dictionary to return.
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
# Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
......@@ -107,16 +108,16 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
log.exception(msg)
try:
#Extract all features and then train a classifier with the features
# Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['success'] = True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
return results
\ No newline at end of file
return results
......@@ -15,11 +15,12 @@ sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log=logging.getLogger(__name__)
log = logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH = 20000
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object):
def __init__(self, type="train"):
......@@ -30,17 +31,17 @@ class EssaySet(object):
type = "train"
self._type = type
self._score=[]
self._text=[]
self._id=[]
self._clean_text=[]
self._tokens=[]
self._pos=[]
self._clean_stem_text=[]
self._score = []
self._text = []
self._id = []
self._clean_text = []
self._tokens = []
self._pos = []
self._clean_stem_text = []
self._generated = []
self._prompt = ""
self._spelling_errors=[]
self._markup_text=[]
self._spelling_errors = []
self._markup_text = []
def add_essay(self, essay_text, essay_score, essay_generated=0):
"""
......@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try:
essay_text=essay_text.encode('ascii', 'ignore')
if len(essay_text)<5:
essay_text="Invalid essay."
essay_text = essay_text.encode('ascii', 'ignore')
if len(essay_text) < 5:
essay_text = "Invalid essay."
except:
log.exception("Could not parse essay into ascii.")
try:
#Try conversion of types
essay_score=int(essay_score)
essay_text=str(essay_text)
# Try conversion of types
essay_score = int(essay_score)
essay_text = str(essay_text)
except:
#Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
# Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters
try:
essay_text=str(essay_text.encode('ascii', 'ignore'))
essay_text = str(essay_text.encode('ascii', 'ignore'))
except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay)
# Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text)
......@@ -112,7 +113,7 @@ class EssaySet(object):
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if(type(prompt_text) == type("text")):
if(isinstance(prompt_text, type("text"))):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else:
......@@ -145,4 +146,4 @@ class EssaySet(object):
syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks))
for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
self.add_essay(new_essays[z], e_score, 1)
......@@ -21,6 +21,8 @@
import math
## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
......@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x))
c = math.log(binomial_coefficient(m+n, k))
return math.exp(a+b-c)
b = math.log(binomial_coefficient(n, k - x))
c = math.log(binomial_coefficient(m + n, k))
return math.exp(a + b - c)
## From dendropy.mathlib.probability
def binomial_coefficient(population, sample):
"Returns `population` choose `sample`."
s = max(sample, population - sample)
......@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
return 1
numerator = 1
denominator = 1
for i in xrange(s+1, population + 1):
for i in xrange(s + 1, population + 1):
numerator *= i
denominator *= (i - s)
return numerator/denominator
return numerator / denominator
## From dendropy.mathlib.statistics
class FishersExactTest(object):
"""
Given a 2x2 table:
......@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
b = table[0][1]
c = table[1][0]
d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c)
return hypergeometric_pmf(a, a + b, c + d, a + c)
probability_of_table = staticmethod(probability_of_table)
def __init__(self, table):
......@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return [ [ table[1][0], table[0][0] ],
[table[1][1], table[0][1] ] ]
return [[table[1][0], table[0][0]],
[table[1][1], table[0][1]]]
def _min_rotation(self):
"""
......@@ -241,8 +247,9 @@ extreme.
p_vals.append(p)
return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec):
if abs(v1 - v2) <= 10 ** (-prec):
print "OK: {} == {}".format(v1, v2)
else:
print "FAIL: {} != {}".format(v1, v2)
......@@ -252,4 +259,4 @@ if __name__ == "__main__":
ft = FishersExactTest(table)
assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
......@@ -8,24 +8,25 @@ import os
import numpy
import logging
#Append sys to base path to import the following modules
# Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
#Depend on base path to be imported
# Depend on base path to be imported
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
#Imports needed to unpickle grader data
# Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
......@@ -38,73 +39,74 @@ def grade(grader_data,submission):
submission - The student submission (string)
"""
#Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
# Initialize result dictionary
results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error = False
grader_set=EssaySet(type="test")
grader_set = EssaySet(type="test")
#This is to preserve legacy functionality
# This is to preserve legacy functionality
if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
# Try to add essay to essay set object
grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt']))
except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
# Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0])
except :
grader_feats = grader_data['extractor'].gen_feats(grader_set)
feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
results['score'] = int(grader_data['model'].predict(grader_feats)[0])
except:
results['errors'].append("Could not extract features and score essay.")
has_error=True
has_error = True
#Try to determine confidence level
# Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
# If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score
# If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
results['score'] = 0
results['correct'] = False
results['success']=True
results['success'] = True
#Generate short form output--number of problem areas identified in feedback
# Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available
# Add feedback to results if available
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
'topicality': feedback['topicality'],
'prompt-overlap': feedback['prompt_overlap'],
})
results['feedback'].update(
{
'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
'spelling': feedback['spelling'],
'grammar': feedback['grammar'],
'markup-text': feedback['markup_text'],
}
)
else:
#If error, success is False.
results['success']=False
# If error, success is False.
results['success'] = False
return results
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
......@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False
has_error = False
#Try to find and load the model file
# Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
grader_set = predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object
# Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
grader_set.add_row(numeric_features, textual_features, 0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
# Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
grader_feats = grader_data['extractor'].gen_feats(grader_set)
results['score'] = grader_data['model'].predict(grader_feats)[0]
except:
results['errors'].append("Could not extract features and score essay.")
has_error=True
has_error = True
#Try to determine confidence level
# Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
# If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
......@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
......@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
elif hasattr(model, "predict"):
......@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence = 0
return confidence
#Provides interface functions to create and save models
# Provides interface functions to create and save models
import numpy
import re
......@@ -19,7 +19,8 @@ import feature_extractor
import logging
import predictor_extractor
log=logging.getLogger()
log = logging.getLogger()
def read_in_test_data(filename):
"""
......@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string = open(filename).read()
return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
def read_in_test_data_twocolumn(filename, sep=","):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
......@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x
def get_cv_error(clf,feats,scores):
def get_cv_error(clf, feats, scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0}
results = {'success': False, 'kappa': 0, 'mae': 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
results['mae']=err
results['kappa']=kappa
results['success']=True
cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
results['mae'] = err
results['kappa'] = kappa
results['success'] = True
except ValueError:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
except:
log.exception("Error getting cv error estimates.")
return results
def get_algorithms(type):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
......@@ -116,14 +120,14 @@ def get_algorithms(type):
"""
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
max_depth=4, random_state=1, min_samples_leaf=3)
clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1, min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
max_depth=4, random_state=1, min_samples_leaf=3)
clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2
......@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
clf, clf2 = get_algorithms(type)
cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
set_score[0] = 1
set_score[1] = 0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
......@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
if len(util_functions.f7(list(set_score))) > 5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(type)
clf, clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
cv_error_results = get_cv_error(clf2, train_feats, essays._score)
try:
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
set_score[0] = 1
set_score[1] = 0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
"""
Writes out a model to a file.
......@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set=create_essay_set(text_score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path)
essay_set = create_essay_set(text_score, prompt)
feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
dump_model_to_file(prompt, feature_ext, clf, model_path)
......@@ -16,17 +16,18 @@ import logging
import math
from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
# Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
......@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
div_length = len(p_set._essay_sets)
if div_length == 0:
div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
# Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
......@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True:
if self._initialized != True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
......
......@@ -11,12 +11,13 @@ sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log = logging.getLogger(__name__)
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
def __init__(self, type="train"):
"""
Initialize variables and check essay set type
"""
......@@ -24,13 +25,13 @@ class PredictorSet(object):
type = "train"
self._type = type
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
self._target = []
self._textual_features = []
self._numeric_features = []
self._essay_sets = []
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
# Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
......@@ -46,26 +47,26 @@ class PredictorSet(object):
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
# Do some length checking for parameters
if len(self._numeric_features) > 0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
if len(self._textual_features) > 0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
# Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
for i in xrange(0, len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
......@@ -73,8 +74,7 @@ class PredictorSet(object):
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
for i in xrange(0, len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
......@@ -82,19 +82,18 @@ class PredictorSet(object):
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
# Create essay sets for textual features if needed
if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features
# Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
# Add targets
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
# Add textual features to essay sets
for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
......@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT = 100
QUICK_TEST_LIMIT = 5
class DataLoader():
def load_text_files(self, pathname):
filenames = os.listdir(pathname)
......@@ -28,34 +29,36 @@ class DataLoader():
"""
pass
class PolarityLoader(DataLoader):
def __init__(self, pathname):
self.pathname = pathname
def load_data(self):
filenames = os.listdir(self.pathname)
directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]]
directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
#Sort so neg is first
# Sort so neg is first
directories.sort()
#We need to have both a postive and a negative folder to classify
if len(directories)!=2:
# We need to have both a postive and a negative folder to classify
if len(directories) != 2:
raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
neg = self.load_text_files(directories[0])
pos = self.load_text_files(directories[1])
scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))]
scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
text = neg + pos
return scores, text
class ModelCreator():
def __init__(self, scores, text):
self.scores = scores
self.text = text
#Governs which creation function in the ease.create module to use. See module for info.
# Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text[0], basestring):
self.create_model_generic = False
else:
......@@ -67,6 +70,7 @@ class ModelCreator():
else:
return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
class Grader():
def __init__(self, model_data):
self.model_data = model_data
......@@ -77,6 +81,7 @@ class Grader():
else:
return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', []))
class GenericTest(object):
loader = DataLoader
data_path = ""
......@@ -87,11 +92,11 @@ class GenericTest(object):
data_loader = self.loader(os.path.join(TEST_PATH, self.data_path))
scores, text = data_loader.load_data()
#Shuffle to mix up the classes, set seed to make it repeatable
# Shuffle to mix up the classes, set seed to make it repeatable
random.seed(1)
shuffled_scores = []
shuffled_text = []
indices = [i for i in xrange(0,len(scores))]
indices = [i for i in xrange(0, len(scores))]
random.shuffle(indices)
for i in indices:
shuffled_scores.append(scores[i])
......@@ -121,12 +126,13 @@ class GenericTest(object):
self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
self.assertLessEqual(cv_mae, self.expected_mae_max)
class PolarityTest(unittest.TestCase,GenericTest):
class PolarityTest(unittest.TestCase, GenericTest):
loader = PolarityLoader
data_path = "data/polarity"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
# These will increase if we allow more data in.
# I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2
expected_mae_max = 1
......
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
# Requires aspell to be installed and added to the path
from external_code.fisher import fisher
aspell_path = "aspell"
......@@ -14,17 +14,18 @@ import pickle
import logging
import sys
log=logging.getLogger(__name__)
log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
#Paths to needed data files
# Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object):
"""
Defines what types of algorithm can be used
......@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
def create_model_path(model_path):
"""
Creates a path to model files
model_path - string
"""
if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path
model_path = "/" + model_path
if not model_path.startswith("models"):
model_path = "models" + model_path
if not model_path.endswith(".p"):
model_path+=".p"
model_path += ".p"
return model_path
def sub_chars(string):
"""
Strips illegal characters from a string. Used to sanitize input essays.
......@@ -53,7 +56,7 @@ def sub_chars(string):
Returns sanitized string.
string - string
"""
#Define replacement patterns
# Define replacement patterns
sub_pat = r"[^A-Za-z\.\?!,';:]"
char_pat = r"\."
com_pat = r","
......@@ -63,9 +66,9 @@ def sub_chars(string):
col_pat = r":"
whitespace_pat = r"\s{1,}"
#Replace text. Ordering is very important!
# Replace text. Ordering is very important!
nstring = re.sub(sub_pat, " ", string)
nstring = re.sub(char_pat," .", nstring)
nstring = re.sub(char_pat, " .", nstring)
nstring = re.sub(com_pat, " ,", nstring)
nstring = re.sub(ques_pat, " ?", nstring)
nstring = re.sub(excl_pat, " !", nstring)
......@@ -84,7 +87,7 @@ def spell_correct(string):
string - string
"""
#Create a temp file so that aspell could be used
# Create a temp file so that aspell could be used
f = open('tmpfile', 'w')
f.write(string)
f_path = os.path.abspath(f.name)
......@@ -93,16 +96,16 @@ def spell_correct(string):
p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
except:
log.exception("Could not find aspell, so could not spell correct!")
#Return original string if aspell fails
return string,0, string
#Aspell returns a list of incorrect words with the above flags
# Return original string if aspell fails
return string, 0, string
# Aspell returns a list of incorrect words with the above flags
incorrect = p.readlines()
p.close()
incorrect_words = list()
correct_spelling = list()
for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10):
#Reformat aspell output to make sense
# Reformat aspell output to make sense
match = re.search(":", incorrect[i])
if hasattr(match, "start"):
begstring = incorrect[i][2:match.start()]
......@@ -117,19 +120,19 @@ def spell_correct(string):
incorrect_words.append(begword)
correct_spelling.append(sug)
#Create markup based on spelling errors
# Create markup based on spelling errors
newstring = string
markup_string = string
already_subbed=[]
already_subbed = []
for i in range(0, len(incorrect_words)):
sub_pat = r"\b" + incorrect_words[i] + r"\b"
sub_comp = re.compile(sub_pat)
newstring = re.sub(sub_comp, correct_spelling[i], newstring)
if incorrect_words[i] not in already_subbed:
markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
already_subbed.append(incorrect_words[i])
return newstring,len(incorrect_words),markup_string
return newstring, len(incorrect_words), markup_string
def ngrams(tokens, min_n, max_n):
......@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
"""
dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
dict_mat = dict.fit_transform(text)
set_score = numpy.asarray(score, dtype=numpy.int)
med_score = numpy.median(set_score)
......@@ -246,12 +249,12 @@ def edit_distance(s1, s2):
else:
cost = 1
d[(i, j)] = min(
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
)
if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
return d[lenstr1 - 1, lenstr2 - 1]
......@@ -335,6 +338,7 @@ def calc_list_average(l):
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Calculates kappa correlation between rater_a and rater_b.
......@@ -351,7 +355,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if max_rating is None:
max_rating = max(rater_a + rater_b)
conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating)
min_rating, max_rating)
num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a))
......@@ -482,4 +486,4 @@ def getMedian(numericValues):
lower = theValues[len(theValues) / 2 - 1]
upper = theValues[len(theValues) / 2]
return (float(lower + upper)) / 2
\ No newline at end of file
return (float(lower + upper)) / 2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment