Commit da78277e by gradyward

Cleand up all of the files sytlistically

parent a990b25e
......@@ -7,7 +7,7 @@ import sys
import logging
import numpy
#Define base path and add to sys path
# Define base path and add to sys path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
......@@ -24,6 +24,7 @@ import json
#Make a log
log = logging.getLogger(__name__)
def dump_input_data(text, score):
try:
file_path = base_path + "/tests/data/json_data/"
......@@ -32,14 +33,15 @@ def dump_input_data(text, score):
filename = prefix + time_suffix + ".json"
json_data = []
for i in xrange(0, len(text)):
json_data.append({'text' : text[i], 'score' : score[i]})
json_data.append({'text': text[i], 'score': score[i]})
with open(file_path + filename, 'w+') as outfile:
json.dump(json_data, outfile)
except:
error = "Could not dump data to file."
log.exception(error)
def create(text,score,prompt_string, dump_data=False):
def create(text, score, prompt_string, dump_data=False):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
......@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):
algorithm = select_algorithm(score)
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm,
'score' : score, 'text' : text, 'prompt' : prompt_string}
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
'score': score, 'text': text, 'prompt': prompt_string}
if len(text)!=len(score):
if len(text) != len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
......@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
log.exception(msg)
try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, algorithm = algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
algorithm=algorithm)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['algorithm'] = algorithm
results['success']=True
results['success'] = True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
......@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
return results
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
......@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm = select_algorithm(target)
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
......@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try:
#Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
algorithm)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['success'] = True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
......@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
return results
def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list)))>5:
if len(util_functions.f7(list(score_list))) > 5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
......
......@@ -27,7 +27,7 @@ class EssaySet(object):
"""
Initialize variables and check essay set type
"""
if(essaytype != "train" and essaytype != "test"):
if (essaytype != "train" and essaytype != "test"):
essaytype = "train"
self._type = essaytype
......@@ -52,7 +52,7 @@ class EssaySet(object):
Returns a confirmation that essay was added.
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if(len(self._id) > 0):
if (len(self._id) > 0):
max_id = max(self._id)
else:
max_id = 0
......@@ -71,9 +71,10 @@ class EssaySet(object):
essay_text = str(essay_text)
except:
# Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
log.exception(
"Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
if isinstance(essay_score, int) and isinstance(essay_text, basestring) \
and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._score.append(essay_score)
......@@ -83,7 +84,7 @@ class EssaySet(object):
except:
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay)
# Spell correct text using aspell
......@@ -113,7 +114,7 @@ class EssaySet(object):
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if(isinstance(prompt_text, basestring)):
if (isinstance(prompt_text, basestring)):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else:
......@@ -134,7 +135,7 @@ class EssaySet(object):
all_syns = []
for word in e_toks:
synonyms = util_functions.get_wordnet_syns(word)
if(len(synonyms) > max_syns):
if (len(synonyms) > max_syns):
synonyms = random.sample(synonyms, max_syns)
all_syns.append(synonyms)
new_essays = []
......
......@@ -8,7 +8,7 @@ import os
import numpy
import logging
#Append sys to base path to import the following modules
# Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
......@@ -25,7 +25,8 @@ import math
log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
......@@ -39,10 +40,10 @@ def grade(grader_data,submission):
"""
#Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error = False
grader_set=EssaySet(essaytype="test")
grader_set = EssaySet(essaytype="test")
feedback = {}
model, extractor = get_classifier_and_ext(grader_data)
......@@ -53,28 +54,29 @@ def grade(grader_data,submission):
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt']))
except Exception:
error_message = "Essay could not be added to essay set:{0}".format(submission)
log.exception(error_message)
results['errors'].append(error_message)
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
try:
grader_feats=extractor.gen_feats(grader_set)
feedback=extractor.gen_feedback(grader_set,grader_feats)[0]
results['score']=int(model.predict(grader_feats)[0])
grader_feats = extractor.gen_feats(grader_set)
feedback = extractor.gen_feedback(grader_set, grader_feats)[0]
results['score'] = int(model.predict(grader_feats)[0])
except Exception:
error_message = "Could not extract features and score essay."
log.exception(error_message)
results['errors'].append(error_message)
has_error=True
has_error = True
#Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'], grader_data['score'])
results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'],
grader_data['score'])
except Exception:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
......@@ -82,11 +84,11 @@ def grade(grader_data,submission):
if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score
if( 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
if 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']:
results['score'] = 0
results['correct'] = False
results['success']=True
results['success'] = True
#Generate short form output--number of problem areas identified in feedback
......@@ -94,24 +96,25 @@ def grade(grader_data,submission):
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
})
'topicality': feedback['topicality'],
'prompt-overlap': feedback['prompt_overlap'],
})
results['feedback'].update(
{
'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
}
'spelling': feedback['spelling'],
'grammar': feedback['grammar'],
'markup-text': feedback['markup_text'],
}
)
else:
#If error, success is False.
results['success']=False
results['success'] = False
return results
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
......@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False
has_error = False
#Try to find and load the model file
grader_set=predictor_set.PredictorSet(essaytype="test")
grader_set = predictor_set.PredictorSet(essaytype="test")
model, extractor = get_classifier_and_ext(grader_data)
#Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
grader_set.add_row(numeric_features, textual_features, 0)
except Exception:
error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)
log.exception(error_msg)
results['errors'].append(error_msg)
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
try:
grader_feats=extractor.gen_feats(grader_set)
results['score']=model.predict(grader_feats)[0]
grader_feats = extractor.gen_feats(grader_set)
results['score'] = model.predict(grader_feats)[0]
except Exception:
error_msg = "Could not extract features and score essay."
log.exception(error_msg)
results['errors'].append(error_msg)
has_error=True
has_error = True
#Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'],model, grader_feats, results['score'])
results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'])
except Exception:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
......@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
......@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model
"""
min_score=min(numpy.asarray(scores))
max_score=max(numpy.asarray(scores))
min_score = min(numpy.asarray(scores))
max_score = max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
raw_confidence = model.predict_proba(grader_feats)[0, (float(score) - float(min_score))]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
confidence = raw_confidence
elif hasattr(model, "predict"):
raw_confidence = model.predict(grader_feats)[0]
confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)), math.ceil(float(raw_confidence)) - float(raw_confidence))
confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)),
math.ceil(float(raw_confidence)) - float(raw_confidence))
else:
confidence = 0
return confidence
def get_classifier_and_ext(grader_data):
if 'classifier' in grader_data:
model = grader_data['classifier']
......
#Provides interface functions to create and save models
# Provides interface functions to create and save models
import numpy
import re
......@@ -19,7 +19,8 @@ import feature_extractor
import logging
import predictor_extractor
log=logging.getLogger()
log = logging.getLogger()
def read_in_test_data(filename):
"""
......@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string = open(filename).read()
return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
def read_in_test_data_twocolumn(filename, sep=","):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
......@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x
def get_cv_error(clf,feats,scores):
def get_cv_error(clf, feats, scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0}
results = {'success': False, 'kappa': 0, 'mae': 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
results['mae']=err
results['kappa']=kappa
results['success']=True
cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
results['mae'] = err
results['kappa'] = kappa
results['success'] = True
except ValueError as ex:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
......@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):
return results
def get_algorithms(algorithm):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
......@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
"""
if algorithm == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
max_depth=4, random_state=1, min_samples_leaf=3)
clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1, min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
max_depth=4, random_state=1, min_samples_leaf=3)
clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2
......@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
......@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
clf, clf2 = get_algorithms(algorithm)
cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
......@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score = predictor_set._target
set_score[0]=1
set_score[1]=0
set_score[0] = 1
set_score[1] = 0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
......@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
if len(util_functions.f7(list(set_score))) > 5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(algorithm)
clf, clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
cv_error_results = get_cv_error(clf2, train_feats, essays._score)
try:
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
set_score[0] = 1
set_score[1] = 0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
"""
Writes out a model to a file.
......@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set=create_essay_set(text,score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path)
essay_set = create_essay_set(text, score, prompt)
feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
dump_model_to_file(prompt, feature_ext, clf, model_path)
......@@ -16,17 +16,18 @@ import logging
import math
from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
# Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
......@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
div_length = len(p_set._essay_sets)
if div_length == 0:
div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
......@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True:
if self._initialized != True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
......
......@@ -11,26 +11,27 @@ sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log = logging.getLogger(__name__)
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, essaytype = "train"):
def __init__(self, essaytype="train"):
"""
Initialize variables and check essay set type
"""
if(essaytype != "train" and essaytype != "test"):
if (essaytype != "train" and essaytype != "test"):
essaytype = "train"
self._type = essaytype
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
self._target = []
self._textual_features = []
self._numeric_features = []
self._essay_sets = []
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
# Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
......@@ -47,16 +48,16 @@ class PredictorSet(object):
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
if len(self._numeric_features) > 0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
if len(self._textual_features) > 0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
......@@ -65,7 +66,7 @@ class PredictorSet(object):
#Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
for i in xrange(0, len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
......@@ -73,8 +74,7 @@ class PredictorSet(object):
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
for i in xrange(0, len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
......@@ -83,8 +83,8 @@ class PredictorSet(object):
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(essaytype=self._type))
#Add numeric and textual features
......@@ -95,6 +95,6 @@ class PredictorSet(object):
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
from fisher import pvalue
......@@ -15,17 +15,18 @@ import logging
import sys
import tempfile
log=logging.getLogger(__name__)
log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
#Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object):
"""
Defines what types of algorithm can be used
......@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
def create_model_path(model_path):
"""
Creates a path to model files
model_path - string
"""
if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path
model_path = "/" + model_path
if not model_path.startswith("models"):
model_path = "models" + model_path
if not model_path.endswith(".p"):
model_path+=".p"
model_path += ".p"
return model_path
def sub_chars(string):
"""
Strips illegal characters from a string. Used to sanitize input essays.
......@@ -66,7 +69,7 @@ def sub_chars(string):
#Replace text. Ordering is very important!
nstring = re.sub(sub_pat, " ", string)
nstring = re.sub(char_pat," .", nstring)
nstring = re.sub(char_pat, " .", nstring)
nstring = re.sub(com_pat, " ,", nstring)
nstring = re.sub(ques_pat, " ?", nstring)
nstring = re.sub(excl_pat, " !", nstring)
......@@ -101,7 +104,7 @@ def spell_correct(string):
except Exception:
log.exception("aspell process failed; could not spell check")
# Return original string if aspell fails
return string,0, string
return string, 0, string
finally:
f.close()
......@@ -109,7 +112,7 @@ def spell_correct(string):
incorrect_words = list()
correct_spelling = list()
for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10):
if (len(incorrect[i]) > 10):
#Reformat aspell output to make sense
match = re.search(":", incorrect[i])
if hasattr(match, "start"):
......@@ -128,16 +131,16 @@ def spell_correct(string):
#Create markup based on spelling errors
newstring = string
markup_string = string
already_subbed=[]
already_subbed = []
for i in range(0, len(incorrect_words)):
sub_pat = r"\b" + incorrect_words[i] + r"\b"
sub_comp = re.compile(sub_pat)
newstring = re.sub(sub_comp, correct_spelling[i], newstring)
if incorrect_words[i] not in already_subbed:
markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
already_subbed.append(incorrect_words[i])
return newstring,len(incorrect_words),markup_string
return newstring, len(incorrect_words), markup_string
def ngrams(tokens, min_n, max_n):
......@@ -162,6 +165,7 @@ def f7(seq):
"""
seen = set()
seen_add = seen.add
#TODO Potential Improvment Here
return [x for x in seq if x not in seen and not seen_add(x)]
......@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
"""
dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
dict_mat = dict.fit_transform(text)
set_score = numpy.asarray(score, dtype=numpy.int)
med_score = numpy.median(set_score)
new_score = set_score
if(med_score == 0):
if (med_score == 0):
med_score = 1
new_score[set_score < med_score] = 0
new_score[set_score >= med_score] = 1
......@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals.append(fish_val)
cutoff = 1
if(len(fish_vals) > max_feats2):
if (len(fish_vals) > max_feats2):
cutoff = sorted(fish_vals)[max_feats2]
good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
......@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
else:
cost = 1
d[(i, j)] = min(
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
)
if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
return d[lenstr1 - 1, lenstr2 - 1]
......@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
preds.append(list(sim_fit.predict(arr[chunks[i]])))
all_preds = list(chain(*preds))
return(all_preds)
return (all_preds)
def gen_model(clf, arr, sel_score):
......@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
"""
set_score = numpy.asarray(sel_score, dtype=numpy.int)
sim_fit = clf.fit(arr, set_score)
return(sim_fit)
return (sim_fit)
def gen_preds(clf, arr):
......@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
arr is a data array identical in dimension to the array clf was trained on
Returns the array of predictions.
"""
if(hasattr(clf, "predict_proba")):
if (hasattr(clf, "predict_proba")):
ret = clf.predict(arr)
# pred_score=preds.argmax(1)+min(x._score)
else:
......@@ -340,8 +344,10 @@ def calc_list_average(l):
total += value
return total / len(l)
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Calculates kappa correlation between rater_a and rater_b.
......@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
max_rating is an optional argument describing the maximum rating possible on the data set
Returns a float corresponding to the kappa correlation
"""
assert(len(rater_a) == len(rater_b))
assert (len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b]
if min_rating is None:
......@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if max_rating is None:
max_rating = max(rater_a + rater_b)
conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating)
min_rating, max_rating)
num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a))
......@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator = 0.0
denominator = 0.0
if(num_ratings > 1):
if (num_ratings > 1):
for i in range(num_ratings):
for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j]
......@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
A confusion matrix shows how often 2 values agree and disagree
See quadratic_weighted_kappa for argument descriptions
"""
assert(len(rater_a) == len(rater_b))
assert (len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b]
min_rating = int(min_rating)
......@@ -450,7 +456,7 @@ def get_separator_words(toks1):
Returns a list of separator words
"""
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
if (os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else:
essay_corpus = open(ESSAY_CORPUS_PATH).read()
......@@ -460,12 +466,12 @@ def get_separator_words(toks1):
sep_words = []
for word in tab_toks1.keys():
tok1_present = tab_toks1[word]
if(tok1_present > 2):
if (tok1_present > 2):
tok1_total = tab_toks1._N
tok2_present = toks2[word]
tok2_total = toks2._N
fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
if (fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
sep_words.append(word)
sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
return sep_words
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment