Commit d62d57a7 by Vik Paruchuri

Incorporate confidence value into generic and regular

parent 5378393b
......@@ -6,6 +6,7 @@ import os
import sys
import logging
from statsd import statsd
import numpy
#Define base path and add to sys path
base_path = os.path.dirname(__file__)
......@@ -35,7 +36,7 @@ def create(text,score,prompt_string,model_path = None):
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""}
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
......@@ -43,6 +44,15 @@ def create(text,score,prompt_string,model_path = None):
log.exception(msg)
return results
#Decide what algorithm to use (regression or classification)
try:
if len(util_functions.f7(list(score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
except:
type = util_functions.AlgorithmTypes.regression
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
......@@ -52,11 +62,12 @@ def create(text,score,prompt_string,model_path = None):
log.exception(msg)
try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['algorithm'] = type
results['success']=True
except:
msg = "feature extraction and model creation failed."
......@@ -78,6 +89,7 @@ def create_generic(numeric_values, textual_values, target, model_path = None, al
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
model_path - deprecated, kept for legacy code. Do not use.
algorithm - the type of algorithm that will be used
"""
#Initialize a result dictionary to return.
......
......@@ -35,6 +35,7 @@ def grade(grader_data,grader_config,submission):
'model' : trained model,
'extractor' : trained feature extractor,
'prompt' : prompt for the question,
'algorithm' : algorithm for the question,
}
grader_config - Legacy, kept for compatibility with old code. Need to remove.
submission - The student submission (string)
......@@ -46,6 +47,10 @@ def grade(grader_data,grader_config,submission):
grader_set=EssaySet(type="test")
#This is to preserve legacy functionality
if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
......@@ -65,11 +70,7 @@ def grade(grader_data,grader_config,submission):
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
......@@ -112,6 +113,17 @@ def grade(grader_data,grader_config,submission):
return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
grader_data -- dictionary containing:
{
'algorithm' - Type of algorithm to use to score
}
grader_config - legacy, kept for compatibility with old code. Need to remove.
numeric_features - list of numeric features to predict on
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
......@@ -137,16 +149,7 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
else:
raw_confidence = grader_data['model'].predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
results['confidence'] = confidence
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
......@@ -159,3 +162,17 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
results['success'] = True
return results
def get_confidence_value(algorithm,model,grader_feats,score):
min_score=min(numpy.asarray(score))
max_score=max(numpy.asarray(score))
if algorithm == util_functions.AlgorithmTypes.classification:
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
else:
raw_confidence = model.predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
return confidence
......@@ -141,7 +141,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None):
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
......@@ -153,9 +153,6 @@ def extract_features_and_generate_model(essays,additional_array=None):
f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays)
if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
if(additional_array.shape[0]==train_feats.shape[0]):
train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment