Incorporate confidence value into generic and regular

d62d57a7 · Vik Paruchuri · 5378393b · d62d57a7 · d62d57a7 · d62d57a7
Commit d62d57a7 authored Feb 26, 2013 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 21 deletions

create.py
+14 -2

grade.py
+32 -15

model_creator.py
+1 -4

No files found.
--- a/create.py
+++ b/create.py
@@ -6,6 +6,7 @@ import os
 import sys
 import logging
 from statsd import statsd
+import numpy
 #Define base path and add to sys path
 base_path = os.path.dirname(__file__)
@@ -35,7 +36,7 @@ def create(text,score,prompt_string,model_path = None):
    #Initialize a results dictionary to return
    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : ""}
+               'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification}
    if len(text)!=len(score):
        msg = "Target and text lists must be same length."
@@ -43,6 +44,15 @@ def create(text,score,prompt_string,model_path = None):
        log.exception(msg)
        return results
+    #Decide what algorithm to use (regression or classification)
+    try:
+        if len(util_functions.f7(list(score)))>5:
+            type = util_functions.AlgorithmTypes.regression
+        else:
+            type = util_functions.AlgorithmTypes.classification
+    except:
+        type = util_functions.AlgorithmTypes.regression
    try:
        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
        e_set = model_creator.create_essay_set(text, score, prompt_string)
@@ -52,11 +62,12 @@ def create(text,score,prompt_string,model_path = None):
        log.exception(msg)
    try:
        #Gets features from the essay set and computes error
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
        results['cv_kappa']=cv_error_results['kappa']
        results['cv_mean_absolute_error']=cv_error_results['mae']
        results['feature_ext']=feature_ext
        results['classifier']=classifier
+        results['algorithm'] = type
        results['success']=True
    except:
        msg = "feature extraction and model creation failed."
@@ -78,6 +89,7 @@ def create_generic(numeric_values, textual_values, target, model_path = None, al
    (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
    target - The variable that we are trying to predict.  A list of integers.
    model_path - deprecated, kept for legacy code.  Do not use.
+    algorithm - the type of algorithm that will be used
    """
    #Initialize a result dictionary to return.

--- a/grade.py
+++ b/grade.py
@@ -35,6 +35,7 @@ def grade(grader_data,grader_config,submission):
        'model' : trained model,
        'extractor' : trained feature extractor,
        'prompt' : prompt for the question,
+        'algorithm' : algorithm for the question,
    }
    grader_config - Legacy, kept for compatibility with old code.  Need to remove.
    submission - The student submission (string)
@@ -46,6 +47,10 @@ def grade(grader_data,grader_config,submission):
    grader_set=EssaySet(type="test")
+    #This is to preserve legacy functionality
+    if 'algorithm' not in grader_data:
+        grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
    try:
        #Try to add essay to essay set object
        grader_set.add_essay(str(submission),0)
@@ -65,11 +70,7 @@ def grade(grader_data,grader_config,submission):
    #Try to determine confidence level
    try:
-        min_score=min(numpy.asarray(grader_data['score']))
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
-        max_score=max(numpy.asarray(grader_data['score']))
-        raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-        #TODO: Normalize confidence somehow here
-        results['confidence']=raw_confidence
    except:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
@@ -112,6 +113,17 @@ def grade(grader_data,grader_config,submission):
    return results
 def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+    """
+    Grades a set of numeric and textual features using a generic model
+    grader_data -- dictionary containing:
+    {
+        'algorithm' - Type of algorithm to use to score
+    }
+    grader_config - legacy, kept for compatibility with old code.  Need to remove.
+    numeric_features - list of numeric features to predict on
+    textual_features - list of textual feature to predict on
+    """
    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
    has_error=False
@@ -137,16 +149,7 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
    #Try to determine confidence level
    try:
-        min_score=min(numpy.asarray(grader_data['score']))
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
-        max_score=max(numpy.asarray(grader_data['score']))
-        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
-            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-            #TODO: Normalize confidence somehow here
-            results['confidence']=raw_confidence
-        else:
-            raw_confidence = grader_data['model'].predict(grader_feats)[0]
-            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
-            results['confidence'] = confidence
    except:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
@@ -159,3 +162,17 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
        results['success'] = True
    return results
+def get_confidence_value(algorithm,model,grader_feats,score):
+    min_score=min(numpy.asarray(score))
+    max_score=max(numpy.asarray(score))
+    if algorithm == util_functions.AlgorithmTypes.classification:
+        raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
+        #TODO: Normalize confidence somehow here
+        confidence=raw_confidence
+    else:
+        raw_confidence = model.predict(grader_feats)[0]
+        confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
+    return confidence
--- a/model_creator.py
+++ b/model_creator.py
@@ -141,7 +141,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
    return f, clf, cv_error_results
-def extract_features_and_generate_model(essays,additional_array=None):
+def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
    """
    Feed in an essay set to get feature vector and classifier
    essays must be an essay set object
@@ -153,9 +153,6 @@ def extract_features_and_generate_model(essays,additional_array=None):
    f.initialize_dictionaries(essays)
    train_feats = f.gen_feats(essays)
-    if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
-        if(additional_array.shape[0]==train_feats.shape[0]):
-            train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
    if len(util_functions.f7(list(set_score)))>5: