Make generic ML grading

ae7ab0f0 · Vik Paruchuri · eb757829 · ae7ab0f0 · ae7ab0f0 · ae7ab0f0
Commit ae7ab0f0 authored Jan 24, 2013 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 11 deletions

create.py
+2 -2

grade.py
+51 -1

model_creator.py
+4 -8

util_functions.py
+4 -0

No files found.
--- a/create.py
+++ b/create.py
@@ -53,9 +53,9 @@ def create(text,score,prompt_string,model_path):
    return results
-def create_generic(numeric_values, textual_values, target, model_path, algorithm = model_creator.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : ""}
+               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
        msg = "Target, numeric features, and text features must all be the same length."

--- a/grade.py
+++ b/grade.py
@@ -16,11 +16,14 @@ base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
+import predictor_extractor
+import predictor_set
 import util_functions
 #Imports needed to unpickle grader data
 import feature_extractor
 import sklearn.ensemble
+import math
 log = logging.getLogger(__name__)
@@ -100,4 +103,51 @@ def grade(grader_data,grader_config,submission):
    return results
+def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
+    has_error=False
+    #Try to find and load the model file
+    grader_set=predictor_set.PredictorSet(type="test")
+    #Try to add essays to essay set object
+    try:
+        grader_set.add_row(numeric_features, textual_features,0)
+    except:
+        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
+        has_error=True
+    #Try to extract features from submission and assign score via the model
+    try:
+        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        results['score']=grader_data['model'].predict(grader_feats)[0]
+    except :
+        results['errors'].append("Could not extract features and score essay.")
+        has_error=True
+    #Try to determine confidence level
+    try:
+        min_score=min(numpy.asarray(grader_data['score']))
+        max_score=max(numpy.asarray(grader_data['score']))
+        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
+            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
+            #TODO: Normalize confidence somehow here
+            results['confidence']=raw_confidence
+        else:
+            raw_confidence = grader_data['model'].predict(grader_feats)[0]
+            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
+            results['confidence'] = confidence
+    except:
+        #If there is an error getting confidence, it is not a show-stopper, so just log
+        log.exception("Problem generating confidence value")
+        #Count number of successful/unsuccessful gradings
+    statsd.increment("open_ended_assessment.machine_learning.grader_count",
+        tags=["success:{0}".format(results['success'])])
+    if not has_error:
+        results['success'] = True
+    return results
--- a/model_creator.py
+++ b/model_creator.py
@@ -21,10 +21,6 @@ import predictor_extractor
 log=logging.getLogger()
-class AlgorithmTypes(object):
-    regression = "regression"
-    classification = "classifiction"
 def read_in_test_data(filename):
    """
    Reads in test data file found at filename.
@@ -107,16 +103,16 @@ def get_cv_error(clf,feats,scores):
    return results
-def extract_features_and_generate_model_predictors(predictor_set, type=AlgorithmTypes.regression):
+def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
-    if(algorithm not in [AlgorithmTypes.regression, AlgorithmTypes.classification]):
+    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
-        algorithm = AlgorithmTypes.regression
+        algorithm = util_functions.AlgorithmTypes.regression
    f = predictor_extractor.PredictorExtractor()
    f.initialize_dictionaries(predictor_set)
    train_feats = f.gen_feats(predictor_set)
-    if type = AlgorithmTypes.classification:
+    if type == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
            max_depth=4, random_state=1,min_samples_leaf=3)
        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,

--- a/util_functions.py
+++ b/util_functions.py
@@ -17,6 +17,10 @@ import logging
 log=logging.getLogger(__name__)
+class AlgorithmTypes(object):
+    regression = "regression"
+    classification = "classifiction"
 def create_model_path(model_path):
    if not model_path.startswith("/") and not model_path.startswith("models/"):
        model_path="/" + model_path