Merge pull request #7 from MITx/vik/deployment_work

Vik/deployment work

Merge pull request #7 from MITx/vik/deployment_work
Vik/deployment work
2e81fda2 · VikParuchuri · 434571f6 · ae7ab0f0 · 2e81fda2 · 2e81fda2
Commit 2e81fda2 authored Jan 25, 2013 by VikParuchuri
Showing with 352 additions and 4 deletions

create.py
+47 -0

feature_extractor.py
+3 -3

grade.py
+49 -1

model_creator.py
+34 -0

predictor_extractor.py
+72 -0

predictor_set.py
+100 -0

tests/test_generic_ml.py
+43 -0

util_functions.py
+4 -0

No files found.
--- a/create.py
+++ b/create.py
@@ -11,6 +11,8 @@ sys.path.append(one_up_path)

 import model_creator
 import util_functions
+import predictor_set
+import predictor_extractor

 from statsd import statsd

@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path):

    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
               'feature_ext' : "", 'classifier' : ""}
+
+    if len(text)!=len(score):
+        msg = "Target and text lists must be same length."
+        results['errors'].append(msg)
+        log.exception(msg)
+        return results
+
    try:
        e_set = model_creator.create_essay_set(text, score, prompt_string)
    except:
@@ -43,3 +52,40 @@ def create(text,score,prompt_string,model_path):

    return results

+
+def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
+    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
+
+    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
+        msg = "Target, numeric features, and text features must all be the same length."
+        results['errors'].append(msg)
+        log.exception(msg)
+        return results
+
+    try:
+        pset = predictor_set.PredictorSet(type="train")
+        for i in xrange(0, len(numeric_values)):
+            pset.add_row(numeric_values[i], textual_values[i], target[i])
+    except:
+        msg = "predictor set creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+
+    try:
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
+        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['feature_ext']=feature_ext
+        results['classifier']=classifier
+        results['success']=True
+    except:
+        msg = "feature extraction and model creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+
+        #Count number of successful/unsuccessful creations
+    statsd.increment("open_ended_assessment.machine_learning.creator_count",
+        tags=["success:{0}".format(results['success'])])
+
+    return results
\ No newline at end of file
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -32,7 +32,7 @@ class FeatureExtractor(object):
        self._spell_errors_per_character=0
        self._grammar_errors_per_character=0

-    def initialize_dictionaries(self, e_set):
+    def initialize_dictionaries(self, e_set, max_feats2 = 200):
        """
        Initializes dictionaries from an essay set object
        Dictionaries must be initialized prior to using this to extract features
@@ -41,8 +41,8 @@ class FeatureExtractor(object):
        """
        if(hasattr(e_set, '_type')):
            if(e_set._type == "train"):
-                nvocab = util_functions.get_vocab(e_set._text, e_set._score)
-                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
+                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
+                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True

--- a/grade.py
+++ b/grade.py
@@ -16,11 +16,14 @@ base_path = os.path.dirname(__file__)
 sys.path.append(base_path)

 from essay_set import EssaySet
+import predictor_extractor
+import predictor_set
 import util_functions

 #Imports needed to unpickle grader data
 import feature_extractor
 import sklearn.ensemble
+import math

 log = logging.getLogger(__name__)

@@ -100,6 +103,51 @@ def grade(grader_data,grader_config,submission):

    return results

+def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}

+    has_error=False
+
+    #Try to find and load the model file
+
+    grader_set=predictor_set.PredictorSet(type="test")
+
+    #Try to add essays to essay set object
+    try:
+        grader_set.add_row(numeric_features, textual_features,0)
+    except:
+        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
+        has_error=True
+
+    #Try to extract features from submission and assign score via the model
+    try:
+        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        results['score']=grader_data['model'].predict(grader_feats)[0]
+    except :
+        results['errors'].append("Could not extract features and score essay.")
+        has_error=True

-    
+    #Try to determine confidence level
+    try:
+        min_score=min(numpy.asarray(grader_data['score']))
+        max_score=max(numpy.asarray(grader_data['score']))
+        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
+            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
+            #TODO: Normalize confidence somehow here
+            results['confidence']=raw_confidence
+        else:
+            raw_confidence = grader_data['model'].predict(grader_feats)[0]
+            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
+            results['confidence'] = confidence
+    except:
+        #If there is an error getting confidence, it is not a show-stopper, so just log
+        log.exception("Problem generating confidence value")
+
+        #Count number of successful/unsuccessful gradings
+    statsd.increment("open_ended_assessment.machine_learning.grader_count",
+        tags=["success:{0}".format(results['success'])])
+
+    if not has_error:
+        results['success'] = True
+
+    return results
--- a/model_creator.py
+++ b/model_creator.py
@@ -17,6 +17,7 @@ from essay_set import EssaySet
 import util_functions
 import feature_extractor
 import logging
+import predictor_extractor

 log=logging.getLogger()

@@ -102,6 +103,39 @@ def get_cv_error(clf,feats,scores):

    return results

+def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
+    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
+        algorithm = util_functions.AlgorithmTypes.regression
+
+    f = predictor_extractor.PredictorExtractor()
+    f.initialize_dictionaries(predictor_set)
+
+    train_feats = f.gen_feats(predictor_set)
+
+    if type == util_functions.AlgorithmTypes.classification:
+        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+    else:
+        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+
+    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
+
+    try:
+        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
+        clf.fit(train_feats, set_score)
+    except ValueError:
+        log.exception("Not enough classes (0,1,etc) in sample.")
+        set_score[0]=1
+        set_score[1]=0
+        clf.fit(train_feats, set_score)
+
+    return f, clf, cv_error_results
+

 def extract_features_and_generate_model(essays,additional_array=None):
    """

--- a/predictor_extractor.py
+++ b/predictor_extractor.py
+import numpy
+import re
+import nltk
+import sys
+from sklearn.feature_extraction.text import CountVectorizer
+import pickle
+import os
+from itertools import chain
+import copy
+import operator
+import logging
+import math
+from feature_extractor import FeatureExtractor
+
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+from essay_set import EssaySet
+import util_functions
+
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+log = logging.getLogger(__name__)
+
+class PredictorExtractor(object):
+    def __init__(self):
+        self._extractors = []
+        self._initialized = False
+
+    def initialize_dictionaries(self, p_set):
+        success = False
+        if not (hasattr(p_set, '_type')):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+
+        if not (p_set._type == "train"):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+
+        div_length=len(p_set._essay_sets)
+        if div_length==0:
+            div_length=1
+
+        max_feats2 = int(math.floor(200/div_length))
+        for i in xrange(0,len(p_set._essay_sets)):
+            self._extractors.append(FeatureExtractor())
+            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
+            self._initialized = True
+            success = True
+        return success
+
+    def gen_feats(self, p_set):
+        if self._initialized!=True:
+            error_message = "Dictionaries have not been initialized."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+
+        textual_features = []
+        for i in xrange(0,len(p_set._essay_sets)):
+            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
+
+        textual_matrix = numpy.concatenate(textual_features, axis=1)
+        predictor_matrix = numpy.array(p_set._numeric_features)
+
+        print textual_matrix.shape
+        print predictor_matrix.shape
+
+        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
+
+        return overall_matrix.copy()
--- a/predictor_set.py
+++ b/predictor_set.py
+import numpy
+import nltk
+import sys
+import random
+import os
+import logging
+import essay_set
+
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+import util_functions
+
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+log=logging.getLogger(__name__)
+
+class PredictorSet(object):
+    def __init__(self, type = "train"):
+        """
+        Initialize variables and check essay set type
+        """
+        if(type != "train" and type != "test"):
+            type = "train"
+
+        self._type = type
+        self._target=[]
+        self._textual_features=[]
+        self._numeric_features=[]
+        self._essay_sets=[]
+
+    def add_row(self, numeric_features, textual_features, target):
+        #Basic input checking
+        if not isinstance(target, (int, long, float)):
+            error_message = "Target is not a numeric value."
+            log.exception(error_message)
+            raise util_functions.InputError(target, error_message)
+
+        if not isinstance(numeric_features, list):
+            error_message = "Numeric features are not a list."
+            log.exception(error_message)
+            raise util_functions.InputError(numeric_features, error_message)
+
+        if not isinstance(textual_features, list):
+            error_message = "Textual features are not a list."
+            log.exception(error_message)
+            raise util_functions.InputError(textual_features, error_message)
+
+        #Do some length checking for parameters
+        if len(self._numeric_features)>0:
+            numeric_length  = len(self._numeric_features[-1])
+            current_numeric_length = len(numeric_features)
+            if numeric_length != current_numeric_length:
+                error_message = "Numeric features are an improper length."
+                log.exception(error_message)
+                raise util_functions.InputError(numeric_features, error_message)
+
+        if len(self._textual_features)>0:
+            textual_length  = len(self._textual_features[-1])
+            current_textual_length = len(textual_features)
+            if textual_length != current_textual_length:
+                error_message = "Textual features are an improper length."
+                log.exception(error_message)
+                raise util_functions.InputError(textual_features, error_message)
+
+        #Now check to see if text features and numeric features are individually correct
+
+        for i in xrange(0,len(numeric_features)):
+            try:
+                numeric_features[i] = float(numeric_features[i])
+            except:
+                error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
+                log.exception(error_message)
+                raise util_functions.InputError(numeric_features, error_message)
+
+
+        for i in xrange(0,len(textual_features)):
+            try:
+                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
+            except:
+                error_message = "Textual feature {0} not string.".format(textual_features[i])
+                log.exception(error_message)
+                raise util_functions.InputError(textual_features, error_message)
+
+        #Create essay sets for textual features if needed
+        if len(self._textual_features)==0:
+            for i in xrange(0,len(textual_features)):
+                self._essay_sets.append(essay_set.EssaySet(type=self._type))
+
+        #Add numeric and textual features
+        self._numeric_features.append(numeric_features)
+        self._textual_features.append(textual_features)
+
+        #Add targets
+        self._target.append(target)
+
+        #Add textual features to essay sets
+        for i in xrange(0,len(textual_features)):
+            self._essay_sets[i].add_essay(textual_features[i], target)
+
--- a/tests/test_generic_ml.py
+++ b/tests/test_generic_ml.py
+import os
+import sys
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+
+one_up_path=os.path.abspath(os.path.join(base_path,'..'))
+sys.path.append(one_up_path)
+
+import util_functions
+import predictor_set
+import predictor_extractor
+import numpy
+
+from sklearn.ensemble import GradientBoostingClassifier
+
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+FILENAME="sa_data.tsv"
+
+
+sa_val = file(FILENAME)
+scores=[]
+texts=[]
+lines=sa_val.readlines()
+pset = predictor_set.PredictorSet(type="train")
+for i in xrange(1,len(lines)):
+    score,text=lines[i].split("\t\"")
+    if len(text)>t_len:
+        scores.append(int(score))
+        texts.append(text)
+        pset.add_row([1],[text],int(score))
+extractor=predictor_extractor.PredictorExtractor()
+extractor.initialize_dictionaries(pset)
+train_feats=extractor.gen_feats(pset)
+
+clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
+cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
+err=numpy.mean(numpy.abs(cv_preds-scores))
+print err
+kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
+print kappa
\ No newline at end of file
--- a/util_functions.py
+++ b/util_functions.py
@@ -17,6 +17,10 @@ import logging

 log=logging.getLogger(__name__)

+class AlgorithmTypes(object):
+    regression = "regression"
+    classification = "classifiction"
+
 def create_model_path(model_path):
    if not model_path.startswith("/") and not model_path.startswith("models/"):
        model_path="/" + model_path