Commit 2e81fda2 by VikParuchuri

Merge pull request #7 from MITx/vik/deployment_work

Vik/deployment work
parents 434571f6 ae7ab0f0
......@@ -11,6 +11,8 @@ sys.path.append(one_up_path)
import model_creator
import util_functions
import predictor_set
import predictor_extractor
from statsd import statsd
......@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
......@@ -43,3 +52,40 @@ def create(text,score,prompt_string,model_path):
return results
def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
\ No newline at end of file
......@@ -32,7 +32,7 @@ class FeatureExtractor(object):
self._spell_errors_per_character=0
self._grammar_errors_per_character=0
def initialize_dictionaries(self, e_set):
def initialize_dictionaries(self, e_set, max_feats2 = 200):
"""
Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features
......@@ -41,8 +41,8 @@ class FeatureExtractor(object):
"""
if(hasattr(e_set, '_type')):
if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True
......
......@@ -16,11 +16,14 @@ base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
#Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
......@@ -100,6 +103,51 @@ def grade(grader_data,grader_config,submission):
return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
else:
raw_confidence = grader_data['model'].predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
results['confidence'] = confidence
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
if not has_error:
results['success'] = True
return results
......@@ -17,6 +17,7 @@ from essay_set import EssaySet
import util_functions
import feature_extractor
import logging
import predictor_extractor
log=logging.getLogger()
......@@ -102,6 +103,39 @@ def get_cv_error(clf,feats,scores):
return results
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None):
"""
......
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
self._initialized = False
def initialize_dictionaries(self, p_set):
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
success = True
return success
def gen_feats(self, p_set):
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
import numpy
import nltk
import sys
import random
import os
import logging
import essay_set
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
raise util_functions.InputError(target, error_message)
if not isinstance(numeric_features, list):
error_message = "Numeric features are not a list."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if not isinstance(textual_features, list):
error_message = "Textual features are not a list."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
error_message = "Textual feature {0} not string.".format(textual_features[i])
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(base_path,'..'))
sys.path.append(one_up_path)
import util_functions
import predictor_set
import predictor_extractor
import numpy
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
FILENAME="sa_data.tsv"
sa_val = file(FILENAME)
scores=[]
texts=[]
lines=sa_val.readlines()
pset = predictor_set.PredictorSet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
if len(text)>t_len:
scores.append(int(score))
texts.append(text)
pset.add_row([1],[text],int(score))
extractor=predictor_extractor.PredictorExtractor()
extractor.initialize_dictionaries(pset)
train_feats=extractor.gen_feats(pset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
\ No newline at end of file
......@@ -17,6 +17,10 @@ import logging
log=logging.getLogger(__name__)
class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
def create_model_path(model_path):
if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment