Commit 2e81fda2 by VikParuchuri

Merge pull request #7 from MITx/vik/deployment_work

Vik/deployment work
parents 434571f6 ae7ab0f0
...@@ -11,6 +11,8 @@ sys.path.append(one_up_path) ...@@ -11,6 +11,8 @@ sys.path.append(one_up_path)
import model_creator import model_creator
import util_functions import util_functions
import predictor_set
import predictor_extractor
from statsd import statsd from statsd import statsd
...@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path): ...@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""} 'feature_ext' : "", 'classifier' : ""}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
try: try:
e_set = model_creator.create_essay_set(text, score, prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
except: except:
...@@ -43,3 +52,40 @@ def create(text,score,prompt_string,model_path): ...@@ -43,3 +52,40 @@ def create(text,score,prompt_string,model_path):
return results return results
def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
\ No newline at end of file
...@@ -32,7 +32,7 @@ class FeatureExtractor(object): ...@@ -32,7 +32,7 @@ class FeatureExtractor(object):
self._spell_errors_per_character=0 self._spell_errors_per_character=0
self._grammar_errors_per_character=0 self._grammar_errors_per_character=0
def initialize_dictionaries(self, e_set): def initialize_dictionaries(self, e_set, max_feats2 = 200):
""" """
Initializes dictionaries from an essay set object Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features Dictionaries must be initialized prior to using this to extract features
...@@ -41,8 +41,8 @@ class FeatureExtractor(object): ...@@ -41,8 +41,8 @@ class FeatureExtractor(object):
""" """
if(hasattr(e_set, '_type')): if(hasattr(e_set, '_type')):
if(e_set._type == "train"): if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score) nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score) svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
......
...@@ -16,11 +16,14 @@ base_path = os.path.dirname(__file__) ...@@ -16,11 +16,14 @@ base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import EssaySet from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions import util_functions
#Imports needed to unpickle grader data #Imports needed to unpickle grader data
import feature_extractor import feature_extractor
import sklearn.ensemble import sklearn.ensemble
import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
...@@ -100,6 +103,51 @@ def grade(grader_data,grader_config,submission): ...@@ -100,6 +103,51 @@ def grade(grader_data,grader_config,submission):
return results return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
else:
raw_confidence = grader_data['model'].predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
results['confidence'] = confidence
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
if not has_error:
results['success'] = True
return results
...@@ -17,6 +17,7 @@ from essay_set import EssaySet ...@@ -17,6 +17,7 @@ from essay_set import EssaySet
import util_functions import util_functions
import feature_extractor import feature_extractor
import logging import logging
import predictor_extractor
log=logging.getLogger() log=logging.getLogger()
...@@ -102,6 +103,39 @@ def get_cv_error(clf,feats,scores): ...@@ -102,6 +103,39 @@ def get_cv_error(clf,feats,scores):
return results return results
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None): def extract_features_and_generate_model(essays,additional_array=None):
""" """
......
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
self._initialized = False
def initialize_dictionaries(self, p_set):
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
success = True
return success
def gen_feats(self, p_set):
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
import numpy
import nltk
import sys
import random
import os
import logging
import essay_set
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
raise util_functions.InputError(target, error_message)
if not isinstance(numeric_features, list):
error_message = "Numeric features are not a list."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if not isinstance(textual_features, list):
error_message = "Textual features are not a list."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
error_message = "Textual feature {0} not string.".format(textual_features[i])
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(base_path,'..'))
sys.path.append(one_up_path)
import util_functions
import predictor_set
import predictor_extractor
import numpy
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
FILENAME="sa_data.tsv"
sa_val = file(FILENAME)
scores=[]
texts=[]
lines=sa_val.readlines()
pset = predictor_set.PredictorSet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
if len(text)>t_len:
scores.append(int(score))
texts.append(text)
pset.add_row([1],[text],int(score))
extractor=predictor_extractor.PredictorExtractor()
extractor.initialize_dictionaries(pset)
train_feats=extractor.gen_feats(pset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
\ No newline at end of file
...@@ -17,6 +17,10 @@ import logging ...@@ -17,6 +17,10 @@ import logging
log=logging.getLogger(__name__) log=logging.getLogger(__name__)
class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
def create_model_path(model_path): def create_model_path(model_path):
if not model_path.startswith("/") and not model_path.startswith("models/"): if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path model_path="/" + model_path
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment