Commit 79bf43f5 by Vik Paruchuri

Generic ML model creation

parent b8f9cdfc
...@@ -11,6 +11,8 @@ sys.path.append(one_up_path) ...@@ -11,6 +11,8 @@ sys.path.append(one_up_path)
import model_creator import model_creator
import util_functions import util_functions
import predictor_set
import predictor_extractor
from statsd import statsd from statsd import statsd
...@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path): ...@@ -19,6 +21,13 @@ def create(text,score,prompt_string,model_path):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""} 'feature_ext' : "", 'classifier' : ""}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
try: try:
e_set = model_creator.create_essay_set(text, score, prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
except: except:
...@@ -44,6 +53,39 @@ def create(text,score,prompt_string,model_path): ...@@ -44,6 +53,39 @@ def create(text,score,prompt_string,model_path):
return results return results
def create_generic(numeric_values, textual_values, target, model_path): def create_generic(numeric_values, textual_values, target, model_path, algorithm = model_creator.AlgorithmTypes.regression):
pass results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
\ No newline at end of file
...@@ -17,9 +17,14 @@ from essay_set import EssaySet ...@@ -17,9 +17,14 @@ from essay_set import EssaySet
import util_functions import util_functions
import feature_extractor import feature_extractor
import logging import logging
import predictor_extractor
log=logging.getLogger() log=logging.getLogger()
class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
def read_in_test_data(filename): def read_in_test_data(filename):
""" """
Reads in test data file found at filename. Reads in test data file found at filename.
...@@ -102,6 +107,39 @@ def get_cv_error(clf,feats,scores): ...@@ -102,6 +107,39 @@ def get_cv_error(clf,feats,scores):
return results return results
def extract_features_and_generate_model_predictors(predictor_set, type=AlgorithmTypes.regression):
if(algorithm not in [AlgorithmTypes.regression, AlgorithmTypes.classification]):
algorithm = AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
if type = AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None): def extract_features_and_generate_model(essays,additional_array=None):
""" """
......
...@@ -15,23 +15,14 @@ if not base_path.endswith("/"): ...@@ -15,23 +15,14 @@ if not base_path.endswith("/"):
log=logging.getLogger(__name__) log=logging.getLogger(__name__)
class AlgorithmTypes(object):
regression = "regression"
classification = "classifiction"
class PredictorSet(object): class PredictorSet(object):
def __init__(self, type = "train", algorithm = AlgorithmTypes.regression): def __init__(self, type = "train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
if(type != "train" and type != "test"): if(type != "train" and type != "test"):
type = "train" type = "train"
if(algorithm not in [AlgorithmTypes.regression, AlgorithmTypes.classification]):
algorithm = AlgorithmTypes.regression
self._type = type self._type = type
self._target=[] self._target=[]
self._textual_features=[] self._textual_features=[]
...@@ -39,7 +30,6 @@ class PredictorSet(object): ...@@ -39,7 +30,6 @@ class PredictorSet(object):
self._essay_sets=[] self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target): def add_row(self, numeric_features, textual_features, target):
#Basic input checking #Basic input checking
if not isinstance(target, (int, long, float)): if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value." error_message = "Target is not a numeric value."
......
...@@ -39,13 +39,4 @@ cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores) ...@@ -39,13 +39,4 @@ cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores)) err=numpy.mean(numpy.abs(cv_preds-scores))
print err print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores) kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa print kappa
all_err.append(err) \ No newline at end of file
all_kappa.append(kappa)
"""
outfile=open("full_cvout.tsv",'w+')
outfile.write("cv_pred" + "\t" + "actual")
for i in xrange(0,len(cv_preds)):
outfile.write("{0}\t{1}".format(cv_preds[i],scores[i]))
"""
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment