Commit b70221db by Vik Paruchuri

Add some docs to create

parent 006f9342
...@@ -5,24 +5,35 @@ Functions that create a machine learning model from training data ...@@ -5,24 +5,35 @@ Functions that create a machine learning model from training data
import os import os
import sys import sys
import logging import logging
log = logging.getLogger(__name__) from statsd import statsd
#Define base path and add to sys path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..')) one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path) sys.path.append(one_up_path)
#Import modules that are dependent on the base path
import model_creator import model_creator
import util_functions import util_functions
import predictor_set import predictor_set
import predictor_extractor import predictor_extractor
from statsd import statsd #Make a log
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.creator.time') @statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string,model_path): def create(text,score,prompt_string,model_path = None):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays
model_path - Deprecated, not needed
"""
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""} 'feature_ext' : "", 'classifier' : ""}
...@@ -33,12 +44,14 @@ def create(text,score,prompt_string,model_path): ...@@ -33,12 +44,14 @@ def create(text,score,prompt_string,model_path):
return results return results
try: try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
except: except:
msg = "essay set creation failed." msg = "essay set creation failed."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
try: try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
results['cv_kappa']=cv_error_results['kappa'] results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_mean_absolute_error']=cv_error_results['mae']
...@@ -57,7 +70,17 @@ def create(text,score,prompt_string,model_path): ...@@ -57,7 +70,17 @@ def create(text,score,prompt_string,model_path):
return results return results
def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression): def create_generic(numeric_values, textual_values, target, model_path = None, algorithm = util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
model_path - deprecated, kept for legacy code. Do not use.
"""
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm} 'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
...@@ -68,6 +91,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm ...@@ -68,6 +91,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
return results return results
try: try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train") pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)): for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i]) pset.add_row(numeric_values[i], textual_values[i], target[i])
...@@ -77,6 +101,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm ...@@ -77,6 +101,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
log.exception(msg) log.exception(msg)
try: try:
#Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa'] results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_mean_absolute_error']=cv_error_results['mae']
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment