Commit 3ea456df by gradyward

Cleaned up the create file (more work to be done)

Additionally changed the F7 function into a make_unique function,
and replaced much of its usage by the already created select_algorithm function
Finally, created an errors file so that SOME DAY I can do error propogation
correctly.
parent aedf0287
......@@ -17,6 +17,7 @@ sys.path.append(one_up_path)
import model_creator
import util_functions
import predictor_set
from errors import *
import predictor_extractor
from datetime import datetime
import json
......@@ -41,41 +42,61 @@ def dump_input_data(text, score):
log.exception(error)
def create(text, score, prompt_string, dump_data=False):
def create(examples, scores, prompt_string, dump_data=False):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays
Creates a machine learning model from basic inputs (essays, associated scores and a prompt)
The previous version of this function took an additional argument which specified the path to the model.
Args:
examples (list of str): the example essays that have been assigned to train the AI.
scores (list of int): the associated scores that correspond to the essays.
prompt_string (str): the common prompt for all of the example essays.
Kwargs:
dump_data (bool): whether or not a examples and scores should be set via a data input dump
Returns:
(dict): Has the following keys:
'errors' (list of Exception): List of all errors that occurred during training
'cv_kappa' (float): cv_error, measured in terms of kappa.
'cv_mean_absolute_error' (float): cv_error, measured as the mean absolute value
'feature_ext': feature_extractor to be used for grading
'classifier': the classifier object which can be used to score future essays
'success' (bool): Whether or not the training of the classifier was successful.
"""
# If dump_data is true, then the examples and scores are loaded from json data.
if dump_data:
dump_input_data(text, score)
dump_input_data(examples, scores)
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = select_algorithm(scores)
algorithm = select_algorithm(score)
#Initialize a results dictionary to return
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
'score': score, 'text': text, 'prompt': prompt_string}
'score': scores, 'text': examples, 'prompt': prompt_string}
if len(text) != len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
if len(examples) != len(scores):
results['errors'].append("Target and text lists must be same length.")
log.exception("Target and text lists must be same length.")
return results
# Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
msg = "essay set creation failed."
essay_set = model_creator.create_essay_set(examples, scores, prompt_string)
except (ExampleCreationRequestError, ExampleCreationInternalError) as ex:
msg = "essay set creation failed due to an error in the create_essay_set method. {}".format(ex)
results['errors'].append(msg)
log.exception(msg)
return results
# Gets the features and classifiers from the essay set and computes the error
try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
algorithm=algorithm)
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
essay_set, algorithm=algorithm
)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
......@@ -92,16 +113,23 @@ def create(text, score, prompt_string, dump_data=False):
def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
algorithm - the type of algorithm that will be used
Constructs a model from a generic list of numeric values and text values.
Generates this through a predictor set, rather than an essay set.
Args:
numeric_values:
textual_values:
target:
Kwargs:
GBW DELETED KWARG ALGORITHM (it was never used)
"""
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = select_algorithm(target)
#Initialize a result dictionary to return.
# Initialize a result dictionary to return.
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
......@@ -111,20 +139,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
log.exception(msg)
return results
# Initialize a predictor set object that encapsulates all of the text and numeric predictors
try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(essaytype="train")
predictor = predictor_set.PredictorSet(essaytype="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
predictor.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
return results
# Gets the features and classifiers from the essay set and computes the error
try:
#Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
algorithm)
feature_ext, classifier, cv_error_results = \
model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
......@@ -139,14 +168,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list))) > 5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
except:
algorithm = util_functions.AlgorithmTypes.regression
"""
Decides whether to use regression or classification as the ML algorithm based on the number of unique scores
If there are more than 5 unique scores give, regression is used, if fewer than 5 unique scores are produced
then classification is used.
Args:
score_list (list of int): The number of scores awarded to example essays for a given question
Return:
The ML algorithm used to train the classifier set and feature extractor
"""
return algorithm
\ No newline at end of file
#Count the number of unique score points in the score list
if len(set(score_list)) > 5:
return util_functions.AlgorithmTypes.regression
else:
return util_functions.AlgorithmTypes.classification
\ No newline at end of file
"""
Errors for the EASE repository
"""
class ExampleCreationRequestError(Exception):
pass
class ExampleCreationInternalError(Exception):
pass
class EaseError(Exception):
pass
......@@ -18,6 +18,7 @@ import util_functions
import feature_extractor
import logging
import predictor_extractor
import create
log = logging.getLogger()
......@@ -131,7 +132,7 @@ def get_algorithms(algorithm):
max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2
#TODO RENAME train_from_predictors
def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
......@@ -176,10 +177,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score))) > 5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
algorithm = create.select_algorithm(set_score)
clf, clf2 = get_algorithms(algorithm)
......
......@@ -159,14 +159,17 @@ def ngrams(tokens, min_n, max_n):
return all_ngrams
def f7(seq):
def make_unique(sequence):
"""
Makes a list unique
Makes a list of elements unique
Args:
sequence (list of any comparable): A sequence to make unique
Return:
the list without any duplicates. May be out of order.
"""
seen = set()
seen_add = seen.add
#TODO Potential Improvment Here
return [x for x in seq if x not in seen and not seen_add(x)]
return list(set(sequence))
def count_list(the_list):
......@@ -190,7 +193,8 @@ def regenerate_good_tokens(string):
pos_string = nltk.pos_tag(toks)
pos_seq = [tag[1] for tag in pos_string]
pos_ngrams = ngrams(pos_seq, 2, 4)
sel_pos_ngrams = f7(pos_ngrams)
# TODO POTENTIAL ISSUE WITH NON STABLE ALGORITHM F7!?!
sel_pos_ngrams = make_unique(pos_ngrams)
return sel_pos_ngrams
......@@ -444,7 +448,7 @@ def get_wordnet_syns(word):
for ss in synset:
for swords in ss.lemma_names:
synonyms.append(pat.sub(" ", swords.lower()))
synonyms = f7(synonyms)
synonyms = make_unique(synonyms)
return synonyms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment