Commit 3ea456df by gradyward

Cleaned up the create file (more work to be done)

Additionally changed the F7 function into a make_unique function,
and replaced much of its usage by the already created select_algorithm function
Finally, created an errors file so that SOME DAY I can do error propogation
correctly.
parent aedf0287
...@@ -17,6 +17,7 @@ sys.path.append(one_up_path) ...@@ -17,6 +17,7 @@ sys.path.append(one_up_path)
import model_creator import model_creator
import util_functions import util_functions
import predictor_set import predictor_set
from errors import *
import predictor_extractor import predictor_extractor
from datetime import datetime from datetime import datetime
import json import json
...@@ -41,41 +42,61 @@ def dump_input_data(text, score): ...@@ -41,41 +42,61 @@ def dump_input_data(text, score):
log.exception(error) log.exception(error)
def create(text, score, prompt_string, dump_data=False): def create(examples, scores, prompt_string, dump_data=False):
""" """
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model Creates a machine learning model from basic inputs (essays, associated scores and a prompt)
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays The previous version of this function took an additional argument which specified the path to the model.
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays Args:
examples (list of str): the example essays that have been assigned to train the AI.
scores (list of int): the associated scores that correspond to the essays.
prompt_string (str): the common prompt for all of the example essays.
Kwargs:
dump_data (bool): whether or not a examples and scores should be set via a data input dump
Returns:
(dict): Has the following keys:
'errors' (list of Exception): List of all errors that occurred during training
'cv_kappa' (float): cv_error, measured in terms of kappa.
'cv_mean_absolute_error' (float): cv_error, measured as the mean absolute value
'feature_ext': feature_extractor to be used for grading
'classifier': the classifier object which can be used to score future essays
'success' (bool): Whether or not the training of the classifier was successful.
""" """
# If dump_data is true, then the examples and scores are loaded from json data.
if dump_data: if dump_data:
dump_input_data(text, score) dump_input_data(examples, scores)
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = select_algorithm(scores)
algorithm = select_algorithm(score)
#Initialize a results dictionary to return #Initialize a results dictionary to return
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm, 'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
'score': score, 'text': text, 'prompt': prompt_string} 'score': scores, 'text': examples, 'prompt': prompt_string}
if len(text) != len(score): if len(examples) != len(scores):
msg = "Target and text lists must be same length." results['errors'].append("Target and text lists must be same length.")
results['errors'].append(msg) log.exception("Target and text lists must be same length.")
log.exception(msg)
return results return results
# Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
try: try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc) essay_set = model_creator.create_essay_set(examples, scores, prompt_string)
e_set = model_creator.create_essay_set(text, score, prompt_string) except (ExampleCreationRequestError, ExampleCreationInternalError) as ex:
except: msg = "essay set creation failed due to an error in the create_essay_set method. {}".format(ex)
msg = "essay set creation failed."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
return results
# Gets the features and classifiers from the essay set and computes the error
try: try:
#Gets features from the essay set and computes error feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, essay_set, algorithm=algorithm
algorithm=algorithm) )
results['cv_kappa'] = cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext results['feature_ext'] = feature_ext
...@@ -92,16 +113,23 @@ def create(text, score, prompt_string, dump_data=False): ...@@ -92,16 +113,23 @@ def create(text, score, prompt_string, dump_data=False):
def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression): def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Creates a model from a generic list numeric values and text values Constructs a model from a generic list of numeric values and text values.
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors Generates this through a predictor set, rather than an essay set.
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers. Args:
algorithm - the type of algorithm that will be used numeric_values:
textual_values:
target:
Kwargs:
GBW DELETED KWARG ALGORITHM (it was never used)
""" """
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = select_algorithm(target) algorithm = select_algorithm(target)
#Initialize a result dictionary to return.
# Initialize a result dictionary to return.
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm} 'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
...@@ -111,20 +139,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi ...@@ -111,20 +139,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
log.exception(msg) log.exception(msg)
return results return results
# Initialize a predictor set object that encapsulates all of the text and numeric predictors
try: try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors predictor = predictor_set.PredictorSet(essaytype="train")
pset = predictor_set.PredictorSet(essaytype="train")
for i in xrange(0, len(numeric_values)): for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i]) predictor.add_row(numeric_values[i], textual_values[i], target[i])
except: except:
msg = "predictor set creation failed." msg = "predictor set creation failed."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
return results
# Gets the features and classifiers from the essay set and computes the error
try: try:
#Extract all features and then train a classifier with the features feature_ext, classifier, cv_error_results = \
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
algorithm)
results['cv_kappa'] = cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext results['feature_ext'] = feature_ext
...@@ -139,14 +168,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi ...@@ -139,14 +168,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
def select_algorithm(score_list): def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification) """
try: Decides whether to use regression or classification as the ML algorithm based on the number of unique scores
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list))) > 5: If there are more than 5 unique scores give, regression is used, if fewer than 5 unique scores are produced
algorithm = util_functions.AlgorithmTypes.regression then classification is used.
else:
algorithm = util_functions.AlgorithmTypes.classification Args:
except: score_list (list of int): The number of scores awarded to example essays for a given question
algorithm = util_functions.AlgorithmTypes.regression
Return:
The ML algorithm used to train the classifier set and feature extractor
"""
return algorithm #Count the number of unique score points in the score list
\ No newline at end of file if len(set(score_list)) > 5:
return util_functions.AlgorithmTypes.regression
else:
return util_functions.AlgorithmTypes.classification
\ No newline at end of file
"""
Errors for the EASE repository
"""
class ExampleCreationRequestError(Exception):
pass
class ExampleCreationInternalError(Exception):
pass
class EaseError(Exception):
pass
...@@ -18,6 +18,7 @@ import util_functions ...@@ -18,6 +18,7 @@ import util_functions
import feature_extractor import feature_extractor
import logging import logging
import predictor_extractor import predictor_extractor
import create
log = logging.getLogger() log = logging.getLogger()
...@@ -131,7 +132,7 @@ def get_algorithms(algorithm): ...@@ -131,7 +132,7 @@ def get_algorithms(algorithm):
max_depth=4, random_state=1, min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2 return clf, clf2
#TODO RENAME train_from_predictors
def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression): def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Extracts features and generates predictors based on a given predictor set Extracts features and generates predictors based on a given predictor set
...@@ -176,10 +177,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit ...@@ -176,10 +177,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats = f.gen_feats(essays) train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score))) > 5: algorithm = create.select_algorithm(set_score)
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
clf, clf2 = get_algorithms(algorithm) clf, clf2 = get_algorithms(algorithm)
......
...@@ -159,14 +159,17 @@ def ngrams(tokens, min_n, max_n): ...@@ -159,14 +159,17 @@ def ngrams(tokens, min_n, max_n):
return all_ngrams return all_ngrams
def f7(seq): def make_unique(sequence):
""" """
Makes a list unique Makes a list of elements unique
Args:
sequence (list of any comparable): A sequence to make unique
Return:
the list without any duplicates. May be out of order.
""" """
seen = set() return list(set(sequence))
seen_add = seen.add
#TODO Potential Improvment Here
return [x for x in seq if x not in seen and not seen_add(x)]
def count_list(the_list): def count_list(the_list):
...@@ -190,7 +193,8 @@ def regenerate_good_tokens(string): ...@@ -190,7 +193,8 @@ def regenerate_good_tokens(string):
pos_string = nltk.pos_tag(toks) pos_string = nltk.pos_tag(toks)
pos_seq = [tag[1] for tag in pos_string] pos_seq = [tag[1] for tag in pos_string]
pos_ngrams = ngrams(pos_seq, 2, 4) pos_ngrams = ngrams(pos_seq, 2, 4)
sel_pos_ngrams = f7(pos_ngrams) # TODO POTENTIAL ISSUE WITH NON STABLE ALGORITHM F7!?!
sel_pos_ngrams = make_unique(pos_ngrams)
return sel_pos_ngrams return sel_pos_ngrams
...@@ -444,7 +448,7 @@ def get_wordnet_syns(word): ...@@ -444,7 +448,7 @@ def get_wordnet_syns(word):
for ss in synset: for ss in synset:
for swords in ss.lemma_names: for swords in ss.lemma_names:
synonyms.append(pat.sub(" ", swords.lower())) synonyms.append(pat.sub(" ", swords.lower()))
synonyms = f7(synonyms) synonyms = make_unique(synonyms)
return synonyms return synonyms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment