Cleaned up the create file (more work to be done)

Additionally changed the F7 function into a make_unique function, and replaced much of its usage by the already created select_algorithm function Finally, created an errors file so that SOME DAY I can do error propogation correctly.

Cleaned up the create file (more work to be done)
Additionally changed the F7 function into a make_unique function, and replaced much of its usage by the already created select_algorithm function Finally, created an errors file so that SOME DAY I can do error propogation correctly.
3ea456df · gradyward · aedf0287 · 3ea456df · 3ea456df · 3ea456df
Commit 3ea456df authored Jun 12, 2014 by gradyward
Hide whitespace changes
Inline Side-by-side

Showing with 107 additions and 57 deletions

ease/create.py
+80 -44

ease/errors.py
+12 -0

ease/model_creator.py
+3 -5

ease/util_functions.py
+12 -8

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -17,6 +17,7 @@ sys.path.append(one_up_path)
 import model_creator
 import util_functions
 import predictor_set
+from errors import *
 import predictor_extractor
 from datetime import datetime
 import json
@@ -41,41 +42,61 @@ def dump_input_data(text, score):
        log.exception(error)
-def create(text, score, prompt_string, dump_data=False):
+def create(examples, scores, prompt_string, dump_data=False):
    """
-    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
+    Creates a machine learning model from basic inputs (essays, associated scores and a prompt)
-    TODO: Remove model path argument, it is needed for now to support legacy code
-    text - A list of strings containing the text of the essays
+    The previous version of this function took an additional argument which specified the path to the model.
-    score - a list of integers containing score values
-    prompt_string - the common prompt for the set of essays
+    Args:
+        examples (list of str): the example essays that have been assigned to train the AI.
+        scores (list of int): the associated scores that correspond to the essays.
+        prompt_string (str): the common prompt for all of the example essays.
+    Kwargs:
+        dump_data (bool): whether or not a examples and scores should be set via a data input dump
+    Returns:
+        (dict): Has the following keys:
+            'errors' (list of Exception): List of all errors that occurred during training
+            'cv_kappa' (float): cv_error, measured in terms of kappa.
+            'cv_mean_absolute_error' (float): cv_error, measured as the mean absolute value
+            'feature_ext': feature_extractor to be used for grading
+            'classifier': the classifier object which can be used to score future essays
+            'success' (bool): Whether or not the training of the classifier was successful.
    """
+    # If dump_data is true, then the examples and scores are loaded from json data.
    if dump_data:
-        dump_input_data(text, score)
+        dump_input_data(examples, scores)
+    # Selects the appropriate ML algorithm to use to train the classifier
+    algorithm = select_algorithm(scores)
-    algorithm = select_algorithm(score)
    #Initialize a results dictionary to return
    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
               'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
-               'score': score, 'text': text, 'prompt': prompt_string}
+               'score': scores, 'text': examples, 'prompt': prompt_string}
-    if len(text) != len(score):
+    if len(examples) != len(scores):
-        msg = "Target and text lists must be same length."
+        results['errors'].append("Target and text lists must be same length.")
-        results['errors'].append(msg)
+        log.exception("Target and text lists must be same length.")
-        log.exception(msg)
        return results
+    # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
    try:
-        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
+        essay_set = model_creator.create_essay_set(examples, scores, prompt_string)
-        e_set = model_creator.create_essay_set(text, score, prompt_string)
+    except (ExampleCreationRequestError, ExampleCreationInternalError) as ex:
-    except:
+        msg = "essay set creation failed due to an error in the create_essay_set method. {}".format(ex)
-        msg = "essay set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
+        return results
+    # Gets the features and classifiers from the essay set and computes the error
    try:
-        #Gets features from the essay set and computes error
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
+            essay_set, algorithm=algorithm
-                                                                                                      algorithm=algorithm)
+        )
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
        results['feature_ext'] = feature_ext
@@ -92,16 +113,23 @@ def create(text, score, prompt_string, dump_data=False):
 def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
    """
-    Creates a model from a generic list numeric values and text values
+    Constructs a model from a generic list of numeric values and text values.
-    numeric_values - A list of lists that are the predictors
-    textual_values - A list of lists that are the predictors
+    Generates this through a predictor set, rather than an essay set.
-    (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
-    target - The variable that we are trying to predict.  A list of integers.
+    Args:
-    algorithm - the type of algorithm that will be used
+        numeric_values:
+        textual_values:
+        target:
+    Kwargs:
+        GBW DELETED KWARG ALGORITHM (it was never used)
    """
+    # Selects the appropriate ML algorithm to use to train the classifier
    algorithm = select_algorithm(target)
-    #Initialize a result dictionary to return.
+    # Initialize a result dictionary to return.
    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
@@ -111,20 +139,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
        log.exception(msg)
        return results
+    # Initialize a predictor set object that encapsulates all of the text and numeric predictors
    try:
-        #Initialize a predictor set object that encapsulates all of the text and numeric predictors
+        predictor = predictor_set.PredictorSet(essaytype="train")
-        pset = predictor_set.PredictorSet(essaytype="train")
        for i in xrange(0, len(numeric_values)):
-            pset.add_row(numeric_values[i], textual_values[i], target[i])
+            predictor.add_row(numeric_values[i], textual_values[i], target[i])
    except:
        msg = "predictor set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
+        return results
+    # Gets the features and classifiers from the essay set and computes the error
    try:
-        #Extract all features and then train a classifier with the features
+        feature_ext, classifier, cv_error_results = \
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
+            model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
-                                                                                                                 algorithm)
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
        results['feature_ext'] = feature_ext
@@ -139,14 +168,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
 def select_algorithm(score_list):
-    #Decide what algorithm to use (regression or classification)
+    """
-    try:
+    Decides whether to use regression or classification as the ML algorithm based on the number of unique scores
-        #Count the number of unique score points in the score list
-        if len(util_functions.f7(list(score_list))) > 5:
+    If there are more than 5 unique scores give, regression is used, if fewer than 5 unique scores are produced
-            algorithm = util_functions.AlgorithmTypes.regression
+    then classification is used.
-        else:
-            algorithm = util_functions.AlgorithmTypes.classification
+    Args:
-    except:
+        score_list (list of int): The number of scores awarded to example essays for a given question
-        algorithm = util_functions.AlgorithmTypes.regression
+    Return:
+        The ML algorithm used to train the classifier set and feature extractor
+    """
-    return algorithm
+    #Count the number of unique score points in the score list
\ No newline at end of file
+    if len(set(score_list)) > 5:
+        return util_functions.AlgorithmTypes.regression
+    else:
+        return util_functions.AlgorithmTypes.classification
\ No newline at end of file
--- a/ease/errors.py
+++ b/ease/errors.py
+"""
+Errors for the EASE repository
+"""
+class ExampleCreationRequestError(Exception):
+    pass
+class ExampleCreationInternalError(Exception):
+    pass
+class EaseError(Exception):
+    pass
--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -18,6 +18,7 @@ import util_functions
 import feature_extractor
 import logging
 import predictor_extractor
+import create
 log = logging.getLogger()
@@ -131,7 +132,7 @@ def get_algorithms(algorithm):
                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2
+#TODO RENAME train_from_predictors
 def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Extracts features and generates predictors based on a given predictor set
@@ -176,10 +177,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
    train_feats = f.gen_feats(essays)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
-    if len(util_functions.f7(list(set_score))) > 5:
+    algorithm = create.select_algorithm(set_score)
-        algorithm = util_functions.AlgorithmTypes.regression
-    else:
-        algorithm = util_functions.AlgorithmTypes.classification
    clf, clf2 = get_algorithms(algorithm)

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
@@ -159,14 +159,17 @@ def ngrams(tokens, min_n, max_n):
    return all_ngrams
-def f7(seq):
+def make_unique(sequence):
    """
-    Makes a list unique
+    Makes a list of elements unique
+    Args:
+        sequence (list of any comparable): A sequence to make unique
+    Return:
+        the list without any duplicates.  May be out of order.
    """
-    seen = set()
+    return list(set(sequence))
-    seen_add = seen.add
-    #TODO Potential Improvment Here
-    return [x for x in seq if x not in seen and not seen_add(x)]
 def count_list(the_list):
@@ -190,7 +193,8 @@ def regenerate_good_tokens(string):
    pos_string = nltk.pos_tag(toks)
    pos_seq = [tag[1] for tag in pos_string]
    pos_ngrams = ngrams(pos_seq, 2, 4)
-    sel_pos_ngrams = f7(pos_ngrams)
+    # TODO POTENTIAL ISSUE WITH NON STABLE ALGORITHM F7!?!
+    sel_pos_ngrams = make_unique(pos_ngrams)
    return sel_pos_ngrams
@@ -444,7 +448,7 @@ def get_wordnet_syns(word):
    for ss in synset:
        for swords in ss.lemma_names:
            synonyms.append(pat.sub(" ", swords.lower()))
-    synonyms = f7(synonyms)
+    synonyms = make_unique(synonyms)
    return synonyms