Cleaned up the create file (more work to be done)

Additionally changed the F7 function into a make_unique function, and replaced much of its usage by the already created select_algorithm function Finally, created an errors file so that SOME DAY I can do error propogation correctly.

Cleaned up the create file (more work to be done)
Additionally changed the F7 function into a make_unique function, and replaced much of its usage by the already created select_algorithm function Finally, created an errors file so that SOME DAY I can do error propogation correctly.
3ea456df · gradyward · aedf0287 · 3ea456df · 3ea456df · 3ea456df
Commit 3ea456df authored Jun 12, 2014 by gradyward
Hide whitespace changes
Inline Side-by-side

Showing with 107 additions and 57 deletions

ease/create.py
+80 -44

ease/errors.py
+12 -0

ease/model_creator.py
+3 -5

ease/util_functions.py
+12 -8

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -17,6 +17,7 @@ sys.path.append(one_up_path)
 import model_creator
 import util_functions
 import predictor_set
+from errors import *
 import predictor_extractor
 from datetime import datetime
 import json
@@ -41,41 +42,61 @@ def dump_input_data(text, score):
        log.exception(error)


-def create(text, score, prompt_string, dump_data=False):
+def create(examples, scores, prompt_string, dump_data=False):
    """
-    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
-    TODO: Remove model path argument, it is needed for now to support legacy code
-    text - A list of strings containing the text of the essays
-    score - a list of integers containing score values
-    prompt_string - the common prompt for the set of essays
+    Creates a machine learning model from basic inputs (essays, associated scores and a prompt)
+
+    The previous version of this function took an additional argument which specified the path to the model.
+
+    Args:
+        examples (list of str): the example essays that have been assigned to train the AI.
+        scores (list of int): the associated scores that correspond to the essays.
+        prompt_string (str): the common prompt for all of the example essays.
+
+    Kwargs:
+        dump_data (bool): whether or not a examples and scores should be set via a data input dump
+
+    Returns:
+        (dict): Has the following keys:
+            'errors' (list of Exception): List of all errors that occurred during training
+            'cv_kappa' (float): cv_error, measured in terms of kappa.
+            'cv_mean_absolute_error' (float): cv_error, measured as the mean absolute value
+            'feature_ext': feature_extractor to be used for grading
+            'classifier': the classifier object which can be used to score future essays
+            'success' (bool): Whether or not the training of the classifier was successful.
    """

+    # If dump_data is true, then the examples and scores are loaded from json data.
    if dump_data:
-        dump_input_data(text, score)
+        dump_input_data(examples, scores)
+
+    # Selects the appropriate ML algorithm to use to train the classifier
+    algorithm = select_algorithm(scores)

-    algorithm = select_algorithm(score)
    #Initialize a results dictionary to return
    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
               'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
-               'score': score, 'text': text, 'prompt': prompt_string}
+               'score': scores, 'text': examples, 'prompt': prompt_string}

-    if len(text) != len(score):
-        msg = "Target and text lists must be same length."
-        results['errors'].append(msg)
-        log.exception(msg)
+    if len(examples) != len(scores):
+        results['errors'].append("Target and text lists must be same length.")
+        log.exception("Target and text lists must be same length.")
        return results

+    # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
    try:
-        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
-        e_set = model_creator.create_essay_set(text, score, prompt_string)
-    except:
-        msg = "essay set creation failed."
+        essay_set = model_creator.create_essay_set(examples, scores, prompt_string)
+    except (ExampleCreationRequestError, ExampleCreationInternalError) as ex:
+        msg = "essay set creation failed due to an error in the create_essay_set method. {}".format(ex)
        results['errors'].append(msg)
        log.exception(msg)
+        return results
+
+    # Gets the features and classifiers from the essay set and computes the error
    try:
-        #Gets features from the essay set and computes error
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
-                                                                                                      algorithm=algorithm)
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
+            essay_set, algorithm=algorithm
+        )
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
        results['feature_ext'] = feature_ext
@@ -92,16 +113,23 @@ def create(text, score, prompt_string, dump_data=False):

 def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
    """
-    Creates a model from a generic list numeric values and text values
-    numeric_values - A list of lists that are the predictors
-    textual_values - A list of lists that are the predictors
-    (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
-    target - The variable that we are trying to predict.  A list of integers.
-    algorithm - the type of algorithm that will be used
+    Constructs a model from a generic list of numeric values and text values.
+
+    Generates this through a predictor set, rather than an essay set.
+
+    Args:
+        numeric_values:
+        textual_values:
+        target:
+
+    Kwargs:
+        GBW DELETED KWARG ALGORITHM (it was never used)
    """

+    # Selects the appropriate ML algorithm to use to train the classifier
    algorithm = select_algorithm(target)
-    #Initialize a result dictionary to return.
+
+    # Initialize a result dictionary to return.
    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}

@@ -111,20 +139,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
        log.exception(msg)
        return results

+    # Initialize a predictor set object that encapsulates all of the text and numeric predictors
    try:
-        #Initialize a predictor set object that encapsulates all of the text and numeric predictors
-        pset = predictor_set.PredictorSet(essaytype="train")
+        predictor = predictor_set.PredictorSet(essaytype="train")
        for i in xrange(0, len(numeric_values)):
-            pset.add_row(numeric_values[i], textual_values[i], target[i])
+            predictor.add_row(numeric_values[i], textual_values[i], target[i])
    except:
        msg = "predictor set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
+        return results

+    # Gets the features and classifiers from the essay set and computes the error
    try:
-        #Extract all features and then train a classifier with the features
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
-                                                                                                                 algorithm)
+        feature_ext, classifier, cv_error_results = \
+            model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
        results['feature_ext'] = feature_ext
@@ -139,14 +168,21 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi


 def select_algorithm(score_list):
-    #Decide what algorithm to use (regression or classification)
-    try:
-        #Count the number of unique score points in the score list
-        if len(util_functions.f7(list(score_list))) > 5:
-            algorithm = util_functions.AlgorithmTypes.regression
-        else:
-            algorithm = util_functions.AlgorithmTypes.classification
-    except:
-        algorithm = util_functions.AlgorithmTypes.regression
+    """
+    Decides whether to use regression or classification as the ML algorithm based on the number of unique scores
+
+    If there are more than 5 unique scores give, regression is used, if fewer than 5 unique scores are produced
+    then classification is used.
+
+    Args:
+        score_list (list of int): The number of scores awarded to example essays for a given question
+
+    Return:
+        The ML algorithm used to train the classifier set and feature extractor
+    """

-    return algorithm
\ No newline at end of file
+    #Count the number of unique score points in the score list
+    if len(set(score_list)) > 5:
+        return util_functions.AlgorithmTypes.regression
+    else:
+        return util_functions.AlgorithmTypes.classification
\ No newline at end of file
--- a/ease/errors.py
+++ b/ease/errors.py
+"""
+Errors for the EASE repository
+"""
+
+class ExampleCreationRequestError(Exception):
+    pass
+
+class ExampleCreationInternalError(Exception):
+    pass
+
+class EaseError(Exception):
+    pass
--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -18,6 +18,7 @@ import util_functions
 import feature_extractor
 import logging
 import predictor_extractor
+import create

 log = logging.getLogger()

@@ -131,7 +132,7 @@ def get_algorithms(algorithm):
                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2

-
+#TODO RENAME train_from_predictors
 def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Extracts features and generates predictors based on a given predictor set
@@ -176,10 +177,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
    train_feats = f.gen_feats(essays)

    set_score = numpy.asarray(essays._score, dtype=numpy.int)
-    if len(util_functions.f7(list(set_score))) > 5:
-        algorithm = util_functions.AlgorithmTypes.regression
-    else:
-        algorithm = util_functions.AlgorithmTypes.classification
+    algorithm = create.select_algorithm(set_score)

    clf, clf2 = get_algorithms(algorithm)


--- a/ease/util_functions.py
+++ b/ease/util_functions.py
@@ -159,14 +159,17 @@ def ngrams(tokens, min_n, max_n):
    return all_ngrams


-def f7(seq):
+def make_unique(sequence):
    """
-    Makes a list unique
+    Makes a list of elements unique
+
+    Args:
+        sequence (list of any comparable): A sequence to make unique
+
+    Return:
+        the list without any duplicates.  May be out of order.
    """
-    seen = set()
-    seen_add = seen.add
-    #TODO Potential Improvment Here
-    return [x for x in seq if x not in seen and not seen_add(x)]
+    return list(set(sequence))


 def count_list(the_list):
@@ -190,7 +193,8 @@ def regenerate_good_tokens(string):
    pos_string = nltk.pos_tag(toks)
    pos_seq = [tag[1] for tag in pos_string]
    pos_ngrams = ngrams(pos_seq, 2, 4)
-    sel_pos_ngrams = f7(pos_ngrams)
+    # TODO POTENTIAL ISSUE WITH NON STABLE ALGORITHM F7!?!
+    sel_pos_ngrams = make_unique(pos_ngrams)
    return sel_pos_ngrams


@@ -444,7 +448,7 @@ def get_wordnet_syns(word):
    for ss in synset:
        for swords in ss.lemma_names:
            synonyms.append(pat.sub(" ", swords.lower()))
-    synonyms = f7(synonyms)
+    synonyms = make_unique(synonyms)
    return synonyms