A handful of small changes.

A final commit before I attempt to depricate all of the unused predictor stuff. Stand with me now.

A handful of small changes.
A final commit before I attempt to depricate all of the unused predictor stuff. Stand with me now.
279d3a0c · gradyward · b32d5674 · 279d3a0c · 279d3a0c · 279d3a0c
Commit 279d3a0c authored Jun 12, 2014 by gradyward
Show whitespace changes
Inline Side-by-side

Showing with 60 additions and 27 deletions

ease/create.py
+4 -3

ease/model_creator.py
+49 -21

ease/predictor_set.py
+1 -1

ease/util_functions.py
+6 -2

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -95,7 +95,7 @@ def create(examples, scores, prompt_string, dump_data=False):
    # Gets the features and classifiers from the essay set and computes the error
    try:
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
-            essay_set, algorithm=algorithm
+            essay_set
        )
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
@@ -103,10 +103,11 @@ def create(examples, scores, prompt_string, dump_data=False):
        results['classifier'] = classifier
        results['algorithm'] = algorithm
        results['success'] = True
-    except:
+    except Exception as ex:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)
        log.exception(msg)
+        log.exception(ex)

    return results

@@ -153,7 +154,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
    # Gets the features and classifiers from the essay set and computes the error
    try:
        feature_ext, classifier, cv_error_results = \
-            model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
+            model_creator.extract_features_and_generate_model_from_predictors(predictor, algorithm)
        results['cv_kappa'] = cv_error_results['kappa']
        results['cv_mean_absolute_error'] = cv_error_results['mae']
        results['feature_ext'] = feature_ext

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -25,9 +25,18 @@ log = logging.getLogger()

 def read_in_test_data(filename):
    """
-    Reads in test data file found at filename.
+    Reads in tab delimited test data file found at filename for training purposes.
+
    filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
-    returns the score and the text
+
+    Args:
+        filename (str): The path to the data
+
+    Return:
+        Tuple of the form (score, text), where:
+            The former is the list of scores assigned to the essays in the file (int)
+            The latter is the list of essays in the file
+
    """
    tid, e_set, score, score2, text = [], [], [], [], []
    combined_raw = open(filename).read()
@@ -45,8 +54,13 @@ def read_in_test_data(filename):

 def read_in_test_prompt(filename):
    """
-    Reads in the prompt from a text file
-    Returns string
+    Reads in the prompt from a file.
+
+    Args:
+        filename (str): the name of the file
+
+    Returns:
+        (str): the prompt as a string.
    """
    prompt_string = open(filename).read()
    return prompt_string
@@ -55,10 +69,16 @@ def read_in_test_prompt(filename):
 def read_in_test_data_twocolumn(filename, sep=","):
    """
    Reads in a two column version of the test data.
-    Filename must point to a delimited file.
+
    In filename, the first column should be integer score data.
    The second column should be string text data.
    Sep specifies the type of separator between fields.
+
+    Return:
+        Tuple of the form (score, text), where:
+            The former is the list of scores assigned to the essays in the file (int)
+            The latter is the list of essays in the file
+
    """
    score, text = [], []
    combined_raw = open(filename).read()
@@ -132,8 +152,7 @@ def get_algorithms(algorithm):
                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2

-#TODO RENAME train_from_predictors
-def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
+def extract_features_and_generate_model_from_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Extracts features and generates predictors based on a given predictor set
    predictor_set - a PredictorSet object that has been initialized with data
@@ -162,40 +181,49 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
    return f, clf, cv_error_results


-def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression):
+def extract_features_and_generate_model(essays):
    """
    Feed in an essay set to get feature vector and classifier
-    essays must be an essay set object
-    additional array is an optional argument that can specify
-    a numpy array of values to add in
-    returns a trained FeatureExtractor object and a trained classifier
+
+    Args:
+        essays (EssaySet): The essay set to construct the feature extractor and model off of
+
+    Returns:
+        A tuple with the following elements in the following order:
+            - The Trained Feature extractor
+            - The Trained Classifier
+            - Any Cross Validation results
    """
-    f = feature_extractor.FeatureExtractor()
-    f.initialize_dictionaries(essays)
+    feat_extractor = feature_extractor.FeatureExtractor(essays)

-    train_feats = f.generate_features(essays)
+    features = feat_extractor.generate_features(essays)

    set_score = numpy.asarray(essays._score, dtype=numpy.int)
    algorithm = create.select_algorithm(set_score)

-    clf, clf2 = get_algorithms(algorithm)
+    predict_classifier, cv_error_classifier = get_algorithms(algorithm)

-    cv_error_results = get_cv_error(clf2, train_feats, essays._score)
+    cv_error_results = get_cv_error(cv_error_classifier, features, essays._score)

    try:
-        clf.fit(train_feats, set_score)
-    except ValueError:
+        predict_classifier.fit(features, set_score)
+    except:
        log.exception("Not enough classes (0,1,etc) in sample.")
        set_score[0] = 1
        set_score[1] = 0
-        clf.fit(train_feats, set_score)
+        predict_classifier.fit(features, set_score)

-    return f, clf, cv_error_results
+    return feat_extractor, predict_classifier, cv_error_results


 def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
    """
    Writes out a model to a file.
+
+    Args:
+        prompt_string (str): The prompt for the set of essays
+        feature_ext (FeatureExtractor): a trained FeatureExtractor Object
+        classifier : a trained Classifier Object
    prompt string is a string containing the prompt
    feature_ext is a trained FeatureExtractor object
    classifier is a trained classifier

--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
@@ -83,7 +83,7 @@ class PredictorSet(object):
            except UnicodeError:
                raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))

-        # Create essay sets for textual features if needed
+        # Create essay sets for textual features
        # TODO Understand this logic and change it, I don't think it is right.
        if len(self._textual_features) == 0:
            for i in xrange(0, len(textual_features)):

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
-# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
-#Requires aspell to be installed and added to the path
+"""
+Collection of misc functions needed to support essay_set.py and feature_extractor.py.
+Requires aspell to be installed and added to the path
+
+The cleanup of this file is far beyond the scope of this hackathon --GBW--
+"""
 from fisher import pvalue

 aspell_path = "aspell"