A first draft of changes to the predictor set.

Primarily sytlistic changes... the real changes need to be done later on this document.

A first draft of changes to the predictor set.
Primarily sytlistic changes... the real changes need to be done later on this document.
07080d1e · gradyward · 3ea456df · 07080d1e · 07080d1e · 07080d1e
Commit 07080d1e authored Jun 12, 2014 by gradyward
Show whitespace changes
Inline Side-by-side

Showing with 55 additions and 47 deletions

ease/create.py
+1 -1

ease/grade.py
+1 -1

ease/predictor_set.py
+53 -45

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -141,7 +141,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi

    # Initialize a predictor set object that encapsulates all of the text and numeric predictors
    try:
-        predictor = predictor_set.PredictorSet(essaytype="train")
+        predictor = predictor_set.PredictorSet(essay_type="train")
        for i in xrange(0, len(numeric_values)):
            predictor.add_row(numeric_values[i], textual_values[i], target[i])
    except:

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -132,7 +132,7 @@ def grade_generic(grader_data, numeric_features, textual_features):

    #Try to find and load the model file

-    grader_set = predictor_set.PredictorSet(essaytype="test")
+    grader_set = predictor_set.PredictorSet(essay_type="test")

    model, extractor = get_classifier_and_ext(grader_data)


--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
-import numpy
-import nltk
+"""
+Defines a predictor set, which is a way of taking textual and numerical data and computing it into a format which
+can be used by a ML algorithm to generate objects necessary to grade future essays.
+"""
+
 import sys
-import random
 import os
 import logging
 import essay_set
@@ -17,84 +19,90 @@ log = logging.getLogger(__name__)


 class PredictorSet(object):
-    def __init__(self, essaytype="train"):
    """
-        Initialize variables and check essay set type
+    The Predictor Set Class
+    """
+    # TODO This class is wildly incomplete.
+
+    def __init__(self, essay_type="train"):
+        """
+        Instantiates a new predictor set, which will be used to place data into for classifier training.
+
+        Args:
+            essay_type (str): Either 'train' or 'test', indicating whether the essays are meant to be trained or ar
+                            in test mode.  If nothing (or anything we don't recognize) is specified, default to train.
        """
-        if (essaytype != "train" and essaytype != "test"):
-            essaytype = "train"
+        if essay_type != "train" and essay_type != "test":
+            essay_type = "train"

-        self._type = essaytype
+        self._type = essay_type
        self._target = []
        self._textual_features = []
        self._numeric_features = []
        self._essay_sets = []

    def add_row(self, numeric_features, textual_features, target):
-        # Basic input checking
+        """
+        Adds a row to the Predictor set from numeric_features, textual_features, and a target.
+        """
+        #TODO This docstring
+
+        # Type input checking
        if not isinstance(target, (int, long, float)):
-            error_message = "Target is not a numeric value."
-            log.exception(error_message)
-            raise util_functions.InputError(target, error_message)
+            raise log_error(target, "Argument target was not entered as a numeric value.")

        if not isinstance(numeric_features, list):
-            error_message = "Numeric features are not a list."
-            log.exception(error_message)
-            raise util_functions.InputError(numeric_features, error_message)
+            raise log_error(numeric_features, "Argument numeric_features must be a list of numeric data.")

        if not isinstance(textual_features, list):
-            error_message = "Textual features are not a list."
-            log.exception(error_message)
-            raise util_functions.InputError(textual_features, error_message)
+            raise log_error(textual_features, "Argument textual_features must be a list of textual data")

-        #Do some length checking for parameters
+        # Make sure the feature sets we are trying to add are of the same length as previous sets
        if len(self._numeric_features) > 0:
-            numeric_length = len(self._numeric_features[-1])
-            current_numeric_length = len(numeric_features)
-            if numeric_length != current_numeric_length:
-                error_message = "Numeric features are an improper length."
-                log.exception(error_message)
-                raise util_functions.InputError(numeric_features, error_message)
+            current_numeric_length = len(self._numeric_features[-1])
+            if len(numeric_features) != current_numeric_length:
+                raise log_error(numeric_features, "Numeric features are an improper length.")

        if len(self._textual_features) > 0:
-            textual_length = len(self._textual_features[-1])
-            current_textual_length = len(textual_features)
-            if textual_length != current_textual_length:
-                error_message = "Textual features are an improper length."
-                log.exception(error_message)
-                raise util_functions.InputError(textual_features, error_message)
-
-        #Now check to see if text features and numeric features are individually correct
+            current_textual_length = len(self._textual_features[-1])
+            if len(textual_features) != current_textual_length:
+                raise log_error(textual_features, "Textual features are an improper length.")

+        # Now check to see if text features and numeric features are individually of the right type
        for i in xrange(0, len(numeric_features)):
            try:
                numeric_features[i] = float(numeric_features[i])
-            except:
-                error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
-                log.exception(error_message)
-                raise util_functions.InputError(numeric_features, error_message)
+            except TypeError:
+                raise log_error(numeric_features, "Numeric feature {0} not numeric.".format(numeric_features[i]))

        for i in xrange(0, len(textual_features)):
            try:
                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
-            except:
-                error_message = "Textual feature {0} not string.".format(textual_features[i])
-                log.exception(error_message)
-                raise util_functions.InputError(textual_features, error_message)
+            except TypeError:
+                raise log_error(textual_features, "Textual feature {0} not numeric.".format(textual_features[i]))
+            except UnicodeError:
+                raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))

-        #Create essay sets for textual features if needed
+        # Create essay sets for textual features if needed
+        # TODO Understand this logic and change it, I don't think it is right.
        if len(self._textual_features) == 0:
            for i in xrange(0, len(textual_features)):
                self._essay_sets.append(essay_set.EssaySet(essay_type=self._type))

-        #Add numeric and textual features
+        # Add numeric and textual features
        self._numeric_features.append(numeric_features)
        self._textual_features.append(textual_features)

-        #Add targets
+        # Add targets
        self._target.append(target)

-        #Add textual features to essay sets
+        # Add textual features to essay sets
        for i in xrange(0, len(textual_features)):
            self._essay_sets[i].add_essay(textual_features[i], target)

+def log_error(self, error_name, error_message):
+    """
+    A helper method to avoid redundancy.  Logs an error and returns it to be raised.
+    """
+    log.exception(error_message)
+    return util_functions.InputError(error_name, error_message)