Commit 07080d1e by gradyward

A first draft of changes to the predictor set.

Primarily sytlistic changes... the real changes need to be done later on this document.
parent 3ea456df
......@@ -141,7 +141,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
# Initialize a predictor set object that encapsulates all of the text and numeric predictors
try:
predictor = predictor_set.PredictorSet(essaytype="train")
predictor = predictor_set.PredictorSet(essay_type="train")
for i in xrange(0, len(numeric_values)):
predictor.add_row(numeric_values[i], textual_values[i], target[i])
except:
......
......@@ -132,7 +132,7 @@ def grade_generic(grader_data, numeric_features, textual_features):
#Try to find and load the model file
grader_set = predictor_set.PredictorSet(essaytype="test")
grader_set = predictor_set.PredictorSet(essay_type="test")
model, extractor = get_classifier_and_ext(grader_data)
......
import numpy
import nltk
"""
Defines a predictor set, which is a way of taking textual and numerical data and computing it into a format which
can be used by a ML algorithm to generate objects necessary to grade future essays.
"""
import sys
import random
import os
import logging
import essay_set
......@@ -17,84 +19,90 @@ log = logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, essaytype="train"):
"""
Initialize variables and check essay set type
The Predictor Set Class
"""
# TODO This class is wildly incomplete.
def __init__(self, essay_type="train"):
"""
Instantiates a new predictor set, which will be used to place data into for classifier training.
Args:
essay_type (str): Either 'train' or 'test', indicating whether the essays are meant to be trained or ar
in test mode. If nothing (or anything we don't recognize) is specified, default to train.
"""
if (essaytype != "train" and essaytype != "test"):
essaytype = "train"
if essay_type != "train" and essay_type != "test":
essay_type = "train"
self._type = essaytype
self._type = essay_type
self._target = []
self._textual_features = []
self._numeric_features = []
self._essay_sets = []
def add_row(self, numeric_features, textual_features, target):
# Basic input checking
"""
Adds a row to the Predictor set from numeric_features, textual_features, and a target.
"""
#TODO This docstring
# Type input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
raise util_functions.InputError(target, error_message)
raise log_error(target, "Argument target was not entered as a numeric value.")
if not isinstance(numeric_features, list):
error_message = "Numeric features are not a list."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
raise log_error(numeric_features, "Argument numeric_features must be a list of numeric data.")
if not isinstance(textual_features, list):
error_message = "Textual features are not a list."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
raise log_error(textual_features, "Argument textual_features must be a list of textual data")
#Do some length checking for parameters
# Make sure the feature sets we are trying to add are of the same length as previous sets
if len(self._numeric_features) > 0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
current_numeric_length = len(self._numeric_features[-1])
if len(numeric_features) != current_numeric_length:
raise log_error(numeric_features, "Numeric features are an improper length.")
if len(self._textual_features) > 0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
current_textual_length = len(self._textual_features[-1])
if len(textual_features) != current_textual_length:
raise log_error(textual_features, "Textual features are an improper length.")
# Now check to see if text features and numeric features are individually of the right type
for i in xrange(0, len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
except TypeError:
raise log_error(numeric_features, "Numeric feature {0} not numeric.".format(numeric_features[i]))
for i in xrange(0, len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
error_message = "Textual feature {0} not string.".format(textual_features[i])
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
except TypeError:
raise log_error(textual_features, "Textual feature {0} not numeric.".format(textual_features[i]))
except UnicodeError:
raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))
#Create essay sets for textual features if needed
# Create essay sets for textual features if needed
# TODO Understand this logic and change it, I don't think it is right.
if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(essay_type=self._type))
#Add numeric and textual features
# Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
# Add targets
self._target.append(target)
#Add textual features to essay sets
# Add textual features to essay sets
for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
def log_error(self, error_name, error_message):
"""
A helper method to avoid redundancy. Logs an error and returns it to be raised.
"""
log.exception(error_message)
return util_functions.InputError(error_name, error_message)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment