Commit 4b8ef34b by gradyward

THE FIRST ROUND OF CULLING OF UNUSED FILES/METHODS

Began the process of destroying all unneeded functionality.

Ease Lite.
parent 279d3a0c
...@@ -16,9 +16,7 @@ sys.path.append(one_up_path) ...@@ -16,9 +16,7 @@ sys.path.append(one_up_path)
#Import modules that are dependent on the base path #Import modules that are dependent on the base path
import model_creator import model_creator
import util_functions import util_functions
import predictor_set
from errors import * from errors import *
import predictor_extractor
from datetime import datetime from datetime import datetime
import json import json
...@@ -112,62 +110,6 @@ def create(examples, scores, prompt_string, dump_data=False): ...@@ -112,62 +110,6 @@ def create(examples, scores, prompt_string, dump_data=False):
return results return results
def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
"""
Constructs a model from a generic list of numeric values and text values.
Generates this through a predictor set, rather than an essay set.
Args:
numeric_values:
textual_values:
target:
Kwargs:
GBW DELETED KWARG ALGORITHM (it was never used)
"""
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = select_algorithm(target)
# Initialize a result dictionary to return.
results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
# Initialize a predictor set object that encapsulates all of the text and numeric predictors
try:
predictor = predictor_set.PredictorSet(essay_type="train")
for i in xrange(0, len(numeric_values)):
predictor.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
return results
# Gets the features and classifiers from the essay set and computes the error
try:
feature_ext, classifier, cv_error_results = \
model_creator.extract_features_and_generate_model_from_predictors(predictor, algorithm)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['success'] = True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
return results
def select_algorithm(score_list): def select_algorithm(score_list):
""" """
Decides whether to use regression or classification as the ML algorithm based on the number of unique scores Decides whether to use regression or classification as the ML algorithm based on the number of unique scores
......
...@@ -13,7 +13,6 @@ sys.path.append(base_path) ...@@ -13,7 +13,6 @@ sys.path.append(base_path)
#Depend on base path to be imported #Depend on base path to be imported
from essay_set import EssaySet from essay_set import EssaySet
import predictor_set
import util_functions import util_functions
from errors import * from errors import *
...@@ -42,7 +41,6 @@ def grade(grader_data, submission): ...@@ -42,7 +41,6 @@ def grade(grader_data, submission):
'score': The score the input essay was assigned by the classifier set 'score': The score the input essay was assigned by the classifier set
'feedback': The feedback given by the classifier set 'feedback': The feedback given by the classifier set
'success': Whether or not the grading operation was a success 'success': Whether or not the grading operation was a success
'confidence': A metric of the classifier's confidence in its result
""" """
# Initialize result dictionary # Initialize result dictionary
...@@ -75,14 +73,6 @@ def grade(grader_data, submission): ...@@ -75,14 +73,6 @@ def grade(grader_data, submission):
log.exception(error_message) log.exception(error_message)
results['errors'].append(error_message) results['errors'].append(error_message)
#Try to determine confidence level
try:
results['confidence'] = get_confidence_value(
grader_data['algorithm'], model, grader_features, results['score'], grader_data['score'])
except:
# If there is an error getting confidence, it is not a show-stopper/big deal, so just log the error
log.exception("Problem generating confidence value")
# If we have errors above, we do not continue here, but return the dictionary of failure # If we have errors above, we do not continue here, but return the dictionary of failure
if len(results['errors']) < 0: if len(results['errors']) < 0:
...@@ -113,103 +103,6 @@ def grade(grader_data, submission): ...@@ -113,103 +103,6 @@ def grade(grader_data, submission):
return results return results
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades the generic case of numeric and textual features using a generic prediction model.
grader_data (dict): contains key (amoung others)
'algorithm': Type of algorithm used to score
numeric_features (list of float or int or long): A list of numeric features of the essay we are grading
textual_features (list of string): A list of textual features of the essay we are grading
Returns:
(dict) with the following keys:
'errors': All of the errors that arose during the grading process.
'tests':
'score': The score the input essay was assigned by the classifier set
'success': Whether or not the grading operation was a success
'confidence': A metric of the classifier's confidence in its result
"""
results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
# Create a predictor set which will carry the information as we grade it.
grader_set = predictor_set.PredictorSet(essay_type="test")
# Finds the appropriate predictor and model to use
model, extractor = get_classifier_and_extractor(grader_data)
# Try to add data to predictor set that we are going to be grading
try:
grader_set.add_row(numeric_features, textual_features, 0)
except:
error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)
log.exception(error_msg)
results['errors'].append(error_msg)
# Try to extract features from submission and assign score via the model
try:
grader_feats = extractor.generate_features(grader_set)
results['score'] = model.predict(grader_feats)[0]
except:
error_msg = "Could not extract features and score essay."
log.exception(error_msg)
results['errors'].append(error_msg)
# Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
# If we didn't run into an error, we were successful
if len(results['errors']) == 0:
results['success'] = True
return results
def get_confidence_value(algorithm, model, grader_features, score, scores):
"""
Determines the confidence level for a specific grade given to a specific essay.
Args:
algorithm: one of the two from util_functions.AlgorithmTypes
model: A trained model for classification
grader_features: A dictionary describing the grading task
score: The score assigned to this problem
scores: All scores assigned to this problem for all submissions (not just this one)
NOTE: For our current intents and purposes, this value is not utile, and will be removed later on.
Returns:
Ideally: A value between 0 and 1 reflecting the normalized probability confidence in the grade assigned.
Actually: A numerical value with no weight reflecting an arbitrary degree of confidence.
"""
min_score = min(numpy.asarray(scores))
# If our algorithm is classification:
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
# If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence = model.predict_proba(grader_features)[0, (float(score) - float(min_score))]
# The intent was to normalize confidence here, but it was never done, so it remains as such.
confidence = raw_confidence
# Otherwise, if our algorithm is prediction
elif hasattr(model, "predict"):
raw_confidence = model.predict(grader_features)[0]
confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)),
math.ceil(float(raw_confidence)) - float(raw_confidence))
# Otherwise, we have no confidence, because we have no grading mechanism
else:
confidence = 0
return confidence
def get_classifier_and_extractor(grader_data): def get_classifier_and_extractor(grader_data):
""" """
Finds the classifier and extractor from a completed training operation in order to perform the grading operation. Finds the classifier and extractor from a completed training operation in order to perform the grading operation.
......
...@@ -17,7 +17,6 @@ from essay_set import EssaySet ...@@ -17,7 +17,6 @@ from essay_set import EssaySet
import util_functions import util_functions
import feature_extractor import feature_extractor
import logging import logging
import predictor_extractor
import create import create
log = logging.getLogger() log = logging.getLogger()
...@@ -99,15 +98,15 @@ def create_essay_set(text, score, prompt_string, generate_additional=True): ...@@ -99,15 +98,15 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
Prompt string is just a string containing the essay prompt. Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not. Generate_additional indicates whether to generate additional essays at the minimum score point or not.
""" """
x = EssaySet() essay_set = EssaySet()
for i in xrange(0, len(text)): for i in xrange(0, len(text)):
x.add_essay(text[i], score[i]) essay_set.add_essay(text[i], score[i])
if score[i] == min(score) and generate_additional == True: if score[i] == min(score) and generate_additional == True:
x.generate_additional_essays(x._cleaned_spelled_essays[len(x._cleaned_spelled_essays) - 1], score[i]) essay_set.generate_additional_essays(essay_set._cleaned_spelled_essays[len(essay_set._cleaned_spelled_essays) - 1], score[i])
x.update_prompt(prompt_string) essay_set.update_prompt(prompt_string)
return x return essay_set
def get_cv_error(clf, feats, scores): def get_cv_error(clf, feats, scores):
...@@ -181,7 +180,7 @@ def extract_features_and_generate_model_from_predictors(predictor_set, algorithm ...@@ -181,7 +180,7 @@ def extract_features_and_generate_model_from_predictors(predictor_set, algorithm
return f, clf, cv_error_results return f, clf, cv_error_results
def extract_features_and_generate_model(essays): def extract_features_and_generate_model(essay_set):
""" """
Feed in an essay set to get feature vector and classifier Feed in an essay set to get feature vector and classifier
...@@ -194,16 +193,16 @@ def extract_features_and_generate_model(essays): ...@@ -194,16 +193,16 @@ def extract_features_and_generate_model(essays):
- The Trained Classifier - The Trained Classifier
- Any Cross Validation results - Any Cross Validation results
""" """
feat_extractor = feature_extractor.FeatureExtractor(essays) feat_extractor = feature_extractor.FeatureExtractor(essay_set)
features = feat_extractor.generate_features(essays) features = feat_extractor.generate_features(essay_set)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essay_set._score, dtype=numpy.int)
algorithm = create.select_algorithm(set_score) algorithm = create.select_algorithm(set_score)
predict_classifier, cv_error_classifier = get_algorithms(algorithm) predict_classifier, cv_error_classifier = get_algorithms(algorithm)
cv_error_results = get_cv_error(cv_error_classifier, features, essays._score) cv_error_results = get_cv_error(cv_error_classifier, features, essay_set._score)
try: try:
predict_classifier.fit(features, set_score) predict_classifier.fit(features, set_score)
......
"""
Extracts features for an arbitrary set of textual and numeric inputs
"""
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
# Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path = base_path + "/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
"""
Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
"""
def __init__(self, predictor_set):
"""
Initializes dictionaries with the textual inputs in the PredictorSet object
Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
Args:
predictor_set (PredictorSet): PredictorSet object that has had data fed to it
"""
if not (hasattr(predictor_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(predictor_set, error_message)
if not (predictor_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(predictor_set, error_message)
div_length = len(predictor_set._essay_sets)
if div_length == 0:
div_length = 1
self._extractors = []
# Ensures that even with a large amount of input textual features, training time will stay reasonable
max_features_pass_2 = int(math.floor(200 / div_length))
for i in xrange(0, len(predictor_set._essay_sets)):
self._extractors.append(FeatureExtractor(predictor_set._essay_sets[i]))
self._initialized = True
def generate_features(self, predictor_set):
"""
Generates features given a predictor set containing the essays/data we want to extract from
Args:
predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
Returns:
an array of features
"""
if self._initialized != True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(predictor_set, error_message)
textual_features = []
# Generates features by using the generate_features method from the essay set class
for i in xrange(0, len(predictor_set._essay_sets)):
textual_features.append(
self._extractors[i].generate_features(predictor_set._essay_sets[i])
)
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(predictor_set._numeric_features)
# Originally there were two calls here to print the shape of the feature matricies. GBW didn't think this was
# appropriate, and deleted them.
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
"""
Defines a predictor set, which is a way of taking textual and numerical data and computing it into a format which
can be used by a ML algorithm to generate objects necessary to grade future essays.
"""
import sys
import os
import logging
import essay_set
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path = base_path + "/"
log = logging.getLogger(__name__)
class PredictorSet(object):
"""
The Predictor Set Class
"""
# TODO This class is wildly incomplete.
def __init__(self, essay_type="train"):
"""
Instantiates a new predictor set, which will be used to place data into for classifier training.
Args:
essay_type (str): Either 'train' or 'test', indicating whether the essays are meant to be trained or ar
in test mode. If nothing (or anything we don't recognize) is specified, default to train.
"""
if essay_type != "train" and essay_type != "test":
essay_type = "train"
self._type = essay_type
self._target = []
self._textual_features = []
self._numeric_features = []
self._essay_sets = []
def add_row(self, numeric_features, textual_features, target):
"""
Adds a row to the Predictor set from numeric_features, textual_features, and a target.
"""
#TODO This docstring
# Type input checking
if not isinstance(target, (int, long, float)):
raise log_error(target, "Argument target was not entered as a numeric value.")
if not isinstance(numeric_features, list):
raise log_error(numeric_features, "Argument numeric_features must be a list of numeric data.")
if not isinstance(textual_features, list):
raise log_error(textual_features, "Argument textual_features must be a list of textual data")
# Make sure the feature sets we are trying to add are of the same length as previous sets
if len(self._numeric_features) > 0:
current_numeric_length = len(self._numeric_features[-1])
if len(numeric_features) != current_numeric_length:
raise log_error(numeric_features, "Numeric features are an improper length.")
if len(self._textual_features) > 0:
current_textual_length = len(self._textual_features[-1])
if len(textual_features) != current_textual_length:
raise log_error(textual_features, "Textual features are an improper length.")
# Now check to see if text features and numeric features are individually of the right type
for i in xrange(0, len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except TypeError:
raise log_error(numeric_features, "Numeric feature {0} not numeric.".format(numeric_features[i]))
for i in xrange(0, len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except TypeError:
raise log_error(textual_features, "Textual feature {0} not numeric.".format(textual_features[i]))
except UnicodeError:
raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))
# Create essay sets for textual features
# TODO Understand this logic and change it, I don't think it is right.
if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(essay_type=self._type))
# Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
# Add targets
self._target.append(target)
# Add textual features to essay sets
for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
def log_error(self, error_name, error_message):
"""
A helper method to avoid redundancy. Logs an error and returns it to be raised.
"""
log.exception(error_message)
return util_functions.InputError(error_name, error_message)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment