THE FIRST ROUND OF CULLING OF UNUSED FILES/METHODS

Began the process of destroying all unneeded functionality. Ease Lite.

THE FIRST ROUND OF CULLING OF UNUSED FILES/METHODS
Began the process of destroying all unneeded functionality. Ease Lite.
4b8ef34b · gradyward · 279d3a0c · 4b8ef34b · 4b8ef34b · 4b8ef34b
Commit 4b8ef34b authored Jun 12, 2014 by gradyward
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 382 deletions

ease/create.py
+0 -58

ease/grade.py
+0 -107

ease/model_creator.py
+10 -11

ease/predictor_extractor.py
+0 -98

ease/predictor_set.py
+0 -108

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -16,9 +16,7 @@ sys.path.append(one_up_path)
 #Import modules that are dependent on the base path
 import model_creator
 import util_functions
-import predictor_set
 from errors import *
-import predictor_extractor
 from datetime import datetime
 import json
@@ -112,62 +110,6 @@ def create(examples, scores, prompt_string, dump_data=False):
    return results
-def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
-    """
-    Constructs a model from a generic list of numeric values and text values.
-    Generates this through a predictor set, rather than an essay set.
-    Args:
-        numeric_values:
-        textual_values:
-        target:
-    Kwargs:
-        GBW DELETED KWARG ALGORITHM (it was never used)
-    """
-    # Selects the appropriate ML algorithm to use to train the classifier
-    algorithm = select_algorithm(target)
-    # Initialize a result dictionary to return.
-    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
-               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
-    if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
-        msg = "Target, numeric features, and text features must all be the same length."
-        results['errors'].append(msg)
-        log.exception(msg)
-        return results
-    # Initialize a predictor set object that encapsulates all of the text and numeric predictors
-    try:
-        predictor = predictor_set.PredictorSet(essay_type="train")
-        for i in xrange(0, len(numeric_values)):
-            predictor.add_row(numeric_values[i], textual_values[i], target[i])
-    except:
-        msg = "predictor set creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-        return results
-    # Gets the features and classifiers from the essay set and computes the error
-    try:
-        feature_ext, classifier, cv_error_results = \
-            model_creator.extract_features_and_generate_model_from_predictors(predictor, algorithm)
-        results['cv_kappa'] = cv_error_results['kappa']
-        results['cv_mean_absolute_error'] = cv_error_results['mae']
-        results['feature_ext'] = feature_ext
-        results['classifier'] = classifier
-        results['success'] = True
-    except:
-        msg = "feature extraction and model creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-    return results
 def select_algorithm(score_list):
    """
    Decides whether to use regression or classification as the ML algorithm based on the number of unique scores

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -13,7 +13,6 @@ sys.path.append(base_path)
 #Depend on base path to be imported
 from essay_set import EssaySet
-import predictor_set
 import util_functions
 from errors import *
@@ -42,7 +41,6 @@ def grade(grader_data, submission):
            'score': The score the input essay was assigned by the classifier set
            'feedback': The feedback given by the classifier set
            'success': Whether or not the grading operation was a success
-            'confidence': A metric of the classifier's confidence in its result
    """
    # Initialize result dictionary
@@ -75,14 +73,6 @@ def grade(grader_data, submission):
        log.exception(error_message)
        results['errors'].append(error_message)
-    #Try to determine confidence level
-    try:
-        results['confidence'] = get_confidence_value(
-            grader_data['algorithm'], model, grader_features, results['score'], grader_data['score'])
-    except:
-        # If there is an error getting confidence, it is not a show-stopper/big deal, so just log the error
-        log.exception("Problem generating confidence value")
    # If we have errors above, we do not continue here, but return the dictionary of failure
    if len(results['errors']) < 0:
@@ -113,103 +103,6 @@ def grade(grader_data, submission):
    return results
-def grade_generic(grader_data, numeric_features, textual_features):
-    """
-    Grades the generic case of numeric and textual features using a generic prediction model.
-    grader_data (dict):  contains key (amoung others)
-        'algorithm': Type of algorithm used to score
-    numeric_features (list of float or int or long): A list of numeric features of the essay we are grading
-    textual_features (list of string): A list of textual features of the essay we are grading
-    Returns:
-        (dict) with the following keys:
-            'errors': All of the errors that arose during the grading process.
-            'tests':
-            'score': The score the input essay was assigned by the classifier set
-            'success': Whether or not the grading operation was a success
-            'confidence': A metric of the classifier's confidence in its result
-    """
-    results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
-    # Create a predictor set which will carry the information as we grade it.
-    grader_set = predictor_set.PredictorSet(essay_type="test")
-    # Finds the appropriate predictor and model to use
-    model, extractor = get_classifier_and_extractor(grader_data)
-    # Try to add data to predictor set that we are going to be grading
-    try:
-        grader_set.add_row(numeric_features, textual_features, 0)
-    except:
-        error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)
-        log.exception(error_msg)
-        results['errors'].append(error_msg)
-    # Try to extract features from submission and assign score via the model
-    try:
-        grader_feats = extractor.generate_features(grader_set)
-        results['score'] = model.predict(grader_feats)[0]
-    except:
-        error_msg = "Could not extract features and score essay."
-        log.exception(error_msg)
-        results['errors'].append(error_msg)
-    # Try to determine confidence level
-    try:
-        results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'])
-    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
-        log.exception("Problem generating confidence value")
-    # If we didn't run into an error, we were successful
-    if len(results['errors']) == 0:
-        results['success'] = True
-    return results
-def get_confidence_value(algorithm, model, grader_features, score, scores):
-    """
-    Determines the confidence level for a specific grade given to a specific essay.
-    Args:
-        algorithm: one of the two from util_functions.AlgorithmTypes
-        model: A trained model for classification
-        grader_features: A dictionary describing the grading task
-        score: The score assigned to this problem
-        scores: All scores assigned to this problem for all submissions (not just this one)
-    NOTE: For our current intents and purposes, this value is not utile, and will be removed later on.
-    Returns:
-        Ideally: A value between 0 and 1 reflecting the normalized probability confidence in the grade assigned.
-        Actually: A numerical value with no weight reflecting an arbitrary degree of confidence.
-    """
-    min_score = min(numpy.asarray(scores))
-    # If our algorithm is classification:
-    if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
-        # If classification, predict with probability, which gives you a matrix of confidences per score point
-        raw_confidence = model.predict_proba(grader_features)[0, (float(score) - float(min_score))]
-        # The intent was to normalize confidence here, but it was never done, so it remains as such.
-        confidence = raw_confidence
-    # Otherwise, if our algorithm is prediction
-    elif hasattr(model, "predict"):
-        raw_confidence = model.predict(grader_features)[0]
-        confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)),
-                         math.ceil(float(raw_confidence)) - float(raw_confidence))
-    # Otherwise, we have no confidence, because we have no grading mechanism
-    else:
-        confidence = 0
-    return confidence
 def get_classifier_and_extractor(grader_data):
    """
    Finds the classifier and extractor from a completed training operation in order to perform the grading operation.

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -17,7 +17,6 @@ from essay_set import EssaySet
 import util_functions
 import feature_extractor
 import logging
-import predictor_extractor
 import create
 log = logging.getLogger()
@@ -99,15 +98,15 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
    Prompt string is just a string containing the essay prompt.
    Generate_additional indicates whether to generate additional essays at the minimum score point or not.
    """
-    x = EssaySet()
+    essay_set = EssaySet()
    for i in xrange(0, len(text)):
-        x.add_essay(text[i], score[i])
+        essay_set.add_essay(text[i], score[i])
        if score[i] == min(score) and generate_additional == True:
-            x.generate_additional_essays(x._cleaned_spelled_essays[len(x._cleaned_spelled_essays) - 1], score[i])
+            essay_set.generate_additional_essays(essay_set._cleaned_spelled_essays[len(essay_set._cleaned_spelled_essays) - 1], score[i])
-    x.update_prompt(prompt_string)
+    essay_set.update_prompt(prompt_string)
-    return x
+    return essay_set
 def get_cv_error(clf, feats, scores):
@@ -181,7 +180,7 @@ def extract_features_and_generate_model_from_predictors(predictor_set, algorithm
    return f, clf, cv_error_results
-def extract_features_and_generate_model(essays):
+def extract_features_and_generate_model(essay_set):
    """
    Feed in an essay set to get feature vector and classifier
@@ -194,16 +193,16 @@ def extract_features_and_generate_model(essays):
            - The Trained Classifier
            - Any Cross Validation results
    """
-    feat_extractor = feature_extractor.FeatureExtractor(essays)
+    feat_extractor = feature_extractor.FeatureExtractor(essay_set)
-    features = feat_extractor.generate_features(essays)
+    features = feat_extractor.generate_features(essay_set)
-    set_score = numpy.asarray(essays._score, dtype=numpy.int)
+    set_score = numpy.asarray(essay_set._score, dtype=numpy.int)
    algorithm = create.select_algorithm(set_score)
    predict_classifier, cv_error_classifier = get_algorithms(algorithm)
-    cv_error_results = get_cv_error(cv_error_classifier, features, essays._score)
+    cv_error_results = get_cv_error(cv_error_classifier, features, essay_set._score)
    try:
        predict_classifier.fit(features, set_score)

--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
-"""
-Extracts features for an arbitrary set of textual and numeric inputs
-"""
-import numpy
-import re
-import nltk
-import sys
-from sklearn.feature_extraction.text import CountVectorizer
-import pickle
-import os
-from itertools import chain
-import copy
-import operator
-import logging
-import math
-from feature_extractor import FeatureExtractor
-# Append to path and then import things that depend on path
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-from essay_set import EssaySet
-import util_functions
-if not base_path.endswith("/"):
-    base_path = base_path + "/"
-log = logging.getLogger(__name__)
-class PredictorExtractor(object):
-    """
-    Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
-    methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
-    """
-    def __init__(self, predictor_set):
-        """
-        Initializes dictionaries with the textual inputs in the PredictorSet object
-        Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
-        Args:
-            predictor_set (PredictorSet): PredictorSet object that has had data fed to it
-        """
-        if not (hasattr(predictor_set, '_type')):
-            error_message = "needs to be an essay set of the train type."
-            log.exception(error_message)
-            raise util_functions.InputError(predictor_set, error_message)
-        if not (predictor_set._type == "train"):
-            error_message = "needs to be an essay set of the train type."
-            log.exception(error_message)
-            raise util_functions.InputError(predictor_set, error_message)
-        div_length = len(predictor_set._essay_sets)
-        if div_length == 0:
-            div_length = 1
-        self._extractors = []
-        # Ensures that even with a large amount of input textual features, training time will stay reasonable
-        max_features_pass_2 = int(math.floor(200 / div_length))
-        for i in xrange(0, len(predictor_set._essay_sets)):
-            self._extractors.append(FeatureExtractor(predictor_set._essay_sets[i]))
-            self._initialized = True
-    def generate_features(self, predictor_set):
-        """
-        Generates features given a predictor set containing the essays/data we want to extract from
-        Args:
-            predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
-        Returns:
-            an array of features
-        """
-        if self._initialized != True:
-            error_message = "Dictionaries have not been initialized."
-            log.exception(error_message)
-            raise util_functions.InputError(predictor_set, error_message)
-        textual_features = []
-        # Generates features by using the generate_features method from the essay set class
-        for i in xrange(0, len(predictor_set._essay_sets)):
-            textual_features.append(
-                self._extractors[i].generate_features(predictor_set._essay_sets[i])
-            )
-        textual_matrix = numpy.concatenate(textual_features, axis=1)
-        predictor_matrix = numpy.array(predictor_set._numeric_features)
-        # Originally there were two calls here to print the shape of the feature matricies.  GBW didn't think this was
-        # appropriate, and deleted them.
-        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
-        return overall_matrix.copy()
--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
-"""
-Defines a predictor set, which is a way of taking textual and numerical data and computing it into a format which
-can be used by a ML algorithm to generate objects necessary to grade future essays.
-"""
-import sys
-import os
-import logging
-import essay_set
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-import util_functions
-if not base_path.endswith("/"):
-    base_path = base_path + "/"
-log = logging.getLogger(__name__)
-class PredictorSet(object):
-    """
-    The Predictor Set Class
-    """
-    # TODO This class is wildly incomplete.
-    def __init__(self, essay_type="train"):
-        """
-        Instantiates a new predictor set, which will be used to place data into for classifier training.
-        Args:
-            essay_type (str): Either 'train' or 'test', indicating whether the essays are meant to be trained or ar
-                            in test mode.  If nothing (or anything we don't recognize) is specified, default to train.
-        """
-        if essay_type != "train" and essay_type != "test":
-            essay_type = "train"
-        self._type = essay_type
-        self._target = []
-        self._textual_features = []
-        self._numeric_features = []
-        self._essay_sets = []
-    def add_row(self, numeric_features, textual_features, target):
-        """
-        Adds a row to the Predictor set from numeric_features, textual_features, and a target.
-        """
-        #TODO This docstring
-        # Type input checking
-        if not isinstance(target, (int, long, float)):
-            raise log_error(target, "Argument target was not entered as a numeric value.")
-        if not isinstance(numeric_features, list):
-            raise log_error(numeric_features, "Argument numeric_features must be a list of numeric data.")
-        if not isinstance(textual_features, list):
-            raise log_error(textual_features, "Argument textual_features must be a list of textual data")
-        # Make sure the feature sets we are trying to add are of the same length as previous sets
-        if len(self._numeric_features) > 0:
-            current_numeric_length = len(self._numeric_features[-1])
-            if len(numeric_features) != current_numeric_length:
-                raise log_error(numeric_features, "Numeric features are an improper length.")
-        if len(self._textual_features) > 0:
-            current_textual_length = len(self._textual_features[-1])
-            if len(textual_features) != current_textual_length:
-                raise log_error(textual_features, "Textual features are an improper length.")
-        # Now check to see if text features and numeric features are individually of the right type
-        for i in xrange(0, len(numeric_features)):
-            try:
-                numeric_features[i] = float(numeric_features[i])
-            except TypeError:
-                raise log_error(numeric_features, "Numeric feature {0} not numeric.".format(numeric_features[i]))
-        for i in xrange(0, len(textual_features)):
-            try:
-                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
-            except TypeError:
-                raise log_error(textual_features, "Textual feature {0} not numeric.".format(textual_features[i]))
-            except UnicodeError:
-                raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))
-        # Create essay sets for textual features
-        # TODO Understand this logic and change it, I don't think it is right.
-        if len(self._textual_features) == 0:
-            for i in xrange(0, len(textual_features)):
-                self._essay_sets.append(essay_set.EssaySet(essay_type=self._type))
-        # Add numeric and textual features
-        self._numeric_features.append(numeric_features)
-        self._textual_features.append(textual_features)
-        # Add targets
-        self._target.append(target)
-        # Add textual features to essay sets
-        for i in xrange(0, len(textual_features)):
-            self._essay_sets[i].add_essay(textual_features[i], target)
-def log_error(self, error_name, error_message):
-    """
-    A helper method to avoid redundancy.  Logs an error and returns it to be raised.
-    """
-    log.exception(error_message)
-    return util_functions.InputError(error_name, error_message)