Changed the predictor_extractor.py file, adding clarity and integrating previous changes.

b32d5674 · gradyward · 8bc96cb8 · b32d5674 · b32d5674 · b32d5674
Commit b32d5674 authored Jun 12, 2014 by gradyward
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 45 deletions

ease/feature_extractor.py
+10 -10

ease/grade.py
+0 -4

ease/model_creator.py
+2 -3

ease/predictor_extractor.py
+39 -28

No files found.
--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
@@ -299,10 +299,10 @@ class FeatureExtractor(object):
        """
        #TODO This is still bad.
-        #Set ratio to modify thresholds for grammar/spelling errors
+        # Set ratio to modify thresholds for grammar/spelling errors
        modifier_ratio = 1.05
-        #GBW TODO: This might be wrong.
+        # GBW TODO: This might be wrong.
-        #Calc number of grammar and spelling errors per character
+        # Calc number of grammar and spelling errors per character
        set_grammar, bad_pos_positions = self._get_grammar_errors(essay_set._pos, essay_set._text, essay_set._tokens)
        set_grammar_per_character = [
            set_grammar[m] / float(
@@ -316,7 +316,7 @@ class FeatureExtractor(object):
            )
        ]
-        #Iterate through essays and create a feedback dictionary for each
+        # Iterate through essays and create a feedback dictionary for each
        all_feedback = []
        for m in xrange(0, len(essay_set._text)):
            #Be very careful about changing these messages!
@@ -329,8 +329,8 @@ class FeatureExtractor(object):
            }
            markup_tokens = essay_set._markup_text[m].split(" ")
-            #This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
+            # This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
-            #disjointed
+            # disjointed
            bad_pos_starts = [z[0] for z in bad_pos_positions[m]]
            bad_pos_ends = [z[1] - 1 for z in bad_pos_positions[m]]
            for z in xrange(0, len(markup_tokens)):
@@ -342,14 +342,14 @@ class FeatureExtractor(object):
                if max(bad_pos_ends) > (len(markup_tokens) - 1) and max(bad_pos_starts) < (len(markup_tokens) - 1):
                    markup_tokens[len(markup_tokens) - 1] += "</bg>"
-            #Display messages if grammar/spelling errors greater than average in training set
+            # Display messages if grammar/spelling errors greater than average in training set
            if set_grammar_per_character[m] > (self._grammar_errors_per_character * modifier_ratio):
                individual_feedback['grammar'] = "Grammar: More grammar errors than average."
            if set_spell_errors_per_character[m] > (self._spell_errors_per_character * modifier_ratio):
                individual_feedback['spelling'] = "Spelling: More spelling errors than average."
-            #Test topicality by calculating # of on topic words per character and comparing to the training set
+            # Test topicality by calculating # of on topic words per character and comparing to the training set
-            #mean.  Requires features to be passed in
+            # mean.  Requires features to be passed in
            if features is not None:
                f_row_sum = numpy.sum(features[m, 12:])
                f_row_prop = f_row_sum / len(essay_set._text[m])
@@ -361,7 +361,7 @@ class FeatureExtractor(object):
                    individual_feedback['too_similar_to_prompt'] = True
                    log.debug(features[m, 9])
-            #Create string representation of markup text
+            # Create string representation of markup text
            markup_string = " ".join(markup_tokens)
            individual_feedback['markup_text'] = markup_string
            all_feedback.append(individual_feedback)

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
 """
 import sys
-import pickle
 import os
 import numpy
 import logging
@@ -14,14 +13,11 @@ sys.path.append(base_path)
 #Depend on base path to be imported
 from essay_set import EssaySet
-import predictor_extractor
 import predictor_set
 import util_functions
 from errors import *
 #Imports needed to unpickle grader data
-import feature_extractor
-import sklearn.ensemble
 import math
 log = logging.getLogger(__name__)

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
    if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
        algorithm = util_functions.AlgorithmTypes.regression
-    f = predictor_extractor.PredictorExtractor()
+    f = predictor_extractor.PredictorExtractor(predictor_set)
-    f.initialize_dictionaries(predictor_set)
-    train_feats = f.gen_feats(predictor_set)
+    train_feats = f.generate_features(predictor_set)
    clf, clf2 = get_algorithms(algorithm)
    cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)

--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
 class PredictorExtractor(object):
-    def __init__(self):
+    """
-        self._extractors = []
+    Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
-        self._initialized = False
+    methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
+    """
-    def initialize_dictionaries(self, p_set):
+    def __init__(self, predictor_set):
        """
-        Initialize dictionaries with the textual inputs in the PredictorSet object
+        Initializes dictionaries with the textual inputs in the PredictorSet object
-        p_set - PredictorSet object that has had data fed in
+        Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
+        Args:
+            predictor_set (PredictorSet): PredictorSet object that has had data fed to it
        """
-        success = False
-        if not (hasattr(p_set, '_type')):
+        if not (hasattr(predictor_set, '_type')):
            error_message = "needs to be an essay set of the train type."
            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
+            raise util_functions.InputError(predictor_set, error_message)
-        if not (p_set._type == "train"):
+        if not (predictor_set._type == "train"):
            error_message = "needs to be an essay set of the train type."
            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
+            raise util_functions.InputError(predictor_set, error_message)
-        div_length = len(p_set._essay_sets)
+        div_length = len(predictor_set._essay_sets)
        if div_length == 0:
            div_length = 1
-        #Ensures that even with a large amount of input textual features, training time stays reasonable
+        self._extractors = []
-        max_feats2 = int(math.floor(200 / div_length))
+        # Ensures that even with a large amount of input textual features, training time will stay reasonable
-        for i in xrange(0, len(p_set._essay_sets)):
+        max_features_pass_2 = int(math.floor(200 / div_length))
-            self._extractors.append(FeatureExtractor())
+        for i in xrange(0, len(predictor_set._essay_sets)):
-            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
+            self._extractors.append(FeatureExtractor(predictor_set._essay_sets[i]))
            self._initialized = True
-            success = True
-        return success
-    def gen_feats(self, p_set):
+    def generate_features(self, predictor_set):
        """
-        Generates features based on an iput p_set
+        Generates features given a predictor set containing the essays/data we want to extract from
-        p_set - PredictorSet
+        Args:
+            predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
+        Returns:
+            an array of features
        """
        if self._initialized != True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
+            raise util_functions.InputError(predictor_set, error_message)
        textual_features = []
-        for i in xrange(0, len(p_set._essay_sets)):
+        # Generates features by using the generate_features method from the essay set class
-            textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
+        for i in xrange(0, len(predictor_set._essay_sets)):
+            textual_features.append(
+                self._extractors[i].generate_features(predictor_set._essay_sets[i])
+            )
        textual_matrix = numpy.concatenate(textual_features, axis=1)
-        predictor_matrix = numpy.array(p_set._numeric_features)
+        predictor_matrix = numpy.array(predictor_set._numeric_features)
-        print textual_matrix.shape
+        # Originally there were two calls here to print the shape of the feature matricies.  GBW didn't think this was
-        print predictor_matrix.shape
+        # appropriate, and deleted them.
        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)