Commit b32d5674 by gradyward

Changed the predictor_extractor.py file, adding clarity and integrating previous changes.

parent 8bc96cb8
...@@ -299,10 +299,10 @@ class FeatureExtractor(object): ...@@ -299,10 +299,10 @@ class FeatureExtractor(object):
""" """
#TODO This is still bad. #TODO This is still bad.
#Set ratio to modify thresholds for grammar/spelling errors # Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio = 1.05 modifier_ratio = 1.05
#GBW TODO: This might be wrong. # GBW TODO: This might be wrong.
#Calc number of grammar and spelling errors per character # Calc number of grammar and spelling errors per character
set_grammar, bad_pos_positions = self._get_grammar_errors(essay_set._pos, essay_set._text, essay_set._tokens) set_grammar, bad_pos_positions = self._get_grammar_errors(essay_set._pos, essay_set._text, essay_set._tokens)
set_grammar_per_character = [ set_grammar_per_character = [
set_grammar[m] / float( set_grammar[m] / float(
...@@ -316,7 +316,7 @@ class FeatureExtractor(object): ...@@ -316,7 +316,7 @@ class FeatureExtractor(object):
) )
] ]
#Iterate through essays and create a feedback dictionary for each # Iterate through essays and create a feedback dictionary for each
all_feedback = [] all_feedback = []
for m in xrange(0, len(essay_set._text)): for m in xrange(0, len(essay_set._text)):
#Be very careful about changing these messages! #Be very careful about changing these messages!
...@@ -329,8 +329,8 @@ class FeatureExtractor(object): ...@@ -329,8 +329,8 @@ class FeatureExtractor(object):
} }
markup_tokens = essay_set._markup_text[m].split(" ") markup_tokens = essay_set._markup_text[m].split(" ")
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying # This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed # disjointed
bad_pos_starts = [z[0] for z in bad_pos_positions[m]] bad_pos_starts = [z[0] for z in bad_pos_positions[m]]
bad_pos_ends = [z[1] - 1 for z in bad_pos_positions[m]] bad_pos_ends = [z[1] - 1 for z in bad_pos_positions[m]]
for z in xrange(0, len(markup_tokens)): for z in xrange(0, len(markup_tokens)):
...@@ -342,14 +342,14 @@ class FeatureExtractor(object): ...@@ -342,14 +342,14 @@ class FeatureExtractor(object):
if max(bad_pos_ends) > (len(markup_tokens) - 1) and max(bad_pos_starts) < (len(markup_tokens) - 1): if max(bad_pos_ends) > (len(markup_tokens) - 1) and max(bad_pos_starts) < (len(markup_tokens) - 1):
markup_tokens[len(markup_tokens) - 1] += "</bg>" markup_tokens[len(markup_tokens) - 1] += "</bg>"
#Display messages if grammar/spelling errors greater than average in training set # Display messages if grammar/spelling errors greater than average in training set
if set_grammar_per_character[m] > (self._grammar_errors_per_character * modifier_ratio): if set_grammar_per_character[m] > (self._grammar_errors_per_character * modifier_ratio):
individual_feedback['grammar'] = "Grammar: More grammar errors than average." individual_feedback['grammar'] = "Grammar: More grammar errors than average."
if set_spell_errors_per_character[m] > (self._spell_errors_per_character * modifier_ratio): if set_spell_errors_per_character[m] > (self._spell_errors_per_character * modifier_ratio):
individual_feedback['spelling'] = "Spelling: More spelling errors than average." individual_feedback['spelling'] = "Spelling: More spelling errors than average."
#Test topicality by calculating # of on topic words per character and comparing to the training set # Test topicality by calculating # of on topic words per character and comparing to the training set
#mean. Requires features to be passed in # mean. Requires features to be passed in
if features is not None: if features is not None:
f_row_sum = numpy.sum(features[m, 12:]) f_row_sum = numpy.sum(features[m, 12:])
f_row_prop = f_row_sum / len(essay_set._text[m]) f_row_prop = f_row_sum / len(essay_set._text[m])
...@@ -361,7 +361,7 @@ class FeatureExtractor(object): ...@@ -361,7 +361,7 @@ class FeatureExtractor(object):
individual_feedback['too_similar_to_prompt'] = True individual_feedback['too_similar_to_prompt'] = True
log.debug(features[m, 9]) log.debug(features[m, 9])
#Create string representation of markup text # Create string representation of markup text
markup_string = " ".join(markup_tokens) markup_string = " ".join(markup_tokens)
individual_feedback['markup_text'] = markup_string individual_feedback['markup_text'] = markup_string
all_feedback.append(individual_feedback) all_feedback.append(individual_feedback)
......
...@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models ...@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
""" """
import sys import sys
import pickle
import os import os
import numpy import numpy
import logging import logging
...@@ -14,14 +13,11 @@ sys.path.append(base_path) ...@@ -14,14 +13,11 @@ sys.path.append(base_path)
#Depend on base path to be imported #Depend on base path to be imported
from essay_set import EssaySet from essay_set import EssaySet
import predictor_extractor
import predictor_set import predictor_set
import util_functions import util_functions
from errors import * from errors import *
#Imports needed to unpickle grader data #Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
......
...@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util ...@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]): if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor() f = predictor_extractor.PredictorExtractor(predictor_set)
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set) train_feats = f.generate_features(predictor_set)
clf, clf2 = get_algorithms(algorithm) clf, clf2 = get_algorithms(algorithm)
cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target) cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
......
...@@ -29,58 +29,69 @@ log = logging.getLogger(__name__) ...@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
class PredictorExtractor(object): class PredictorExtractor(object):
def __init__(self): """
self._extractors = [] Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
self._initialized = False methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
"""
def initialize_dictionaries(self, p_set): def __init__(self, predictor_set):
""" """
Initialize dictionaries with the textual inputs in the PredictorSet object Initializes dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
Args:
predictor_set (PredictorSet): PredictorSet object that has had data fed to it
""" """
success = False
if not (hasattr(p_set, '_type')): if not (hasattr(predictor_set, '_type')):
error_message = "needs to be an essay set of the train type." error_message = "needs to be an essay set of the train type."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(predictor_set, error_message)
if not (p_set._type == "train"): if not (predictor_set._type == "train"):
error_message = "needs to be an essay set of the train type." error_message = "needs to be an essay set of the train type."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(predictor_set, error_message)
div_length = len(p_set._essay_sets) div_length = len(predictor_set._essay_sets)
if div_length == 0: if div_length == 0:
div_length = 1 div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable self._extractors = []
max_feats2 = int(math.floor(200 / div_length)) # Ensures that even with a large amount of input textual features, training time will stay reasonable
for i in xrange(0, len(p_set._essay_sets)): max_features_pass_2 = int(math.floor(200 / div_length))
self._extractors.append(FeatureExtractor()) for i in xrange(0, len(predictor_set._essay_sets)):
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2) self._extractors.append(FeatureExtractor(predictor_set._essay_sets[i]))
self._initialized = True self._initialized = True
success = True
return success
def gen_feats(self, p_set): def generate_features(self, predictor_set):
""" """
Generates features based on an iput p_set Generates features given a predictor set containing the essays/data we want to extract from
p_set - PredictorSet
Args:
predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
Returns:
an array of features
""" """
if self._initialized != True: if self._initialized != True:
error_message = "Dictionaries have not been initialized." error_message = "Dictionaries have not been initialized."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(predictor_set, error_message)
textual_features = [] textual_features = []
for i in xrange(0, len(p_set._essay_sets)): # Generates features by using the generate_features method from the essay set class
textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i])) for i in xrange(0, len(predictor_set._essay_sets)):
textual_features.append(
self._extractors[i].generate_features(predictor_set._essay_sets[i])
)
textual_matrix = numpy.concatenate(textual_features, axis=1) textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features) predictor_matrix = numpy.array(predictor_set._numeric_features)
print textual_matrix.shape # Originally there were two calls here to print the shape of the feature matricies. GBW didn't think this was
print predictor_matrix.shape # appropriate, and deleted them.
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1) overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment