Commit b32d5674 by gradyward

Changed the predictor_extractor.py file, adding clarity and integrating previous changes.

parent 8bc96cb8
......@@ -299,10 +299,10 @@ class FeatureExtractor(object):
"""
#TODO This is still bad.
#Set ratio to modify thresholds for grammar/spelling errors
# Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio = 1.05
#GBW TODO: This might be wrong.
#Calc number of grammar and spelling errors per character
# GBW TODO: This might be wrong.
# Calc number of grammar and spelling errors per character
set_grammar, bad_pos_positions = self._get_grammar_errors(essay_set._pos, essay_set._text, essay_set._tokens)
set_grammar_per_character = [
set_grammar[m] / float(
......@@ -316,7 +316,7 @@ class FeatureExtractor(object):
)
]
#Iterate through essays and create a feedback dictionary for each
# Iterate through essays and create a feedback dictionary for each
all_feedback = []
for m in xrange(0, len(essay_set._text)):
#Be very careful about changing these messages!
......@@ -329,8 +329,8 @@ class FeatureExtractor(object):
}
markup_tokens = essay_set._markup_text[m].split(" ")
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed
# This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
# disjointed
bad_pos_starts = [z[0] for z in bad_pos_positions[m]]
bad_pos_ends = [z[1] - 1 for z in bad_pos_positions[m]]
for z in xrange(0, len(markup_tokens)):
......@@ -342,14 +342,14 @@ class FeatureExtractor(object):
if max(bad_pos_ends) > (len(markup_tokens) - 1) and max(bad_pos_starts) < (len(markup_tokens) - 1):
markup_tokens[len(markup_tokens) - 1] += "</bg>"
#Display messages if grammar/spelling errors greater than average in training set
# Display messages if grammar/spelling errors greater than average in training set
if set_grammar_per_character[m] > (self._grammar_errors_per_character * modifier_ratio):
individual_feedback['grammar'] = "Grammar: More grammar errors than average."
if set_spell_errors_per_character[m] > (self._spell_errors_per_character * modifier_ratio):
individual_feedback['spelling'] = "Spelling: More spelling errors than average."
#Test topicality by calculating # of on topic words per character and comparing to the training set
#mean. Requires features to be passed in
# Test topicality by calculating # of on topic words per character and comparing to the training set
# mean. Requires features to be passed in
if features is not None:
f_row_sum = numpy.sum(features[m, 12:])
f_row_prop = f_row_sum / len(essay_set._text[m])
......@@ -361,7 +361,7 @@ class FeatureExtractor(object):
individual_feedback['too_similar_to_prompt'] = True
log.debug(features[m, 9])
#Create string representation of markup text
# Create string representation of markup text
markup_string = " ".join(markup_tokens)
individual_feedback['markup_text'] = markup_string
all_feedback.append(individual_feedback)
......
......@@ -3,7 +3,6 @@ Functions to score specified data using specified ML models
"""
import sys
import pickle
import os
import numpy
import logging
......@@ -14,14 +13,11 @@ sys.path.append(base_path)
#Depend on base path to be imported
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
from errors import *
#Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
......
......@@ -142,10 +142,9 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
f = predictor_extractor.PredictorExtractor(predictor_set)
train_feats = f.gen_feats(predictor_set)
train_feats = f.generate_features(predictor_set)
clf, clf2 = get_algorithms(algorithm)
cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
......
......@@ -29,58 +29,69 @@ log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
self._initialized = False
"""
Provides an interface for extracting features from a predictor set (as opposed to an essay set), and uses the
methods of the essay set feature extractor in order to maintain cohesion between the two different methods.
"""
def initialize_dictionaries(self, p_set):
def __init__(self, predictor_set):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
Initializes dictionaries with the textual inputs in the PredictorSet object
Uses a predictor_set in the definition of the PredictorExtractor to train the extractor.
Args:
predictor_set (PredictorSet): PredictorSet object that has had data fed to it
"""
success = False
if not (hasattr(p_set, '_type')):
if not (hasattr(predictor_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
raise util_functions.InputError(predictor_set, error_message)
if not (p_set._type == "train"):
if not (predictor_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
raise util_functions.InputError(predictor_set, error_message)
div_length = len(p_set._essay_sets)
div_length = len(predictor_set._essay_sets)
if div_length == 0:
div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
self._extractors = []
# Ensures that even with a large amount of input textual features, training time will stay reasonable
max_features_pass_2 = int(math.floor(200 / div_length))
for i in xrange(0, len(predictor_set._essay_sets)):
self._extractors.append(FeatureExtractor(predictor_set._essay_sets[i]))
self._initialized = True
success = True
return success
def gen_feats(self, p_set):
def generate_features(self, predictor_set):
"""
Generates features based on an iput p_set
p_set - PredictorSet
Generates features given a predictor set containing the essays/data we want to extract from
Args:
predictor_set (PredictorSet): the wrapper which contains the prediction data we want to extract from
Returns:
an array of features
"""
if self._initialized != True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
raise util_functions.InputError(predictor_set, error_message)
textual_features = []
for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
# Generates features by using the generate_features method from the essay set class
for i in xrange(0, len(predictor_set._essay_sets)):
textual_features.append(
self._extractors[i].generate_features(predictor_set._essay_sets[i])
)
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
predictor_matrix = numpy.array(predictor_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
# Originally there were two calls here to print the shape of the feature matricies. GBW didn't think this was
# appropriate, and deleted them.
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment