Commit 2a2972f8 by gradyward

Working on feature_extractor.py, doing a lot of refactoring.

Committed not because of solid progress but because of how much has changed.
parent e1be348a
...@@ -32,6 +32,7 @@ class EssaySet(object): ...@@ -32,6 +32,7 @@ class EssaySet(object):
Essays in an essay set can be assumed to have these properties. Essays in an essay set can be assumed to have these properties.
""" """
def __init__(self, essay_type="train"): def __init__(self, essay_type="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
......
...@@ -71,7 +71,7 @@ def grade(grader_data, submission): ...@@ -71,7 +71,7 @@ def grade(grader_data, submission):
# Tries to extract features from submission and assign score via the model # Tries to extract features from submission and assign score via the model
grader_features = None grader_features = None
try: try:
grader_features = extractor.gen_feats(grader_set) grader_features = extractor.generate_features(grader_set)
feedback = extractor.gen_feedback(grader_set, grader_features)[0] feedback = extractor.gen_feedback(grader_set, grader_features)[0]
results['score'] = int(model.predict(grader_features)[0]) results['score'] = int(model.predict(grader_features)[0])
except: except:
...@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features):
# Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats = extractor.gen_feats(grader_set) grader_feats = extractor.generate_features(grader_set)
results['score'] = model.predict(grader_feats)[0] results['score'] = model.predict(grader_feats)[0]
except: except:
error_msg = "Could not extract features and score essay." error_msg = "Could not extract features and score essay."
......
...@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit ...@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
f = feature_extractor.FeatureExtractor() f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays) f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays) train_feats = f.generate_features(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
algorithm = create.select_algorithm(set_score) algorithm = create.select_algorithm(set_score)
......
...@@ -57,7 +57,7 @@ class PredictorExtractor(object): ...@@ -57,7 +57,7 @@ class PredictorExtractor(object):
max_feats2 = int(math.floor(200 / div_length)) max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0, len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
self._initialized = True self._initialized = True
success = True success = True
return success return success
...@@ -74,7 +74,7 @@ class PredictorExtractor(object): ...@@ -74,7 +74,7 @@ class PredictorExtractor(object):
textual_features = [] textual_features = []
for i in xrange(0, len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i])) textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1) textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features) predictor_matrix = numpy.array(p_set._numeric_features)
......
...@@ -198,19 +198,29 @@ def regenerate_good_tokens(string): ...@@ -198,19 +198,29 @@ def regenerate_good_tokens(string):
return sel_pos_ngrams return sel_pos_ngrams
def get_vocab(text, score, max_feats=750, max_feats2=200): def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
""" """
Uses a fisher test to find words that are significant in that they separate Uses a fisher test to find words that are significant in that they separate
high scoring essays from low scoring essays. high scoring essays from low scoring essays.
text is a list of input essays.
score is a list of scores, with score[n] corresponding to text[n] Args:
max_feats is the maximum number of features to consider in the first pass essays (list of str): a list of input essays
max_feats2 is the maximum number of features to consider in the second (final) pass scores (list of int): a list of associated input scores
Returns a list of words that constitute the significant vocabulary
""" Kwargs:
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats) max_features_pass_1 (int): the maximum number of features to consider in the first pass of the essays
dict_mat = dict.fit_transform(text) max_features_pass_2 (int): the maximum number of features to consider in the second pass of the essays
set_score = numpy.asarray(score, dtype=numpy.int) (DEFAULTS of 750 and 200 respectively)
Returns:
(list of str): A list of strings which constitute the signifigant vocabulary which diferentiates between
strong and weak essays.
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
"""
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
dict_matrix = dict.fit_transform(essays)
set_score = numpy.asarray(scores, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
new_score = set_score new_score = set_score
if (med_score == 0): if (med_score == 0):
...@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
new_score[set_score >= med_score] = 1 new_score[set_score >= med_score] = 1
fish_vals = [] fish_vals = []
for col_num in range(0, dict_mat.shape[1]): for col_num in range(0, dict_matrix.shape[1]):
loop_vec = dict_mat.getcol(col_num).toarray() loop_vec = dict_matrix.getcol(col_num).toarray()
good_loop_vec = loop_vec[new_score == 1] good_loop_vec = loop_vec[new_score == 1]
bad_loop_vec = loop_vec[new_score == 0] bad_loop_vec = loop_vec[new_score == 0]
good_loop_present = len(good_loop_vec[good_loop_vec > 0]) good_loop_present = len(good_loop_vec[good_loop_vec > 0])
...@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals.append(fish_val) fish_vals.append(fish_val)
cutoff = 1 cutoff = 1
if (len(fish_vals) > max_feats2): if (len(fish_vals) > max_features_pass_2):
cutoff = sorted(fish_vals)[max_feats2] cutoff = sorted(fish_vals)[max_features_pass_2]
good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff]) good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
getVar = lambda searchList, ind: [searchList[i] for i in ind] getVar = lambda searchList, ind: [searchList[i] for i in ind]
vocab = getVar(dict.get_feature_names(), good_cols) vocab = getVar(dict.get_feature_names(), good_cols)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment