Commit 2a2972f8 by gradyward

Working on feature_extractor.py, doing a lot of refactoring.

Committed not because of solid progress but because of how much has changed.
parent e1be348a
...@@ -32,6 +32,7 @@ class EssaySet(object): ...@@ -32,6 +32,7 @@ class EssaySet(object):
Essays in an essay set can be assumed to have these properties. Essays in an essay set can be assumed to have these properties.
""" """
def __init__(self, essay_type="train"): def __init__(self, essay_type="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
......
...@@ -30,85 +30,123 @@ ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH ...@@ -30,85 +30,123 @@ ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
class FeatureExtractor(object): class FeatureExtractor(object):
def __init__(self): """
self._good_pos_ngrams = self.get_good_pos_ngrams() An object which serves as a feature extractor, using NLTK and some statistics to derive an object which will extract
self.dict_initialized = False features from an object which will allow object classification.
self._spell_errors_per_character = 0 """
self._grammar_errors_per_character = 0
def initialize_dictionaries(self, e_set, max_feats2=200): def __init__(self, essay_set, max_features_pass_2=200):
""" """
Initializes dictionaries from an essay set object Initializes requisite dictionaries/statistics before the feature extraction can occur.
Dictionaries must be initialized prior to using this to extract features
e_set is an input essay set Was originally separated between an __init__ and an instantiate_dictionaries method, but they were never
returns a confirmation of initialization called separately, so I combined them, as the logical similarity is striking.
Args:
essay_set: an input set of essays that the feature extractor extracts from and is based upon
Kwargs:
max_features_pass_2: The maximum number of features we consider on the second pass of vocabulary grooming
""" """
if (hasattr(e_set, '_type')):
if (e_set._type == "train"): if hasattr(essay_set, '_type'):
#normal text (unstemmed) useful words/bigrams if essay_set._type == "train":
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2=max_feats2) # Finds vocabulary which differentiates good/high scoring essays from bad/low scoring essays.
#stemmed and spell corrected vocab useful words/ngrams normal_vocab = util_functions.get_vocab(
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2=max_feats2) essay_set._cleaned_spelled_essays, essay_set._scores, max_features_pass_2=max_features_pass_2
#dictionary trained on proper vocab )
self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=nvocab)
#dictionary trained on proper vocab # Finds vocab (same criteria as above), but with essays that have been porter stemmed
self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=svocab) stemmed_vocab = util_functions.get_vocab(
essay_set._clean_stem_text, essay_set._scores, max_features_pass_2=max_features_pass_2
)
# Constructs dictionaries trained based on the important vocabularies
self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=normal_vocab)
self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=stemmed_vocab)
# Sets the flag to show that this instance is now ready for training
self.dict_initialized = True self.dict_initialized = True
#Average spelling errors in set. needed later for spelling detection
self._mean_spelling_errors = sum(e_set._spelling_errors) / float(len(e_set._spelling_errors)) # Average the number of spelling errors in the set. This is needed later for spelling detection.
self._spell_errors_per_character = sum(e_set._spelling_errors) / float( spelling_errors = essay_set._spelling_errors
sum([len(t) for t in e_set._text])) self._mean_spelling_errors = sum(spelling_errors) / float(len(spelling_errors))
#Gets the number and positions of grammar errors self._spell_errors_per_character = sum(spelling_errors) / float(
good_pos_tags, bad_pos_positions = self._get_grammar_errors(e_set._pos, e_set._text, e_set._tokens) sum([len(essay) for essay in essay_set._cleaned_essays]))
self._grammar_errors_per_character = (sum(good_pos_tags) / float(sum([len(t) for t in e_set._text])))
#Generate bag of words features # Gets the number and positions of grammar errors
bag_feats = self.gen_bag_feats(e_set) good_pos_tags, bad_pos_positions = self._get_grammar_errors(
#Sum of a row of bag of words features (topical words in an essay) essay_set._pos_tags, essay_set._cleaned_essays, essay_set._tokens
f_row_sum = numpy.sum(bag_feats[:, :]) )
#Average index of how "topical" essays are # NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
self._mean_f_prop = f_row_sum / float(sum([len(t) for t in e_set._text])) # gramatical errors. Though this was not what the original author used, it is clearly what his code
ret = "ok" # implies, as if this is intended to be a true "grammar errors per character", we should have that
# exact number. The replaced call is included for posterity.
# self._grammar_errors_per_character =
# (sum(good_pos_tags) / float(sum([len(t) for t in essay_set._text])))
total_grammar_errors = sum(len(l) for l in bad_pos_positions)
total_characters = float(sum([len(t) for t in essay_set._text]))
self._grammar_errors_per_character = total_grammar_errors / total_characters
# Generates a bag of vocabulary features
vocabulary_features = self.gen_vocabulary_features(essay_set)
# Sum of a row of bag of words features (topical words in an essay)
feature_row_sum = numpy.sum(vocabulary_features[:, :])
# Average index of how "topical" essays are
self._mean_f_prop = feature_row_sum / float(sum([len(t) for t in essay_set._text]))
else: else:
raise util_functions.InputError(e_set, "needs to be an essay set of the train type.") raise util_functions.InputError(essay_set, "needs to be an essay set of the train type.")
else: else:
raise util_functions.InputError(e_set, "wrong input. need an essay set object") raise util_functions.InputError(essay_set, "wrong input. need an essay set object.")
return ret
self._good_pos_ngrams = self.get_good_pos_ngrams()
self.dict_initialized = False
self._spell_errors_per_character = 0
self._grammar_errors_per_character = 0
def get_good_pos_ngrams(self): def get_good_pos_ngrams(self):
""" """
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file Returns the list and caches the file
Returns:
A list of all grammatically correct parts of speech.
""" """
if (os.path.isfile(NGRAM_PATH)): if (os.path.isfile(NGRAM_PATH)):
good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb')) good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
elif os.path.isfile(ESSAY_CORPUS_PATH):
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
else: else:
#Hard coded list in case the needed files cannot be found #Hard coded an incomplete list in case the needed files cannot be found
good_pos_ngrams = ['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT', good_pos_ngrams = ['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
'. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP', '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP',
'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO', 'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO',
'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP', 'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP',
'TO NNP', 'TO NNP NNP'] 'TO NNP', 'TO NNP NNP']
return good_pos_ngrams return good_pos_ngrams
def _get_grammar_errors(self, pos, text, tokens): def _get_grammar_errors(self, pos, essays, tokens):
""" """
Internal function to get the number of grammar errors in given text Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
text - normal text (list) Args:
tokens - list of lists of tokenized text pos: list of pos values for an essay set
essays: list of essay texts
tokens: list of the lists of the tokens in each essay
Returns:
Tuple of the form (good_grammar_ratios, bad_pos_positions)
The former is a list of each essay's "good grammar ratio", which is not very well defined
The latter is a list of lists of each essay's grammatical mistakes as a location in its tokens
""" """
word_counts = [max(len(t), 1) for t in tokens] word_counts = [max(len(t), 1) for t in tokens]
good_pos_tags = [] good_grammar_ratios = []
min_pos_seq = 2 min_pos_seq = 2
max_pos_seq = 4 max_pos_seq = 4
bad_pos_positions = [] bad_pos_positions = []
for i in xrange(0, len(text)): for i in xrange(0, len(essays)):
pos_seq = [tag[1] for tag in pos[i]] pos_seq = [tag[1] for tag in pos[i]]
pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq) pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
long_pos_ngrams = [z for z in pos_ngrams if z.count(' ') == (max_pos_seq - 1)] long_pos_ngrams = [z for z in pos_ngrams if z.count(' ') == (max_pos_seq - 1)]
...@@ -134,8 +172,8 @@ class FeatureExtractor(object): ...@@ -134,8 +172,8 @@ class FeatureExtractor(object):
if divisor == 0: if divisor == 0:
divisor = 1 divisor = 1
good_grammar_ratio = (len(pos_ngrams) - len(overlap_ngrams)) / divisor good_grammar_ratio = (len(pos_ngrams) - len(overlap_ngrams)) / divisor
good_pos_tags.append(good_grammar_ratio) good_grammar_ratios.append(good_grammar_ratio)
return good_pos_tags, bad_pos_positions return good_grammar_ratios, bad_pos_positions
def gen_length_feats(self, e_set): def gen_length_feats(self, e_set):
""" """
...@@ -161,34 +199,46 @@ class FeatureExtractor(object): ...@@ -161,34 +199,46 @@ class FeatureExtractor(object):
return length_arr.copy() return length_arr.copy()
def gen_bag_feats(self, e_set): def gen_vocabulary_features(self, essay_set):
""" """
Generates bag of words features from an input essay set and trained FeatureExtractor Generates a bag of words features from an essay set and a trained FeatureExtractor (self)
Generally called by gen_feats
Returns an array of features Args:
e_set - EssaySet object self: The trained Feature Extractor (trained by the init_method)
essay_set: the EssaySet Object to generate the bag of word features from.
Returns:
An array of features to be used for extraction
""" """
if (hasattr(self, '_stem_dict')): # Calculates Stem and Normal features
sfeats = self._stem_dict.transform(e_set._clean_stem_text) stem_features = self._stem_dict.transform(essay_set._cleaned_stem_essays)
nfeats = self._normal_dict.transform(e_set._text) normal_features = self._normal_dict.transform(essay_set._cleaned_essays)
bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
else: # Mushes them together and returns
raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.") bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1)
return bag_feats.copy() return bag_features.copy()
def gen_feats(self, e_set): def generate_features(self, essay_set):
""" """
Generates bag of words, length, and prompt features from an essay set object Generates bag of words, length, and prompt features from an essay set object
returns an array of features
e_set - EssaySet object
"""
bag_feats = self.gen_bag_feats(e_set)
length_feats = self.gen_length_feats(e_set)
prompt_feats = self.gen_prompt_feats(e_set)
overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
overall_feats = overall_feats.copy()
return overall_feats Args:
essay_set (EssaySet): the essay set to extract features for
Returns:
Array of features with the following included:
- Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
"""
vocabulary_features = self.gen_vocabulary_features(essay_set)
length_features = self.gen_length_feats(essay_set)
prompt_features = self.gen_prompt_feats(essay_set)
# Lumps them all together, copies to solidify, and returns
overall_features = numpy.concatenate((length_features, prompt_features, vocabulary_features), axis=1)
overall_features = overall_features.copy()
return overall_features
def gen_prompt_feats(self, e_set): def gen_prompt_feats(self, e_set):
""" """
......
...@@ -71,7 +71,7 @@ def grade(grader_data, submission): ...@@ -71,7 +71,7 @@ def grade(grader_data, submission):
# Tries to extract features from submission and assign score via the model # Tries to extract features from submission and assign score via the model
grader_features = None grader_features = None
try: try:
grader_features = extractor.gen_feats(grader_set) grader_features = extractor.generate_features(grader_set)
feedback = extractor.gen_feedback(grader_set, grader_features)[0] feedback = extractor.gen_feedback(grader_set, grader_features)[0]
results['score'] = int(model.predict(grader_features)[0]) results['score'] = int(model.predict(grader_features)[0])
except: except:
...@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features):
# Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats = extractor.gen_feats(grader_set) grader_feats = extractor.generate_features(grader_set)
results['score'] = model.predict(grader_feats)[0] results['score'] = model.predict(grader_feats)[0]
except: except:
error_msg = "Could not extract features and score essay." error_msg = "Could not extract features and score essay."
......
...@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit ...@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
f = feature_extractor.FeatureExtractor() f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays) f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays) train_feats = f.generate_features(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
algorithm = create.select_algorithm(set_score) algorithm = create.select_algorithm(set_score)
......
...@@ -57,7 +57,7 @@ class PredictorExtractor(object): ...@@ -57,7 +57,7 @@ class PredictorExtractor(object):
max_feats2 = int(math.floor(200 / div_length)) max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0, len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
self._initialized = True self._initialized = True
success = True success = True
return success return success
...@@ -74,7 +74,7 @@ class PredictorExtractor(object): ...@@ -74,7 +74,7 @@ class PredictorExtractor(object):
textual_features = [] textual_features = []
for i in xrange(0, len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i])) textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1) textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features) predictor_matrix = numpy.array(p_set._numeric_features)
......
...@@ -198,19 +198,29 @@ def regenerate_good_tokens(string): ...@@ -198,19 +198,29 @@ def regenerate_good_tokens(string):
return sel_pos_ngrams return sel_pos_ngrams
def get_vocab(text, score, max_feats=750, max_feats2=200): def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
""" """
Uses a fisher test to find words that are significant in that they separate Uses a fisher test to find words that are significant in that they separate
high scoring essays from low scoring essays. high scoring essays from low scoring essays.
text is a list of input essays.
score is a list of scores, with score[n] corresponding to text[n] Args:
max_feats is the maximum number of features to consider in the first pass essays (list of str): a list of input essays
max_feats2 is the maximum number of features to consider in the second (final) pass scores (list of int): a list of associated input scores
Returns a list of words that constitute the significant vocabulary
""" Kwargs:
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats) max_features_pass_1 (int): the maximum number of features to consider in the first pass of the essays
dict_mat = dict.fit_transform(text) max_features_pass_2 (int): the maximum number of features to consider in the second pass of the essays
set_score = numpy.asarray(score, dtype=numpy.int) (DEFAULTS of 750 and 200 respectively)
Returns:
(list of str): A list of strings which constitute the signifigant vocabulary which diferentiates between
strong and weak essays.
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
"""
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
dict_matrix = dict.fit_transform(essays)
set_score = numpy.asarray(scores, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
new_score = set_score new_score = set_score
if (med_score == 0): if (med_score == 0):
...@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
new_score[set_score >= med_score] = 1 new_score[set_score >= med_score] = 1
fish_vals = [] fish_vals = []
for col_num in range(0, dict_mat.shape[1]): for col_num in range(0, dict_matrix.shape[1]):
loop_vec = dict_mat.getcol(col_num).toarray() loop_vec = dict_matrix.getcol(col_num).toarray()
good_loop_vec = loop_vec[new_score == 1] good_loop_vec = loop_vec[new_score == 1]
bad_loop_vec = loop_vec[new_score == 0] bad_loop_vec = loop_vec[new_score == 0]
good_loop_present = len(good_loop_vec[good_loop_vec > 0]) good_loop_present = len(good_loop_vec[good_loop_vec > 0])
...@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals.append(fish_val) fish_vals.append(fish_val)
cutoff = 1 cutoff = 1
if (len(fish_vals) > max_feats2): if (len(fish_vals) > max_features_pass_2):
cutoff = sorted(fish_vals)[max_feats2] cutoff = sorted(fish_vals)[max_features_pass_2]
good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff]) good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
getVar = lambda searchList, ind: [searchList[i] for i in ind] getVar = lambda searchList, ind: [searchList[i] for i in ind]
vocab = getVar(dict.get_feature_names(), good_cols) vocab = getVar(dict.get_feature_names(), good_cols)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment