Working on feature_extractor.py, doing a lot of refactoring.

Committed not because of solid progress but because of how much has changed.

Working on feature_extractor.py, doing a lot of refactoring.
Committed not because of solid progress but because of how much has changed.
2a2972f8 · gradyward · e1be348a · 2a2972f8 · 2a2972f8 · 2a2972f8
Commit 2a2972f8 authored Jun 12, 2014 by gradyward
Show whitespace changes
Inline Side-by-side

Showing with 156 additions and 95 deletions

ease/essay_set.py
+1 -0

ease/feature_extractor.py
+125 -75

ease/grade.py
+2 -2

ease/model_creator.py
+1 -1

ease/predictor_extractor.py
+2 -2

ease/util_functions.py
+25 -15

No files found.
--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -32,6 +32,7 @@ class EssaySet(object):
    Essays in an essay set can be assumed to have these properties.
    """
    def __init__(self, essay_type="train"):
        """
        Initialize variables and check essay set type

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
@@ -30,85 +30,123 @@ ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
 class FeatureExtractor(object):
-    def __init__(self):
+    """
-        self._good_pos_ngrams = self.get_good_pos_ngrams()
+    An object which serves as a feature extractor, using NLTK and some statistics to derive an object which will extract
-        self.dict_initialized = False
+    features from an object which will allow object classification.
-        self._spell_errors_per_character = 0
+    """
-        self._grammar_errors_per_character = 0
-    def initialize_dictionaries(self, e_set, max_feats2=200):
+    def __init__(self, essay_set, max_features_pass_2=200):
        """
-        Initializes dictionaries from an essay set object
+        Initializes requisite dictionaries/statistics before the feature extraction can occur.
-        Dictionaries must be initialized prior to using this to extract features
-        e_set is an input essay set
+        Was originally separated between an __init__ and an instantiate_dictionaries method, but they were never
-        returns a confirmation of initialization
+        called separately, so I combined them, as the logical similarity is striking.
+        Args:
+            essay_set: an input set of essays that the feature extractor extracts from and is based upon
+        Kwargs:
+            max_features_pass_2: The maximum number of features we consider on the second pass of vocabulary grooming
        """
-        if (hasattr(e_set, '_type')):
-            if (e_set._type == "train"):
+        if hasattr(essay_set, '_type'):
-                #normal text (unstemmed) useful words/bigrams
+            if essay_set._type == "train":
-                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2=max_feats2)
+                # Finds vocabulary which differentiates good/high scoring essays from bad/low scoring essays.
-                #stemmed and spell corrected vocab useful words/ngrams
+                normal_vocab = util_functions.get_vocab(
-                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2=max_feats2)
+                    essay_set._cleaned_spelled_essays, essay_set._scores, max_features_pass_2=max_features_pass_2
-                #dictionary trained on proper vocab
+                )
-                self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=nvocab)
-                #dictionary trained on proper vocab
+                # Finds vocab (same criteria as above), but with essays that have been porter stemmed
-                self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=svocab)
+                stemmed_vocab = util_functions.get_vocab(
+                    essay_set._clean_stem_text, essay_set._scores, max_features_pass_2=max_features_pass_2
+                )
+                # Constructs dictionaries trained based on the important vocabularies
+                self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=normal_vocab)
+                self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=stemmed_vocab)
+                # Sets the flag to show that this instance is now ready for training
                self.dict_initialized = True
-                #Average spelling errors in set. needed later for spelling detection
-                self._mean_spelling_errors = sum(e_set._spelling_errors) / float(len(e_set._spelling_errors))
+                # Average the number of spelling errors in the set. This is needed later for spelling detection.
-                self._spell_errors_per_character = sum(e_set._spelling_errors) / float(
+                spelling_errors = essay_set._spelling_errors
-                    sum([len(t) for t in e_set._text]))
+                self._mean_spelling_errors = sum(spelling_errors) / float(len(spelling_errors))
-                #Gets the number and positions of grammar errors
+                self._spell_errors_per_character = sum(spelling_errors) / float(
-                good_pos_tags, bad_pos_positions = self._get_grammar_errors(e_set._pos, e_set._text, e_set._tokens)
+                    sum([len(essay) for essay in essay_set._cleaned_essays]))
-                self._grammar_errors_per_character = (sum(good_pos_tags) / float(sum([len(t) for t in e_set._text])))
-                #Generate bag of words features
+                # Gets the number and positions of grammar errors
-                bag_feats = self.gen_bag_feats(e_set)
+                good_pos_tags, bad_pos_positions = self._get_grammar_errors(
-                #Sum of a row of bag of words features (topical words in an essay)
+                    essay_set._pos_tags, essay_set._cleaned_essays, essay_set._tokens
-                f_row_sum = numpy.sum(bag_feats[:, :])
+                )
-                #Average index of how "topical" essays are
+                # NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
-                self._mean_f_prop = f_row_sum / float(sum([len(t) for t in e_set._text]))
+                # gramatical errors.  Though this was not what the original author used, it is clearly what his code
-                ret = "ok"
+                # implies, as if this is intended to be a true "grammar errors per character", we should have that
+                # exact number.  The replaced call is included for posterity.
+                # self._grammar_errors_per_character =
+                # (sum(good_pos_tags) / float(sum([len(t) for t in essay_set._text])))
+                total_grammar_errors = sum(len(l) for l in bad_pos_positions)
+                total_characters = float(sum([len(t) for t in essay_set._text]))
+                self._grammar_errors_per_character = total_grammar_errors / total_characters
+                # Generates a bag of vocabulary features
+                vocabulary_features = self.gen_vocabulary_features(essay_set)
+                # Sum of a row of bag of words features (topical words in an essay)
+                feature_row_sum = numpy.sum(vocabulary_features[:, :])
+                # Average index of how "topical" essays are
+                self._mean_f_prop = feature_row_sum / float(sum([len(t) for t in essay_set._text]))
            else:
-                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
+                raise util_functions.InputError(essay_set, "needs to be an essay set of the train type.")
        else:
-            raise util_functions.InputError(e_set, "wrong input. need an essay set object")
+            raise util_functions.InputError(essay_set, "wrong input. need an essay set object.")
-        return ret
+        self._good_pos_ngrams = self.get_good_pos_ngrams()
+        self.dict_initialized = False
+        self._spell_errors_per_character = 0
+        self._grammar_errors_per_character = 0
    def get_good_pos_ngrams(self):
        """
-        Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
+        Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
        Returns the list and caches the file
+        Returns:
+            A list of all grammatically correct parts of speech.
        """
        if (os.path.isfile(NGRAM_PATH)):
            good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
-        elif os.path.isfile(ESSAY_CORPUS_PATH):
-            essay_corpus = open(ESSAY_CORPUS_PATH).read()
-            essay_corpus = util_functions.sub_chars(essay_corpus)
-            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
-            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
        else:
-            #Hard coded list in case the needed files cannot be found
+            #Hard coded an incomplete list in case the needed files cannot be found
            good_pos_ngrams = ['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
                               '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP',
                               'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO',
                               'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP',
                               'TO NNP', 'TO NNP NNP']
        return good_pos_ngrams
-    def _get_grammar_errors(self, pos, text, tokens):
+    def _get_grammar_errors(self, pos, essays, tokens):
        """
        Internal function to get the number of grammar errors in given text
-        pos - part of speech tagged text (list)
-        text - normal text (list)
+        Args:
-        tokens - list of lists of tokenized text
+            pos: list of pos values for an essay set
+            essays: list of essay texts
+            tokens: list of the lists of the tokens in each essay
+        Returns:
+            Tuple of the form (good_grammar_ratios, bad_pos_positions)
+                The former is a list of each essay's "good grammar ratio", which is not very well defined
+                The latter is a list of lists of each essay's grammatical mistakes as a location in its tokens
        """
        word_counts = [max(len(t), 1) for t in tokens]
-        good_pos_tags = []
+        good_grammar_ratios = []
        min_pos_seq = 2
        max_pos_seq = 4
        bad_pos_positions = []
-        for i in xrange(0, len(text)):
+        for i in xrange(0, len(essays)):
            pos_seq = [tag[1] for tag in pos[i]]
            pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
            long_pos_ngrams = [z for z in pos_ngrams if z.count(' ') == (max_pos_seq - 1)]
@@ -134,8 +172,8 @@ class FeatureExtractor(object):
            if divisor == 0:
                divisor = 1
            good_grammar_ratio = (len(pos_ngrams) - len(overlap_ngrams)) / divisor
-            good_pos_tags.append(good_grammar_ratio)
+            good_grammar_ratios.append(good_grammar_ratio)
-        return good_pos_tags, bad_pos_positions
+        return good_grammar_ratios, bad_pos_positions
    def gen_length_feats(self, e_set):
        """
@@ -161,34 +199,46 @@ class FeatureExtractor(object):
        return length_arr.copy()
-    def gen_bag_feats(self, e_set):
+    def gen_vocabulary_features(self, essay_set):
        """
-        Generates bag of words features from an input essay set and trained FeatureExtractor
+        Generates a bag of words features from an essay set and a trained FeatureExtractor (self)
-        Generally called by gen_feats
-        Returns an array of features
+        Args:
-        e_set - EssaySet object
+            self: The trained Feature Extractor (trained by the init_method)
+            essay_set: the EssaySet Object to generate the bag of word features from.
+        Returns:
+            An array of features to be used for extraction
        """
-        if (hasattr(self, '_stem_dict')):
+        # Calculates Stem and Normal features
-            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
+        stem_features = self._stem_dict.transform(essay_set._cleaned_stem_essays)
-            nfeats = self._normal_dict.transform(e_set._text)
+        normal_features = self._normal_dict.transform(essay_set._cleaned_essays)
-            bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
-        else:
+        # Mushes them together and returns
-            raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
+        bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1)
-        return bag_feats.copy()
+        return bag_features.copy()
-    def gen_feats(self, e_set):
+    def generate_features(self, essay_set):
        """
        Generates bag of words, length, and prompt features from an essay set object
-        returns an array of features
-        e_set - EssaySet object
-        """
-        bag_feats = self.gen_bag_feats(e_set)
-        length_feats = self.gen_length_feats(e_set)
-        prompt_feats = self.gen_prompt_feats(e_set)
-        overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
-        overall_feats = overall_feats.copy()
-        return overall_feats
+        Args:
+            essay_set (EssaySet): the essay set to extract features for
+        Returns:
+            Array of features with the following included:
+                - Length Features
+                - Vocabulary Features (both Normal and Stemmed Vocabulary)
+                - Prompt Features
+        """
+        vocabulary_features = self.gen_vocabulary_features(essay_set)
+        length_features = self.gen_length_feats(essay_set)
+        prompt_features = self.gen_prompt_feats(essay_set)
+        # Lumps them all together, copies to solidify, and returns
+        overall_features = numpy.concatenate((length_features, prompt_features, vocabulary_features), axis=1)
+        overall_features = overall_features.copy()
+        return overall_features
    def gen_prompt_feats(self, e_set):
        """

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -71,7 +71,7 @@ def grade(grader_data, submission):
    # Tries to extract features from submission and assign score via the model
    grader_features = None
    try:
-        grader_features = extractor.gen_feats(grader_set)
+        grader_features = extractor.generate_features(grader_set)
        feedback = extractor.gen_feedback(grader_set, grader_features)[0]
        results['score'] = int(model.predict(grader_features)[0])
    except:
@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features):
    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats = extractor.gen_feats(grader_set)
+        grader_feats = extractor.generate_features(grader_set)
        results['score'] = model.predict(grader_feats)[0]
    except:
        error_msg = "Could not extract features and score essay."

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
    f = feature_extractor.FeatureExtractor()
    f.initialize_dictionaries(essays)
-    train_feats = f.gen_feats(essays)
+    train_feats = f.generate_features(essays)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
    algorithm = create.select_algorithm(set_score)

--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -57,7 +57,7 @@ class PredictorExtractor(object):
        max_feats2 = int(math.floor(200 / div_length))
        for i in xrange(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
-            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
+            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
            self._initialized = True
            success = True
        return success
@@ -74,7 +74,7 @@ class PredictorExtractor(object):
        textual_features = []
        for i in xrange(0, len(p_set._essay_sets)):
-            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
+            textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
        textual_matrix = numpy.concatenate(textual_features, axis=1)
        predictor_matrix = numpy.array(p_set._numeric_features)

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
@@ -198,19 +198,29 @@ def regenerate_good_tokens(string):
    return sel_pos_ngrams
-def get_vocab(text, score, max_feats=750, max_feats2=200):
+def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
    """
    Uses a fisher test to find words that are significant in that they separate
    high scoring essays from low scoring essays.
-    text is a list of input essays.
-    score is a list of scores, with score[n] corresponding to text[n]
+    Args:
-    max_feats is the maximum number of features to consider in the first pass
+        essays (list of str): a list of input essays
-    max_feats2 is the maximum number of features to consider in the second (final) pass
+        scores (list of int): a list of associated input scores
-    Returns a list of words that constitute the significant vocabulary
-    """
+    Kwargs:
-    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
+        max_features_pass_1 (int): the maximum number of features to consider in the first pass of the essays
-    dict_mat = dict.fit_transform(text)
+        max_features_pass_2 (int): the maximum number of features to consider in the second pass of the essays
-    set_score = numpy.asarray(score, dtype=numpy.int)
+            (DEFAULTS of 750 and 200 respectively)
+    Returns:
+        (list of str): A list of strings which constitute the signifigant vocabulary which diferentiates between
+                        strong and weak essays.
+    NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
+    """
+    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
+    dict_matrix = dict.fit_transform(essays)
+    set_score = numpy.asarray(scores, dtype=numpy.int)
    med_score = numpy.median(set_score)
    new_score = set_score
    if (med_score == 0):
@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
    new_score[set_score >= med_score] = 1
    fish_vals = []
-    for col_num in range(0, dict_mat.shape[1]):
+    for col_num in range(0, dict_matrix.shape[1]):
-        loop_vec = dict_mat.getcol(col_num).toarray()
+        loop_vec = dict_matrix.getcol(col_num).toarray()
        good_loop_vec = loop_vec[new_score == 1]
        bad_loop_vec = loop_vec[new_score == 0]
        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
        fish_vals.append(fish_val)
    cutoff = 1
-    if (len(fish_vals) > max_feats2):
+    if (len(fish_vals) > max_features_pass_2):
-        cutoff = sorted(fish_vals)[max_feats2]
+        cutoff = sorted(fish_vals)[max_features_pass_2]
-    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
+    good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
    getVar = lambda searchList, ind: [searchList[i] for i in ind]
    vocab = getVar(dict.get_feature_names(), good_cols)