Working on feature_extractor.py, doing a lot of refactoring.

Committed not because of solid progress but because of how much has changed.

Working on feature_extractor.py, doing a lot of refactoring.
Committed not because of solid progress but because of how much has changed.
2a2972f8 · gradyward · e1be348a · 2a2972f8 · 2a2972f8 · 2a2972f8
Commit 2a2972f8 authored Jun 12, 2014 by gradyward
Expand all Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 20 deletions

ease/essay_set.py
+1 -0

ease/feature_extractor.py
+0 -0

ease/grade.py
+2 -2

ease/model_creator.py
+1 -1

ease/predictor_extractor.py
+2 -2

ease/util_functions.py
+25 -15

No files found.
--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -32,6 +32,7 @@ class EssaySet(object):
    Essays in an essay set can be assumed to have these properties.
    """
    def __init__(self, essay_type="train"):
        """
        Initialize variables and check essay set type

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
--- a/ease/grade.py
+++ b/ease/grade.py
@@ -71,7 +71,7 @@ def grade(grader_data, submission):
    # Tries to extract features from submission and assign score via the model
    grader_features = None
    try:
-        grader_features = extractor.gen_feats(grader_set)
+        grader_features = extractor.generate_features(grader_set)
        feedback = extractor.gen_feedback(grader_set, grader_features)[0]
        results['score'] = int(model.predict(grader_features)[0])
    except:
@@ -154,7 +154,7 @@ def grade_generic(grader_data, numeric_features, textual_features):
    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats = extractor.gen_feats(grader_set)
+        grader_feats = extractor.generate_features(grader_set)
        results['score'] = model.predict(grader_feats)[0]
    except:
        error_msg = "Could not extract features and score essay."

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
@@ -174,7 +174,7 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
    f = feature_extractor.FeatureExtractor()
    f.initialize_dictionaries(essays)
-    train_feats = f.gen_feats(essays)
+    train_feats = f.generate_features(essays)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
    algorithm = create.select_algorithm(set_score)

--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -57,7 +57,7 @@ class PredictorExtractor(object):
        max_feats2 = int(math.floor(200 / div_length))
        for i in xrange(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
-            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
+            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_features_pass_2=max_feats2)
            self._initialized = True
            success = True
        return success
@@ -74,7 +74,7 @@ class PredictorExtractor(object):
        textual_features = []
        for i in xrange(0, len(p_set._essay_sets)):
-            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
+            textual_features.append(self._extractors[i].generate_features(p_set._essay_sets[i]))
        textual_matrix = numpy.concatenate(textual_features, axis=1)
        predictor_matrix = numpy.array(p_set._numeric_features)

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
@@ -198,19 +198,29 @@ def regenerate_good_tokens(string):
    return sel_pos_ngrams
-def get_vocab(text, score, max_feats=750, max_feats2=200):
+def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
    """
    Uses a fisher test to find words that are significant in that they separate
    high scoring essays from low scoring essays.
-    text is a list of input essays.
-    score is a list of scores, with score[n] corresponding to text[n]
+    Args:
-    max_feats is the maximum number of features to consider in the first pass
+        essays (list of str): a list of input essays
-    max_feats2 is the maximum number of features to consider in the second (final) pass
+        scores (list of int): a list of associated input scores
-    Returns a list of words that constitute the significant vocabulary
-    """
+    Kwargs:
-    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
+        max_features_pass_1 (int): the maximum number of features to consider in the first pass of the essays
-    dict_mat = dict.fit_transform(text)
+        max_features_pass_2 (int): the maximum number of features to consider in the second pass of the essays
-    set_score = numpy.asarray(score, dtype=numpy.int)
+            (DEFAULTS of 750 and 200 respectively)
+    Returns:
+        (list of str): A list of strings which constitute the signifigant vocabulary which diferentiates between
+                        strong and weak essays.
+    NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
+    """
+    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
+    dict_matrix = dict.fit_transform(essays)
+    set_score = numpy.asarray(scores, dtype=numpy.int)
    med_score = numpy.median(set_score)
    new_score = set_score
    if (med_score == 0):
@@ -219,8 +229,8 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
    new_score[set_score >= med_score] = 1
    fish_vals = []
-    for col_num in range(0, dict_mat.shape[1]):
+    for col_num in range(0, dict_matrix.shape[1]):
-        loop_vec = dict_mat.getcol(col_num).toarray()
+        loop_vec = dict_matrix.getcol(col_num).toarray()
        good_loop_vec = loop_vec[new_score == 1]
        bad_loop_vec = loop_vec[new_score == 0]
        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
@@ -231,9 +241,9 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
        fish_vals.append(fish_val)
    cutoff = 1
-    if (len(fish_vals) > max_feats2):
+    if (len(fish_vals) > max_features_pass_2):
-        cutoff = sorted(fish_vals)[max_feats2]
+        cutoff = sorted(fish_vals)[max_features_pass_2]
-    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
+    good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
    getVar = lambda searchList, ind: [searchList[i] for i in ind]
    vocab = getVar(dict.get_feature_names(), good_cols)