Add flexible maximum for feature counts

3e7a4342 · Vik Paruchuri · 2e7f9e6a · 3e7a4342 · 3e7a4342 · 3e7a4342
Commit 3e7a4342 authored Jan 24, 2013 by Vik Paruchuri
Show whitespace changes
Inline Side-by-side

Showing with 29 additions and 7 deletions

feature_extractor.py
+3 -3

predictor_extractor.py
+23 -1

predictor_set.py
+3 -3

No files found.
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -32,7 +32,7 @@ class FeatureExtractor(object):
        self._spell_errors_per_character=0
        self._grammar_errors_per_character=0
-    def initialize_dictionaries(self, e_set):
+    def initialize_dictionaries(self, e_set, max_feats2 = 200):
        """
        Initializes dictionaries from an essay set object
        Dictionaries must be initialized prior to using this to extract features
@@ -41,8 +41,8 @@ class FeatureExtractor(object):
        """
        if(hasattr(e_set, '_type')):
            if(e_set._type == "train"):
-                nvocab = util_functions.get_vocab(e_set._text, e_set._score)
+                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
-                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
+                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True

--- a/predictor_extractor.py
+++ b/predictor_extractor.py
@@ -9,6 +9,8 @@ from itertools import chain
 import copy
 import operator
 import logging
+import math
+from feature_extractor import FeatureExtractor
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
@@ -23,6 +25,26 @@ log = logging.getLogger(__name__)
 class PredictorExtractor(object):
    def __init__(self):
-        pass
+        self._extractors = []
+    def initialize_dictionaries(self, p_set):
+        success = False
+        if not (hasattr(p_set, '_type')):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+        if not (p_set._type == "train"):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+        max_feats2 = math.floor(200/len(p_set._essay_sets))
+        for i in xrange(0,len(p_set._essay_sets)):
+            self._extractors.append(FeatureExtractor())
+            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
+        return success
+    def gen_feats(self, p_set):
--- a/predictor_set.py
+++ b/predictor_set.py
@@ -36,7 +36,7 @@ class PredictorSet(object):
        self._target=[]
        self._textual_features=[]
        self._numeric_features=[]
-        self.essay_sets=[]
+        self._essay_sets=[]
    def add_row(self, numeric_features, textual_features, target):
@@ -95,7 +95,7 @@ class PredictorSet(object):
        #Create essay sets for textual features if needed
        if len(self._textual_features)==0:
            for i in xrange(0,len(textual_features)):
-                self.essay_sets.append(EssaySet(type=self._type))
+                self._essay_sets.append(EssaySet(type=self._type))
        #Add numeric and textual features
        self._numeric_features.append(numeric_features)
@@ -106,5 +106,5 @@ class PredictorSet(object):
        #Add textual features to essay sets
        for i in xrange(0,len(textual_features)):
-            self.essay_sets[i].add_essay(textual_features[i], target[i])
+            self._essay_sets[i].add_essay(textual_features[i], target[i])