Commit 3e7a4342 by Vik Paruchuri

Add flexible maximum for feature counts

parent 2e7f9e6a
......@@ -32,7 +32,7 @@ class FeatureExtractor(object):
self._spell_errors_per_character=0
self._grammar_errors_per_character=0
def initialize_dictionaries(self, e_set):
def initialize_dictionaries(self, e_set, max_feats2 = 200):
"""
Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features
......@@ -41,8 +41,8 @@ class FeatureExtractor(object):
"""
if(hasattr(e_set, '_type')):
if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True
......
......@@ -9,6 +9,8 @@ from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
......@@ -23,6 +25,26 @@ log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
pass
self._extractors = []
def initialize_dictionaries(self, p_set):
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
max_feats2 = math.floor(200/len(p_set._essay_sets))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
return success
def gen_feats(self, p_set):
......@@ -36,7 +36,7 @@ class PredictorSet(object):
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self.essay_sets=[]
self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target):
......@@ -95,7 +95,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self.essay_sets.append(EssaySet(type=self._type))
self._essay_sets.append(EssaySet(type=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
......@@ -106,5 +106,5 @@ class PredictorSet(object):
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
self.essay_sets[i].add_essay(textual_features[i], target[i])
self._essay_sets[i].add_essay(textual_features[i], target[i])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment