Commit 3e7a4342 by Vik Paruchuri

Add flexible maximum for feature counts

parent 2e7f9e6a
...@@ -32,7 +32,7 @@ class FeatureExtractor(object): ...@@ -32,7 +32,7 @@ class FeatureExtractor(object):
self._spell_errors_per_character=0 self._spell_errors_per_character=0
self._grammar_errors_per_character=0 self._grammar_errors_per_character=0
def initialize_dictionaries(self, e_set): def initialize_dictionaries(self, e_set, max_feats2 = 200):
""" """
Initializes dictionaries from an essay set object Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features Dictionaries must be initialized prior to using this to extract features
...@@ -41,8 +41,8 @@ class FeatureExtractor(object): ...@@ -41,8 +41,8 @@ class FeatureExtractor(object):
""" """
if(hasattr(e_set, '_type')): if(hasattr(e_set, '_type')):
if(e_set._type == "train"): if(e_set._type == "train"):
nvocab = util_functions.get_vocab(e_set._text, e_set._score) nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score) svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
......
...@@ -9,6 +9,8 @@ from itertools import chain ...@@ -9,6 +9,8 @@ from itertools import chain
import copy import copy
import operator import operator
import logging import logging
import math
from feature_extractor import FeatureExtractor
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
...@@ -23,6 +25,26 @@ log = logging.getLogger(__name__) ...@@ -23,6 +25,26 @@ log = logging.getLogger(__name__)
class PredictorExtractor(object): class PredictorExtractor(object):
def __init__(self): def __init__(self):
pass self._extractors = []
def initialize_dictionaries(self, p_set):
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
max_feats2 = math.floor(200/len(p_set._essay_sets))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
return success
def gen_feats(self, p_set):
...@@ -36,7 +36,7 @@ class PredictorSet(object): ...@@ -36,7 +36,7 @@ class PredictorSet(object):
self._target=[] self._target=[]
self._textual_features=[] self._textual_features=[]
self._numeric_features=[] self._numeric_features=[]
self.essay_sets=[] self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target): def add_row(self, numeric_features, textual_features, target):
...@@ -95,7 +95,7 @@ class PredictorSet(object): ...@@ -95,7 +95,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed #Create essay sets for textual features if needed
if len(self._textual_features)==0: if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)): for i in xrange(0,len(textual_features)):
self.essay_sets.append(EssaySet(type=self._type)) self._essay_sets.append(EssaySet(type=self._type))
#Add numeric and textual features #Add numeric and textual features
self._numeric_features.append(numeric_features) self._numeric_features.append(numeric_features)
...@@ -106,5 +106,5 @@ class PredictorSet(object): ...@@ -106,5 +106,5 @@ class PredictorSet(object):
#Add textual features to essay sets #Add textual features to essay sets
for i in xrange(0,len(textual_features)): for i in xrange(0,len(textual_features)):
self.essay_sets[i].add_essay(textual_features[i], target[i]) self._essay_sets[i].add_essay(textual_features[i], target[i])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment