Commit b8f9cdfc by Vik Paruchuri

Create generic ml algo and test for it

parent 3e7a4342
...@@ -22,10 +22,10 @@ if not base_path.endswith("/"): ...@@ -22,10 +22,10 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class PredictorExtractor(object): class PredictorExtractor(object):
def __init__(self): def __init__(self):
self._extractors = [] self._extractors = []
self._initialized = False
def initialize_dictionaries(self, p_set): def initialize_dictionaries(self, p_set):
success = False success = False
...@@ -39,12 +39,30 @@ class PredictorExtractor(object): ...@@ -39,12 +39,30 @@ class PredictorExtractor(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(p_set, error_message)
max_feats2 = math.floor(200/len(p_set._essay_sets)) max_feats2 = int(math.floor(200/len(p_set._essay_sets)))
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
success = True
return success return success
def gen_feats(self, p_set): def gen_feats(self, p_set):
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
...@@ -95,7 +95,7 @@ class PredictorSet(object): ...@@ -95,7 +95,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed #Create essay sets for textual features if needed
if len(self._textual_features)==0: if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)): for i in xrange(0,len(textual_features)):
self._essay_sets.append(EssaySet(type=self._type)) self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features #Add numeric and textual features
self._numeric_features.append(numeric_features) self._numeric_features.append(numeric_features)
...@@ -106,5 +106,5 @@ class PredictorSet(object): ...@@ -106,5 +106,5 @@ class PredictorSet(object):
#Add textual features to essay sets #Add textual features to essay sets
for i in xrange(0,len(textual_features)): for i in xrange(0,len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target[i]) self._essay_sets[i].add_essay(textual_features[i], target)
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(base_path,'..'))
sys.path.append(one_up_path)
import util_functions
import predictor_set
import predictor_extractor
import numpy
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
FILENAME="sa_data.tsv"
sa_val = file(FILENAME)
scores=[]
texts=[]
lines=sa_val.readlines()
pset = predictor_set.PredictorSet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
if len(text)>t_len:
scores.append(int(score))
texts.append(text)
pset.add_row([1],[text],int(score))
extractor=predictor_extractor.PredictorExtractor()
extractor.initialize_dictionaries(pset)
train_feats=extractor.gen_feats(pset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
all_err.append(err)
all_kappa.append(kappa)
"""
outfile=open("full_cvout.tsv",'w+')
outfile.write("cv_pred" + "\t" + "actual")
for i in xrange(0,len(cv_preds)):
outfile.write("{0}\t{1}".format(cv_preds[i],scores[i]))
"""
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment