Commit fc598c06 by Vik Paruchuri

Modify cv test to run with hewlett stuff

parent 0f3c2dac
import os import os
import sys import sys
base_path = os.path.dirname(__file__) #base_path = os.path.dirname(__file__)
base_path = "/home/vik/mitx_all/machine-learning"
sys.path.append(base_path) sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..')) one_up_path=os.path.abspath(os.path.join(base_path,'..'))
sys.path.append(one_up_path) sys.path.append(one_up_path)
import util_functions import util_functions
...@@ -17,44 +18,83 @@ from sklearn.ensemble import GradientBoostingClassifier ...@@ -17,44 +18,83 @@ from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path=base_path+"/"
filenames = ['LSQ_W09_60_MLT.tsv', data_path = "/home/vik/mitx_all/vik_sandbox/hewlett_essay_data/split_data"
'LSQ_W10_22_a.tsv', if not data_path.endswith("/"):
'LSQ_W11_21_MLT.tsv', data_path=data_path+"/"
] filenames = [str(i) +".tsv" for i in xrange(1,19)]
kappas = []
errs = []
percent_errors=[]
human_kappas=[]
human_errs=[]
human_percent_errors=[]
for filename in filenames: for filename in filenames:
base_name = base_path + filename base_name = data_path + filename
print base_name print base_name
sa_val = file(base_name) sa_val = file(base_name)
scores=[] id_vals=[]
essay_set_nums=[]
score1s=[]
score2s=[]
texts=[] texts=[]
lines=sa_val.readlines() lines=sa_val.readlines()
eset=essay_set.EssaySet(type="train") eset=essay_set.EssaySet(type="train")
for i in xrange(1,len(lines)): #len(lines)
score,text=lines[i].split("\t\"") for i in xrange(1,10):
scores.append(int(score)) id_val,essay_set_num,score1,score2,text=lines[i].split("\t")
score1s.append(int(score1))
score2s.append(int(score2))
texts.append(text) texts.append(text)
eset.add_essay(text,int(score)) essay_set_nums.append(essay_set_num)
id_vals.append(id_val)
eset.add_essay(text,int(score1))
#if int(score)==0: #if int(score)==0:
# eset.generate_additional_essays(text,int(score)) # eset.generate_additional_essays(text,int(score))
extractor=feature_extractor.FeatureExtractor() extractor=feature_extractor.FeatureExtractor()
extractor.initialize_dictionaries(eset) extractor.initialize_dictionaries(eset)
train_feats=extractor.gen_feats(eset) train_feats=extractor.gen_feats(eset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3) clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores, num_chunks = int(math.floor(len(texts)/2))) try:
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores)) cv_preds=util_functions.gen_cv_preds(clf,train_feats,score1s, num_chunks = 3) # int(math.floor(len(texts)/2)
except:
cv_preds = score1s
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-score1s))
errs.append(err)
print err print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores) kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),score1s)
kappas.append(kappa)
print kappa print kappa
percent_error = numpy.mean(numpy.abs(scores - numpy.array(cv_preds))/scores) percent_error = numpy.mean(numpy.abs(score1s - numpy.array(cv_preds))/score1s)
percent_errors.append(percent_error)
print percent_error print percent_error
outfile=open(filename + "_cvout.tsv",'w+') human_err=numpy.mean(numpy.abs(numpy.array(score2s)-score1s))
outfile.write("cv_pred" + "\t" + "actual\n") human_errs.append(human_err)
print human_err
human_kappa=util_functions.quadratic_weighted_kappa(list(score2s),score1s)
human_kappas.append(human_kappa)
print human_kappa
human_percent_error = numpy.mean(numpy.abs(score1s - numpy.array(score2s))/score1s)
human_percent_errors.append(human_percent_error)
print human_percent_error
outfile=open(data_path + "outdata/" + filename + ".tsv",'w+')
outfile.write("cv_pred" + "\t" + "actual1\t" + "actual2\n")
for i in xrange(0,len(cv_preds)): for i in xrange(0,len(cv_preds)):
outfile.write("{0}\t{1}\n".format(str(cv_preds[i]),str(scores[i]))) outfile.write("{0}\t{1}\t{2}\n".format(str(cv_preds[i]),str(score1s[i]), str(scores2s[i])))
outfile.close() outfile.close()
outfile=open(data_path + "outdata/summary.tsv",'w+')
outfile.write("err\tkappa\tpercent_error\thuman_err\thuman_kappa\thuman_percent_error\n")
for i in xrange(0,len(cv_preds)):
outfile.write("{err}\t{kappa}\t{percent_error}\t{human_err}\t{human_kappa}\t{human_percent_error}\n".format(
err=errs,kappa=kappas,percent_error=percent_errors, human_err=human_errs,
human_kappa=human_kappas, human_percent_error=human_percent_errors))
outfile.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment