Commit 1fb02a6d by Vik Paruchuri

Parallel ml cv training

parent 119a6390
...@@ -12,6 +12,7 @@ import essay_set ...@@ -12,6 +12,7 @@ import essay_set
import feature_extractor import feature_extractor
import numpy import numpy
import math import math
from multiprocessing import Pool
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
...@@ -22,14 +23,9 @@ data_path = "/home/vik/mitx_all/vik_sandbox/hewlett_essay_data/split_data" ...@@ -22,14 +23,9 @@ data_path = "/home/vik/mitx_all/vik_sandbox/hewlett_essay_data/split_data"
if not data_path.endswith("/"): if not data_path.endswith("/"):
data_path=data_path+"/" data_path=data_path+"/"
filenames = [str(i) +".tsv" for i in xrange(1,19)] filenames = [str(i) +".tsv" for i in xrange(1,19)]
kappas = []
errs = [] def run_single_worker(args):
percent_errors=[] filename,data_path = args
human_kappas=[]
human_errs=[]
human_percent_errors=[]
for filename in filenames:
base_name = data_path + filename base_name = data_path + filename
print base_name print base_name
sa_val = file(base_name) sa_val = file(base_name)
...@@ -40,7 +36,8 @@ for filename in filenames: ...@@ -40,7 +36,8 @@ for filename in filenames:
texts=[] texts=[]
lines=sa_val.readlines() lines=sa_val.readlines()
eset=essay_set.EssaySet(type="train") eset=essay_set.EssaySet(type="train")
for i in xrange(1,len(lines)): #len(lines)
for i in xrange(1,10):
id_val,essay_set_num,score1,score2,text=lines[i].split("\t") id_val,essay_set_num,score1,score2,text=lines[i].split("\t")
score1s.append(int(score1)) score1s.append(int(score1))
score2s.append(int(score2)) score2s.append(int(score2))
...@@ -65,26 +62,13 @@ for filename in filenames: ...@@ -65,26 +62,13 @@ for filename in filenames:
cv_preds = score1s cv_preds = score1s
rounded_cv = [int(round(cv)) for cv in list(cv_preds)] rounded_cv = [int(round(cv)) for cv in list(cv_preds)]
added_score1 = [s1+1 for s1 in score1s]
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-score1s)) err=numpy.mean(numpy.abs(numpy.array(cv_preds)-score1s))
errs.append(err)
print err
kappa=util_functions.quadratic_weighted_kappa(rounded_cv, score1s) kappa=util_functions.quadratic_weighted_kappa(rounded_cv, score1s)
kappas.append(kappa) percent_error = numpy.mean(numpy.abs(score1s - numpy.array(cv_preds))/added_score1)
print kappa
percent_error = numpy.mean(numpy.abs(score1s - numpy.array(cv_preds))/score1s)
percent_errors.append(percent_error)
print percent_error
human_err=numpy.mean(numpy.abs(numpy.array(score2s)-score1s)) human_err=numpy.mean(numpy.abs(numpy.array(score2s)-score1s))
human_errs.append(human_err)
print human_err
human_kappa=util_functions.quadratic_weighted_kappa(list(score2s),score1s) human_kappa=util_functions.quadratic_weighted_kappa(list(score2s),score1s)
human_kappas.append(human_kappa) human_percent_error = numpy.mean(numpy.abs(score1s - numpy.array(score2s))/added_score1)
print human_kappa
human_percent_error = numpy.mean(numpy.abs(score1s - numpy.array(score2s))/score1s)
human_percent_errors.append(human_percent_error)
print human_percent_error
outfile=open(data_path + "outdata/" + filename,'w+') outfile=open(data_path + "outdata/" + filename,'w+')
outfile.write("cv_pred" + "\t" + "actual1\t" + "actual2\n") outfile.write("cv_pred" + "\t" + "actual1\t" + "actual2\n")
...@@ -92,9 +76,16 @@ for filename in filenames: ...@@ -92,9 +76,16 @@ for filename in filenames:
outfile.write("{0}\t{1}\t{2}\n".format(str(cv_preds[i]),str(score1s[i]), str(score2s[i]))) outfile.write("{0}\t{1}\t{2}\n".format(str(cv_preds[i]),str(score1s[i]), str(score2s[i])))
outfile.close() outfile.close()
return err, kappa,percent_error,human_err,human_kappa,human_percent_error
length = len(filenames)
np=12
p = Pool(processes=np)
errs, kappas,percent_errors,human_errs,human_kappas,human_percent_errors = zip(*p.map(run_single_worker,[(filenames[i],data_path) for i in xrange(0,length)]))
outfile=open(data_path + "outdata/summary.tsv",'w+') outfile=open(data_path + "outdata/summary.tsv",'w+')
outfile.write("set\terr\tkappa\tpercent_error\thuman_err\thuman_kappa\thuman_percent_error\n") outfile.write("set\terr\tkappa\tpercent_error\thuman_err\thuman_kappa\thuman_percent_error\n")
for i in xrange(0,len(cv_preds)): for i in xrange(0,len(errs)):
outfile.write("{set}\t{err}\t{kappa}\t{percent_error}\t{human_err}\t{human_kappa}\t{human_percent_error}\n".format( outfile.write("{set}\t{err}\t{kappa}\t{percent_error}\t{human_err}\t{human_kappa}\t{human_percent_error}\n".format(
set=i+1,err=errs[i],kappa=kappas[i],percent_error=percent_errors[i], human_err=human_errs[i], set=i+1,err=errs[i],kappa=kappas[i],percent_error=percent_errors[i], human_err=human_errs[i],
human_kappa=human_kappas[i], human_percent_error=human_percent_errors[i])) human_kappa=human_kappas[i], human_percent_error=human_percent_errors[i]))
...@@ -104,3 +95,4 @@ outfile.close() ...@@ -104,3 +95,4 @@ outfile.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment