Add in test cv single, fix up test cv full to work with arbitrary scales

119a6390 · Vik Paruchuri · 56fab8d8 · 119a6390 · 119a6390
Commit 119a6390 authored Feb 12, 2013 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 57 additions and 3 deletions

tests/test_cv_full.py
+2 -3

tests/test_cv_single.py
+55 -0

No files found.
--- a/tests/test_cv_full.py
+++ b/tests/test_cv_full.py
@@ -13,7 +13,7 @@ import feature_extractor
 import numpy
 import math

-from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

 if not base_path.endswith("/"):
    base_path=base_path+"/"
@@ -40,8 +40,7 @@ for filename in filenames:
    texts=[]
    lines=sa_val.readlines()
    eset=essay_set.EssaySet(type="train")
-    #len(lines)
-    for i in xrange(1,10):
+    for i in xrange(1,len(lines)):
        id_val,essay_set_num,score1,score2,text=lines[i].split("\t")
        score1s.append(int(score1))
        score2s.append(int(score2))

--- a/tests/test_cv_single.py
+++ b/tests/test_cv_single.py
+import os
+import sys
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+
+one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
+sys.path.append(one_up_path)
+
+import util_functions
+import essay_set
+import feature_extractor
+import numpy
+import math
+
+from sklearn.ensemble import GradientBoostingClassifier
+
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+filenames = ['LSQ_W09_60_MLT.tsv',
+             'LSQ_W10_22_a.tsv',
+             'LSQ_W11_21_MLT.tsv',
+             ]
+
+for filename in filenames:
+    base_name = base_path + filename
+    print base_name
+    sa_val = file(base_name)
+    scores=[]
+    texts=[]
+    lines=sa_val.readlines()
+    eset=essay_set.EssaySet(type="train")
+    for i in xrange(1,len(lines)):
+        score,text=lines[i].split("\t\"")
+        scores.append(int(score))
+        texts.append(text)
+        eset.add_essay(text,int(score))
+        #if int(score)==0:
+        #    eset.generate_additional_essays(text,int(score))
+    extractor=feature_extractor.FeatureExtractor()
+    extractor.initialize_dictionaries(eset)
+    train_feats=extractor.gen_feats(eset)
+    clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
+    cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores, num_chunks = int(math.floor(len(texts)/2)))
+    err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
+    print err
+    kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
+    print kappa
+
+    outfile=open(filename + "_cvout.tsv",'w+')
+    outfile.write("cv_pred" + "\t" + "actual\n")
+    for i in xrange(0,len(cv_preds)):
+        outfile.write("{0}\t{1}\n".format(str(cv_preds[i]),str(scores[i])))
+    outfile.close()
\ No newline at end of file