Add in new test file and tests data, alter essay set to sanitize input more.

b6d19aca · Vik Paruchuri · b2d0ae38 · b6d19aca · b6d19aca · b6d19aca
Commit b6d19aca authored Nov 19, 2012 by Vik Paruchuri
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 0 deletions

essay_set.py
+1 -0

tests/sa_data.tsv
+0 -0

tests/test_cv_accuracy.py
+50 -0

No files found.
--- a/essay_set.py
+++ b/essay_set.py
@@ -58,6 +58,7 @@ class EssaySet(object):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
            # Clean text by removing non digit/work/punctuation characters
+            essay_text=str(essay_text.encode('ascii', 'ignore'))
            cleaned_essay=util_functions.sub_chars(essay_text).lower()
            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]

--- a/tests/sa_data.tsv
+++ b/tests/sa_data.tsv
--- a/tests/test_cv_accuracy.py
+++ b/tests/test_cv_accuracy.py
+import os
+import sys
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+
+one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
+sys.path.append(one_up_path)
+
+import util_functions
+import essay_set
+import feature_extractor
+
+from sklearn.ensemble import GradientBoostingClassifier
+
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+FILENAME="sa_data.tsv"
+
+sa_val = file(FILENAME)
+
+scores=[]
+texts=[]
+lines=sa_val.readlines()
+
+eset=essay_set.EssaySet(type="train")
+
+for i in xrange(1,len(lines)):
+    score,text=lines[i].split("\t\"")
+    scores.append(int(score))
+    texts.append(text)
+    eset.add_essay(text,int(score))
+    if score==0:
+        eset.generate_additional_essays(text,int(score))
+
+extractor=feature_extractor.FeatureExtractor()
+extractor.initialize_dictionaries(eset)
+train_feats=extractor.gen_feats(eset)
+
+clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+    max_depth=4, random_state=1,
+    min_samples_leaf=3)
+
+cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
+
+kappa=util_functions.quadratic_weighted_kappa(cv_preds,scores)
+
+
+
+