adding documentation

09b8c904 · Vik Paruchuri · ead03fa6 · 09b8c904 · 09b8c904 · 09b8c904
Commit 09b8c904 authored Oct 24, 2012 by Vik Paruchuri
Show whitespace changes
Inline Side-by-side

Showing with 324 additions and 274 deletions

Readme.md
+1 -1

create_test_models.py
+7 -8

essay_set.py
+63 -44

feature_extractor.py
+58 -56

model_creator.py
+22 -21

util_functions.py
+173 -144

No files found.
--- a/Readme.md
+++ b/Readme.md
-Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy and sklearn also need to be installed.
+Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy, sklearn, and nltk also need to be installed.
 Runnable files:

--- a/create_test_models.py
+++ b/create_test_models.py
@@ -4,14 +4,13 @@ import os
 import sys
 import argparse
-base_path = os.path.dirname( __file__ )
+base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 import model_creator
 def main(argv):
    parser = argparse.ArgumentParser(description="Generate model from test data files")
    parser.add_argument('train_file')
    parser.add_argument('prompt_file')
@@ -19,11 +18,11 @@ def main(argv):
    args = parser.parse_args(argv)
-    score,text=model_creator.read_in_test_data(args.train_file)
+    score, text = model_creator.read_in_test_data(args.train_file)
-    prompt_string=model_creator.read_in_test_prompt(args.prompt_file)
+    prompt_string = model_creator.read_in_test_prompt(args.prompt_file)
-    e_set=model_creator.create_essay_set(text,score,prompt_string)
+    e_set = model_creator.create_essay_set(text, score, prompt_string)
-    feature_ext,classifier=model_creator.extract_features_and_generate_model(e_set)
+    feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
-    model_creator.dump_model_to_file(prompt_string,feature_ext,classifier,args.model_path)
+    model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, args.model_path)
-if __name__=="__main__":
+if __name__ == "__main__":
    main(sys.argv[1:])
--- a/essay_set.py
+++ b/essay_set.py
@@ -13,75 +13,94 @@ import util_functions
 class essay_set:
    def __init__(self, type="train"):
-        if(type!="train" and type!="test"):
+        """
-            type="train"
+        Initialize variables and check essay set type
+        """
+        if(type != "train" and type != "test"):
+            type = "train"
        self._type = type
-        self._score,self._text,self._id,self._clean_text,self._tokens,self._pos,\
+        self._score, self._text, self._id, self._clean_text, self._tokens, self._pos,\
-        self._clean_stem_text,self._generated=[],[],[],[],[],[],[],[]
+        self._clean_stem_text, self._generated = [], [], [], [], [], [], [], []
-        self._prompt=""
+        self._prompt = ""
-    #Add new (essay_text,essay_score) pair to the essay set
+    def add_essay(self, essay_text, essay_score, essay_generated=0):
-    def add_essay(self,essay_text,essay_score,essay_generated=0):
+        """
+        Add new (essay_text,essay_score) pair to the essay set.
+        essay_text must be a string.
+        essay_score must be an int.
+        essay_generated should not be changed by the user.
+        Returns a confirmation that essay was added.
+        """
        #Get maximum current essay id, or set to 0 if this is the first essay added
-        if(len(self._id)>0):
+        if(len(self._id) > 0):
-            max_id=max(self._id)
+            max_id = max(self._id)
-        else :
+        else:
-            max_id=0
+            max_id = 0
            #Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
-        if type(essay_score)==type(0) and type(essay_text)==type("text") \
+        if type(essay_score) == type(0) and type(essay_text) == type("text")\
-           and (essay_generated==0 or essay_generated==1):
+        and (essay_generated == 0 or essay_generated == 1):
+            self._id.append(max_id + 1)
-            self._id.append(max_id+1)
            self._score.append(essay_score)
            #Clean text by removing non digit/work/punctuation characters
            self._text.append(util_functions.sub_chars(essay_text).lower())
            #Spell correct text using aspell
-            self._clean_text.append(util_functions.spell_correct(self._text[len(self._text)-1]))
+            self._clean_text.append(util_functions.spell_correct(self._text[len(self._text) - 1]))
            #Tokenize text
-            self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text)-1]))
+            self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
            #Part of speech tag text
-            self._pos.append(nltk.pos_tag(self._tokens[len(self._tokens)-1]))
+            self._pos.append(nltk.pos_tag(self._tokens[len(self._tokens) - 1]))
            self._generated.append(essay_generated)
            #Stem spell corrected text
            porter = nltk.PorterStemmer()
-            por_toks=" ".join([porter.stem(w) for w in self._tokens[len(self._tokens)-1]])
+            por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
            self._clean_stem_text.append(por_toks)
-            ret="text: " + self._text[len(self._text)-1] + " score: " + str(essay_score)
+            ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
        else:
-            raise util_functions.InputError(essay_text,"arguments need to be in format "
+            raise util_functions.InputError(essay_text, "arguments need to be in format "
                                                        "(text,score). text needs to be string,"
                                                        " score needs to be int.")
        return ret
-    #Update the default prompt string, which is ""
+    def update_prompt(self, prompt_text):
-    def update_prompt(self,prompt_text):
+        """
-        if(type(prompt_text)==type("text")):
+        Update the default prompt string, which is "".
-            self._prompt=util_functions.sub_chars(prompt_text)
+        prompt_text should be a string.
-            ret=self._prompt
+        Returns the prompt as a confirmation.
+        """
+        if(type(prompt_text) == type("text")):
+            self._prompt = util_functions.sub_chars(prompt_text)
+            ret = self._prompt
        else:
-            raise util_functions.InputError(prompt_text,"Invalid prompt. Need to enter a string value.")
+            raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
        return ret
-    #Substitute synonyms to generate extra essays from existing ones
+    def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
-    def generate_additional_essays(self,e_text,e_score,dict=None,max_syns=3):
+        """
+        Substitute synonyms to generate extra essays from existing ones.
+        This is done to increase the amount of training data.
+        Should only be used with lowest scoring essays.
+        e_text is the text of the original essay.
+        e_score is the score of the original essay.
+        dict is a fixed dictionary (list) of words to replace.
+        max_syns defines the maximum number of additional essays to generate.  Do not set too high.
+        """
        random.seed(1)
-        e_toks=nltk.word_tokenize(e_text)
+        e_toks = nltk.word_tokenize(e_text)
-        all_syns=[]
+        all_syns = []
        for word in e_toks:
-            synonyms=util_functions.get_wordnet_syns(word)
+            synonyms = util_functions.get_wordnet_syns(word)
-            if(len(synonyms)>max_syns):
+            if(len(synonyms) > max_syns):
-                synonyms=random.sample(synonyms,max_syns)
+                synonyms = random.sample(synonyms, max_syns)
            all_syns.append(synonyms)
-        new_essays=[]
+        new_essays = []
-        for i in range(0,max_syns):
+        for i in range(0, max_syns):
-            syn_toks=e_toks
+            syn_toks = e_toks
-            for z in range(0,len(e_toks)):
+            for z in range(0, len(e_toks)):
-                if len(all_syns[z])>i and (dict==None or e_toks[z] in dict):
+                if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
-                    syn_toks[z]=all_syns[z][i]
+                    syn_toks[z] = all_syns[z][i]
            new_essays.append(" ".join(syn_toks))
-        for z in xrange(0,len(new_essays)):
+        for z in xrange(0, len(new_essays)):
-            self.add_essay(new_essays[z],e_score,1)
+            self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -16,92 +16,93 @@ import util_functions
 class feature_extractor:
    def __init__(self):
-        self._good_pos_ngrams=self.get_good_pos_ngrams()
+        self._good_pos_ngrams = self.get_good_pos_ngrams()
-        self.dict_initialized=False
+        self.dict_initialized = False
-    def initialize_dictionaries(self,e_set):
+    def initialize_dictionaries(self, e_set):
        if(hasattr(e_set, '_type')):
-            if(e_set._type=="train"):
+            if(e_set._type == "train"):
-                nvocab=util_functions.get_vocab(e_set._text,e_set._score)
+                nvocab = util_functions.get_vocab(e_set._text, e_set._score)
-                svocab=util_functions.get_vocab(e_set._clean_stem_text,e_set._score)
+                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score)
-                self._normal_dict=CountVectorizer(min_n=1,max_n=2,vocabulary=nvocab)
+                self._normal_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=nvocab)
-                self._stem_dict=CountVectorizer(min_n=1,max_n=2,vocabulary=svocab)
+                self._stem_dict = CountVectorizer(min_n=1, max_n=2, vocabulary=svocab)
-                self.dict_initialized=True
+                self.dict_initialized = True
-                ret="ok"
+                ret = "ok"
            else:
-                raise util_functions.InputError(e_set,"needs to be an essay set of the train type.")
+                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
        else:
-            raise util_functions.InputError(e_set,"wrong input. need an essay set object")
+            raise util_functions.InputError(e_set, "wrong input. need an essay set object")
        return ret
    def get_good_pos_ngrams(self):
        if(os.path.isfile("good_pos_ngrams.p")):
-            good_pos_ngrams=pickle.load(open('good_pos_ngrams.p', 'rb'))
+            good_pos_ngrams = pickle.load(open('good_pos_ngrams.p', 'rb'))
-        else :
+        else:
-            essay_corpus=open("essaycorpus.txt").read()
+            essay_corpus = open("essaycorpus.txt").read()
-            essay_corpus=util_functions.sub_chars(essay_corpus)
+            essay_corpus = util_functions.sub_chars(essay_corpus)
-            good_pos_ngrams=util_functions.regenerate_good_tokens(essay_corpus)
+            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
            pickle.dump(good_pos_ngrams, open('good_pos_ngrams.p', 'wb'))
        return good_pos_ngrams
-    def gen_length_feats(self,e_set):
+    def gen_length_feats(self, e_set):
-        text=e_set._text
+        text = e_set._text
-        lengths=[len(e) for e in text]
+        lengths = [len(e) for e in text]
-        word_counts=[len(t) for t in e_set._tokens]
+        word_counts = [len(t) for t in e_set._tokens]
-        comma_count=[e.count(",") for e in text]
+        comma_count = [e.count(",") for e in text]
-        ap_count=[e.count("'") for e in text]
+        ap_count = [e.count("'") for e in text]
-        punc_count=[e.count(".")+e.count("?")+e.count("!") for e in text]
+        punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
-        chars_per_word=[lengths[m]/float(word_counts[m]) for m in xrange(0,len(text))]
+        chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-        good_pos_tags=[]
+        good_pos_tags = []
-        for i in xrange(0,len(text)) :
+        for i in xrange(0, len(text)):
-            pos_seq=[tag[1] for tag in e_set._pos[i]]
+            pos_seq = [tag[1] for tag in e_set._pos[i]]
-            pos_ngrams=util_functions.ngrams(pos_seq,2,4)
+            pos_ngrams = util_functions.ngrams(pos_seq, 2, 4)
-            overlap_ngrams=[i for i in pos_ngrams if i in self._good_pos_ngrams]
+            overlap_ngrams = [i for i in pos_ngrams if i in self._good_pos_ngrams]
            good_pos_tags.append(len(overlap_ngrams))
-        good_pos_tag_prop=[good_pos_tags[m]/float(word_counts[m]) for m in xrange(0,len(text))]
+        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-        length_arr=numpy.array((lengths,word_counts,comma_count,ap_count,punc_count,chars_per_word,good_pos_tags,good_pos_tag_prop)).transpose()
+        length_arr = numpy.array((
+        lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
+        good_pos_tag_prop)).transpose()
        return length_arr.copy()
-    def gen_bag_feats(self,e_set):
+    def gen_bag_feats(self, e_set):
        if(hasattr(self, '_stem_dict')):
-             sfeats=self._stem_dict.transform(e_set._clean_stem_text)
+            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
-             nfeats=self._normal_dict.transform(e_set._text)
+            nfeats = self._normal_dict.transform(e_set._text)
-             bag_feats=numpy.concatenate((sfeats.toarray(),nfeats.toarray()),axis=1)
+            bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
        else:
-            raise util_functions.InputError(self,"Dictionaries must be initialized prior to generating bag features.")
+            raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
        return bag_feats.copy()
-    def gen_feats(self,e_set):
+    def gen_feats(self, e_set):
-        bag_feats=self.gen_bag_feats(e_set)
+        bag_feats = self.gen_bag_feats(e_set)
-        length_feats=self.gen_length_feats(e_set)
+        length_feats = self.gen_length_feats(e_set)
-        prompt_feats=self.gen_prompt_feats(e_set)
+        prompt_feats = self.gen_prompt_feats(e_set)
-        overall_feats=numpy.concatenate((length_feats,prompt_feats,bag_feats),axis=1)
+        overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
-        overall_feats=overall_feats.copy()
+        overall_feats = overall_feats.copy()
        return overall_feats
-    def gen_prompt_feats(self,e_set):
+    def gen_prompt_feats(self, e_set):
-        prompt_toks=nltk.word_tokenize(e_set._prompt)
+        prompt_toks = nltk.word_tokenize(e_set._prompt)
-        expand_syns=[]
+        expand_syns = []
        for word in prompt_toks:
-            synonyms=util_functions.get_wordnet_syns(word)
+            synonyms = util_functions.get_wordnet_syns(word)
            expand_syns.append(synonyms)
-        expand_syns=list(chain.from_iterable(expand_syns))
+        expand_syns = list(chain.from_iterable(expand_syns))
-        prompt_overlap=[]
+        prompt_overlap = []
-        prompt_overlap_prop=[]
+        prompt_overlap_prop = []
        for j in e_set._tokens:
            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
-            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap)-1]/float(len(j)))
+            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(len(j)))
-        expand_overlap=[]
+        expand_overlap = []
-        expand_overlap_prop=[]
+        expand_overlap_prop = []
        for j in e_set._tokens:
            expand_overlap.append(len([i for i in j if i in expand_syns]))
-            expand_overlap_prop.append(expand_overlap[len(expand_overlap)-1]/float(len(j)))
+            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(len(j)))
-        prompt_arr=numpy.array((prompt_overlap,prompt_overlap_prop,expand_overlap,expand_overlap_prop)).transpose()
+        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
        return prompt_arr.copy()
\ No newline at end of file
--- a/model_creator.py
+++ b/model_creator.py
@@ -10,7 +10,7 @@ import os
 import sklearn.ensemble
 from itertools import chain
-base_path = os.path.dirname( __file__ )
+base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import essay_set
@@ -18,30 +18,31 @@ import util_functions
 import feature_extractor
 def read_in_test_data(filename):
-    id,e_set,score,score2,text=[],[],[],[],[]
+    id, e_set, score, score2, text = [], [], [], [], []
-    combined_raw=open(filename).read()
+    combined_raw = open(filename).read()
-    raw_lines=combined_raw.splitlines()
+    raw_lines = combined_raw.splitlines()
-    for row in xrange(1,len(raw_lines)):
+    for row in xrange(1, len(raw_lines)):
-        id1,set1,score1,score12,text1 = raw_lines[row].strip().split("\t")
+        id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
        id.append(int(id1))
        text.append(text1)
        e_set.append(int(set1))
        score.append(int(score1))
        score2.append(int(score12))
-    return score,text
+    return score, text
 def read_in_test_prompt(filename):
-    prompt_string=open(filename).read()
+    prompt_string = open(filename).read()
    return prompt_string
 #Create an essay set.  text and score should be lists of strings and ints, respectively.
-def create_essay_set(text,score,prompt_string,generate_additional=True):
+def create_essay_set(text, score, prompt_string, generate_additional=True):
-    x=essay_set()
+    x = essay_set()
-    for i in xrange(0,len(text)):
+    for i in xrange(0, len(text)):
-        x.add_essay(text[i],score[i])
+        x.add_essay(text[i], score[i])
-        if score[i]==min(score) and generate_additional==True:
+        if score[i] == min(score) and generate_additional == True:
-            x.generate_additional_essays(x._clean_text[len(x._clean_text)-1],score[i])
+            x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
    x.update_prompt(prompt_string)
@@ -49,22 +50,22 @@ def create_essay_set(text,score,prompt_string,generate_additional=True):
 #Feed in an essay set to get feature vector and classifier
 def extract_features_and_generate_model(essays):
-    f=feature_extractor.feature_extractor()
+    f = feature_extractor.feature_extractor()
    f.initialize_dictionaries(essays)
-    train_feats=f.gen_feats(essays)
+    train_feats = f.gen_feats(essays)
    clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
        max_depth=4, random_state=1,
        min_samples_leaf=3)
-    model=util_functions.gen_model(clf,train_feats,essays._score)
+    model = util_functions.gen_model(clf, train_feats, essays._score)
-    return f,clf
+    return f, clf
 #Writes out model to pickle file
-def dump_model_to_file(prompt_string,feature_ext,classifier,model_path):
+def dump_model_to_file(prompt_string, feature_ext, classifier, model_path):
-    model_file={'prompt': prompt_string, 'extractor' : feature_ext, 'model' : classifier}
+    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier}
-    pickle.dump(model_file,file=open(model_path,"w"))
+    pickle.dump(model_file, file=open(model_path, "w"))
--- a/util_functions.py
+++ b/util_functions.py
 #Collection of misc functions needed to support essay_set.py and feature_extractor.py.
 #Requires aspell to be installed and added to the path
-aspell_path="aspell"
+aspell_path = "aspell"
 import re
 import os
 from sklearn.feature_extraction.text import CountVectorizer
@@ -14,79 +14,91 @@ import random
 import pickle
 def sub_chars(string):
-    sub_pat=r"[^A-Za-z\.\?!,;:']"
+    """
-    char_pat=r"\."
+    Strips illegal characters from a string.  Used to sanitize input essays.
-    com_pat=r","
+    Removes all non-punctuation, digit, or letter characters.
-    ques_pat=r"\?"
+    Returns sanitized string.
-    excl_pat=r"!"
+    """
-    sem_pat=r";"
+    sub_pat = r"[^A-Za-z\.\?!,;:']"
-    col_pat=r":"
+    char_pat = r"\."
+    com_pat = r","
-    whitespace_pat=r"\s{1,}"
+    ques_pat = r"\?"
-    whitespace_comp=re.compile(whitespace_pat)
+    excl_pat = r"!"
-    sub_comp=re.compile(sub_pat)
+    sem_pat = r";"
-    char_comp=re.compile(char_pat)
+    col_pat = r":"
-    com_comp=re.compile(com_pat)
-    ques_comp=re.compile(ques_pat)
+    whitespace_pat = r"\s{1,}"
-    excl_comp=re.compile(excl_pat)
+    whitespace_comp = re.compile(whitespace_pat)
-    sem_comp=re.compile(sem_pat)
+    sub_comp = re.compile(sub_pat)
-    col_comp=re.compile(col_pat)
+    char_comp = re.compile(char_pat)
+    com_comp = re.compile(com_pat)
-    nstring=sub_comp.sub(" ",string)
+    ques_comp = re.compile(ques_pat)
-    nstring=char_comp.sub(" .",nstring)
+    excl_comp = re.compile(excl_pat)
-    nstring=com_comp.sub(" ,",nstring)
+    sem_comp = re.compile(sem_pat)
-    nstring=ques_comp.sub(" ?",nstring)
+    col_comp = re.compile(col_pat)
-    nstring=excl_comp.sub(" !",nstring)
-    nstring=sem_comp.sub(" ;",nstring)
+    nstring = sub_comp.sub(" ", string)
-    nstring=col_comp.sub(" :",nstring)
+    nstring = char_comp.sub(" .", nstring)
+    nstring = com_comp.sub(" ,", nstring)
-    nstring=whitespace_comp.sub(" ",nstring)
+    nstring = ques_comp.sub(" ?", nstring)
+    nstring = excl_comp.sub(" !", nstring)
+    nstring = sem_comp.sub(" ;", nstring)
+    nstring = col_comp.sub(" :", nstring)
+    nstring = whitespace_comp.sub(" ", nstring)
    return nstring
 def spell_correct(string):
+    """
+    Uses aspell to spell correct an input string.
+    """
    f = open('tmpfile', 'w')
    f.write(string)
-    f_path=os.path.abspath(f.name)
+    f_path = os.path.abspath(f.name)
    f.close()
-    p=os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
+    p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
-    incorrect=p.readlines()
+    incorrect = p.readlines()
    p.close()
-    incorrect_words=list()
+    incorrect_words = list()
-    correct_spelling=list()
+    correct_spelling = list()
-    for i in range(1,len(incorrect)):
+    for i in range(1, len(incorrect)):
-        if(len(incorrect[i])>10):
+        if(len(incorrect[i]) > 10):
-            match=re.search(":",incorrect[i])
+            match = re.search(":", incorrect[i])
-            if hasattr(match,"start"):
+            if hasattr(match, "start"):
-                begstring=incorrect[i][2:match.start()]
+                begstring = incorrect[i][2:match.start()]
-                begmatch=re.search(" ",begstring)
+                begmatch = re.search(" ", begstring)
-                begword=begstring[0:begmatch.start()]
+                begword = begstring[0:begmatch.start()]
-                sugstring=incorrect[i][match.start()+2:]
+                sugstring = incorrect[i][match.start() + 2:]
-                sugmatch=re.search(",",sugstring)
+                sugmatch = re.search(",", sugstring)
                if hasattr(sugmatch, "start"):
-                    sug=sugstring[0:sugmatch.start()]
+                    sug = sugstring[0:sugmatch.start()]
                    incorrect_words.append(begword)
                    correct_spelling.append(sug)
-    newstring=string
+    newstring = string
-    for i in range(0,len(incorrect_words)):
+    for i in range(0, len(incorrect_words)):
-        sub_pat=r"\b" + incorrect_words[i] + r"\b"
+        sub_pat = r"\b" + incorrect_words[i] + r"\b"
-        sub_comp=re.compile(sub_pat)
+        sub_comp = re.compile(sub_pat)
-        newstring=re.sub(sub_comp,correct_spelling[i],newstring)
+        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
    return newstring
 def ngrams(tokens, MIN_N, MAX_N):
-    all_ngrams=list()
+    all_ngrams = list()
    n_tokens = len(tokens)
    for i in xrange(n_tokens):
-        for j in xrange(i+MIN_N, min(n_tokens, i+MAX_N)+1):
+        for j in xrange(i + MIN_N, min(n_tokens, i + MAX_N) + 1):
            all_ngrams.append(" ".join(tokens[i:j]))
    return all_ngrams
 def f7(seq):
    seen = set()
    seen_add = seen.add
-    return [ x for x in seq if x not in seen and not seen_add(x)]
+    return [x for x in seq if x not in seen and not seen_add(x)]
 def count_list(the_list):
    count = the_list.count
@@ -94,44 +106,47 @@ def count_list(the_list):
    result.sort()
    return result
 def regenerate_good_tokens(string):
-    toks=nltk.word_tokenize(string)
+    toks = nltk.word_tokenize(string)
-    pos_string=nltk.pos_tag(toks)
+    pos_string = nltk.pos_tag(toks)
-    pos_seq=[tag[1] for tag in pos_string]
+    pos_seq = [tag[1] for tag in pos_string]
-    pos_ngrams=ngrams(pos_seq,2,4)
+    pos_ngrams = ngrams(pos_seq, 2, 4)
-    sel_pos_ngrams=f7(pos_ngrams)
+    sel_pos_ngrams = f7(pos_ngrams)
    return sel_pos_ngrams
-def get_vocab(text,score,max_feats=750,min_length=100):
-    dict = CountVectorizer(min_n=1,max_n=2,max_features=max_feats)
+def get_vocab(text, score, max_feats=750, min_length=100):
-    dict_mat=dict.fit_transform(text)
+    dict = CountVectorizer(min_n=1, max_n=2, max_features=max_feats)
-    set_score=numpy.asarray(score,dtype=numpy.int)
+    dict_mat = dict.fit_transform(text)
-    med_score=numpy.median(set_score)
+    set_score = numpy.asarray(score, dtype=numpy.int)
-    new_score=set_score
+    med_score = numpy.median(set_score)
-    if(med_score==0):
+    new_score = set_score
-        med_score=1
+    if(med_score == 0):
-    new_score[set_score<med_score]=0
+        med_score = 1
-    new_score[set_score>=med_score]=1
+    new_score[set_score < med_score] = 0
+    new_score[set_score >= med_score] = 1
-    fish_vals=[]
-    for col_num in range(0,dict_mat.shape[1]):
+    fish_vals = []
-        loop_vec=dict_mat.getcol(col_num).toarray()
+    for col_num in range(0, dict_mat.shape[1]):
-        good_loop_vec=loop_vec[new_score==1]
+        loop_vec = dict_mat.getcol(col_num).toarray()
-        bad_loop_vec=loop_vec[new_score==0]
+        good_loop_vec = loop_vec[new_score == 1]
-        good_loop_present=len(good_loop_vec[good_loop_vec>0])
+        bad_loop_vec = loop_vec[new_score == 0]
-        good_loop_missing=len(good_loop_vec[good_loop_vec==0])
+        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
-        bad_loop_present=len(bad_loop_vec[bad_loop_vec>0])
+        good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
-        bad_loop_missing=len(bad_loop_vec[bad_loop_vec==0])
+        bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
-        fish_val=fisher.FishersExactTest.probability_of_table([[good_loop_present,bad_loop_present],[good_loop_missing,bad_loop_missing]])
+        bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
+        fish_val = fisher.FishersExactTest.probability_of_table(
+            [[good_loop_present, bad_loop_present], [good_loop_missing, bad_loop_missing]])
        fish_vals.append(fish_val)
-    cutoff=1
+    cutoff = 1
-    if(len(fish_vals)>200):
+    if(len(fish_vals) > 200):
-        cutoff=sorted(fish_vals)[200]
+        cutoff = sorted(fish_vals)[200]
-    good_cols=numpy.asarray([num for num in range(0,dict_mat.shape[1]) if fish_vals[num]<=cutoff])
+    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
    getVar = lambda searchList, ind: [searchList[i] for i in ind]
-    vocab=getVar(dict.get_feature_names(),good_cols)
+    vocab = getVar(dict.get_feature_names(), good_cols)
    return vocab
@@ -140,10 +155,10 @@ def edit_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
-    for i in xrange(-1,lenstr1+1):
+    for i in xrange(-1, lenstr1 + 1):
-        d[(i,-1)] = i+1
+        d[(i, -1)] = i + 1
-    for j in xrange(-1,lenstr2+1):
+    for j in xrange(-1, lenstr2 + 1):
-        d[(-1,j)] = j+1
+        d[(-1, j)] = j + 1
    for i in xrange(lenstr1):
        for j in xrange(lenstr2):
@@ -151,66 +166,72 @@ def edit_distance(s1, s2):
                cost = 0
            else:
                cost = 1
-            d[(i,j)] = min(
+            d[(i, j)] = min(
-                           d[(i-1,j)] + 1, # deletion
+                d[(i - 1, j)] + 1, # deletion
-                           d[(i,j-1)] + 1, # insertion
+                d[(i, j - 1)] + 1, # insertion
-                           d[(i-1,j-1)] + cost, # substitution
+                d[(i - 1, j - 1)] + cost, # substitution
            )
-            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
+            if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
-                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition
+                d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
+    return d[lenstr1 - 1, lenstr2 - 1]
-    return d[lenstr1-1,lenstr2-1]
 class Error(Exception):
    pass
 class InputError(Error):
    def __init__(self, expr, msg):
        self.expr = expr
        self.msg = msg
-def gen_cv_preds(clf,arr,sel_score,num_chunks=3):
+def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
-    cv_len=int(math.floor(len(sel_score)/num_chunks))
+    cv_len = int(math.floor(len(sel_score) / num_chunks))
-    chunks=[]
+    chunks = []
-    for i in range(0,num_chunks):
+    for i in range(0, num_chunks):
-        range_min=i*cv_len
+        range_min = i * cv_len
-        range_max=((i+1)*cv_len)
+        range_max = ((i + 1) * cv_len)
-        if i==num_chunks-1:
+        if i == num_chunks - 1:
-            range_max=len(sel_score)
+            range_max = len(sel_score)
-        chunks.append(range(range_min,range_max))
+        chunks.append(range(range_min, range_max))
-    preds=[]
+    preds = []
-    set_score=numpy.asarray(sel_score,dtype=numpy.int)
+    set_score = numpy.asarray(sel_score, dtype=numpy.int)
-    chunk_vec=numpy.asarray(range(0,len(chunks)))
+    chunk_vec = numpy.asarray(range(0, len(chunks)))
-    for i in range(0,len(chunks)):
+    for i in range(0, len(chunks)):
-        loop_inds=list(chain.from_iterable([chunks[int(z)] for z,m in enumerate(range(0,len(chunks))) if int(z)!=i ]))
+        loop_inds = list(
-        sim_fit=clf.fit(arr[loop_inds],set_score[loop_inds])
+            chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
+        sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
        preds.append(sim_fit.predict(arr[chunks[i]]))
-    all_preds=numpy.concatenate((preds[0],preds[1],preds[2]),axis=0)
+    all_preds = numpy.concatenate((preds[0], preds[1], preds[2]), axis=0)
    return(all_preds)
-def gen_model(clf,arr,sel_score,num_chunks=3):
-    set_score=numpy.asarray(sel_score,dtype=numpy.int)
+def gen_model(clf, arr, sel_score, num_chunks=3):
-    sim_fit=clf.fit(arr,set_score)
+    set_score = numpy.asarray(sel_score, dtype=numpy.int)
+    sim_fit = clf.fit(arr, set_score)
    return(sim_fit)
-def gen_preds(clf,arr,num_chunks=3):
-    if(hasattr(clf,"predict_proba")):
+def gen_preds(clf, arr, num_chunks=3):
-        ret=clf.predict(arr)
+    if(hasattr(clf, "predict_proba")):
+        ret = clf.predict(arr)
        #pred_score=preds.argmax(1)+min(x._score)
    else:
-        ret=clf.predict(arr)
+        ret = clf.predict(arr)
    return ret
 def calc_list_average(l):
    total = 0.0
    for value in l:
        total += value
-    return total/len(l)
+    return total / len(l)
-stdev=lambda d:(sum((x-1.*sum(d)/len(d))**2 for x in d)/(1.*(len(d)-1)))**.5
+stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
-def quadratic_weighted_kappa(rater_a, rater_b, min_rating = None, max_rating = None):
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
@@ -227,7 +248,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating = None, max_rating = N
    numerator = 0.0
    denominator = 0.0
-    if(num_ratings>1):
+    if(num_ratings > 1):
        for i in range(num_ratings):
            for j in range(num_ratings):
                expected_count = (hist_rater_a[i] * hist_rater_b[j]
@@ -240,6 +261,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating = None, max_rating = N
    else:
        return 1.0
 def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
@@ -253,6 +275,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat
 def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
@@ -264,51 +287,56 @@ def histogram(ratings, min_rating=None, max_rating=None):
        hist_ratings[r - min_rating] += 1
    return hist_ratings
 def get_wordnet_syns(word):
    synonyms = []
    regex = r"_"
-    pat = re.compile( regex )
+    pat = re.compile(regex)
    synset = nltk.wordnet.wordnet.synsets(word)
    for ss in synset:
        for swords in ss.lemma_names:
-            synonyms.append(pat.sub(" ",swords.lower()))
+            synonyms.append(pat.sub(" ", swords.lower()))
-    synonyms=f7(synonyms)
+    synonyms = f7(synonyms)
    return synonyms
 def get_separator_words(toks1):
-    tab_toks1=nltk.FreqDist(word.lower() for word in toks1)
+    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
    if(os.path.isfile("essay_cor_tokens.p")):
-        toks2=pickle.load(open('essay_cor_tokens.p', 'rb'))
+        toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
    else:
-        essay_corpus=open("essaycorpus.txt").read()
+        essay_corpus = open("essaycorpus.txt").read()
-        essay_corpus=sub_chars(essay_corpus)
+        essay_corpus = sub_chars(essay_corpus)
-        toks2=nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
+        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
        pickle.dump(toks2, open('essay_cor_tokens.p', 'wb'))
-    sep_words=[]
+    sep_words = []
    for word in tab_toks1.keys():
-        tok1_present=tab_toks1[word]
+        tok1_present = tab_toks1[word]
-        if(tok1_present>2):
+        if(tok1_present > 2):
-            tok1_total=tab_toks1._N
+            tok1_total = tab_toks1._N
-            tok2_present=toks2[word]
+            tok2_present = toks2[word]
-            tok2_total=toks2._N
+            tok2_total = toks2._N
-            fish_val=fisher.FishersExactTest.probability_of_table([[tok1_present,tok2_present],[tok1_total,tok2_total]])
+            fish_val = fisher.FishersExactTest.probability_of_table(
-            if(fish_val<.001 and tok1_present/float(tok1_total) > (tok2_present/float(tok2_total))*2):
+                [[tok1_present, tok2_present], [tok1_total, tok2_total]])
+            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
                sep_words.append(word)
-    sep_words=[w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w)>5]
+    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
    return sep_words
 def encode_plus(s):
-    regex=r"\+"
+    regex = r"\+"
-    pat=re.compile(regex)
+    pat = re.compile(regex)
-    return pat.sub("%2B",s)
+    return pat.sub("%2B", s)
 def getMedian(numericValues):
    theValues = sorted(numericValues)
    if len(theValues) % 2 == 1:
-        return theValues[(len(theValues)+1)/2-1]
+        return theValues[(len(theValues) + 1) / 2 - 1]
    else:
-        lower = theValues[len(theValues)/2-1]
+        lower = theValues[len(theValues) / 2 - 1]
-        upper = theValues[len(theValues)/2]
+        upper = theValues[len(theValues) / 2]
        return (float(lower + upper)) / 2 
\ No newline at end of file