./grade.py: W391 blank line at end of file

2e6cb8e5 · Hugh Brown · Vik Paruchuri · 0d7ac804 · 2e6cb8e5 · 2e6cb8e5
Commit 2e6cb8e5 authored Jun 02, 2013 by Hugh Brown Committed by Vik Paruchuri Jun 06, 2013
11 changed files
--- a/ease/.pep8
+++ b/ease/.pep8
+[pep8]
+ignore=E501,E712,E711
--- a/ease/create.py
+++ b/ease/create.py
@@ -7,22 +7,23 @@ import sys
 import logging
 import numpy
-#Define base path and add to sys path
+# Define base path and add to sys path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
 sys.path.append(one_up_path)
-#Import modules that are dependent on the base path
+# Import modules that are dependent on the base path
 import model_creator
 import util_functions
 import predictor_set
 import predictor_extractor
-#Make a log
+# Make a log
 log = logging.getLogger(__name__)
-def create(text,score,prompt_string):
+def create(text, score, prompt_string):
    """
    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
    TODO: Remove model path argument, it is needed for now to support legacy code
@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
    prompt_string - the common prompt for the set of essays
    """
-    #Initialize a results dictionary to return
+    # Initialize a results dictionary to return
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
+               'feature_ext': "", 'classifier': "", 'algorithm': util_functions.AlgorithmTypes.classification,
-               'score' : score, 'text' : text, 'prompt' : prompt_string}
+               'score': score, 'text': text, 'prompt': prompt_string}
-    if len(text)!=len(score):
+    if len(text) != len(score):
        msg = "Target and text lists must be same length."
        results['errors'].append(msg)
        log.exception(msg)
        return results
-    #Decide what algorithm to use (regression or classification)
+    # Decide what algorithm to use (regression or classification)
    try:
-        #Count the number of unique score points in the score list
+        # Count the number of unique score points in the score list
-        if len(util_functions.f7(list(score)))>5:
+        if len(util_functions.f7(list(score))) > 5:
            type = util_functions.AlgorithmTypes.regression
        else:
            type = util_functions.AlgorithmTypes.classification
@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
        type = util_functions.AlgorithmTypes.regression
    try:
-        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
+        # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
        e_set = model_creator.create_essay_set(text, score, prompt_string)
    except:
        msg = "essay set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
    try:
-        #Gets features from the essay set and computes error
+        # Gets features from the essay set and computes error
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
-        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_kappa'] = cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
-        results['feature_ext']=feature_ext
+        results['feature_ext'] = feature_ext
-        results['classifier']=classifier
+        results['classifier'] = classifier
        results['algorithm'] = type
-        results['success']=True
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)
@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
    return results
-def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Creates a model from a generic list numeric values and text values
    numeric_values - A list of lists that are the predictors
@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
    algorithm - the type of algorithm that will be used
    """
-    #Initialize a result dictionary to return.
+    # Initialize a result dictionary to return.
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
+               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
-    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
+    if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
        msg = "Target, numeric features, and text features must all be the same length."
        results['errors'].append(msg)
        log.exception(msg)
        return results
    try:
-        #Initialize a predictor set object that encapsulates all of the text and numeric predictors
+        # Initialize a predictor set object that encapsulates all of the text and numeric predictors
        pset = predictor_set.PredictorSet(type="train")
        for i in xrange(0, len(numeric_values)):
            pset.add_row(numeric_values[i], textual_values[i], target[i])
@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
        log.exception(msg)
    try:
-        #Extract all features and then train a classifier with the features
+        # Extract all features and then train a classifier with the features
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
-        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_kappa'] = cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
-        results['feature_ext']=feature_ext
+        results['feature_ext'] = feature_ext
-        results['classifier']=classifier
+        results['classifier'] = classifier
-        results['success']=True
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)

--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -15,11 +15,12 @@ sys.path.append(base_path)
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
-log=logging.getLogger(__name__)
+log = logging.getLogger(__name__)
+MAXIMUM_ESSAY_LENGTH = 20000
-MAXIMUM_ESSAY_LENGTH=20000
 class EssaySet(object):
    def __init__(self, type="train"):
@@ -30,17 +31,17 @@ class EssaySet(object):
            type = "train"
        self._type = type
-        self._score=[]
+        self._score = []
-        self._text=[]
+        self._text = []
-        self._id=[]
+        self._id = []
-        self._clean_text=[]
+        self._clean_text = []
-        self._tokens=[]
+        self._tokens = []
-        self._pos=[]
+        self._pos = []
-        self._clean_stem_text=[]
+        self._clean_stem_text = []
        self._generated = []
        self._prompt = ""
-        self._spelling_errors=[]
+        self._spelling_errors = []
-        self._markup_text=[]
+        self._markup_text = []
    def add_essay(self, essay_text, essay_score, essay_generated=0):
        """
@@ -58,35 +59,35 @@ class EssaySet(object):
            # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
        try:
-            essay_text=essay_text.encode('ascii', 'ignore')
+            essay_text = essay_text.encode('ascii', 'ignore')
-            if len(essay_text)<5:
+            if len(essay_text) < 5:
-                essay_text="Invalid essay."
+                essay_text = "Invalid essay."
        except:
            log.exception("Could not parse essay into ascii.")
        try:
-            #Try conversion of types
+            # Try conversion of types
-            essay_score=int(essay_score)
+            essay_score = int(essay_score)
-            essay_text=str(essay_text)
+            essay_text = str(essay_text)
        except:
-            #Nothing needed here, will return error in any case.
+            # Nothing needed here, will return error in any case.
-            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
+            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
-        if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
+        if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
                and (essay_generated == 0 or essay_generated == 1):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
            # Clean text by removing non digit/work/punctuation characters
            try:
-                essay_text=str(essay_text.encode('ascii', 'ignore'))
+                essay_text = str(essay_text.encode('ascii', 'ignore'))
            except:
-                essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
+                essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
-            cleaned_essay=util_functions.sub_chars(essay_text).lower()
+            cleaned_essay = util_functions.sub_chars(essay_text).lower()
-            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
+            if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
-                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
+                cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
            self._text.append(cleaned_essay)
            # Spell correct text using aspell
-            cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
+            cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
            self._clean_text.append(cleaned_text)
            self._spelling_errors.append(spell_errors)
            self._markup_text.append(markup_text)
@@ -112,7 +113,7 @@ class EssaySet(object):
        prompt_text should be a string.
        Returns the prompt as a confirmation.
        """
-        if(type(prompt_text) == type("text")):
+        if(isinstance(prompt_text, type("text"))):
            self._prompt = util_functions.sub_chars(prompt_text)
            ret = self._prompt
        else:

--- a/ease/external_code/fisher/fisher.py
+++ b/ease/external_code/fisher/fisher.py
@@ -21,6 +21,8 @@
 import math
 ## From dendropy.mathlib.probability
 def hypergeometric_pmf(x, m, n, k):
    """
 Given a population consisting of `m` items of class M and `n` items of class N,
@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    # float' with large numbers
    # return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
    a = math.log(binomial_coefficient(m, x))
-    b = math.log(binomial_coefficient(n, k-x))
+    b = math.log(binomial_coefficient(n, k - x))
-    c = math.log(binomial_coefficient(m+n, k))
+    c = math.log(binomial_coefficient(m + n, k))
-    return math.exp(a+b-c)
+    return math.exp(a + b - c)
 ## From dendropy.mathlib.probability
 def binomial_coefficient(population, sample):
    "Returns `population` choose `sample`."
    s = max(sample, population - sample)
@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
        return 1
    numerator = 1
    denominator = 1
-    for i in xrange(s+1, population + 1):
+    for i in xrange(s + 1, population + 1):
        numerator *= i
        denominator *= (i - s)
-    return numerator/denominator
+    return numerator / denominator
 ## From dendropy.mathlib.statistics
 class FishersExactTest(object):
    """
 Given a 2x2 table:
@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
        b = table[0][1]
        c = table[1][0]
        d = table[1][1]
-        return hypergeometric_pmf(a, a+b, c+d, a+c)
+        return hypergeometric_pmf(a, a + b, c + d, a + c)
    probability_of_table = staticmethod(probability_of_table)
    def __init__(self, table):
@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
 Returns a copy of table such that all the values
 are rotated clockwise once.
 """
-        return [ [ table[1][0], table[0][0] ],
+        return [[table[1][0], table[0][0]],
-                [table[1][1], table[0][1] ] ]
+                [table[1][1], table[0][1]]]
    def _min_rotation(self):
        """
@@ -241,8 +247,9 @@ extreme.
                p_vals.append(p)
        return sum(p_vals) + p0
 def assert_almost_equal(v1, v2, prec=8):
-    if abs(v1-v2) <= 10**(-prec):
+    if abs(v1 - v2) <= 10 ** (-prec):
        print "OK: {} == {}".format(v1, v2)
    else:
        print "FAIL: {} != {}".format(v1, v2)

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
@@ -20,22 +20,23 @@ from essay_set import EssaySet
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
 log = logging.getLogger(__name__)
-#Paths to needed data files
+# Paths to needed data files
 NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
 ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
 class FeatureExtractor(object):
    def __init__(self):
        self._good_pos_ngrams = self.get_good_pos_ngrams()
        self.dict_initialized = False
-        self._spell_errors_per_character=0
+        self._spell_errors_per_character = 0
-        self._grammar_errors_per_character=0
+        self._grammar_errors_per_character = 0
-    def initialize_dictionaries(self, e_set, max_feats2 = 200):
+    def initialize_dictionaries(self, e_set, max_feats2=200):
        """
        Initializes dictionaries from an essay set object
        Dictionaries must be initialized prior to using this to extract features
@@ -44,27 +45,27 @@ class FeatureExtractor(object):
        """
        if(hasattr(e_set, '_type')):
            if(e_set._type == "train"):
-                #normal text (unstemmed) useful words/bigrams
+                # normal text (unstemmed) useful words/bigrams
-                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
+                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2=max_feats2)
-                #stemmed and spell corrected vocab useful words/ngrams
+                # stemmed and spell corrected vocab useful words/ngrams
-                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
+                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2=max_feats2)
-                #dictionary trained on proper vocab
+                # dictionary trained on proper vocab
-                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
+                self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=nvocab)
-                #dictionary trained on proper vocab
+                # dictionary trained on proper vocab
-                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
+                self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=svocab)
                self.dict_initialized = True
-                #Average spelling errors in set. needed later for spelling detection
+                # Average spelling errors in set. needed later for spelling detection
-                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
+                self._mean_spelling_errors = sum(e_set._spelling_errors) / float(len(e_set._spelling_errors))
-                self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
+                self._spell_errors_per_character = sum(e_set._spelling_errors) / float(sum([len(t) for t in e_set._text]))
-                #Gets the number and positions of grammar errors
+                # Gets the number and positions of grammar errors
-                good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+                good_pos_tags, bad_pos_positions = self._get_grammar_errors(e_set._pos, e_set._text, e_set._tokens)
-                self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
+                self._grammar_errors_per_character = (sum(good_pos_tags) / float(sum([len(t) for t in e_set._text])))
-                #Generate bag of words features
+                # Generate bag of words features
-                bag_feats=self.gen_bag_feats(e_set)
+                bag_feats = self.gen_bag_feats(e_set)
-                #Sum of a row of bag of words features (topical words in an essay)
+                # Sum of a row of bag of words features (topical words in an essay)
-                f_row_sum=numpy.sum(bag_feats[:,:])
+                f_row_sum = numpy.sum(bag_feats[:, :])
-                #Average index of how "topical" essays are
+                # Average index of how "topical" essays are
-                self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
+                self._mean_f_prop = f_row_sum / float(sum([len(t) for t in e_set._text]))
                ret = "ok"
            else:
                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
@@ -85,8 +86,8 @@ class FeatureExtractor(object):
            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
        else:
-            #Hard coded list in case the needed files cannot be found
+            # Hard coded list in case the needed files cannot be found
-            good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
+            good_pos_ngrams = ['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
                               '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP',
                               'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO',
                               'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP',
@@ -94,38 +95,38 @@ class FeatureExtractor(object):
        return good_pos_ngrams
-    def _get_grammar_errors(self,pos,text,tokens):
+    def _get_grammar_errors(self, pos, text, tokens):
        """
        Internal function to get the number of grammar errors in given text
        pos - part of speech tagged text (list)
        text - normal text (list)
        tokens - list of lists of tokenized text
        """
-        word_counts = [max(len(t),1) for t in tokens]
+        word_counts = [max(len(t), 1) for t in tokens]
        good_pos_tags = []
-        min_pos_seq=2
+        min_pos_seq = 2
-        max_pos_seq=4
+        max_pos_seq = 4
-        bad_pos_positions=[]
+        bad_pos_positions = []
        for i in xrange(0, len(text)):
            pos_seq = [tag[1] for tag in pos[i]]
            pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
-            long_pos_ngrams=[z for z in pos_ngrams if z.count(' ')==(max_pos_seq-1)]
+            long_pos_ngrams = [z for z in pos_ngrams if z.count(' ') == (max_pos_seq - 1)]
-            bad_pos_tuples=[[z,z+max_pos_seq] for z in xrange(0,len(long_pos_ngrams)) if long_pos_ngrams[z] not in self._good_pos_ngrams]
+            bad_pos_tuples = [[z, z + max_pos_seq] for z in xrange(0, len(long_pos_ngrams)) if long_pos_ngrams[z] not in self._good_pos_ngrams]
            bad_pos_tuples.sort(key=operator.itemgetter(1))
-            to_delete=[]
+            to_delete = []
-            for m in reversed(xrange(len(bad_pos_tuples)-1)):
+            for m in reversed(xrange(len(bad_pos_tuples) - 1)):
                start, end = bad_pos_tuples[m]
-                for j in xrange(m+1, len(bad_pos_tuples)):
+                for j in xrange(m + 1, len(bad_pos_tuples)):
                    lstart, lend = bad_pos_tuples[j]
                    if lstart >= start and lstart <= end:
-                        bad_pos_tuples[m][1]=bad_pos_tuples[j][1]
+                        bad_pos_tuples[m][1] = bad_pos_tuples[j][1]
                        to_delete.append(j)
-            fixed_bad_pos_tuples=[bad_pos_tuples[z] for z in xrange(0,len(bad_pos_tuples)) if z not in to_delete]
+            fixed_bad_pos_tuples = [bad_pos_tuples[z] for z in xrange(0, len(bad_pos_tuples)) if z not in to_delete]
            bad_pos_positions.append(fixed_bad_pos_tuples)
            overlap_ngrams = [z for z in pos_ngrams if z in self._good_pos_ngrams]
-            if (len(pos_ngrams)-len(overlap_ngrams))>0:
+            if (len(pos_ngrams) - len(overlap_ngrams)) > 0:
-                divisor=len(pos_ngrams)/len(pos_seq)
+                divisor = len(pos_ngrams) / len(pos_seq)
            else:
                divisor=1
            if divisor == 0:
@@ -143,13 +144,13 @@ class FeatureExtractor(object):
        """
        text = e_set._text
        lengths = [len(e) for e in text]
-        word_counts = [max(len(t),1) for t in e_set._tokens]
+        word_counts = [max(len(t), 1) for t in e_set._tokens]
        comma_count = [e.count(",") for e in text]
        ap_count = [e.count("'") for e in text]
        punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
        chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-        good_pos_tags,bad_pos_positions= self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+        good_pos_tags, bad_pos_positions = self._get_grammar_errors(e_set._pos, e_set._text, e_set._tokens)
        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
        length_arr = numpy.array((
@@ -203,17 +204,17 @@ class FeatureExtractor(object):
        prompt_overlap = []
        prompt_overlap_prop = []
        for j in e_set._tokens:
-            tok_length=len(j)
+            tok_length = len(j)
-            if(tok_length==0):
+            if(tok_length == 0):
-                tok_length=1
+                tok_length = 1
            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
        expand_overlap = []
        expand_overlap_prop = []
        for j in e_set._tokens:
-            tok_length=len(j)
+            tok_length = len(j)
-            if(tok_length==0):
+            if(tok_length == 0):
-                tok_length=1
+                tok_length = 1
            expand_overlap.append(len([i for i in j if i in expand_syns]))
            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
@@ -231,62 +232,62 @@ class FeatureExtractor(object):
        e_set - EssaySet object
        """
-        #Set ratio to modify thresholds for grammar/spelling errors
+        # Set ratio to modify thresholds for grammar/spelling errors
-        modifier_ratio=1.05
+        modifier_ratio = 1.05
-        #Calc number of grammar and spelling errors per character
+        # Calc number of grammar and spelling errors per character
-        set_grammar,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+        set_grammar, bad_pos_positions = self._get_grammar_errors(e_set._pos, e_set._text, e_set._tokens)
-        set_grammar_per_character=[set_grammar[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
+        set_grammar_per_character = [set_grammar[m] / float(len(e_set._text[m]) + .1) for m in xrange(0, len(e_set._text))]
-        set_spell_errors_per_character=[e_set._spelling_errors[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
+        set_spell_errors_per_character = [e_set._spelling_errors[m] / float(len(e_set._text[m]) + .1) for m in xrange(0, len(e_set._text))]
-        #Iterate through essays and create a feedback dict for each
+        # Iterate through essays and create a feedback dict for each
-        all_feedback=[]
+        all_feedback = []
-        for m in xrange(0,len(e_set._text)):
+        for m in xrange(0, len(e_set._text)):
-            #Be very careful about changing these messages!
+            # Be very careful about changing these messages!
-            individual_feedback={'grammar' : "Grammar: Ok.",
+            individual_feedback = {'grammar': "Grammar: Ok.",
-                                 'spelling' : "Spelling: Ok.",
+                                   'spelling': "Spelling: Ok.",
-                                 'markup_text' : "",
+                                   'markup_text': "",
-                                 'grammar_per_char' : set_grammar_per_character[m],
+                                   'grammar_per_char': set_grammar_per_character[m],
-                                 'spelling_per_char' : set_spell_errors_per_character[m],
+                                   'spelling_per_char': set_spell_errors_per_character[m],
-                                 'too_similar_to_prompt' : False,
+                                   'too_similar_to_prompt': False,
                                   }
-            markup_tokens=e_set._markup_text[m].split(" ")
+            markup_tokens = e_set._markup_text[m].split(" ")
-            #This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
+            # This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
-            #disjointed
+            # disjointed
-            bad_pos_starts=[z[0] for z in bad_pos_positions[m]]
+            bad_pos_starts = [z[0] for z in bad_pos_positions[m]]
-            bad_pos_ends=[z[1]-1 for z in bad_pos_positions[m]]
+            bad_pos_ends = [z[1] - 1 for z in bad_pos_positions[m]]
-            for z in xrange(0,len(markup_tokens)):
+            for z in xrange(0, len(markup_tokens)):
                if z in bad_pos_starts:
-                    markup_tokens[z]='<bg>' + markup_tokens[z]
+                    markup_tokens[z] = '<bg>' + markup_tokens[z]
                elif z in bad_pos_ends:
-                    markup_tokens[z]=markup_tokens[z] + "</bg>"
+                    markup_tokens[z] = markup_tokens[z] + "</bg>"
-            if(len(bad_pos_ends)>0 and len(bad_pos_starts)>0 and len(markup_tokens)>1):
+            if(len(bad_pos_ends) > 0 and len(bad_pos_starts) > 0 and len(markup_tokens) > 1):
-                if max(bad_pos_ends)>(len(markup_tokens)-1) and max(bad_pos_starts)<(len(markup_tokens)-1):
+                if max(bad_pos_ends) > (len(markup_tokens) - 1) and max(bad_pos_starts) < (len(markup_tokens) - 1):
-                    markup_tokens[len(markup_tokens)-1]+="</bg>"
+                    markup_tokens[len(markup_tokens) - 1] += "</bg>"
-            #Display messages if grammar/spelling errors greater than average in training set
+            # Display messages if grammar/spelling errors greater than average in training set
-            if set_grammar_per_character[m]>(self._grammar_errors_per_character*modifier_ratio):
+            if set_grammar_per_character[m] > (self._grammar_errors_per_character * modifier_ratio):
-                individual_feedback['grammar']="Grammar: More grammar errors than average."
+                individual_feedback['grammar'] = "Grammar: More grammar errors than average."
-            if set_spell_errors_per_character[m]>(self._spell_errors_per_character*modifier_ratio):
+            if set_spell_errors_per_character[m] > (self._spell_errors_per_character * modifier_ratio):
-                individual_feedback['spelling']="Spelling: More spelling errors than average."
+                individual_feedback['spelling'] = "Spelling: More spelling errors than average."
-            #Test topicality by calculating # of on topic words per character and comparing to the training set
+            # Test topicality by calculating # of on topic words per character and comparing to the training set
-            #mean.  Requires features to be passed in
+            # mean.  Requires features to be passed in
            if features is not None:
-                f_row_sum=numpy.sum(features[m,12:])
+                f_row_sum = numpy.sum(features[m, 12:])
-                f_row_prop=f_row_sum/len(e_set._text[m])
+                f_row_prop = f_row_sum / len(e_set._text[m])
-                if f_row_prop<(self._mean_f_prop/1.5) or len(e_set._text[m])<20:
+                if f_row_prop < (self._mean_f_prop / 1.5) or len(e_set._text[m]) < 20:
-                    individual_feedback['topicality']="Topicality: Essay may be off topic."
+                    individual_feedback['topicality'] = "Topicality: Essay may be off topic."
-                if(features[m,9]>.6):
+                if(features[m, 9] > .6):
-                    individual_feedback['prompt_overlap']="Prompt Overlap: Too much overlap with prompt."
+                    individual_feedback['prompt_overlap'] = "Prompt Overlap: Too much overlap with prompt."
-                    individual_feedback['too_similar_to_prompt']=True
+                    individual_feedback['too_similar_to_prompt'] = True
-                    log.debug(features[m,9])
+                    log.debug(features[m, 9])
-            #Create string representation of markup text
+            # Create string representation of markup text
-            markup_string=" ".join(markup_tokens)
+            markup_string = " ".join(markup_tokens)
-            individual_feedback['markup_text']=markup_string
+            individual_feedback['markup_text'] = markup_string
            all_feedback.append(individual_feedback)
        return all_feedback
--- a/ease/grade.py
+++ b/ease/grade.py
@@ -8,24 +8,25 @@ import os
 import numpy
 import logging
-#Append sys to base path to import the following modules
+# Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-#Depend on base path to be imported
+# Depend on base path to be imported
 from essay_set import EssaySet
 import predictor_extractor
 import predictor_set
 import util_functions
-#Imports needed to unpickle grader data
+# Imports needed to unpickle grader data
 import feature_extractor
 import sklearn.ensemble
 import math
 log = logging.getLogger(__name__)
-def grade(grader_data,submission):
+def grade(grader_data, submission):
    """
    Grades a specified submission using specified models
    grader_data - A dictionary:
@@ -38,73 +39,74 @@ def grade(grader_data,submission):
    submission - The student submission (string)
    """
-    #Initialize result dictionary
+    # Initialize result dictionary
-    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
+    results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
-    has_error=False
+    has_error = False
-    grader_set=EssaySet(type="test")
+    grader_set = EssaySet(type="test")
-    #This is to preserve legacy functionality
+    # This is to preserve legacy functionality
    if 'algorithm' not in grader_data:
        grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
    try:
-        #Try to add essay to essay set object
+        # Try to add essay to essay set object
-        grader_set.add_essay(str(submission),0)
+        grader_set.add_essay(str(submission), 0)
        grader_set.update_prompt(str(grader_data['prompt']))
    except:
        results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
-        has_error=True
+        has_error = True
-    #Try to extract features from submission and assign score via the model
+    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        grader_feats = grader_data['extractor'].gen_feats(grader_set)
-        feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
+        feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
-        results['score']=int(grader_data['model'].predict(grader_feats)[0])
+        results['score'] = int(grader_data['model'].predict(grader_feats)[0])
-    except :
+    except:
        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
+        has_error = True
-    #Try to determine confidence level
+    # Try to determine confidence level
    try:
        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
+        # If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
    if not has_error:
-        #If the essay is just a copy of the prompt, return a 0 as the score
+        # If the essay is just a copy of the prompt, return a 0 as the score
        if(feedback['too_similar_to_prompt']):
-            results['score']=0
+            results['score'] = 0
-            results['correct']=False
+            results['correct'] = False
-        results['success']=True
+        results['success'] = True
-        #Generate short form output--number of problem areas identified in feedback
+        # Generate short form output--number of problem areas identified in feedback
-        #Add feedback to results if available
+        # Add feedback to results if available
        results['feedback'] = {}
        if 'topicality' in feedback and 'prompt_overlap' in feedback:
            results['feedback'].update({
-                'topicality' : feedback['topicality'],
+                'topicality': feedback['topicality'],
-                'prompt-overlap' : feedback['prompt_overlap'],
+                'prompt-overlap': feedback['prompt_overlap'],
            })
        results['feedback'].update(
            {
-                'spelling' : feedback['spelling'],
+                'spelling': feedback['spelling'],
-                'grammar' : feedback['grammar'],
+                'grammar': feedback['grammar'],
-                'markup-text' : feedback['markup_text'],
+                'markup-text': feedback['markup_text'],
            }
        )
    else:
-        #If error, success is False.
+        # If error, success is False.
-        results['success']=False
+        results['success'] = False
    return results
 def grade_generic(grader_data, numeric_features, textual_features):
    """
    Grades a set of numeric and textual features using a generic model
@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
    textual_features - list of textual feature to predict on
    """
-    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
+    results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
-    has_error=False
+    has_error = False
-    #Try to find and load the model file
+    # Try to find and load the model file
-    grader_set=predictor_set.PredictorSet(type="test")
+    grader_set = predictor_set.PredictorSet(type="test")
-    #Try to add essays to essay set object
+    # Try to add essays to essay set object
    try:
-        grader_set.add_row(numeric_features, textual_features,0)
+        grader_set.add_row(numeric_features, textual_features, 0)
    except:
        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
-        has_error=True
+        has_error = True
-    #Try to extract features from submission and assign score via the model
+    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        grader_feats = grader_data['extractor'].gen_feats(grader_set)
-        results['score']=grader_data['model'].predict(grader_feats)[0]
+        results['score'] = grader_data['model'].predict(grader_feats)[0]
-    except :
+    except:
        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
+        has_error = True
-    #Try to determine confidence level
+    # Try to determine confidence level
    try:
        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
+        # If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
    if not has_error:
@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
    return results
-def get_confidence_value(algorithm,model,grader_feats,score, scores):
+def get_confidence_value(algorithm, model, grader_feats, score, scores):
    """
    Determines a confidence in a certain score, given proper input parameters
    algorithm- from util_functions.AlgorithmTypes
@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
    max_score=max(numpy.asarray(scores))
    if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
        #If classification, predict with probability, which gives you a matrix of confidences per score point
-        raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
+        raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
        #TODO: Normalize confidence somehow here
        confidence=raw_confidence
    elif hasattr(model, "predict"):
@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
        confidence = 0
    return confidence
--- a/ease/model_creator.py
+++ b/ease/model_creator.py
-#Provides interface functions to create and save models
+# Provides interface functions to create and save models
 import numpy
 import re
@@ -19,7 +19,8 @@ import feature_extractor
 import logging
 import predictor_extractor
-log=logging.getLogger()
+log = logging.getLogger()
 def read_in_test_data(filename):
    """
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
    prompt_string = open(filename).read()
    return prompt_string
-def read_in_test_data_twocolumn(filename,sep=","):
+def read_in_test_data_twocolumn(filename, sep=","):
    """
    Reads in a two column version of the test data.
    Filename must point to a delimited file.
@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
    return x
-def get_cv_error(clf,feats,scores):
+def get_cv_error(clf, feats, scores):
    """
    Gets cross validated error for a given classifier, set of features, and scores
    clf - classifier
    feats - features to feed into the classified and cross validate over
    scores - scores associated with the features -- feature row 1 associates with score 1, etc.
    """
-    results={'success' : False, 'kappa' : 0, 'mae' : 0}
+    results = {'success': False, 'kappa': 0, 'mae': 0}
    try:
-        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
+        cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
-        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
+        err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
-        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
+        kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
-        results['mae']=err
+        results['mae'] = err
-        results['kappa']=kappa
+        results['kappa'] = kappa
-        results['success']=True
+        results['success'] = True
    except ValueError:
-        #If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
+        # If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
        log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
    except:
        log.exception("Error getting cv error estimates.")
    return results
 def get_algorithms(type):
    """
    Gets two classifiers for each type of algorithm, and returns them.  First for predicting, second for cv error.
@@ -116,14 +120,14 @@ def get_algorithms(type):
    """
    if type == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+        clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                           max_depth=4, random_state=1, min_samples_leaf=3)
    else:
        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                         max_depth=4, random_state=1, min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+        clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2
@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
    train_feats = f.gen_feats(predictor_set)
-    clf,clf2 = get_algorithms(type)
+    clf, clf2 = get_algorithms(type)
-    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
+    cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
    try:
        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
+        set_score[0] = 1
-        set_score[1]=0
+        set_score[1] = 0
        clf.fit(train_feats, set_score)
    return f, clf, cv_error_results
@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
    train_feats = f.gen_feats(essays)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
-    if len(util_functions.f7(list(set_score)))>5:
+    if len(util_functions.f7(list(set_score))) > 5:
        type = util_functions.AlgorithmTypes.regression
    else:
        type = util_functions.AlgorithmTypes.classification
-    clf,clf2 = get_algorithms(type)
+    clf, clf2 = get_algorithms(type)
-    cv_error_results=get_cv_error(clf2,train_feats,essays._score)
+    cv_error_results = get_cv_error(clf2, train_feats, essays._score)
    try:
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
+        set_score[0] = 1
-        set_score[1]=0
+        set_score[1] = 0
        clf.fit(train_feats, set_score)
    return f, clf, cv_error_results
 def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
    """
    Writes out a model to a file.
@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
    classifier is a trained classifier
    model_path is the path of write out the model file to
    """
-    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
+    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
    pickle.dump(model_file, file=open(model_path, "w"))
-def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
+def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
    """
    Function that creates essay set, extracts features, and writes out model
    See above functions for argument descriptions
    """
-    essay_set=create_essay_set(text_score,prompt)
+    essay_set = create_essay_set(text_score, prompt)
-    feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
+    feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
-    dump_model_to_file(prompt,feature_ext,clf,model_path)
+    dump_model_to_file(prompt, feature_ext, clf, model_path)
--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -16,17 +16,18 @@ import logging
 import math
 from feature_extractor import FeatureExtractor
-#Append to path and then import things that depend on path
+# Append to path and then import things that depend on path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
 log = logging.getLogger(__name__)
 class PredictorExtractor(object):
    def __init__(self):
        self._extractors = []
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)
-        div_length=len(p_set._essay_sets)
+        div_length = len(p_set._essay_sets)
-        if div_length==0:
+        if div_length == 0:
-            div_length=1
+            div_length = 1
-        #Ensures that even with a large amount of input textual features, training time stays reasonable
+        # Ensures that even with a large amount of input textual features, training time stays reasonable
-        max_feats2 = int(math.floor(200/div_length))
+        max_feats2 = int(math.floor(200 / div_length))
-        for i in xrange(0,len(p_set._essay_sets)):
+        for i in xrange(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
            self._initialized = True
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
        Generates features based on an iput p_set
        p_set - PredictorSet
        """
-        if self._initialized!=True:
+        if self._initialized != True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)
        textual_features = []
-        for i in xrange(0,len(p_set._essay_sets)):
+        for i in xrange(0, len(p_set._essay_sets)):
            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
        textual_matrix = numpy.concatenate(textual_features, axis=1)

--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
@@ -11,12 +11,13 @@ sys.path.append(base_path)
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
+log = logging.getLogger(__name__)
-log=logging.getLogger(__name__)
 class PredictorSet(object):
-    def __init__(self, type = "train"):
+    def __init__(self, type="train"):
        """
        Initialize variables and check essay set type
        """
@@ -24,13 +25,13 @@ class PredictorSet(object):
            type = "train"
        self._type = type
-        self._target=[]
+        self._target = []
-        self._textual_features=[]
+        self._textual_features = []
-        self._numeric_features=[]
+        self._numeric_features = []
-        self._essay_sets=[]
+        self._essay_sets = []
    def add_row(self, numeric_features, textual_features, target):
-        #Basic input checking
+        # Basic input checking
        if not isinstance(target, (int, long, float)):
            error_message = "Target is not a numeric value."
            log.exception(error_message)
@@ -46,8 +47,8 @@ class PredictorSet(object):
            log.exception(error_message)
            raise util_functions.InputError(textual_features, error_message)
-        #Do some length checking for parameters
+        # Do some length checking for parameters
-        if len(self._numeric_features)>0:
+        if len(self._numeric_features) > 0:
            numeric_length = len(self._numeric_features[-1])
            current_numeric_length = len(numeric_features)
            if numeric_length != current_numeric_length:
@@ -55,7 +56,7 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)
-        if len(self._textual_features)>0:
+        if len(self._textual_features) > 0:
            textual_length = len(self._textual_features[-1])
            current_textual_length = len(textual_features)
            if textual_length != current_textual_length:
@@ -63,9 +64,9 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(textual_features, error_message)
-        #Now check to see if text features and numeric features are individually correct
+        # Now check to see if text features and numeric features are individually correct
-        for i in xrange(0,len(numeric_features)):
+        for i in xrange(0, len(numeric_features)):
            try:
                numeric_features[i] = float(numeric_features[i])
            except:
@@ -73,8 +74,7 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)
+        for i in xrange(0, len(textual_features)):
-        for i in xrange(0,len(textual_features)):
            try:
                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
            except:
@@ -82,19 +82,18 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(textual_features, error_message)
-        #Create essay sets for textual features if needed
+        # Create essay sets for textual features if needed
-        if len(self._textual_features)==0:
+        if len(self._textual_features) == 0:
-            for i in xrange(0,len(textual_features)):
+            for i in xrange(0, len(textual_features)):
                self._essay_sets.append(essay_set.EssaySet(type=self._type))
-        #Add numeric and textual features
+        # Add numeric and textual features
        self._numeric_features.append(numeric_features)
        self._textual_features.append(textual_features)
-        #Add targets
+        # Add targets
        self._target.append(target)
-        #Add textual features to essay sets
+        # Add textual features to essay sets
-        for i in xrange(0,len(textual_features)):
+        for i in xrange(0, len(textual_features)):
            self._essay_sets[i].add_essay(textual_features[i], target)
--- a/ease/tests/test_model_accuracy.py
+++ b/ease/tests/test_model_accuracy.py
@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
 TRAINING_LIMIT = 100
 QUICK_TEST_LIMIT = 5
 class DataLoader():
    def load_text_files(self, pathname):
        filenames = os.listdir(pathname)
@@ -28,34 +29,36 @@ class DataLoader():
        """
        pass
 class PolarityLoader(DataLoader):
    def __init__(self, pathname):
        self.pathname = pathname
    def load_data(self):
        filenames = os.listdir(self.pathname)
-        directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]]
+        directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
-        #Sort so neg is first
+        # Sort so neg is first
        directories.sort()
-        #We need to have both a postive and a negative folder to classify
+        # We need to have both a postive and a negative folder to classify
-        if len(directories)!=2:
+        if len(directories) != 2:
            raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
        neg = self.load_text_files(directories[0])
        pos = self.load_text_files(directories[1])
-        scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))]
+        scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
        text = neg + pos
        return scores, text
 class ModelCreator():
    def __init__(self, scores, text):
        self.scores = scores
        self.text = text
-        #Governs which creation function in the ease.create module to use.  See module for info.
+        # Governs which creation function in the ease.create module to use.  See module for info.
        if isinstance(text[0], basestring):
            self.create_model_generic = False
        else:
@@ -67,6 +70,7 @@ class ModelCreator():
        else:
            return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
 class Grader():
    def __init__(self, model_data):
        self.model_data = model_data
@@ -77,6 +81,7 @@ class Grader():
        else:
            return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', []))
 class GenericTest(object):
    loader = DataLoader
    data_path = ""
@@ -87,11 +92,11 @@ class GenericTest(object):
        data_loader = self.loader(os.path.join(TEST_PATH, self.data_path))
        scores, text = data_loader.load_data()
-        #Shuffle to mix up the classes, set seed to make it repeatable
+        # Shuffle to mix up the classes, set seed to make it repeatable
        random.seed(1)
        shuffled_scores = []
        shuffled_text = []
-        indices = [i for i in xrange(0,len(scores))]
+        indices = [i for i in xrange(0, len(scores))]
        random.shuffle(indices)
        for i in indices:
            shuffled_scores.append(scores[i])
@@ -121,12 +126,13 @@ class GenericTest(object):
        self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
        self.assertLessEqual(cv_mae, self.expected_mae_max)
-class PolarityTest(unittest.TestCase,GenericTest):
+class PolarityTest(unittest.TestCase, GenericTest):
    loader = PolarityLoader
    data_path = "data/polarity"
-    #These will increase if we allow more data in.
+    # These will increase if we allow more data in.
-    #I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
+    # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
    expected_kappa_min = -.2
    expected_mae_max = 1

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
-#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
+# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
-#Requires aspell to be installed and added to the path
+# Requires aspell to be installed and added to the path
 from external_code.fisher import fisher
 aspell_path = "aspell"
@@ -14,17 +14,18 @@ import pickle
 import logging
 import sys
-log=logging.getLogger(__name__)
+log = logging.getLogger(__name__)
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
-#Paths to needed data files
+# Paths to needed data files
 ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
 ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
 class AlgorithmTypes(object):
    """
    Defines what types of algorithm can be used
@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
    regression = "regression"
    classification = "classifiction"
 def create_model_path(model_path):
    """
    Creates a path to model files
    model_path - string
    """
    if not model_path.startswith("/") and not model_path.startswith("models/"):
-        model_path="/" + model_path
+        model_path = "/" + model_path
    if not model_path.startswith("models"):
        model_path = "models" + model_path
    if not model_path.endswith(".p"):
-        model_path+=".p"
+        model_path += ".p"
    return model_path
 def sub_chars(string):
    """
    Strips illegal characters from a string.  Used to sanitize input essays.
@@ -53,7 +56,7 @@ def sub_chars(string):
    Returns sanitized string.
    string - string
    """
-    #Define replacement patterns
+    # Define replacement patterns
    sub_pat = r"[^A-Za-z\.\?!,';:]"
    char_pat = r"\."
    com_pat = r","
@@ -63,9 +66,9 @@ def sub_chars(string):
    col_pat = r":"
    whitespace_pat = r"\s{1,}"
-    #Replace text.  Ordering is very important!
+    # Replace text.  Ordering is very important!
    nstring = re.sub(sub_pat, " ", string)
-    nstring = re.sub(char_pat," .", nstring)
+    nstring = re.sub(char_pat, " .", nstring)
    nstring = re.sub(com_pat, " ,", nstring)
    nstring = re.sub(ques_pat, " ?", nstring)
    nstring = re.sub(excl_pat, " !", nstring)
@@ -84,7 +87,7 @@ def spell_correct(string):
    string - string
    """
-    #Create a temp file so that aspell could be used
+    # Create a temp file so that aspell could be used
    f = open('tmpfile', 'w')
    f.write(string)
    f_path = os.path.abspath(f.name)
@@ -93,16 +96,16 @@ def spell_correct(string):
        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
    except:
        log.exception("Could not find aspell, so could not spell correct!")
-        #Return original string if aspell fails
+        # Return original string if aspell fails
-        return string,0, string
+        return string, 0, string
-    #Aspell returns a list of incorrect words with the above flags
+    # Aspell returns a list of incorrect words with the above flags
    incorrect = p.readlines()
    p.close()
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
        if(len(incorrect[i]) > 10):
-            #Reformat aspell output to make sense
+            # Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
                begstring = incorrect[i][2:match.start()]
@@ -117,19 +120,19 @@ def spell_correct(string):
                    incorrect_words.append(begword)
                    correct_spelling.append(sug)
-    #Create markup based on spelling errors
+    # Create markup based on spelling errors
    newstring = string
    markup_string = string
-    already_subbed=[]
+    already_subbed = []
    for i in range(0, len(incorrect_words)):
        sub_pat = r"\b" + incorrect_words[i] + r"\b"
        sub_comp = re.compile(sub_pat)
        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
        if incorrect_words[i] not in already_subbed:
-            markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
+            markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
            already_subbed.append(incorrect_words[i])
-    return newstring,len(incorrect_words),markup_string
+    return newstring, len(incorrect_words), markup_string
 def ngrams(tokens, min_n, max_n):
@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
    max_feats2 is the maximum number of features to consider in the second (final) pass
    Returns a list of words that constitute the significant vocabulary
    """
-    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
+    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
    dict_mat = dict.fit_transform(text)
    set_score = numpy.asarray(score, dtype=numpy.int)
    med_score = numpy.median(set_score)
@@ -335,6 +338,7 @@ def calc_list_average(l):
 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
 def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates kappa correlation between rater_a and rater_b.