./grade.py: W391 blank line at end of file

2e6cb8e5 · Hugh Brown · Vik Paruchuri · 0d7ac804 · 2e6cb8e5 · 2e6cb8e5
Commit 2e6cb8e5 authored Jun 02, 2013 by Hugh Brown Committed by Vik Paruchuri Jun 06, 2013
11 changed files
--- a/ease/.pep8
+++ b/ease/.pep8
+[pep8]
+ignore=E501,E712,E711
--- a/ease/create.py
+++ b/ease/create.py
@@ -7,22 +7,23 @@ import sys
 import logging
 import numpy
-#Define base path and add to sys path
+# Define base path and add to sys path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
 sys.path.append(one_up_path)
-#Import modules that are dependent on the base path
+# Import modules that are dependent on the base path
 import model_creator
 import util_functions
 import predictor_set
 import predictor_extractor
-#Make a log
+# Make a log
 log = logging.getLogger(__name__)
-def create(text,score,prompt_string):
+def create(text, score, prompt_string):
    """
    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
    TODO: Remove model path argument, it is needed for now to support legacy code
@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
    prompt_string - the common prompt for the set of essays
    """
-    #Initialize a results dictionary to return
+    # Initialize a results dictionary to return
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
+               'feature_ext': "", 'classifier': "", 'algorithm': util_functions.AlgorithmTypes.classification,
-               'score' : score, 'text' : text, 'prompt' : prompt_string}
+               'score': score, 'text': text, 'prompt': prompt_string}
-    if len(text)!=len(score):
+    if len(text) != len(score):
        msg = "Target and text lists must be same length."
        results['errors'].append(msg)
        log.exception(msg)
        return results
-    #Decide what algorithm to use (regression or classification)
+    # Decide what algorithm to use (regression or classification)
    try:
-        #Count the number of unique score points in the score list
+        # Count the number of unique score points in the score list
-        if len(util_functions.f7(list(score)))>5:
+        if len(util_functions.f7(list(score))) > 5:
            type = util_functions.AlgorithmTypes.regression
        else:
            type = util_functions.AlgorithmTypes.classification
@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
        type = util_functions.AlgorithmTypes.regression
    try:
-        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
+        # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
        e_set = model_creator.create_essay_set(text, score, prompt_string)
    except:
        msg = "essay set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
    try:
-        #Gets features from the essay set and computes error
+        # Gets features from the essay set and computes error
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
-        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_kappa'] = cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
-        results['feature_ext']=feature_ext
+        results['feature_ext'] = feature_ext
-        results['classifier']=classifier
+        results['classifier'] = classifier
        results['algorithm'] = type
-        results['success']=True
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)
@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
    return results
-def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Creates a model from a generic list numeric values and text values
    numeric_values - A list of lists that are the predictors
@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
    algorithm - the type of algorithm that will be used
    """
-    #Initialize a result dictionary to return.
+    # Initialize a result dictionary to return.
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
+               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
-    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
+    if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
        msg = "Target, numeric features, and text features must all be the same length."
        results['errors'].append(msg)
        log.exception(msg)
        return results
    try:
-        #Initialize a predictor set object that encapsulates all of the text and numeric predictors
+        # Initialize a predictor set object that encapsulates all of the text and numeric predictors
        pset = predictor_set.PredictorSet(type="train")
        for i in xrange(0, len(numeric_values)):
            pset.add_row(numeric_values[i], textual_values[i], target[i])
@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
        log.exception(msg)
    try:
-        #Extract all features and then train a classifier with the features
+        # Extract all features and then train a classifier with the features
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
-        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_kappa'] = cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
-        results['feature_ext']=feature_ext
+        results['feature_ext'] = feature_ext
-        results['classifier']=classifier
+        results['classifier'] = classifier
-        results['success']=True
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)

--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -15,11 +15,12 @@ sys.path.append(base_path)
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
-log=logging.getLogger(__name__)
+log = logging.getLogger(__name__)
+MAXIMUM_ESSAY_LENGTH = 20000
-MAXIMUM_ESSAY_LENGTH=20000
 class EssaySet(object):
    def __init__(self, type="train"):
@@ -30,17 +31,17 @@ class EssaySet(object):
            type = "train"
        self._type = type
-        self._score=[]
+        self._score = []
-        self._text=[]
+        self._text = []
-        self._id=[]
+        self._id = []
-        self._clean_text=[]
+        self._clean_text = []
-        self._tokens=[]
+        self._tokens = []
-        self._pos=[]
+        self._pos = []
-        self._clean_stem_text=[]
+        self._clean_stem_text = []
        self._generated = []
        self._prompt = ""
-        self._spelling_errors=[]
+        self._spelling_errors = []
-        self._markup_text=[]
+        self._markup_text = []
    def add_essay(self, essay_text, essay_score, essay_generated=0):
        """
@@ -58,35 +59,35 @@ class EssaySet(object):
            # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
        try:
-            essay_text=essay_text.encode('ascii', 'ignore')
+            essay_text = essay_text.encode('ascii', 'ignore')
-            if len(essay_text)<5:
+            if len(essay_text) < 5:
-                essay_text="Invalid essay."
+                essay_text = "Invalid essay."
        except:
            log.exception("Could not parse essay into ascii.")
        try:
-            #Try conversion of types
+            # Try conversion of types
-            essay_score=int(essay_score)
+            essay_score = int(essay_score)
-            essay_text=str(essay_text)
+            essay_text = str(essay_text)
        except:
-            #Nothing needed here, will return error in any case.
+            # Nothing needed here, will return error in any case.
-            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
+            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
-        if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
+        if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
                and (essay_generated == 0 or essay_generated == 1):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
            # Clean text by removing non digit/work/punctuation characters
            try:
-                essay_text=str(essay_text.encode('ascii', 'ignore'))
+                essay_text = str(essay_text.encode('ascii', 'ignore'))
            except:
-                essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
+                essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
-            cleaned_essay=util_functions.sub_chars(essay_text).lower()
+            cleaned_essay = util_functions.sub_chars(essay_text).lower()
-            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
+            if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
-                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
+                cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
            self._text.append(cleaned_essay)
            # Spell correct text using aspell
-            cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
+            cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
            self._clean_text.append(cleaned_text)
            self._spelling_errors.append(spell_errors)
            self._markup_text.append(markup_text)
@@ -112,7 +113,7 @@ class EssaySet(object):
        prompt_text should be a string.
        Returns the prompt as a confirmation.
        """
-        if(type(prompt_text) == type("text")):
+        if(isinstance(prompt_text, type("text"))):
            self._prompt = util_functions.sub_chars(prompt_text)
            ret = self._prompt
        else:

--- a/ease/external_code/fisher/fisher.py
+++ b/ease/external_code/fisher/fisher.py
@@ -21,6 +21,8 @@
 import math
 ## From dendropy.mathlib.probability
 def hypergeometric_pmf(x, m, n, k):
    """
 Given a population consisting of `m` items of class M and `n` items of class N,
@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
    # float' with large numbers
    # return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
    a = math.log(binomial_coefficient(m, x))
-    b = math.log(binomial_coefficient(n, k-x))
+    b = math.log(binomial_coefficient(n, k - x))
-    c = math.log(binomial_coefficient(m+n, k))
+    c = math.log(binomial_coefficient(m + n, k))
-    return math.exp(a+b-c)
+    return math.exp(a + b - c)
 ## From dendropy.mathlib.probability
 def binomial_coefficient(population, sample):
    "Returns `population` choose `sample`."
    s = max(sample, population - sample)
@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
        return 1
    numerator = 1
    denominator = 1
-    for i in xrange(s+1, population + 1):
+    for i in xrange(s + 1, population + 1):
        numerator *= i
        denominator *= (i - s)
-    return numerator/denominator
+    return numerator / denominator
 ## From dendropy.mathlib.statistics
 class FishersExactTest(object):
    """
 Given a 2x2 table:
@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
        b = table[0][1]
        c = table[1][0]
        d = table[1][1]
-        return hypergeometric_pmf(a, a+b, c+d, a+c)
+        return hypergeometric_pmf(a, a + b, c + d, a + c)
    probability_of_table = staticmethod(probability_of_table)
    def __init__(self, table):
@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
 Returns a copy of table such that all the values
 are rotated clockwise once.
 """
-        return [ [ table[1][0], table[0][0] ],
+        return [[table[1][0], table[0][0]],
-                [table[1][1], table[0][1] ] ]
+                [table[1][1], table[0][1]]]
    def _min_rotation(self):
        """
@@ -241,8 +247,9 @@ extreme.
                p_vals.append(p)
        return sum(p_vals) + p0
 def assert_almost_equal(v1, v2, prec=8):
-    if abs(v1-v2) <= 10**(-prec):
+    if abs(v1 - v2) <= 10 ** (-prec):
        print "OK: {} == {}".format(v1, v2)
    else:
        print "FAIL: {} != {}".format(v1, v2)

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
--- a/ease/grade.py
+++ b/ease/grade.py
@@ -8,24 +8,25 @@ import os
 import numpy
 import logging
-#Append sys to base path to import the following modules
+# Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-#Depend on base path to be imported
+# Depend on base path to be imported
 from essay_set import EssaySet
 import predictor_extractor
 import predictor_set
 import util_functions
-#Imports needed to unpickle grader data
+# Imports needed to unpickle grader data
 import feature_extractor
 import sklearn.ensemble
 import math
 log = logging.getLogger(__name__)
-def grade(grader_data,submission):
+def grade(grader_data, submission):
    """
    Grades a specified submission using specified models
    grader_data - A dictionary:
@@ -38,73 +39,74 @@ def grade(grader_data,submission):
    submission - The student submission (string)
    """
-    #Initialize result dictionary
+    # Initialize result dictionary
-    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
+    results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
-    has_error=False
+    has_error = False
-    grader_set=EssaySet(type="test")
+    grader_set = EssaySet(type="test")
-    #This is to preserve legacy functionality
+    # This is to preserve legacy functionality
    if 'algorithm' not in grader_data:
        grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
    try:
-        #Try to add essay to essay set object
+        # Try to add essay to essay set object
-        grader_set.add_essay(str(submission),0)
+        grader_set.add_essay(str(submission), 0)
        grader_set.update_prompt(str(grader_data['prompt']))
    except:
        results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
-        has_error=True
+        has_error = True
-    #Try to extract features from submission and assign score via the model
+    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        grader_feats = grader_data['extractor'].gen_feats(grader_set)
-        feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
+        feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
-        results['score']=int(grader_data['model'].predict(grader_feats)[0])
+        results['score'] = int(grader_data['model'].predict(grader_feats)[0])
-    except :
+    except:
        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
+        has_error = True
-    #Try to determine confidence level
+    # Try to determine confidence level
    try:
        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
+        # If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
    if not has_error:
-        #If the essay is just a copy of the prompt, return a 0 as the score
+        # If the essay is just a copy of the prompt, return a 0 as the score
        if(feedback['too_similar_to_prompt']):
-            results['score']=0
+            results['score'] = 0
-            results['correct']=False
+            results['correct'] = False
-        results['success']=True
+        results['success'] = True
-        #Generate short form output--number of problem areas identified in feedback
+        # Generate short form output--number of problem areas identified in feedback
-        #Add feedback to results if available
+        # Add feedback to results if available
        results['feedback'] = {}
        if 'topicality' in feedback and 'prompt_overlap' in feedback:
            results['feedback'].update({
-                'topicality' : feedback['topicality'],
+                'topicality': feedback['topicality'],
-                'prompt-overlap' : feedback['prompt_overlap'],
+                'prompt-overlap': feedback['prompt_overlap'],
            })
        results['feedback'].update(
            {
-                'spelling' : feedback['spelling'],
+                'spelling': feedback['spelling'],
-                'grammar' : feedback['grammar'],
+                'grammar': feedback['grammar'],
-                'markup-text' : feedback['markup_text'],
+                'markup-text': feedback['markup_text'],
            }
        )
    else:
-        #If error, success is False.
+        # If error, success is False.
-        results['success']=False
+        results['success'] = False
    return results
 def grade_generic(grader_data, numeric_features, textual_features):
    """
    Grades a set of numeric and textual features using a generic model
@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
    textual_features - list of textual feature to predict on
    """
-    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
+    results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
-    has_error=False
+    has_error = False
-    #Try to find and load the model file
+    # Try to find and load the model file
-    grader_set=predictor_set.PredictorSet(type="test")
+    grader_set = predictor_set.PredictorSet(type="test")
-    #Try to add essays to essay set object
+    # Try to add essays to essay set object
    try:
-        grader_set.add_row(numeric_features, textual_features,0)
+        grader_set.add_row(numeric_features, textual_features, 0)
    except:
        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
-        has_error=True
+        has_error = True
-    #Try to extract features from submission and assign score via the model
+    # Try to extract features from submission and assign score via the model
    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        grader_feats = grader_data['extractor'].gen_feats(grader_set)
-        results['score']=grader_data['model'].predict(grader_feats)[0]
+        results['score'] = grader_data['model'].predict(grader_feats)[0]
-    except :
+    except:
        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
+        has_error = True
-    #Try to determine confidence level
+    # Try to determine confidence level
    try:
        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
+        # If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
    if not has_error:
@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
    return results
-def get_confidence_value(algorithm,model,grader_feats,score, scores):
+def get_confidence_value(algorithm, model, grader_feats, score, scores):
    """
    Determines a confidence in a certain score, given proper input parameters
    algorithm- from util_functions.AlgorithmTypes
@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
    max_score=max(numpy.asarray(scores))
    if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
        #If classification, predict with probability, which gives you a matrix of confidences per score point
-        raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
+        raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
        #TODO: Normalize confidence somehow here
        confidence=raw_confidence
    elif hasattr(model, "predict"):
@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
        confidence = 0
    return confidence
--- a/ease/model_creator.py
+++ b/ease/model_creator.py
-#Provides interface functions to create and save models
+# Provides interface functions to create and save models
 import numpy
 import re
@@ -19,7 +19,8 @@ import feature_extractor
 import logging
 import predictor_extractor
-log=logging.getLogger()
+log = logging.getLogger()
 def read_in_test_data(filename):
    """
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
    prompt_string = open(filename).read()
    return prompt_string
-def read_in_test_data_twocolumn(filename,sep=","):
+def read_in_test_data_twocolumn(filename, sep=","):
    """
    Reads in a two column version of the test data.
    Filename must point to a delimited file.
@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
    return x
-def get_cv_error(clf,feats,scores):
+def get_cv_error(clf, feats, scores):
    """
    Gets cross validated error for a given classifier, set of features, and scores
    clf - classifier
    feats - features to feed into the classified and cross validate over
    scores - scores associated with the features -- feature row 1 associates with score 1, etc.
    """
-    results={'success' : False, 'kappa' : 0, 'mae' : 0}
+    results = {'success': False, 'kappa': 0, 'mae': 0}
    try:
-        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
+        cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
-        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
+        err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
-        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
+        kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
-        results['mae']=err
+        results['mae'] = err
-        results['kappa']=kappa
+        results['kappa'] = kappa
-        results['success']=True
+        results['success'] = True
    except ValueError:
-        #If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
+        # If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
        log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
    except:
        log.exception("Error getting cv error estimates.")
    return results
 def get_algorithms(type):
    """
    Gets two classifiers for each type of algorithm, and returns them.  First for predicting, second for cv error.
@@ -116,14 +120,14 @@ def get_algorithms(type):
    """
    if type == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+        clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                           max_depth=4, random_state=1, min_samples_leaf=3)
    else:
        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                         max_depth=4, random_state=1, min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+        clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2
@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
    train_feats = f.gen_feats(predictor_set)
-    clf,clf2 = get_algorithms(type)
+    clf, clf2 = get_algorithms(type)
-    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
+    cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
    try:
        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
+        set_score[0] = 1
-        set_score[1]=0
+        set_score[1] = 0
        clf.fit(train_feats, set_score)
    return f, clf, cv_error_results
@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
    train_feats = f.gen_feats(essays)
    set_score = numpy.asarray(essays._score, dtype=numpy.int)
-    if len(util_functions.f7(list(set_score)))>5:
+    if len(util_functions.f7(list(set_score))) > 5:
        type = util_functions.AlgorithmTypes.regression
    else:
        type = util_functions.AlgorithmTypes.classification
-    clf,clf2 = get_algorithms(type)
+    clf, clf2 = get_algorithms(type)
-    cv_error_results=get_cv_error(clf2,train_feats,essays._score)
+    cv_error_results = get_cv_error(clf2, train_feats, essays._score)
    try:
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
+        set_score[0] = 1
-        set_score[1]=0
+        set_score[1] = 0
        clf.fit(train_feats, set_score)
    return f, clf, cv_error_results
 def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
    """
    Writes out a model to a file.
@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
    classifier is a trained classifier
    model_path is the path of write out the model file to
    """
-    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
+    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
    pickle.dump(model_file, file=open(model_path, "w"))
-def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
+def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
    """
    Function that creates essay set, extracts features, and writes out model
    See above functions for argument descriptions
    """
-    essay_set=create_essay_set(text_score,prompt)
+    essay_set = create_essay_set(text_score, prompt)
-    feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
+    feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
-    dump_model_to_file(prompt,feature_ext,clf,model_path)
+    dump_model_to_file(prompt, feature_ext, clf, model_path)
--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -16,17 +16,18 @@ import logging
 import math
 from feature_extractor import FeatureExtractor
-#Append to path and then import things that depend on path
+# Append to path and then import things that depend on path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
 log = logging.getLogger(__name__)
 class PredictorExtractor(object):
    def __init__(self):
        self._extractors = []
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)
-        div_length=len(p_set._essay_sets)
+        div_length = len(p_set._essay_sets)
-        if div_length==0:
+        if div_length == 0:
-            div_length=1
+            div_length = 1
-        #Ensures that even with a large amount of input textual features, training time stays reasonable
+        # Ensures that even with a large amount of input textual features, training time stays reasonable
-        max_feats2 = int(math.floor(200/div_length))
+        max_feats2 = int(math.floor(200 / div_length))
-        for i in xrange(0,len(p_set._essay_sets)):
+        for i in xrange(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
            self._initialized = True
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
        Generates features based on an iput p_set
        p_set - PredictorSet
        """
-        if self._initialized!=True:
+        if self._initialized != True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)
        textual_features = []
-        for i in xrange(0,len(p_set._essay_sets)):
+        for i in xrange(0, len(p_set._essay_sets)):
            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
        textual_matrix = numpy.concatenate(textual_features, axis=1)

--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
@@ -11,12 +11,13 @@ sys.path.append(base_path)
 import util_functions
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
+log = logging.getLogger(__name__)
-log=logging.getLogger(__name__)
 class PredictorSet(object):
-    def __init__(self, type = "train"):
+    def __init__(self, type="train"):
        """
        Initialize variables and check essay set type
        """
@@ -24,13 +25,13 @@ class PredictorSet(object):
            type = "train"
        self._type = type
-        self._target=[]
+        self._target = []
-        self._textual_features=[]
+        self._textual_features = []
-        self._numeric_features=[]
+        self._numeric_features = []
-        self._essay_sets=[]
+        self._essay_sets = []
    def add_row(self, numeric_features, textual_features, target):
-        #Basic input checking
+        # Basic input checking
        if not isinstance(target, (int, long, float)):
            error_message = "Target is not a numeric value."
            log.exception(error_message)
@@ -46,8 +47,8 @@ class PredictorSet(object):
            log.exception(error_message)
            raise util_functions.InputError(textual_features, error_message)
-        #Do some length checking for parameters
+        # Do some length checking for parameters
-        if len(self._numeric_features)>0:
+        if len(self._numeric_features) > 0:
            numeric_length = len(self._numeric_features[-1])
            current_numeric_length = len(numeric_features)
            if numeric_length != current_numeric_length:
@@ -55,7 +56,7 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)
-        if len(self._textual_features)>0:
+        if len(self._textual_features) > 0:
            textual_length = len(self._textual_features[-1])
            current_textual_length = len(textual_features)
            if textual_length != current_textual_length:
@@ -63,9 +64,9 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(textual_features, error_message)
-        #Now check to see if text features and numeric features are individually correct
+        # Now check to see if text features and numeric features are individually correct
-        for i in xrange(0,len(numeric_features)):
+        for i in xrange(0, len(numeric_features)):
            try:
                numeric_features[i] = float(numeric_features[i])
            except:
@@ -73,8 +74,7 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)
+        for i in xrange(0, len(textual_features)):
-        for i in xrange(0,len(textual_features)):
            try:
                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
            except:
@@ -82,19 +82,18 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(textual_features, error_message)
-        #Create essay sets for textual features if needed
+        # Create essay sets for textual features if needed
-        if len(self._textual_features)==0:
+        if len(self._textual_features) == 0:
-            for i in xrange(0,len(textual_features)):
+            for i in xrange(0, len(textual_features)):
                self._essay_sets.append(essay_set.EssaySet(type=self._type))
-        #Add numeric and textual features
+        # Add numeric and textual features
        self._numeric_features.append(numeric_features)
        self._textual_features.append(textual_features)
-        #Add targets
+        # Add targets
        self._target.append(target)
-        #Add textual features to essay sets
+        # Add textual features to essay sets
-        for i in xrange(0,len(textual_features)):
+        for i in xrange(0, len(textual_features)):
            self._essay_sets[i].add_essay(textual_features[i], target)
--- a/ease/tests/test_model_accuracy.py
+++ b/ease/tests/test_model_accuracy.py
@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
 TRAINING_LIMIT = 100
 QUICK_TEST_LIMIT = 5
 class DataLoader():
    def load_text_files(self, pathname):
        filenames = os.listdir(pathname)
@@ -28,34 +29,36 @@ class DataLoader():
        """
        pass
 class PolarityLoader(DataLoader):
    def __init__(self, pathname):
        self.pathname = pathname
    def load_data(self):
        filenames = os.listdir(self.pathname)
-        directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]]
+        directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
-        #Sort so neg is first
+        # Sort so neg is first
        directories.sort()
-        #We need to have both a postive and a negative folder to classify
+        # We need to have both a postive and a negative folder to classify
-        if len(directories)!=2:
+        if len(directories) != 2:
            raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
        neg = self.load_text_files(directories[0])
        pos = self.load_text_files(directories[1])
-        scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))]
+        scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
        text = neg + pos
        return scores, text
 class ModelCreator():
    def __init__(self, scores, text):
        self.scores = scores
        self.text = text
-        #Governs which creation function in the ease.create module to use.  See module for info.
+        # Governs which creation function in the ease.create module to use.  See module for info.
        if isinstance(text[0], basestring):
            self.create_model_generic = False
        else:
@@ -67,6 +70,7 @@ class ModelCreator():
        else:
            return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
 class Grader():
    def __init__(self, model_data):
        self.model_data = model_data
@@ -77,6 +81,7 @@ class Grader():
        else:
            return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', []))
 class GenericTest(object):
    loader = DataLoader
    data_path = ""
@@ -87,11 +92,11 @@ class GenericTest(object):
        data_loader = self.loader(os.path.join(TEST_PATH, self.data_path))
        scores, text = data_loader.load_data()
-        #Shuffle to mix up the classes, set seed to make it repeatable
+        # Shuffle to mix up the classes, set seed to make it repeatable
        random.seed(1)
        shuffled_scores = []
        shuffled_text = []
-        indices = [i for i in xrange(0,len(scores))]
+        indices = [i for i in xrange(0, len(scores))]
        random.shuffle(indices)
        for i in indices:
            shuffled_scores.append(scores[i])
@@ -121,12 +126,13 @@ class GenericTest(object):
        self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
        self.assertLessEqual(cv_mae, self.expected_mae_max)
-class PolarityTest(unittest.TestCase,GenericTest):
+class PolarityTest(unittest.TestCase, GenericTest):
    loader = PolarityLoader
    data_path = "data/polarity"
-    #These will increase if we allow more data in.
+    # These will increase if we allow more data in.
-    #I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
+    # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
    expected_kappa_min = -.2
    expected_mae_max = 1

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
-#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
+# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
-#Requires aspell to be installed and added to the path
+# Requires aspell to be installed and added to the path
 from external_code.fisher import fisher
 aspell_path = "aspell"
@@ -14,17 +14,18 @@ import pickle
 import logging
 import sys
-log=logging.getLogger(__name__)
+log = logging.getLogger(__name__)
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
-#Paths to needed data files
+# Paths to needed data files
 ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
 ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
 class AlgorithmTypes(object):
    """
    Defines what types of algorithm can be used
@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
    regression = "regression"
    classification = "classifiction"
 def create_model_path(model_path):
    """
    Creates a path to model files
    model_path - string
    """
    if not model_path.startswith("/") and not model_path.startswith("models/"):
-        model_path="/" + model_path
+        model_path = "/" + model_path
    if not model_path.startswith("models"):
        model_path = "models" + model_path
    if not model_path.endswith(".p"):
-        model_path+=".p"
+        model_path += ".p"
    return model_path
 def sub_chars(string):
    """
    Strips illegal characters from a string.  Used to sanitize input essays.
@@ -53,7 +56,7 @@ def sub_chars(string):
    Returns sanitized string.
    string - string
    """
-    #Define replacement patterns
+    # Define replacement patterns
    sub_pat = r"[^A-Za-z\.\?!,';:]"
    char_pat = r"\."
    com_pat = r","
@@ -63,9 +66,9 @@ def sub_chars(string):
    col_pat = r":"
    whitespace_pat = r"\s{1,}"
-    #Replace text.  Ordering is very important!
+    # Replace text.  Ordering is very important!
    nstring = re.sub(sub_pat, " ", string)
-    nstring = re.sub(char_pat," .", nstring)
+    nstring = re.sub(char_pat, " .", nstring)
    nstring = re.sub(com_pat, " ,", nstring)
    nstring = re.sub(ques_pat, " ?", nstring)
    nstring = re.sub(excl_pat, " !", nstring)
@@ -84,7 +87,7 @@ def spell_correct(string):
    string - string
    """
-    #Create a temp file so that aspell could be used
+    # Create a temp file so that aspell could be used
    f = open('tmpfile', 'w')
    f.write(string)
    f_path = os.path.abspath(f.name)
@@ -93,16 +96,16 @@ def spell_correct(string):
        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
    except:
        log.exception("Could not find aspell, so could not spell correct!")
-        #Return original string if aspell fails
+        # Return original string if aspell fails
-        return string,0, string
+        return string, 0, string
-    #Aspell returns a list of incorrect words with the above flags
+    # Aspell returns a list of incorrect words with the above flags
    incorrect = p.readlines()
    p.close()
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
        if(len(incorrect[i]) > 10):
-            #Reformat aspell output to make sense
+            # Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
                begstring = incorrect[i][2:match.start()]
@@ -117,19 +120,19 @@ def spell_correct(string):
                    incorrect_words.append(begword)
                    correct_spelling.append(sug)
-    #Create markup based on spelling errors
+    # Create markup based on spelling errors
    newstring = string
    markup_string = string
-    already_subbed=[]
+    already_subbed = []
    for i in range(0, len(incorrect_words)):
        sub_pat = r"\b" + incorrect_words[i] + r"\b"
        sub_comp = re.compile(sub_pat)
        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
        if incorrect_words[i] not in already_subbed:
-            markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
+            markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
            already_subbed.append(incorrect_words[i])
-    return newstring,len(incorrect_words),markup_string
+    return newstring, len(incorrect_words), markup_string
 def ngrams(tokens, min_n, max_n):
@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
    max_feats2 is the maximum number of features to consider in the second (final) pass
    Returns a list of words that constitute the significant vocabulary
    """
-    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
+    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
    dict_mat = dict.fit_transform(text)
    set_score = numpy.asarray(score, dtype=numpy.int)
    med_score = numpy.median(set_score)
@@ -335,6 +338,7 @@ def calc_list_average(l):
 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
 def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates kappa correlation between rater_a and rater_b.