Cleand up all of the files sytlistically

da78277e · gradyward · a990b25e · da78277e · da78277e · da78277e
Commit da78277e authored Jun 12, 2014 by gradyward
8 changed files
--- a/ease/create.py
+++ b/ease/create.py
@@ -7,7 +7,7 @@ import sys
 import logging
 import numpy

-#Define base path and add to sys path
+# Define base path and add to sys path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
@@ -24,6 +24,7 @@ import json
 #Make a log
 log = logging.getLogger(__name__)

+
 def dump_input_data(text, score):
    try:
        file_path = base_path + "/tests/data/json_data/"
@@ -32,14 +33,15 @@ def dump_input_data(text, score):
        filename = prefix + time_suffix + ".json"
        json_data = []
        for i in xrange(0, len(text)):
-            json_data.append({'text' : text[i], 'score' : score[i]})
+            json_data.append({'text': text[i], 'score': score[i]})
        with open(file_path + filename, 'w+') as outfile:
            json.dump(json_data, outfile)
    except:
        error = "Could not dump data to file."
        log.exception(error)

-def create(text,score,prompt_string, dump_data=False):
+
+def create(text, score, prompt_string, dump_data=False):
    """
    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
    TODO: Remove model path argument, it is needed for now to support legacy code
@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):

    algorithm = select_algorithm(score)
    #Initialize a results dictionary to return
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm,
-               'score' : score, 'text' : text, 'prompt' : prompt_string}
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
+               'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
+               'score': score, 'text': text, 'prompt': prompt_string}

-    if len(text)!=len(score):
+    if len(text) != len(score):
        msg = "Target and text lists must be same length."
        results['errors'].append(msg)
        log.exception(msg)
@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
        log.exception(msg)
    try:
        #Gets features from the essay set and computes error
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, algorithm = algorithm)
-        results['cv_kappa']=cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
-        results['feature_ext']=feature_ext
-        results['classifier']=classifier
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
+                                                                                                      algorithm=algorithm)
+        results['cv_kappa'] = cv_error_results['kappa']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
+        results['feature_ext'] = feature_ext
+        results['classifier'] = classifier
        results['algorithm'] = algorithm
-        results['success']=True
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)
@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
    return results


-def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
    """
    Creates a model from a generic list numeric values and text values
    numeric_values - A list of lists that are the predictors
@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func

    algorithm = select_algorithm(target)
    #Initialize a result dictionary to return.
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
+    results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
+               'feature_ext': "", 'classifier': "", 'algorithm': algorithm}

-    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
+    if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
        msg = "Target, numeric features, and text features must all be the same length."
        results['errors'].append(msg)
        log.exception(msg)
@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func

    try:
        #Extract all features and then train a classifier with the features
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
-        results['cv_kappa']=cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
-        results['feature_ext']=feature_ext
-        results['classifier']=classifier
-        results['success']=True
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
+                                                                                                                 algorithm)
+        results['cv_kappa'] = cv_error_results['kappa']
+        results['cv_mean_absolute_error'] = cv_error_results['mae']
+        results['feature_ext'] = feature_ext
+        results['classifier'] = classifier
+        results['success'] = True
    except:
        msg = "feature extraction and model creation failed."
        results['errors'].append(msg)
@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func

    return results

+
 def select_algorithm(score_list):
    #Decide what algorithm to use (regression or classification)
    try:
        #Count the number of unique score points in the score list
-        if len(util_functions.f7(list(score_list)))>5:
+        if len(util_functions.f7(list(score_list))) > 5:
            algorithm = util_functions.AlgorithmTypes.regression
        else:
            algorithm = util_functions.AlgorithmTypes.classification

--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -27,7 +27,7 @@ class EssaySet(object):
        """
        Initialize variables and check essay set type
        """
-        if(essaytype != "train" and essaytype != "test"):
+        if (essaytype != "train" and essaytype != "test"):
            essaytype = "train"

        self._type = essaytype
@@ -52,7 +52,7 @@ class EssaySet(object):
        Returns a confirmation that essay was added.
        """
        # Get maximum current essay id, or set to 0 if this is the first essay added
-        if(len(self._id) > 0):
+        if (len(self._id) > 0):
            max_id = max(self._id)
        else:
            max_id = 0
@@ -71,9 +71,10 @@ class EssaySet(object):
            essay_text = str(essay_text)
        except:
            # Nothing needed here, will return error in any case.
-            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
+            log.exception(
+                "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))

-        if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
+        if isinstance(essay_score, int) and isinstance(essay_text, basestring) \
                and (essay_generated == 0 or essay_generated == 1):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
@@ -83,7 +84,7 @@ class EssaySet(object):
            except:
                essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
            cleaned_essay = util_functions.sub_chars(essay_text).lower()
-            if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
+            if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
                cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
            self._text.append(cleaned_essay)
            # Spell correct text using aspell
@@ -113,7 +114,7 @@ class EssaySet(object):
        prompt_text should be a string.
        Returns the prompt as a confirmation.
        """
-        if(isinstance(prompt_text, basestring)):
+        if (isinstance(prompt_text, basestring)):
            self._prompt = util_functions.sub_chars(prompt_text)
            ret = self._prompt
        else:
@@ -134,7 +135,7 @@ class EssaySet(object):
        all_syns = []
        for word in e_toks:
            synonyms = util_functions.get_wordnet_syns(word)
-            if(len(synonyms) > max_syns):
+            if (len(synonyms) > max_syns):
                synonyms = random.sample(synonyms, max_syns)
            all_syns.append(synonyms)
        new_essays = []

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
--- a/ease/grade.py
+++ b/ease/grade.py
@@ -8,7 +8,7 @@ import os
 import numpy
 import logging

-#Append sys to base path to import the following modules
+# Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)

@@ -25,7 +25,8 @@ import math

 log = logging.getLogger(__name__)

-def grade(grader_data,submission):
+
+def grade(grader_data, submission):
    """
    Grades a specified submission using specified models
    grader_data - A dictionary:
@@ -39,10 +40,10 @@ def grade(grader_data,submission):
    """

    #Initialize result dictionary
-    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
-    has_error=False
+    results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
+    has_error = False

-    grader_set=EssaySet(essaytype="test")
+    grader_set = EssaySet(essaytype="test")
    feedback = {}

    model, extractor = get_classifier_and_ext(grader_data)
@@ -53,28 +54,29 @@ def grade(grader_data,submission):

    try:
        #Try to add essay to essay set object
-        grader_set.add_essay(str(submission),0)
+        grader_set.add_essay(str(submission), 0)
        grader_set.update_prompt(str(grader_data['prompt']))
    except Exception:
        error_message = "Essay could not be added to essay set:{0}".format(submission)
        log.exception(error_message)
        results['errors'].append(error_message)
-        has_error=True
+        has_error = True

    #Try to extract features from submission and assign score via the model
    try:
-        grader_feats=extractor.gen_feats(grader_set)
-        feedback=extractor.gen_feedback(grader_set,grader_feats)[0]
-        results['score']=int(model.predict(grader_feats)[0])
+        grader_feats = extractor.gen_feats(grader_set)
+        feedback = extractor.gen_feedback(grader_set, grader_feats)[0]
+        results['score'] = int(model.predict(grader_feats)[0])
    except Exception:
        error_message = "Could not extract features and score essay."
        log.exception(error_message)
        results['errors'].append(error_message)
-        has_error=True
+        has_error = True

    #Try to determine confidence level
    try:
-        results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'], grader_data['score'])
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'],
+                                                     grader_data['score'])
    except Exception:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
@@ -82,11 +84,11 @@ def grade(grader_data,submission):
    if not has_error:

        #If the essay is just a copy of the prompt, return a 0 as the score
-        if( 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']):
-            results['score']=0
-            results['correct']=False
+        if 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']:
+            results['score'] = 0
+            results['correct'] = False

-        results['success']=True
+        results['success'] = True

        #Generate short form output--number of problem areas identified in feedback

@@ -94,24 +96,25 @@ def grade(grader_data,submission):
        results['feedback'] = {}
        if 'topicality' in feedback and 'prompt_overlap' in feedback:
            results['feedback'].update({
-                'topicality' : feedback['topicality'],
-                'prompt-overlap' : feedback['prompt_overlap'],
-                })
+                'topicality': feedback['topicality'],
+                'prompt-overlap': feedback['prompt_overlap'],
+            })

        results['feedback'].update(
            {
-                'spelling' : feedback['spelling'],
-                'grammar' : feedback['grammar'],
-                'markup-text' : feedback['markup_text'],
-                }
+                'spelling': feedback['spelling'],
+                'grammar': feedback['grammar'],
+                'markup-text': feedback['markup_text'],
+            }
        )

    else:
        #If error, success is False.
-        results['success']=False
+        results['success'] = False

    return results

+
 def grade_generic(grader_data, numeric_features, textual_features):
    """
    Grades a set of numeric and textual features using a generic model
@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
    textual_features - list of textual feature to predict on

    """
-    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
+    results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}

-    has_error=False
+    has_error = False

    #Try to find and load the model file

-    grader_set=predictor_set.PredictorSet(essaytype="test")
+    grader_set = predictor_set.PredictorSet(essaytype="test")

    model, extractor = get_classifier_and_ext(grader_data)

    #Try to add essays to essay set object
    try:
-        grader_set.add_row(numeric_features, textual_features,0)
+        grader_set.add_row(numeric_features, textual_features, 0)
    except Exception:
        error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)
        log.exception(error_msg)
        results['errors'].append(error_msg)
-        has_error=True
+        has_error = True

    #Try to extract features from submission and assign score via the model
    try:
-        grader_feats=extractor.gen_feats(grader_set)
-        results['score']=model.predict(grader_feats)[0]
+        grader_feats = extractor.gen_feats(grader_set)
+        results['score'] = model.predict(grader_feats)[0]
    except Exception:
        error_msg = "Could not extract features and score essay."
        log.exception(error_msg)
        results['errors'].append(error_msg)
-        has_error=True
+        has_error = True

    #Try to determine confidence level
    try:
-        results['confidence'] = get_confidence_value(grader_data['algorithm'],model, grader_feats, results['score'])
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'])
    except Exception:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):

    return results

-def get_confidence_value(algorithm,model,grader_feats,score, scores):
+
+def get_confidence_value(algorithm, model, grader_feats, score, scores):
    """
    Determines a confidence in a certain score, given proper input parameters
    algorithm- from util_functions.AlgorithmTypes
@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
    grader_feats - a row of features used by the model for classification/regression
    score - The score assigned to the submission by a prior model
    """
-    min_score=min(numpy.asarray(scores))
-    max_score=max(numpy.asarray(scores))
+    min_score = min(numpy.asarray(scores))
+    max_score = max(numpy.asarray(scores))
    if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
        #If classification, predict with probability, which gives you a matrix of confidences per score point
-        raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
+        raw_confidence = model.predict_proba(grader_feats)[0, (float(score) - float(min_score))]
        #TODO: Normalize confidence somehow here
-        confidence=raw_confidence
+        confidence = raw_confidence
    elif hasattr(model, "predict"):
        raw_confidence = model.predict(grader_feats)[0]
-        confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)), math.ceil(float(raw_confidence)) - float(raw_confidence))
+        confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)),
+                         math.ceil(float(raw_confidence)) - float(raw_confidence))
    else:
        confidence = 0

    return confidence

+
 def get_classifier_and_ext(grader_data):
    if 'classifier' in grader_data:
        model = grader_data['classifier']

--- a/ease/model_creator.py
+++ b/ease/model_creator.py
-#Provides interface functions to create and save models
+# Provides interface functions to create and save models

 import numpy
 import re
@@ -19,7 +19,8 @@ import feature_extractor
 import logging
 import predictor_extractor

-log=logging.getLogger()
+log = logging.getLogger()
+

 def read_in_test_data(filename):
    """
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
    prompt_string = open(filename).read()
    return prompt_string

-def read_in_test_data_twocolumn(filename,sep=","):
+
+def read_in_test_data_twocolumn(filename, sep=","):
    """
    Reads in a two column version of the test data.
    Filename must point to a delimited file.
@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):

    return x

-def get_cv_error(clf,feats,scores):
+
+def get_cv_error(clf, feats, scores):
    """
    Gets cross validated error for a given classifier, set of features, and scores
    clf - classifier
    feats - features to feed into the classified and cross validate over
    scores - scores associated with the features -- feature row 1 associates with score 1, etc.
    """
-    results={'success' : False, 'kappa' : 0, 'mae' : 0}
+    results = {'success': False, 'kappa': 0, 'mae': 0}
    try:
-        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
-        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
-        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-        results['mae']=err
-        results['kappa']=kappa
-        results['success']=True
+        cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
+        err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
+        kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
+        results['mae'] = err
+        results['kappa'] = kappa
+        results['success'] = True
    except ValueError as ex:
        # If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
        msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):

    return results

+
 def get_algorithms(algorithm):
    """
    Gets two classifiers for each type of algorithm, and returns them.  First for predicting, second for cv error.
@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
    """
    if algorithm == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
+        clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+                                                           max_depth=4, random_state=1, min_samples_leaf=3)
    else:
        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
+                                                         max_depth=4, random_state=1, min_samples_leaf=3)
+        clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+                                                          max_depth=4, random_state=1, min_samples_leaf=3)
    return clf, clf2


@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
    predictor_set - a PredictorSet object that has been initialized with data
    type - one of util_functions.AlgorithmType
    """
-    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
+    if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
        algorithm = util_functions.AlgorithmTypes.regression

    f = predictor_extractor.PredictorExtractor()
@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util

    train_feats = f.gen_feats(predictor_set)

-    clf,clf2 = get_algorithms(algorithm)
-    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
+    clf, clf2 = get_algorithms(algorithm)
+    cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)

    try:
        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
        set_score = predictor_set._target
-        set_score[0]=1
-        set_score[1]=0
+        set_score[0] = 1
+        set_score[1] = 0
        clf.fit(train_feats, set_score)

    return f, clf, cv_error_results
@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
    train_feats = f.gen_feats(essays)

    set_score = numpy.asarray(essays._score, dtype=numpy.int)
-    if len(util_functions.f7(list(set_score)))>5:
+    if len(util_functions.f7(list(set_score))) > 5:
        algorithm = util_functions.AlgorithmTypes.regression
    else:
        algorithm = util_functions.AlgorithmTypes.classification

-    clf,clf2 = get_algorithms(algorithm)
+    clf, clf2 = get_algorithms(algorithm)

-    cv_error_results=get_cv_error(clf2,train_feats,essays._score)
+    cv_error_results = get_cv_error(clf2, train_feats, essays._score)

    try:
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
-        set_score[1]=0
+        set_score[0] = 1
+        set_score[1] = 0
        clf.fit(train_feats, set_score)

    return f, clf, cv_error_results

+
 def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
    """
    Writes out a model to a file.
@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
    classifier is a trained classifier
    model_path is the path of write out the model file to
    """
-    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
+    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
    pickle.dump(model_file, file=open(model_path, "w"))

-def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
+
+def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
    """
    Function that creates essay set, extracts features, and writes out model
    See above functions for argument descriptions
    """
-    essay_set=create_essay_set(text,score,prompt)
-    feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
-    dump_model_to_file(prompt,feature_ext,clf,model_path)
+    essay_set = create_essay_set(text, score, prompt)
+    feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
+    dump_model_to_file(prompt, feature_ext, clf, model_path)


--- a/ease/predictor_extractor.py
+++ b/ease/predictor_extractor.py
@@ -16,17 +16,18 @@ import logging
 import math
 from feature_extractor import FeatureExtractor

-#Append to path and then import things that depend on path
+# Append to path and then import things that depend on path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
 import util_functions

 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"

 log = logging.getLogger(__name__)

+
 class PredictorExtractor(object):
    def __init__(self):
        self._extractors = []
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)

-        div_length=len(p_set._essay_sets)
-        if div_length==0:
-            div_length=1
+        div_length = len(p_set._essay_sets)
+        if div_length == 0:
+            div_length = 1

        #Ensures that even with a large amount of input textual features, training time stays reasonable
-        max_feats2 = int(math.floor(200/div_length))
-        for i in xrange(0,len(p_set._essay_sets)):
+        max_feats2 = int(math.floor(200 / div_length))
+        for i in xrange(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
            self._initialized = True
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
        Generates features based on an iput p_set
        p_set - PredictorSet
        """
-        if self._initialized!=True:
+        if self._initialized != True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)

        textual_features = []
-        for i in xrange(0,len(p_set._essay_sets)):
+        for i in xrange(0, len(p_set._essay_sets)):
            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))

        textual_matrix = numpy.concatenate(textual_features, axis=1)

--- a/ease/predictor_set.py
+++ b/ease/predictor_set.py
@@ -11,26 +11,27 @@ sys.path.append(base_path)
 import util_functions

 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"
+
+log = logging.getLogger(__name__)

-log=logging.getLogger(__name__)

 class PredictorSet(object):
-    def __init__(self, essaytype = "train"):
+    def __init__(self, essaytype="train"):
        """
        Initialize variables and check essay set type
        """
-        if(essaytype != "train" and essaytype != "test"):
+        if (essaytype != "train" and essaytype != "test"):
            essaytype = "train"

        self._type = essaytype
-        self._target=[]
-        self._textual_features=[]
-        self._numeric_features=[]
-        self._essay_sets=[]
+        self._target = []
+        self._textual_features = []
+        self._numeric_features = []
+        self._essay_sets = []

    def add_row(self, numeric_features, textual_features, target):
-        #Basic input checking
+        # Basic input checking
        if not isinstance(target, (int, long, float)):
            error_message = "Target is not a numeric value."
            log.exception(error_message)
@@ -47,16 +48,16 @@ class PredictorSet(object):
            raise util_functions.InputError(textual_features, error_message)

        #Do some length checking for parameters
-        if len(self._numeric_features)>0:
-            numeric_length  = len(self._numeric_features[-1])
+        if len(self._numeric_features) > 0:
+            numeric_length = len(self._numeric_features[-1])
            current_numeric_length = len(numeric_features)
            if numeric_length != current_numeric_length:
                error_message = "Numeric features are an improper length."
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)

-        if len(self._textual_features)>0:
-            textual_length  = len(self._textual_features[-1])
+        if len(self._textual_features) > 0:
+            textual_length = len(self._textual_features[-1])
            current_textual_length = len(textual_features)
            if textual_length != current_textual_length:
                error_message = "Textual features are an improper length."
@@ -65,7 +66,7 @@ class PredictorSet(object):

        #Now check to see if text features and numeric features are individually correct

-        for i in xrange(0,len(numeric_features)):
+        for i in xrange(0, len(numeric_features)):
            try:
                numeric_features[i] = float(numeric_features[i])
            except:
@@ -73,8 +74,7 @@ class PredictorSet(object):
                log.exception(error_message)
                raise util_functions.InputError(numeric_features, error_message)

-
-        for i in xrange(0,len(textual_features)):
+        for i in xrange(0, len(textual_features)):
            try:
                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
            except:
@@ -83,8 +83,8 @@ class PredictorSet(object):
                raise util_functions.InputError(textual_features, error_message)

        #Create essay sets for textual features if needed
-        if len(self._textual_features)==0:
-            for i in xrange(0,len(textual_features)):
+        if len(self._textual_features) == 0:
+            for i in xrange(0, len(textual_features)):
                self._essay_sets.append(essay_set.EssaySet(essaytype=self._type))

        #Add numeric and textual features
@@ -95,6 +95,6 @@ class PredictorSet(object):
        self._target.append(target)

        #Add textual features to essay sets
-        for i in xrange(0,len(textual_features)):
+        for i in xrange(0, len(textual_features)):
            self._essay_sets[i].add_essay(textual_features[i], target)

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
-#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
+# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
 #Requires aspell to be installed and added to the path
 from fisher import pvalue

@@ -15,17 +15,18 @@ import logging
 import sys
 import tempfile

-log=logging.getLogger(__name__)
+log = logging.getLogger(__name__)

 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 if not base_path.endswith("/"):
-    base_path=base_path+"/"
+    base_path = base_path + "/"

 #Paths to needed data files
 ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
 ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"

+
 class AlgorithmTypes(object):
    """
    Defines what types of algorithm can be used
@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
    regression = "regression"
    classification = "classifiction"

+
 def create_model_path(model_path):
    """
    Creates a path to model files
    model_path - string
    """
    if not model_path.startswith("/") and not model_path.startswith("models/"):
-        model_path="/" + model_path
+        model_path = "/" + model_path
    if not model_path.startswith("models"):
        model_path = "models" + model_path
    if not model_path.endswith(".p"):
-        model_path+=".p"
+        model_path += ".p"

    return model_path

+
 def sub_chars(string):
    """
    Strips illegal characters from a string.  Used to sanitize input essays.
@@ -66,7 +69,7 @@ def sub_chars(string):

    #Replace text.  Ordering is very important!
    nstring = re.sub(sub_pat, " ", string)
-    nstring = re.sub(char_pat," .", nstring)
+    nstring = re.sub(char_pat, " .", nstring)
    nstring = re.sub(com_pat, " ,", nstring)
    nstring = re.sub(ques_pat, " ?", nstring)
    nstring = re.sub(excl_pat, " !", nstring)
@@ -101,7 +104,7 @@ def spell_correct(string):
    except Exception:
        log.exception("aspell process failed; could not spell check")
        # Return original string if aspell fails
-        return string,0, string
+        return string, 0, string

    finally:
        f.close()
@@ -109,7 +112,7 @@ def spell_correct(string):
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
-        if(len(incorrect[i]) > 10):
+        if (len(incorrect[i]) > 10):
            #Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
@@ -128,16 +131,16 @@ def spell_correct(string):
    #Create markup based on spelling errors
    newstring = string
    markup_string = string
-    already_subbed=[]
+    already_subbed = []
    for i in range(0, len(incorrect_words)):
        sub_pat = r"\b" + incorrect_words[i] + r"\b"
        sub_comp = re.compile(sub_pat)
        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
        if incorrect_words[i] not in already_subbed:
-            markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
+            markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
            already_subbed.append(incorrect_words[i])

-    return newstring,len(incorrect_words),markup_string
+    return newstring, len(incorrect_words), markup_string


 def ngrams(tokens, min_n, max_n):
@@ -162,6 +165,7 @@ def f7(seq):
    """
    seen = set()
    seen_add = seen.add
+    #TODO Potential Improvment Here
    return [x for x in seq if x not in seen and not seen_add(x)]


@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
    max_feats2 is the maximum number of features to consider in the second (final) pass
    Returns a list of words that constitute the significant vocabulary
    """
-    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
+    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
    dict_mat = dict.fit_transform(text)
    set_score = numpy.asarray(score, dtype=numpy.int)
    med_score = numpy.median(set_score)
    new_score = set_score
-    if(med_score == 0):
+    if (med_score == 0):
        med_score = 1
    new_score[set_score < med_score] = 0
    new_score[set_score >= med_score] = 1
@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
        fish_vals.append(fish_val)

    cutoff = 1
-    if(len(fish_vals) > max_feats2):
+    if (len(fish_vals) > max_feats2):
        cutoff = sorted(fish_vals)[max_feats2]
    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])

@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
            else:
                cost = 1
            d[(i, j)] = min(
-                d[(i - 1, j)] + 1, # deletion
-                d[(i, j - 1)] + 1, # insertion
-                d[(i - 1, j - 1)] + cost, # substitution
+                d[(i - 1, j)] + 1,  # deletion
+                d[(i, j - 1)] + 1,  # insertion
+                d[(i - 1, j - 1)] + cost,  # substitution
            )
            if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
-                d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
+                d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost)  # transposition

    return d[lenstr1 - 1, lenstr2 - 1]

@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
        sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
        preds.append(list(sim_fit.predict(arr[chunks[i]])))
    all_preds = list(chain(*preds))
-    return(all_preds)
+    return (all_preds)


 def gen_model(clf, arr, sel_score):
@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
    """
    set_score = numpy.asarray(sel_score, dtype=numpy.int)
    sim_fit = clf.fit(arr, set_score)
-    return(sim_fit)
+    return (sim_fit)


 def gen_preds(clf, arr):
@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
    arr is a data array identical in dimension to the array clf was trained on
    Returns the array of predictions.
    """
-    if(hasattr(clf, "predict_proba")):
+    if (hasattr(clf, "predict_proba")):
        ret = clf.predict(arr)
        # pred_score=preds.argmax(1)+min(x._score)
    else:
@@ -340,8 +344,10 @@ def calc_list_average(l):
        total += value
    return total / len(l)

+
 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5

+
 def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates kappa correlation between rater_a and rater_b.
@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
    max_rating is an optional argument describing the maximum rating possible on the data set
    Returns a float corresponding to the kappa correlation
    """
-    assert(len(rater_a) == len(rater_b))
+    assert (len(rater_a) == len(rater_b))
    rater_a = [int(a) for a in rater_a]
    rater_b = [int(b) for b in rater_b]
    if min_rating is None:
@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
-        min_rating, max_rating)
+                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
    numerator = 0.0
    denominator = 0.0

-    if(num_ratings > 1):
+    if (num_ratings > 1):
        for i in range(num_ratings):
            for j in range(num_ratings):
                expected_count = (hist_rater_a[i] * hist_rater_b[j]
@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    A confusion matrix shows how often 2 values agree and disagree
    See quadratic_weighted_kappa for argument descriptions
    """
-    assert(len(rater_a) == len(rater_b))
+    assert (len(rater_a) == len(rater_b))
    rater_a = [int(a) for a in rater_a]
    rater_b = [int(b) for b in rater_b]
    min_rating = int(min_rating)
@@ -450,7 +456,7 @@ def get_separator_words(toks1):
    Returns a list of separator words
    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
-    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
+    if (os.path.isfile(ESSAY_COR_TOKENS_PATH)):
        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
    else:
        essay_corpus = open(ESSAY_CORPUS_PATH).read()
@@ -460,12 +466,12 @@ def get_separator_words(toks1):
    sep_words = []
    for word in tab_toks1.keys():
        tok1_present = tab_toks1[word]
-        if(tok1_present > 2):
+        if (tok1_present > 2):
            tok1_total = tab_toks1._N
            tok2_present = toks2[word]
            tok2_total = toks2._N
            fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
-            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
+            if (fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
                sep_words.append(word)
    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
    return sep_words