Stylistic cleanup

31d4363d · gradyward · 9c16fbbe · 31d4363d · 31d4363d · 31d4363d
Commit 31d4363d authored Jun 13, 2014 by gradyward
8 changed files
--- a/ease/create.py
+++ b/ease/create.py
@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
 """
 import os
-import sys
 import logging
 import numpy
+import sys
 # Constructs a log
 log = logging.getLogger(__name__)
 # Setup base path so that we can import modules who are dependent on it
@@ -15,7 +18,7 @@ sys.path.append(base_path)
 one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
 sys.path.append(one_up_path)
-#Import modules that are dependent on the base path
+# Import modules that are dependent on the base path
 import util_functions
 from errors import *
 from datetime import datetime
@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
    # We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
    except Exception as ex:
-        str = (
+        msg = (
            "predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
        ).format(ex)
-        log.exception(str)
+        log.exception(msg)
-        raise ClassifierTrainingInternalError(str)
+        raise ClassifierTrainingInternalError(msg)
    return feat_extractor, predict_classifier, cv_error_results

--- a/ease/errors.py
+++ b/ease/errors.py
@@ -2,6 +2,7 @@
 Errors for the EASE repository
 """
 class EaseError(Exception):
    pass
@@ -45,6 +46,7 @@ class InputError(EaseError):
    """
    The user supplied an argument which was incorrect.
    """
    def __init__(self, expr, msg):
        self.expr = expr
        self.msg = msg

--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
 Performs spell and grammar checking, tokenization, and stemming.
 """
-import nltk
-import sys
 import random
 import os
 import logging
-from ease.errors import InputError
+import nltk
+import sys
 from errors import *
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 import util_functions
 if not base_path.endswith("/"):
-    base_path = base_path + "/"
+    base_path += "/"
 log = logging.getLogger(__name__)
@@ -97,9 +98,9 @@ class EssaySet(object):
            try:
                essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
            except UnicodeError as ex:
-                str = "Could not parse essay text into ascii: {}".format(ex)
+                msg = "Could not parse essay text into ascii: {}".format(ex)
-                log.exception(str)
+                log.exception(msg)
-                raise EssaySetRequestError(ex)
+                raise EssaySetRequestError(msg)
        # Validates that score is an integer and essay_text is a string.
        try:
@@ -107,9 +108,9 @@ class EssaySet(object):
            essay_text = str(essay_text)
            essay_generated = int(essay_generated)
        except TypeError:
-            str = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
+            ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
-            log.exception(str)
+            log.exception(ex)
-            raise EssaySetRequestError(str)
+            raise EssaySetRequestError(ex)
        # Validates that essay generated is 0 or 1
        if essay_generated != 0 and essay_generated != 1:

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
@@ -2,24 +2,26 @@
 Extracts features from training set and test set essays
 """
-import numpy
-import nltk
-import sys
-from sklearn.feature_extraction.text import CountVectorizer
 import pickle
 import os
 from itertools import chain
 import operator
 import logging
+import numpy
+import nltk
+import sys
+from sklearn.feature_extraction.text import CountVectorizer
 from errors import *
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-from essay_set import EssaySet
 import util_functions
 if not base_path.endswith("/"):
-    base_path = base_path + "/"
+    base_path += "/"
 log = logging.getLogger(__name__)
@@ -79,9 +81,8 @@ class FeatureExtractor(object):
                    sum([len(essay) for essay in essay_set._cleaned_essays]))
                # Gets the number and positions of grammar errors
-                good_pos_tags, bad_pos_positions = self._get_grammar_errors(
+                good_pos_tags, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
-                    essay_set._pos_tags, essay_set._cleaned_essays, essay_set._tokens
+                                                                            essay_set._cleaned_essays)
-                )
                # NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
                # grammatical errors.  Though this was not what the original author used, it is clearly what his code
                # implies, as if this is intended to be a true "grammar errors per character", we should have that
@@ -154,7 +155,7 @@ class FeatureExtractor(object):
        # SEE COMMENT AROUND LINE 85
        good_grammar_ratios, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
-                                                                          essay_set._cleaned_essays, essay_set._tokens)
+                                                                          essay_set._cleaned_essays)
        good_pos_tag_proportion = [len(bad_pos_positions[m]) / float(word_counts[m]) for m in xrange(0, len(essays))]
        length_array = numpy.array((
@@ -204,7 +205,7 @@ class FeatureExtractor(object):
        prompt_overlap_prop = []
        for j in essay_set._tokens:
            tok_length = len(j)
-            if (tok_length == 0):
+            if tok_length == 0:
                tok_length = 1
            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
@@ -212,7 +213,7 @@ class FeatureExtractor(object):
        expand_overlap_prop = []
        for j in essay_set._tokens:
            tok_length = len(j)
-            if (tok_length == 0):
+            if tok_length == 0:
                tok_length = 1
            expand_overlap.append(len([i for i in j if i in expand_syns]))
            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
@@ -221,7 +222,7 @@ class FeatureExtractor(object):
        return prompt_arr.copy()
-    def _get_grammar_errors(self, pos, essays, tokens):
+    def _get_grammar_errors(self, pos, essays):
        """
        Internal function to get the number of grammar errors in given text
@@ -251,7 +252,7 @@ class FeatureExtractor(object):
                start, end = bad_pos_tuples[m]
                for j in xrange(m + 1, len(bad_pos_tuples)):
                    lstart, lend = bad_pos_tuples[j]
-                    if lstart >= start and lstart <= end:
+                    if start <= lstart <= end:
                        bad_pos_tuples[m][1] = bad_pos_tuples[j][1]
                        to_delete.append(j)
@@ -268,7 +269,8 @@ class FeatureExtractor(object):
            good_grammar_ratios.append(good_grammar_ratio)
        return good_grammar_ratios, bad_pos_positions
-    def _get_good_pos_ngrams(self):
+    @staticmethod
+    def _get_good_pos_ngrams():
        """
        Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
        Returns the list and caches the file

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -8,11 +8,12 @@ import logging
 import sys
 # Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-#Depend on base path to be imported
+# Depend on base path to be imported
 from essay_set import EssaySet
 from errors import *
@@ -45,7 +46,6 @@ def grade(grader_data, submission):
    # Instantiates the Essay set which will carry our essay while it is being classified and graded.
    grader_set = EssaySet(essay_type="test")
-    feedback = {}
    # Retrieves the model and extractor we will be using
    model, extractor = _get_classifier_and_extractor(grader_data)

--- a/ease/tests/test_model_accuracy.py
+++ b/ease/tests/test_model_accuracy.py
 import unittest
 import os
-from ease import create, grade
 import random
 import logging
 import json
+from ease import create, grade
 log = logging.getLogger(__name__)
 ROOT_PATH = os.path.abspath(__file__)
@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
 TRAINING_LIMIT = 50
 QUICK_TEST_LIMIT = 5
+# noinspection PyClassHasNoInit
 class DataLoader():
-    def load_text_files(self, pathname):
+    @staticmethod
+    def load_text_files(pathname):
        filenames = os.listdir(pathname)
        text = []
        for filename in filenames:
@@ -23,7 +27,8 @@ class DataLoader():
            text.append(data[:CHARACTER_LIMIT])
        return text
-    def load_json_file(self, filename):
+    @staticmethod
+    def load_json_file(filename):
        datafile = open(os.path.join(filename))
        data = json.load(datafile)
        return data
@@ -34,38 +39,42 @@ class DataLoader():
        """
        pass
 class PolarityLoader(DataLoader):
    def __init__(self, pathname):
        self.pathname = pathname
    def load_data(self):
        filenames = os.listdir(self.pathname)
-        directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]]
+        directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
+                       not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
-        #Sort so neg is first
+        # Sort so neg is first
        directories.sort()
-        #We need to have both a postive and a negative folder to classify
+        # We need to have both a postive and a negative folder to classify
-        if len(directories)!=2:
+        if len(directories) != 2:
            raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
        neg = self.load_text_files(directories[0])
        pos = self.load_text_files(directories[1])
-        scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))]
+        scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
        text = neg + pos
        return scores, text
 class JSONLoader(DataLoader):
    def __init__(self, pathname):
        self.pathname = pathname
    def load_data(self):
        filenames = os.listdir(self.pathname)
-        files = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if os.path.isfile(os.path.join(self.pathname,f)) if f.endswith(".json")]
+        files = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
+                 os.path.isfile(os.path.join(self.pathname, f)) if f.endswith(".json")]
        files.sort()
-        #We need to have both a postive and a negative folder to classify
+        # We need to have both a postive and a negative folder to classify
        if len(files) == 0:
            return [], []
@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
        all_scores = []
        all_text = []
-        for i in xrange(0,len(data)):
+        for i in xrange(0, len(data)):
            scores = [d['score'] for d in data[i]]
            text = [d['text'] for d in data[i]]
            if isinstance(scores[0], list):
                new_text = []
                new_scores = []
-                for i in xrange(0,len(scores)):
+                for j in xrange(0, len(scores)):
-                    text = scores[i]
+                    text = scores[j]
-                    s = scores[i]
+                    s = scores[j]
-                    for j in s:
+                    for k in s:
                        new_text.append(text)
-                        new_scores.append(j)
+                        new_scores.append(k)
                text = new_text
                scores = new_scores
@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
        return all_scores, all_text
 class ModelCreator():
    def __init__(self, scores, text):
        self.scores = scores
        self.text = text
-        #Governs which creation function in the ease.create module to use.  See module for info.
+        # Governs which creation function in the ease.create module to use.  See module for info.
        if isinstance(text, list):
            self.create_model_generic = False
        else:
@@ -112,7 +122,9 @@ class ModelCreator():
        if not self.create_model_generic:
            return create.create(self.text, self.scores, "")
        else:
-            return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
+            return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []),
+                                         self.scores)
 class Grader():
    def __init__(self, model_data):
@@ -122,7 +134,9 @@ class Grader():
        if isinstance(submission, basestring):
            return grade.grade(self.model_data, submission)
        else:
-            return grade.grade_generic(self.model_data, submission.get('numeric_values', []), submission.get('textual_values', []))
+            return grade.grade_generic(self.model_data, submission.get('numeric_values', []),
+                                       submission.get('textual_values', []))
 class GenericTest(object):
    loader = DataLoader
@@ -137,11 +151,11 @@ class GenericTest(object):
        return scores, text
    def generic_setup(self, scores, text):
-        #Shuffle to mix up the classes, set seed to make it repeatable
+        # Shuffle to mix up the classes, set seed to make it repeatable
        random.seed(1)
        shuffled_scores = []
        shuffled_text = []
-        indices = [i for i in xrange(0,len(scores))]
+        indices = [i for i in xrange(0, len(scores))]
        random.shuffle(indices)
        for i in indices:
            shuffled_scores.append(scores[i])
@@ -159,45 +173,46 @@ class GenericTest(object):
        grader = Grader(results)
        results = grader.grade(self.text[0])
-        assert results['success']==True
+        assert results['success'] == True
    def scoring_accuracy(self):
        random.seed(1)
        model_creator = ModelCreator(self.scores, self.text)
        results = model_creator.create_model()
-        assert results['success']==True
+        assert results['success'] == True
        cv_kappa = results['cv_kappa']
        cv_mae = results['cv_mean_absolute_error']
-        assert cv_kappa>=self.expected_kappa_min
+        assert cv_kappa >= self.expected_kappa_min
-        assert cv_mae <=self.expected_mae_max
+        assert cv_mae <= self.expected_mae_max
    def generic_model_creation_and_grading(self):
        log.info(self.scores)
        log.info(self.text)
-        score_subset = [random.randint(0,100) for i in xrange(0,min([QUICK_TEST_LIMIT, len(self.scores)]))]
+        score_subset = [random.randint(0, 100) for i in xrange(0, min([QUICK_TEST_LIMIT, len(self.scores)]))]
        text_subset = self.text[:QUICK_TEST_LIMIT]
        text_subset = {
-            'textual_values' : [[t] for t in text_subset],
+            'textual_values': [[t] for t in text_subset],
-            'numeric_values' : [[1] for i in xrange(0,len(text_subset))]
+            'numeric_values': [[1] for i in xrange(0, len(text_subset))]
        }
        model_creator = ModelCreator(score_subset, text_subset)
        results = model_creator.create_model()
-        assert results['success']==True
+        assert results['success'] == True
        grader = Grader(results)
        test_text = {
-            'textual_values' : [self.text[0]],
+            'textual_values': [self.text[0]],
-            'numeric_values' : [1]
+            'numeric_values': [1]
        }
        results = grader.grade(test_text)
-        assert results['success']==True
+        assert results['success'] == True
-class PolarityTest(unittest.TestCase,GenericTest):
+class PolarityTest(unittest.TestCase, GenericTest):
    loader = PolarityLoader
    data_path = "data/polarity"
-    #These will increase if we allow more data in.
+    # These will increase if we allow more data in.
-    #I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
+    # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
    expected_kappa_min = -.2
    expected_mae_max = 1
@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
    def test_generic_model_creation_and_grading(self):
        self.generic_model_creation_and_grading()
 class JSONTest(GenericTest):
    loader = JSONLoader
    data_path = "data/json_data"
-    #These will increase if we allow more data in.
+    # These will increase if we allow more data in.
-    #I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
+    # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
    expected_kappa_min = -.2
    expected_mae_max = 1
@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
        self.scores, self.text = self.load_data()
        return self.scores, self.text
 def test_loop():
    json_test = JSONTest()
    scores, text = json_test.setUp()
-    for i in xrange(0,len(scores)):
+    for i in xrange(0, len(scores)):
        json_test.generic_setup(scores[i], text[i])
        yield json_test.model_creation_and_grading
        yield json_test.scoring_accuracy

--- a/ease/tests/test_spellcheck.py
+++ b/ease/tests/test_spellcheck.py
 from unittest import TestCase
 from nose.tools import assert_equal
 from mock import patch
 from ease.util_functions import spell_correct
@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
    @patch("util_functions.os.popen")
    def test_aspell_not_found(self, popen_mock):
        # Expected behavior when aspell is not installed is to return the original
        # string with no corrections.
        popen_mock.side_effect = OSError

--- a/ease/util_functions.py
+++ b/ease/util_functions.py
@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 if not base_path.endswith("/"):
-    base_path = base_path + "/"
+    base_path += "/"
-#Paths to needed data files
+# Paths to needed data files
 ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
 ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
@@ -100,7 +100,7 @@ def spell_correct(string):
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
-        if (len(incorrect[i]) > 10):
+        if len(incorrect[i]) > 10:
            #Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
    NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
    """
-    dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
+    dictionary = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
-    dict_matrix = dict.fit_transform(essays)
+    dict_matrix = dictionary.fit_transform(essays)
    set_score = numpy.asarray(scores, dtype=numpy.int)
    med_score = numpy.median(set_score)
    new_score = set_score
-    if (med_score == 0):
+    if med_score == 0:
        med_score = 1
    new_score[set_score < med_score] = 0
    new_score[set_score >= med_score] = 1
@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
        fish_vals.append(fish_val)
    cutoff = 1
-    if (len(fish_vals) > max_features_pass_2):
+    if len(fish_vals) > max_features_pass_2:
        cutoff = sorted(fish_vals)[max_features_pass_2]
    good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
-    getVar = lambda searchList, ind: [searchList[i] for i in ind]
+    get_var = lambda search_list, ind: [search_list[i] for i in ind]
-    vocab = getVar(dict.get_feature_names(), good_cols)
+    vocab = get_var(dictionary.get_feature_names(), good_cols)
    return vocab
@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
        chunks.append(range(range_min, range_max))
    preds = []
    set_score = numpy.asarray(sel_score, dtype=numpy.int)
-    chunk_vec = numpy.asarray(range(0, len(chunks)))
    for i in xrange(0, len(chunks)):
        loop_inds = list(
            chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
        sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
        preds.append(list(sim_fit.predict(arr[chunks[i]])))
    all_preds = list(chain(*preds))
-    return (all_preds)
+    return all_preds
 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
    numerator = 0.0
    denominator = 0.0
-    if (num_ratings > 1):
+    if num_ratings > 1:
        for i in range(num_ratings):
            for j in range(num_ratings):
                expected_count = (hist_rater_a[i] * hist_rater_b[j]