Commit 31d4363d by gradyward

Stylistic cleanup

parent 9c16fbbe
...@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data ...@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
""" """
import os import os
import sys
import logging import logging
import numpy import numpy
import sys
# Constructs a log # Constructs a log
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Setup base path so that we can import modules who are dependent on it # Setup base path so that we can import modules who are dependent on it
...@@ -15,7 +18,7 @@ sys.path.append(base_path) ...@@ -15,7 +18,7 @@ sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//')) one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
sys.path.append(one_up_path) sys.path.append(one_up_path)
#Import modules that are dependent on the base path # Import modules that are dependent on the base path
import util_functions import util_functions
from errors import * from errors import *
from datetime import datetime from datetime import datetime
...@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set): ...@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
# We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc. # We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
except Exception as ex: except Exception as ex:
str = ( msg = (
"predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}" "predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
).format(ex) ).format(ex)
log.exception(str) log.exception(msg)
raise ClassifierTrainingInternalError(str) raise ClassifierTrainingInternalError(msg)
return feat_extractor, predict_classifier, cv_error_results return feat_extractor, predict_classifier, cv_error_results
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Errors for the EASE repository Errors for the EASE repository
""" """
class EaseError(Exception): class EaseError(Exception):
pass pass
...@@ -45,6 +46,7 @@ class InputError(EaseError): ...@@ -45,6 +46,7 @@ class InputError(EaseError):
""" """
The user supplied an argument which was incorrect. The user supplied an argument which was incorrect.
""" """
def __init__(self, expr, msg): def __init__(self, expr, msg):
self.expr = expr self.expr = expr
self.msg = msg self.msg = msg
......
...@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se ...@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
Performs spell and grammar checking, tokenization, and stemming. Performs spell and grammar checking, tokenization, and stemming.
""" """
import nltk
import sys
import random import random
import os import os
import logging import logging
from ease.errors import InputError
import nltk
import sys
from errors import * from errors import *
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path = base_path + "/" base_path += "/"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
...@@ -97,9 +98,9 @@ class EssaySet(object): ...@@ -97,9 +98,9 @@ class EssaySet(object):
try: try:
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore') essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
except UnicodeError as ex: except UnicodeError as ex:
str = "Could not parse essay text into ascii: {}".format(ex) msg = "Could not parse essay text into ascii: {}".format(ex)
log.exception(str) log.exception(msg)
raise EssaySetRequestError(ex) raise EssaySetRequestError(msg)
# Validates that score is an integer and essay_text is a string. # Validates that score is an integer and essay_text is a string.
try: try:
...@@ -107,9 +108,9 @@ class EssaySet(object): ...@@ -107,9 +108,9 @@ class EssaySet(object):
essay_text = str(essay_text) essay_text = str(essay_text)
essay_generated = int(essay_generated) essay_generated = int(essay_generated)
except TypeError: except TypeError:
str = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)) ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
log.exception(str) log.exception(ex)
raise EssaySetRequestError(str) raise EssaySetRequestError(ex)
# Validates that essay generated is 0 or 1 # Validates that essay generated is 0 or 1
if essay_generated != 0 and essay_generated != 1: if essay_generated != 0 and essay_generated != 1:
......
...@@ -2,24 +2,26 @@ ...@@ -2,24 +2,26 @@
Extracts features from training set and test set essays Extracts features from training set and test set essays
""" """
import numpy
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle import pickle
import os import os
from itertools import chain from itertools import chain
import operator import operator
import logging import logging
import numpy
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
from errors import * from errors import *
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import EssaySet
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path = base_path + "/" base_path += "/"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
...@@ -79,9 +81,8 @@ class FeatureExtractor(object): ...@@ -79,9 +81,8 @@ class FeatureExtractor(object):
sum([len(essay) for essay in essay_set._cleaned_essays])) sum([len(essay) for essay in essay_set._cleaned_essays]))
# Gets the number and positions of grammar errors # Gets the number and positions of grammar errors
good_pos_tags, bad_pos_positions = self._get_grammar_errors( good_pos_tags, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
essay_set._pos_tags, essay_set._cleaned_essays, essay_set._tokens essay_set._cleaned_essays)
)
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of # NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# grammatical errors. Though this was not what the original author used, it is clearly what his code # grammatical errors. Though this was not what the original author used, it is clearly what his code
# implies, as if this is intended to be a true "grammar errors per character", we should have that # implies, as if this is intended to be a true "grammar errors per character", we should have that
...@@ -154,7 +155,7 @@ class FeatureExtractor(object): ...@@ -154,7 +155,7 @@ class FeatureExtractor(object):
# SEE COMMENT AROUND LINE 85 # SEE COMMENT AROUND LINE 85
good_grammar_ratios, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags, good_grammar_ratios, bad_pos_positions = self._get_grammar_errors(essay_set._pos_tags,
essay_set._cleaned_essays, essay_set._tokens) essay_set._cleaned_essays)
good_pos_tag_proportion = [len(bad_pos_positions[m]) / float(word_counts[m]) for m in xrange(0, len(essays))] good_pos_tag_proportion = [len(bad_pos_positions[m]) / float(word_counts[m]) for m in xrange(0, len(essays))]
length_array = numpy.array(( length_array = numpy.array((
...@@ -204,7 +205,7 @@ class FeatureExtractor(object): ...@@ -204,7 +205,7 @@ class FeatureExtractor(object):
prompt_overlap_prop = [] prompt_overlap_prop = []
for j in essay_set._tokens: for j in essay_set._tokens:
tok_length = len(j) tok_length = len(j)
if (tok_length == 0): if tok_length == 0:
tok_length = 1 tok_length = 1
prompt_overlap.append(len([i for i in j if i in prompt_toks])) prompt_overlap.append(len([i for i in j if i in prompt_toks]))
prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length)) prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
...@@ -212,7 +213,7 @@ class FeatureExtractor(object): ...@@ -212,7 +213,7 @@ class FeatureExtractor(object):
expand_overlap_prop = [] expand_overlap_prop = []
for j in essay_set._tokens: for j in essay_set._tokens:
tok_length = len(j) tok_length = len(j)
if (tok_length == 0): if tok_length == 0:
tok_length = 1 tok_length = 1
expand_overlap.append(len([i for i in j if i in expand_syns])) expand_overlap.append(len([i for i in j if i in expand_syns]))
expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length)) expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
...@@ -221,7 +222,7 @@ class FeatureExtractor(object): ...@@ -221,7 +222,7 @@ class FeatureExtractor(object):
return prompt_arr.copy() return prompt_arr.copy()
def _get_grammar_errors(self, pos, essays, tokens): def _get_grammar_errors(self, pos, essays):
""" """
Internal function to get the number of grammar errors in given text Internal function to get the number of grammar errors in given text
...@@ -251,7 +252,7 @@ class FeatureExtractor(object): ...@@ -251,7 +252,7 @@ class FeatureExtractor(object):
start, end = bad_pos_tuples[m] start, end = bad_pos_tuples[m]
for j in xrange(m + 1, len(bad_pos_tuples)): for j in xrange(m + 1, len(bad_pos_tuples)):
lstart, lend = bad_pos_tuples[j] lstart, lend = bad_pos_tuples[j]
if lstart >= start and lstart <= end: if start <= lstart <= end:
bad_pos_tuples[m][1] = bad_pos_tuples[j][1] bad_pos_tuples[m][1] = bad_pos_tuples[j][1]
to_delete.append(j) to_delete.append(j)
...@@ -268,7 +269,8 @@ class FeatureExtractor(object): ...@@ -268,7 +269,8 @@ class FeatureExtractor(object):
good_grammar_ratios.append(good_grammar_ratio) good_grammar_ratios.append(good_grammar_ratio)
return good_grammar_ratios, bad_pos_positions return good_grammar_ratios, bad_pos_positions
def _get_good_pos_ngrams(self): @staticmethod
def _get_good_pos_ngrams():
""" """
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file Returns the list and caches the file
......
...@@ -8,11 +8,12 @@ import logging ...@@ -8,11 +8,12 @@ import logging
import sys import sys
# Append sys to base path to import the following modules # Append sys to base path to import the following modules
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
#Depend on base path to be imported # Depend on base path to be imported
from essay_set import EssaySet from essay_set import EssaySet
from errors import * from errors import *
...@@ -45,7 +46,6 @@ def grade(grader_data, submission): ...@@ -45,7 +46,6 @@ def grade(grader_data, submission):
# Instantiates the Essay set which will carry our essay while it is being classified and graded. # Instantiates the Essay set which will carry our essay while it is being classified and graded.
grader_set = EssaySet(essay_type="test") grader_set = EssaySet(essay_type="test")
feedback = {}
# Retrieves the model and extractor we will be using # Retrieves the model and extractor we will be using
model, extractor = _get_classifier_and_extractor(grader_data) model, extractor = _get_classifier_and_extractor(grader_data)
......
import unittest import unittest
import os import os
from ease import create, grade
import random import random
import logging import logging
import json import json
from ease import create, grade
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
ROOT_PATH = os.path.abspath(__file__) ROOT_PATH = os.path.abspath(__file__)
...@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000 ...@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT = 50 TRAINING_LIMIT = 50
QUICK_TEST_LIMIT = 5 QUICK_TEST_LIMIT = 5
# noinspection PyClassHasNoInit
class DataLoader(): class DataLoader():
def load_text_files(self, pathname): @staticmethod
def load_text_files(pathname):
filenames = os.listdir(pathname) filenames = os.listdir(pathname)
text = [] text = []
for filename in filenames: for filename in filenames:
...@@ -23,7 +27,8 @@ class DataLoader(): ...@@ -23,7 +27,8 @@ class DataLoader():
text.append(data[:CHARACTER_LIMIT]) text.append(data[:CHARACTER_LIMIT])
return text return text
def load_json_file(self, filename): @staticmethod
def load_json_file(filename):
datafile = open(os.path.join(filename)) datafile = open(os.path.join(filename))
data = json.load(datafile) data = json.load(datafile)
return data return data
...@@ -34,38 +39,42 @@ class DataLoader(): ...@@ -34,38 +39,42 @@ class DataLoader():
""" """
pass pass
class PolarityLoader(DataLoader): class PolarityLoader(DataLoader):
def __init__(self, pathname): def __init__(self, pathname):
self.pathname = pathname self.pathname = pathname
def load_data(self): def load_data(self):
filenames = os.listdir(self.pathname) filenames = os.listdir(self.pathname)
directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]] directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
#Sort so neg is first # Sort so neg is first
directories.sort() directories.sort()
#We need to have both a postive and a negative folder to classify # We need to have both a postive and a negative folder to classify
if len(directories)!=2: if len(directories) != 2:
raise Exception("Need a pos and a neg directory in {0}".format(self.pathname)) raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
neg = self.load_text_files(directories[0]) neg = self.load_text_files(directories[0])
pos = self.load_text_files(directories[1]) pos = self.load_text_files(directories[1])
scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))] scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
text = neg + pos text = neg + pos
return scores, text return scores, text
class JSONLoader(DataLoader): class JSONLoader(DataLoader):
def __init__(self, pathname): def __init__(self, pathname):
self.pathname = pathname self.pathname = pathname
def load_data(self): def load_data(self):
filenames = os.listdir(self.pathname) filenames = os.listdir(self.pathname)
files = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if os.path.isfile(os.path.join(self.pathname,f)) if f.endswith(".json")] files = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if
os.path.isfile(os.path.join(self.pathname, f)) if f.endswith(".json")]
files.sort() files.sort()
#We need to have both a postive and a negative folder to classify # We need to have both a postive and a negative folder to classify
if len(files) == 0: if len(files) == 0:
return [], [] return [], []
...@@ -76,19 +85,19 @@ class JSONLoader(DataLoader): ...@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
all_scores = [] all_scores = []
all_text = [] all_text = []
for i in xrange(0,len(data)): for i in xrange(0, len(data)):
scores = [d['score'] for d in data[i]] scores = [d['score'] for d in data[i]]
text = [d['text'] for d in data[i]] text = [d['text'] for d in data[i]]
if isinstance(scores[0], list): if isinstance(scores[0], list):
new_text = [] new_text = []
new_scores = [] new_scores = []
for i in xrange(0,len(scores)): for j in xrange(0, len(scores)):
text = scores[i] text = scores[j]
s = scores[i] s = scores[j]
for j in s: for k in s:
new_text.append(text) new_text.append(text)
new_scores.append(j) new_scores.append(k)
text = new_text text = new_text
scores = new_scores scores = new_scores
...@@ -97,12 +106,13 @@ class JSONLoader(DataLoader): ...@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
return all_scores, all_text return all_scores, all_text
class ModelCreator(): class ModelCreator():
def __init__(self, scores, text): def __init__(self, scores, text):
self.scores = scores self.scores = scores
self.text = text self.text = text
#Governs which creation function in the ease.create module to use. See module for info. # Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text, list): if isinstance(text, list):
self.create_model_generic = False self.create_model_generic = False
else: else:
...@@ -112,7 +122,9 @@ class ModelCreator(): ...@@ -112,7 +122,9 @@ class ModelCreator():
if not self.create_model_generic: if not self.create_model_generic:
return create.create(self.text, self.scores, "") return create.create(self.text, self.scores, "")
else: else:
return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores) return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []),
self.scores)
class Grader(): class Grader():
def __init__(self, model_data): def __init__(self, model_data):
...@@ -122,7 +134,9 @@ class Grader(): ...@@ -122,7 +134,9 @@ class Grader():
if isinstance(submission, basestring): if isinstance(submission, basestring):
return grade.grade(self.model_data, submission) return grade.grade(self.model_data, submission)
else: else:
return grade.grade_generic(self.model_data, submission.get('numeric_values', []), submission.get('textual_values', [])) return grade.grade_generic(self.model_data, submission.get('numeric_values', []),
submission.get('textual_values', []))
class GenericTest(object): class GenericTest(object):
loader = DataLoader loader = DataLoader
...@@ -137,11 +151,11 @@ class GenericTest(object): ...@@ -137,11 +151,11 @@ class GenericTest(object):
return scores, text return scores, text
def generic_setup(self, scores, text): def generic_setup(self, scores, text):
#Shuffle to mix up the classes, set seed to make it repeatable # Shuffle to mix up the classes, set seed to make it repeatable
random.seed(1) random.seed(1)
shuffled_scores = [] shuffled_scores = []
shuffled_text = [] shuffled_text = []
indices = [i for i in xrange(0,len(scores))] indices = [i for i in xrange(0, len(scores))]
random.shuffle(indices) random.shuffle(indices)
for i in indices: for i in indices:
shuffled_scores.append(scores[i]) shuffled_scores.append(scores[i])
...@@ -159,45 +173,46 @@ class GenericTest(object): ...@@ -159,45 +173,46 @@ class GenericTest(object):
grader = Grader(results) grader = Grader(results)
results = grader.grade(self.text[0]) results = grader.grade(self.text[0])
assert results['success']==True assert results['success'] == True
def scoring_accuracy(self): def scoring_accuracy(self):
random.seed(1) random.seed(1)
model_creator = ModelCreator(self.scores, self.text) model_creator = ModelCreator(self.scores, self.text)
results = model_creator.create_model() results = model_creator.create_model()
assert results['success']==True assert results['success'] == True
cv_kappa = results['cv_kappa'] cv_kappa = results['cv_kappa']
cv_mae = results['cv_mean_absolute_error'] cv_mae = results['cv_mean_absolute_error']
assert cv_kappa>=self.expected_kappa_min assert cv_kappa >= self.expected_kappa_min
assert cv_mae <=self.expected_mae_max assert cv_mae <= self.expected_mae_max
def generic_model_creation_and_grading(self): def generic_model_creation_and_grading(self):
log.info(self.scores) log.info(self.scores)
log.info(self.text) log.info(self.text)
score_subset = [random.randint(0,100) for i in xrange(0,min([QUICK_TEST_LIMIT, len(self.scores)]))] score_subset = [random.randint(0, 100) for i in xrange(0, min([QUICK_TEST_LIMIT, len(self.scores)]))]
text_subset = self.text[:QUICK_TEST_LIMIT] text_subset = self.text[:QUICK_TEST_LIMIT]
text_subset = { text_subset = {
'textual_values' : [[t] for t in text_subset], 'textual_values': [[t] for t in text_subset],
'numeric_values' : [[1] for i in xrange(0,len(text_subset))] 'numeric_values': [[1] for i in xrange(0, len(text_subset))]
} }
model_creator = ModelCreator(score_subset, text_subset) model_creator = ModelCreator(score_subset, text_subset)
results = model_creator.create_model() results = model_creator.create_model()
assert results['success']==True assert results['success'] == True
grader = Grader(results) grader = Grader(results)
test_text = { test_text = {
'textual_values' : [self.text[0]], 'textual_values': [self.text[0]],
'numeric_values' : [1] 'numeric_values': [1]
} }
results = grader.grade(test_text) results = grader.grade(test_text)
assert results['success']==True assert results['success'] == True
class PolarityTest(unittest.TestCase,GenericTest): class PolarityTest(unittest.TestCase, GenericTest):
loader = PolarityLoader loader = PolarityLoader
data_path = "data/polarity" data_path = "data/polarity"
#These will increase if we allow more data in. # These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each) # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2 expected_kappa_min = -.2
expected_mae_max = 1 expected_mae_max = 1
...@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest): ...@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
def test_generic_model_creation_and_grading(self): def test_generic_model_creation_and_grading(self):
self.generic_model_creation_and_grading() self.generic_model_creation_and_grading()
class JSONTest(GenericTest): class JSONTest(GenericTest):
loader = JSONLoader loader = JSONLoader
data_path = "data/json_data" data_path = "data/json_data"
#These will increase if we allow more data in. # These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each) # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2 expected_kappa_min = -.2
expected_mae_max = 1 expected_mae_max = 1
...@@ -227,10 +243,11 @@ class JSONTest(GenericTest): ...@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
self.scores, self.text = self.load_data() self.scores, self.text = self.load_data()
return self.scores, self.text return self.scores, self.text
def test_loop(): def test_loop():
json_test = JSONTest() json_test = JSONTest()
scores, text = json_test.setUp() scores, text = json_test.setUp()
for i in xrange(0,len(scores)): for i in xrange(0, len(scores)):
json_test.generic_setup(scores[i], text[i]) json_test.generic_setup(scores[i], text[i])
yield json_test.model_creation_and_grading yield json_test.model_creation_and_grading
yield json_test.scoring_accuracy yield json_test.scoring_accuracy
......
from unittest import TestCase from unittest import TestCase
from nose.tools import assert_equal from nose.tools import assert_equal
from mock import patch from mock import patch
from ease.util_functions import spell_correct from ease.util_functions import spell_correct
...@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase): ...@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
@patch("util_functions.os.popen") @patch("util_functions.os.popen")
def test_aspell_not_found(self, popen_mock): def test_aspell_not_found(self, popen_mock):
# Expected behavior when aspell is not installed is to return the original # Expected behavior when aspell is not installed is to return the original
# string with no corrections. # string with no corrections.
popen_mock.side_effect = OSError popen_mock.side_effect = OSError
......
...@@ -23,9 +23,9 @@ log = logging.getLogger(__name__) ...@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path = base_path + "/" base_path += "/"
#Paths to needed data files # Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt" ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p" ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
...@@ -100,7 +100,7 @@ def spell_correct(string): ...@@ -100,7 +100,7 @@ def spell_correct(string):
incorrect_words = list() incorrect_words = list()
correct_spelling = list() correct_spelling = list()
for i in range(1, len(incorrect)): for i in range(1, len(incorrect)):
if (len(incorrect[i]) > 10): if len(incorrect[i]) > 10:
#Reformat aspell output to make sense #Reformat aspell output to make sense
match = re.search(":", incorrect[i]) match = re.search(":", incorrect[i])
if hasattr(match, "start"): if hasattr(match, "start"):
...@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200): ...@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up. NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
""" """
dict = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1) dictionary = CountVectorizer(ngram_range=(1, 2), max_features=max_features_pass_1)
dict_matrix = dict.fit_transform(essays) dict_matrix = dictionary.fit_transform(essays)
set_score = numpy.asarray(scores, dtype=numpy.int) set_score = numpy.asarray(scores, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
new_score = set_score new_score = set_score
if (med_score == 0): if med_score == 0:
med_score = 1 med_score = 1
new_score[set_score < med_score] = 0 new_score[set_score < med_score] = 0
new_score[set_score >= med_score] = 1 new_score[set_score >= med_score] = 1
...@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200): ...@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
fish_vals.append(fish_val) fish_vals.append(fish_val)
cutoff = 1 cutoff = 1
if (len(fish_vals) > max_features_pass_2): if len(fish_vals) > max_features_pass_2:
cutoff = sorted(fish_vals)[max_features_pass_2] cutoff = sorted(fish_vals)[max_features_pass_2]
good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff]) good_cols = numpy.asarray([num for num in range(0, dict_matrix.shape[1]) if fish_vals[num] <= cutoff])
getVar = lambda searchList, ind: [searchList[i] for i in ind] get_var = lambda search_list, ind: [search_list[i] for i in ind]
vocab = getVar(dict.get_feature_names(), good_cols) vocab = get_var(dictionary.get_feature_names(), good_cols)
return vocab return vocab
...@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3): ...@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
chunks.append(range(range_min, range_max)) chunks.append(range(range_min, range_max))
preds = [] preds = []
set_score = numpy.asarray(sel_score, dtype=numpy.int) set_score = numpy.asarray(sel_score, dtype=numpy.int)
chunk_vec = numpy.asarray(range(0, len(chunks)))
for i in xrange(0, len(chunks)): for i in xrange(0, len(chunks)):
loop_inds = list( loop_inds = list(
chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i])) chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds]) sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
preds.append(list(sim_fit.predict(arr[chunks[i]]))) preds.append(list(sim_fit.predict(arr[chunks[i]])))
all_preds = list(chain(*preds)) all_preds = list(chain(*preds))
return (all_preds) return all_preds
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
...@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None) ...@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator = 0.0 numerator = 0.0
denominator = 0.0 denominator = 0.0
if (num_ratings > 1): if num_ratings > 1:
for i in range(num_ratings): for i in range(num_ratings):
for j in range(num_ratings): for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j] expected_count = (hist_rater_a[i] * hist_rater_b[j]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment