Commit 2e6cb8e5 by Hugh Brown Committed by Vik Paruchuri

./grade.py: W391 blank line at end of file

parent 0d7ac804
[pep8]
ignore=E501,E712,E711
...@@ -7,22 +7,23 @@ import sys ...@@ -7,22 +7,23 @@ import sys
import logging import logging
import numpy import numpy
#Define base path and add to sys path # Define base path and add to sys path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//')) one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
sys.path.append(one_up_path) sys.path.append(one_up_path)
#Import modules that are dependent on the base path # Import modules that are dependent on the base path
import model_creator import model_creator
import util_functions import util_functions
import predictor_set import predictor_set
import predictor_extractor import predictor_extractor
#Make a log # Make a log
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def create(text,score,prompt_string):
def create(text, score, prompt_string):
""" """
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code TODO: Remove model path argument, it is needed for now to support legacy code
...@@ -31,21 +32,21 @@ def create(text,score,prompt_string): ...@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays prompt_string - the common prompt for the set of essays
""" """
#Initialize a results dictionary to return # Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification, 'feature_ext': "", 'classifier': "", 'algorithm': util_functions.AlgorithmTypes.classification,
'score' : score, 'text' : text, 'prompt' : prompt_string} 'score': score, 'text': text, 'prompt': prompt_string}
if len(text)!=len(score): if len(text) != len(score):
msg = "Target and text lists must be same length." msg = "Target and text lists must be same length."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
return results return results
#Decide what algorithm to use (regression or classification) # Decide what algorithm to use (regression or classification)
try: try:
#Count the number of unique score points in the score list # Count the number of unique score points in the score list
if len(util_functions.f7(list(score)))>5: if len(util_functions.f7(list(score))) > 5:
type = util_functions.AlgorithmTypes.regression type = util_functions.AlgorithmTypes.regression
else: else:
type = util_functions.AlgorithmTypes.classification type = util_functions.AlgorithmTypes.classification
...@@ -53,21 +54,21 @@ def create(text,score,prompt_string): ...@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
type = util_functions.AlgorithmTypes.regression type = util_functions.AlgorithmTypes.regression
try: try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc) # Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
except: except:
msg = "essay set creation failed." msg = "essay set creation failed."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
try: try:
#Gets features from the essay set and computes error # Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
results['cv_kappa']=cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext']=feature_ext results['feature_ext'] = feature_ext
results['classifier']=classifier results['classifier'] = classifier
results['algorithm'] = type results['algorithm'] = type
results['success']=True results['success'] = True
except: except:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
results['errors'].append(msg) results['errors'].append(msg)
...@@ -76,7 +77,7 @@ def create(text,score,prompt_string): ...@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
return results return results
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression): def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Creates a model from a generic list numeric values and text values Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors numeric_values - A list of lists that are the predictors
...@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used algorithm - the type of algorithm that will be used
""" """
#Initialize a result dictionary to return. # Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm} 'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target): if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
msg = "Target, numeric features, and text features must all be the same length." msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
return results return results
try: try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors # Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train") pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)): for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i]) pset.add_row(numeric_values[i], textual_values[i], target[i])
...@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
log.exception(msg) log.exception(msg)
try: try:
#Extract all features and then train a classifier with the features # Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext']=feature_ext results['feature_ext'] = feature_ext
results['classifier']=classifier results['classifier'] = classifier
results['success']=True results['success'] = True
except: except:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
results['errors'].append(msg) results['errors'].append(msg)
......
...@@ -15,11 +15,12 @@ sys.path.append(base_path) ...@@ -15,11 +15,12 @@ sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log=logging.getLogger(__name__) log = logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH = 20000
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object): class EssaySet(object):
def __init__(self, type="train"): def __init__(self, type="train"):
...@@ -30,17 +31,17 @@ class EssaySet(object): ...@@ -30,17 +31,17 @@ class EssaySet(object):
type = "train" type = "train"
self._type = type self._type = type
self._score=[] self._score = []
self._text=[] self._text = []
self._id=[] self._id = []
self._clean_text=[] self._clean_text = []
self._tokens=[] self._tokens = []
self._pos=[] self._pos = []
self._clean_stem_text=[] self._clean_stem_text = []
self._generated = [] self._generated = []
self._prompt = "" self._prompt = ""
self._spelling_errors=[] self._spelling_errors = []
self._markup_text=[] self._markup_text = []
def add_essay(self, essay_text, essay_score, essay_generated=0): def add_essay(self, essay_text, essay_score, essay_generated=0):
""" """
...@@ -58,35 +59,35 @@ class EssaySet(object): ...@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1 # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try: try:
essay_text=essay_text.encode('ascii', 'ignore') essay_text = essay_text.encode('ascii', 'ignore')
if len(essay_text)<5: if len(essay_text) < 5:
essay_text="Invalid essay." essay_text = "Invalid essay."
except: except:
log.exception("Could not parse essay into ascii.") log.exception("Could not parse essay into ascii.")
try: try:
#Try conversion of types # Try conversion of types
essay_score=int(essay_score) essay_score = int(essay_score)
essay_text=str(essay_text) essay_text = str(essay_text)
except: except:
#Nothing needed here, will return error in any case. # Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text))) log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\ if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1): and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1) self._id.append(max_id + 1)
self._score.append(essay_score) self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters # Clean text by removing non digit/work/punctuation characters
try: try:
essay_text=str(essay_text.encode('ascii', 'ignore')) essay_text = str(essay_text.encode('ascii', 'ignore'))
except: except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore') essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower() cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH): if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay) self._text.append(cleaned_essay)
# Spell correct text using aspell # Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1]) cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text) self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors) self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text) self._markup_text.append(markup_text)
...@@ -112,7 +113,7 @@ class EssaySet(object): ...@@ -112,7 +113,7 @@ class EssaySet(object):
prompt_text should be a string. prompt_text should be a string.
Returns the prompt as a confirmation. Returns the prompt as a confirmation.
""" """
if(type(prompt_text) == type("text")): if(isinstance(prompt_text, type("text"))):
self._prompt = util_functions.sub_chars(prompt_text) self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt ret = self._prompt
else: else:
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
import math import math
## From dendropy.mathlib.probability ## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k): def hypergeometric_pmf(x, m, n, k):
""" """
Given a population consisting of `m` items of class M and `n` items of class N, Given a population consisting of `m` items of class M and `n` items of class N,
...@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k) ...@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
# float' with large numbers # float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k) # return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x)) a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x)) b = math.log(binomial_coefficient(n, k - x))
c = math.log(binomial_coefficient(m+n, k)) c = math.log(binomial_coefficient(m + n, k))
return math.exp(a+b-c) return math.exp(a + b - c)
## From dendropy.mathlib.probability ## From dendropy.mathlib.probability
def binomial_coefficient(population, sample): def binomial_coefficient(population, sample):
"Returns `population` choose `sample`." "Returns `population` choose `sample`."
s = max(sample, population - sample) s = max(sample, population - sample)
...@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample): ...@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
return 1 return 1
numerator = 1 numerator = 1
denominator = 1 denominator = 1
for i in xrange(s+1, population + 1): for i in xrange(s + 1, population + 1):
numerator *= i numerator *= i
denominator *= (i - s) denominator *= (i - s)
return numerator/denominator return numerator / denominator
## From dendropy.mathlib.statistics ## From dendropy.mathlib.statistics
class FishersExactTest(object): class FishersExactTest(object):
""" """
Given a 2x2 table: Given a 2x2 table:
...@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c) ...@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
b = table[0][1] b = table[0][1]
c = table[1][0] c = table[1][0]
d = table[1][1] d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c) return hypergeometric_pmf(a, a + b, c + d, a + c)
probability_of_table = staticmethod(probability_of_table) probability_of_table = staticmethod(probability_of_table)
def __init__(self, table): def __init__(self, table):
...@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c) ...@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
Returns a copy of table such that all the values Returns a copy of table such that all the values
are rotated clockwise once. are rotated clockwise once.
""" """
return [ [ table[1][0], table[0][0] ], return [[table[1][0], table[0][0]],
[table[1][1], table[0][1] ] ] [table[1][1], table[0][1]]]
def _min_rotation(self): def _min_rotation(self):
""" """
...@@ -241,8 +247,9 @@ extreme. ...@@ -241,8 +247,9 @@ extreme.
p_vals.append(p) p_vals.append(p)
return sum(p_vals) + p0 return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8): def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec): if abs(v1 - v2) <= 10 ** (-prec):
print "OK: {} == {}".format(v1, v2) print "OK: {} == {}".format(v1, v2)
else: else:
print "FAIL: {} != {}".format(v1, v2) print "FAIL: {} != {}".format(v1, v2)
......
...@@ -8,24 +8,25 @@ import os ...@@ -8,24 +8,25 @@ import os
import numpy import numpy
import logging import logging
#Append sys to base path to import the following modules # Append sys to base path to import the following modules
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
#Depend on base path to be imported # Depend on base path to be imported
from essay_set import EssaySet from essay_set import EssaySet
import predictor_extractor import predictor_extractor
import predictor_set import predictor_set
import util_functions import util_functions
#Imports needed to unpickle grader data # Imports needed to unpickle grader data
import feature_extractor import feature_extractor
import sklearn.ensemble import sklearn.ensemble
import math import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
""" """
Grades a specified submission using specified models Grades a specified submission using specified models
grader_data - A dictionary: grader_data - A dictionary:
...@@ -38,73 +39,74 @@ def grade(grader_data,submission): ...@@ -38,73 +39,74 @@ def grade(grader_data,submission):
submission - The student submission (string) submission - The student submission (string)
""" """
#Initialize result dictionary # Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error=False has_error = False
grader_set=EssaySet(type="test") grader_set = EssaySet(type="test")
#This is to preserve legacy functionality # This is to preserve legacy functionality
if 'algorithm' not in grader_data: if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try: try:
#Try to add essay to essay set object # Try to add essay to essay set object
grader_set.add_essay(str(submission),0) grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt'])) grader_set.update_prompt(str(grader_data['prompt']))
except: except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission)) results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True has_error = True
#Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats=grader_data['extractor'].gen_feats(grader_set) grader_feats = grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0] feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0]) results['score'] = int(grader_data['model'].predict(grader_feats)[0])
except : except:
results['errors'].append("Could not extract features and score essay.") results['errors'].append("Could not extract features and score essay.")
has_error=True has_error = True
#Try to determine confidence level # Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except: except:
#If there is an error getting confidence, it is not a show-stopper, so just log # If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
if not has_error: if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score # If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']): if(feedback['too_similar_to_prompt']):
results['score']=0 results['score'] = 0
results['correct']=False results['correct'] = False
results['success']=True results['success'] = True
#Generate short form output--number of problem areas identified in feedback # Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available # Add feedback to results if available
results['feedback'] = {} results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback: if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({ results['feedback'].update({
'topicality' : feedback['topicality'], 'topicality': feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'], 'prompt-overlap': feedback['prompt_overlap'],
}) })
results['feedback'].update( results['feedback'].update(
{ {
'spelling' : feedback['spelling'], 'spelling': feedback['spelling'],
'grammar' : feedback['grammar'], 'grammar': feedback['grammar'],
'markup-text' : feedback['markup_text'], 'markup-text': feedback['markup_text'],
} }
) )
else: else:
#If error, success is False. # If error, success is False.
results['success']=False results['success'] = False
return results return results
def grade_generic(grader_data, numeric_features, textual_features): def grade_generic(grader_data, numeric_features, textual_features):
""" """
Grades a set of numeric and textual features using a generic model Grades a set of numeric and textual features using a generic model
...@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on textual_features - list of textual feature to predict on
""" """
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False has_error = False
#Try to find and load the model file # Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test") grader_set = predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object # Try to add essays to essay set object
try: try:
grader_set.add_row(numeric_features, textual_features,0) grader_set.add_row(numeric_features, textual_features, 0)
except: except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)) results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True has_error = True
#Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats=grader_data['extractor'].gen_feats(grader_set) grader_feats = grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0] results['score'] = grader_data['model'].predict(grader_feats)[0]
except : except:
results['errors'].append("Could not extract features and score essay.") results['errors'].append("Could not extract features and score essay.")
has_error=True has_error = True
#Try to determine confidence level # Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except: except:
#If there is an error getting confidence, it is not a show-stopper, so just log # If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
if not has_error: if not has_error:
...@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
""" """
Determines a confidence in a certain score, given proper input parameters Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes algorithm- from util_functions.AlgorithmTypes
...@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores): ...@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score=max(numpy.asarray(scores)) max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"): if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point #If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))] raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
#TODO: Normalize confidence somehow here #TODO: Normalize confidence somehow here
confidence=raw_confidence confidence=raw_confidence
elif hasattr(model, "predict"): elif hasattr(model, "predict"):
...@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores): ...@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence = 0 confidence = 0
return confidence return confidence
#Provides interface functions to create and save models # Provides interface functions to create and save models
import numpy import numpy
import re import re
...@@ -19,7 +19,8 @@ import feature_extractor ...@@ -19,7 +19,8 @@ import feature_extractor
import logging import logging
import predictor_extractor import predictor_extractor
log=logging.getLogger() log = logging.getLogger()
def read_in_test_data(filename): def read_in_test_data(filename):
""" """
...@@ -49,7 +50,8 @@ def read_in_test_prompt(filename): ...@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string = open(filename).read() prompt_string = open(filename).read()
return prompt_string return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
def read_in_test_data_twocolumn(filename, sep=","):
""" """
Reads in a two column version of the test data. Reads in a two column version of the test data.
Filename must point to a delimited file. Filename must point to a delimited file.
...@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True): ...@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x return x
def get_cv_error(clf,feats,scores):
def get_cv_error(clf, feats, scores):
""" """
Gets cross validated error for a given classifier, set of features, and scores Gets cross validated error for a given classifier, set of features, and scores
clf - classifier clf - classifier
feats - features to feed into the classified and cross validate over feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc. scores - scores associated with the features -- feature row 1 associates with score 1, etc.
""" """
results={'success' : False, 'kappa' : 0, 'mae' : 0} results = {'success': False, 'kappa': 0, 'mae': 0}
try: try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores) cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores)) err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores) kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
results['mae']=err results['mae'] = err
results['kappa']=kappa results['kappa'] = kappa
results['success']=True results['success'] = True
except ValueError: except ValueError:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal. # If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log.exception("Not enough classes (0,1,etc) in each cross validation fold.") log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
except: except:
log.exception("Error getting cv error estimates.") log.exception("Error getting cv error estimates.")
return results return results
def get_algorithms(type): def get_algorithms(type):
""" """
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error. Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...@@ -116,14 +120,14 @@ def get_algorithms(type): ...@@ -116,14 +120,14 @@ def get_algorithms(type):
""" """
if type == util_functions.AlgorithmTypes.classification: if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
else: else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05, clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2 return clf, clf2
...@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func ...@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats = f.gen_feats(predictor_set) train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type) clf, clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target) cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
try: try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int) set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
except ValueError: except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1 set_score[0] = 1
set_score[1]=0 set_score[1] = 0
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
return f, clf, cv_error_results return f, clf, cv_error_results
...@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp ...@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
train_feats = f.gen_feats(essays) train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5: if len(util_functions.f7(list(set_score))) > 5:
type = util_functions.AlgorithmTypes.regression type = util_functions.AlgorithmTypes.regression
else: else:
type = util_functions.AlgorithmTypes.classification type = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(type) clf, clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,essays._score) cv_error_results = get_cv_error(clf2, train_feats, essays._score)
try: try:
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
except ValueError: except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1 set_score[0] = 1
set_score[1]=0 set_score[1] = 0
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
return f, clf, cv_error_results return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path): def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
""" """
Writes out a model to a file. Writes out a model to a file.
...@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode ...@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier classifier is a trained classifier
model_path is the path of write out the model file to model_path is the path of write out the model file to
""" """
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score} model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
pickle.dump(model_file, file=open(model_path, "w")) pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
""" """
Function that creates essay set, extracts features, and writes out model Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions See above functions for argument descriptions
""" """
essay_set=create_essay_set(text_score,prompt) essay_set = create_essay_set(text_score, prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array) feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path) dump_model_to_file(prompt, feature_ext, clf, model_path)
...@@ -16,17 +16,18 @@ import logging ...@@ -16,17 +16,18 @@ import logging
import math import math
from feature_extractor import FeatureExtractor from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path # Append to path and then import things that depend on path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import EssaySet from essay_set import EssaySet
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class PredictorExtractor(object): class PredictorExtractor(object):
def __init__(self): def __init__(self):
self._extractors = [] self._extractors = []
...@@ -48,13 +49,13 @@ class PredictorExtractor(object): ...@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets) div_length = len(p_set._essay_sets)
if div_length==0: if div_length == 0:
div_length=1 div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable # Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length)) max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True self._initialized = True
...@@ -66,13 +67,13 @@ class PredictorExtractor(object): ...@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set Generates features based on an iput p_set
p_set - PredictorSet p_set - PredictorSet
""" """
if self._initialized!=True: if self._initialized != True:
error_message = "Dictionaries have not been initialized." error_message = "Dictionaries have not been initialized."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(p_set, error_message)
textual_features = [] textual_features = []
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i])) textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1) textual_matrix = numpy.concatenate(textual_features, axis=1)
......
...@@ -11,12 +11,13 @@ sys.path.append(base_path) ...@@ -11,12 +11,13 @@ sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log = logging.getLogger(__name__)
log=logging.getLogger(__name__)
class PredictorSet(object): class PredictorSet(object):
def __init__(self, type = "train"): def __init__(self, type="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
...@@ -24,13 +25,13 @@ class PredictorSet(object): ...@@ -24,13 +25,13 @@ class PredictorSet(object):
type = "train" type = "train"
self._type = type self._type = type
self._target=[] self._target = []
self._textual_features=[] self._textual_features = []
self._numeric_features=[] self._numeric_features = []
self._essay_sets=[] self._essay_sets = []
def add_row(self, numeric_features, textual_features, target): def add_row(self, numeric_features, textual_features, target):
#Basic input checking # Basic input checking
if not isinstance(target, (int, long, float)): if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value." error_message = "Target is not a numeric value."
log.exception(error_message) log.exception(error_message)
...@@ -46,8 +47,8 @@ class PredictorSet(object): ...@@ -46,8 +47,8 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(textual_features, error_message) raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters # Do some length checking for parameters
if len(self._numeric_features)>0: if len(self._numeric_features) > 0:
numeric_length = len(self._numeric_features[-1]) numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features) current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length: if numeric_length != current_numeric_length:
...@@ -55,7 +56,7 @@ class PredictorSet(object): ...@@ -55,7 +56,7 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message) raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0: if len(self._textual_features) > 0:
textual_length = len(self._textual_features[-1]) textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features) current_textual_length = len(textual_features)
if textual_length != current_textual_length: if textual_length != current_textual_length:
...@@ -63,9 +64,9 @@ class PredictorSet(object): ...@@ -63,9 +64,9 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(textual_features, error_message) raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct # Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)): for i in xrange(0, len(numeric_features)):
try: try:
numeric_features[i] = float(numeric_features[i]) numeric_features[i] = float(numeric_features[i])
except: except:
...@@ -73,8 +74,7 @@ class PredictorSet(object): ...@@ -73,8 +74,7 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message) raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0, len(textual_features)):
for i in xrange(0,len(textual_features)):
try: try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore')) textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except: except:
...@@ -82,19 +82,18 @@ class PredictorSet(object): ...@@ -82,19 +82,18 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(textual_features, error_message) raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed # Create essay sets for textual features if needed
if len(self._textual_features)==0: if len(self._textual_features) == 0:
for i in xrange(0,len(textual_features)): for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type)) self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features # Add numeric and textual features
self._numeric_features.append(numeric_features) self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features) self._textual_features.append(textual_features)
#Add targets # Add targets
self._target.append(target) self._target.append(target)
#Add textual features to essay sets # Add textual features to essay sets
for i in xrange(0,len(textual_features)): for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target) self._essay_sets[i].add_essay(textual_features[i], target)
...@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000 ...@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT = 100 TRAINING_LIMIT = 100
QUICK_TEST_LIMIT = 5 QUICK_TEST_LIMIT = 5
class DataLoader(): class DataLoader():
def load_text_files(self, pathname): def load_text_files(self, pathname):
filenames = os.listdir(pathname) filenames = os.listdir(pathname)
...@@ -28,34 +29,36 @@ class DataLoader(): ...@@ -28,34 +29,36 @@ class DataLoader():
""" """
pass pass
class PolarityLoader(DataLoader): class PolarityLoader(DataLoader):
def __init__(self, pathname): def __init__(self, pathname):
self.pathname = pathname self.pathname = pathname
def load_data(self): def load_data(self):
filenames = os.listdir(self.pathname) filenames = os.listdir(self.pathname)
directories = [os.path.abspath(os.path.join(self.pathname,f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname,f)) and f in ["neg", "pos"]] directories = [os.path.abspath(os.path.join(self.pathname, f)) for f in filenames if not os.path.isfile(os.path.join(self.pathname, f)) and f in ["neg", "pos"]]
#Sort so neg is first # Sort so neg is first
directories.sort() directories.sort()
#We need to have both a postive and a negative folder to classify # We need to have both a postive and a negative folder to classify
if len(directories)!=2: if len(directories) != 2:
raise Exception("Need a pos and a neg directory in {0}".format(self.pathname)) raise Exception("Need a pos and a neg directory in {0}".format(self.pathname))
neg = self.load_text_files(directories[0]) neg = self.load_text_files(directories[0])
pos = self.load_text_files(directories[1]) pos = self.load_text_files(directories[1])
scores = [0 for i in xrange(0,len(neg))] + [1 for i in xrange(0,len(pos))] scores = [0 for i in xrange(0, len(neg))] + [1 for i in xrange(0, len(pos))]
text = neg + pos text = neg + pos
return scores, text return scores, text
class ModelCreator(): class ModelCreator():
def __init__(self, scores, text): def __init__(self, scores, text):
self.scores = scores self.scores = scores
self.text = text self.text = text
#Governs which creation function in the ease.create module to use. See module for info. # Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text[0], basestring): if isinstance(text[0], basestring):
self.create_model_generic = False self.create_model_generic = False
else: else:
...@@ -67,6 +70,7 @@ class ModelCreator(): ...@@ -67,6 +70,7 @@ class ModelCreator():
else: else:
return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores) return create.create_generic(self.text.get('numeric_values', []), self.text.get('textual_values', []), self.scores)
class Grader(): class Grader():
def __init__(self, model_data): def __init__(self, model_data):
self.model_data = model_data self.model_data = model_data
...@@ -77,6 +81,7 @@ class Grader(): ...@@ -77,6 +81,7 @@ class Grader():
else: else:
return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', [])) return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', []))
class GenericTest(object): class GenericTest(object):
loader = DataLoader loader = DataLoader
data_path = "" data_path = ""
...@@ -87,11 +92,11 @@ class GenericTest(object): ...@@ -87,11 +92,11 @@ class GenericTest(object):
data_loader = self.loader(os.path.join(TEST_PATH, self.data_path)) data_loader = self.loader(os.path.join(TEST_PATH, self.data_path))
scores, text = data_loader.load_data() scores, text = data_loader.load_data()
#Shuffle to mix up the classes, set seed to make it repeatable # Shuffle to mix up the classes, set seed to make it repeatable
random.seed(1) random.seed(1)
shuffled_scores = [] shuffled_scores = []
shuffled_text = [] shuffled_text = []
indices = [i for i in xrange(0,len(scores))] indices = [i for i in xrange(0, len(scores))]
random.shuffle(indices) random.shuffle(indices)
for i in indices: for i in indices:
shuffled_scores.append(scores[i]) shuffled_scores.append(scores[i])
...@@ -121,12 +126,13 @@ class GenericTest(object): ...@@ -121,12 +126,13 @@ class GenericTest(object):
self.assertGreaterEqual(cv_kappa, self.expected_kappa_min) self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
self.assertLessEqual(cv_mae, self.expected_mae_max) self.assertLessEqual(cv_mae, self.expected_mae_max)
class PolarityTest(unittest.TestCase,GenericTest):
class PolarityTest(unittest.TestCase, GenericTest):
loader = PolarityLoader loader = PolarityLoader
data_path = "data/polarity" data_path = "data/polarity"
#These will increase if we allow more data in. # These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each) # I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min = -.2 expected_kappa_min = -.2
expected_mae_max = 1 expected_mae_max = 1
......
#Collection of misc functions needed to support essay_set.py and feature_extractor.py. # Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path # Requires aspell to be installed and added to the path
from external_code.fisher import fisher from external_code.fisher import fisher
aspell_path = "aspell" aspell_path = "aspell"
...@@ -14,17 +14,18 @@ import pickle ...@@ -14,17 +14,18 @@ import pickle
import logging import logging
import sys import sys
log=logging.getLogger(__name__) log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
#Paths to needed data files # Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt" ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p" ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object): class AlgorithmTypes(object):
""" """
Defines what types of algorithm can be used Defines what types of algorithm can be used
...@@ -32,20 +33,22 @@ class AlgorithmTypes(object): ...@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
regression = "regression" regression = "regression"
classification = "classifiction" classification = "classifiction"
def create_model_path(model_path): def create_model_path(model_path):
""" """
Creates a path to model files Creates a path to model files
model_path - string model_path - string
""" """
if not model_path.startswith("/") and not model_path.startswith("models/"): if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path model_path = "/" + model_path
if not model_path.startswith("models"): if not model_path.startswith("models"):
model_path = "models" + model_path model_path = "models" + model_path
if not model_path.endswith(".p"): if not model_path.endswith(".p"):
model_path+=".p" model_path += ".p"
return model_path return model_path
def sub_chars(string): def sub_chars(string):
""" """
Strips illegal characters from a string. Used to sanitize input essays. Strips illegal characters from a string. Used to sanitize input essays.
...@@ -53,7 +56,7 @@ def sub_chars(string): ...@@ -53,7 +56,7 @@ def sub_chars(string):
Returns sanitized string. Returns sanitized string.
string - string string - string
""" """
#Define replacement patterns # Define replacement patterns
sub_pat = r"[^A-Za-z\.\?!,';:]" sub_pat = r"[^A-Za-z\.\?!,';:]"
char_pat = r"\." char_pat = r"\."
com_pat = r"," com_pat = r","
...@@ -63,9 +66,9 @@ def sub_chars(string): ...@@ -63,9 +66,9 @@ def sub_chars(string):
col_pat = r":" col_pat = r":"
whitespace_pat = r"\s{1,}" whitespace_pat = r"\s{1,}"
#Replace text. Ordering is very important! # Replace text. Ordering is very important!
nstring = re.sub(sub_pat, " ", string) nstring = re.sub(sub_pat, " ", string)
nstring = re.sub(char_pat," .", nstring) nstring = re.sub(char_pat, " .", nstring)
nstring = re.sub(com_pat, " ,", nstring) nstring = re.sub(com_pat, " ,", nstring)
nstring = re.sub(ques_pat, " ?", nstring) nstring = re.sub(ques_pat, " ?", nstring)
nstring = re.sub(excl_pat, " !", nstring) nstring = re.sub(excl_pat, " !", nstring)
...@@ -84,7 +87,7 @@ def spell_correct(string): ...@@ -84,7 +87,7 @@ def spell_correct(string):
string - string string - string
""" """
#Create a temp file so that aspell could be used # Create a temp file so that aspell could be used
f = open('tmpfile', 'w') f = open('tmpfile', 'w')
f.write(string) f.write(string)
f_path = os.path.abspath(f.name) f_path = os.path.abspath(f.name)
...@@ -93,16 +96,16 @@ def spell_correct(string): ...@@ -93,16 +96,16 @@ def spell_correct(string):
p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra") p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
except: except:
log.exception("Could not find aspell, so could not spell correct!") log.exception("Could not find aspell, so could not spell correct!")
#Return original string if aspell fails # Return original string if aspell fails
return string,0, string return string, 0, string
#Aspell returns a list of incorrect words with the above flags # Aspell returns a list of incorrect words with the above flags
incorrect = p.readlines() incorrect = p.readlines()
p.close() p.close()
incorrect_words = list() incorrect_words = list()
correct_spelling = list() correct_spelling = list()
for i in range(1, len(incorrect)): for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10): if(len(incorrect[i]) > 10):
#Reformat aspell output to make sense # Reformat aspell output to make sense
match = re.search(":", incorrect[i]) match = re.search(":", incorrect[i])
if hasattr(match, "start"): if hasattr(match, "start"):
begstring = incorrect[i][2:match.start()] begstring = incorrect[i][2:match.start()]
...@@ -117,19 +120,19 @@ def spell_correct(string): ...@@ -117,19 +120,19 @@ def spell_correct(string):
incorrect_words.append(begword) incorrect_words.append(begword)
correct_spelling.append(sug) correct_spelling.append(sug)
#Create markup based on spelling errors # Create markup based on spelling errors
newstring = string newstring = string
markup_string = string markup_string = string
already_subbed=[] already_subbed = []
for i in range(0, len(incorrect_words)): for i in range(0, len(incorrect_words)):
sub_pat = r"\b" + incorrect_words[i] + r"\b" sub_pat = r"\b" + incorrect_words[i] + r"\b"
sub_comp = re.compile(sub_pat) sub_comp = re.compile(sub_pat)
newstring = re.sub(sub_comp, correct_spelling[i], newstring) newstring = re.sub(sub_comp, correct_spelling[i], newstring)
if incorrect_words[i] not in already_subbed: if incorrect_words[i] not in already_subbed:
markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string) markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
already_subbed.append(incorrect_words[i]) already_subbed.append(incorrect_words[i])
return newstring,len(incorrect_words),markup_string return newstring, len(incorrect_words), markup_string
def ngrams(tokens, min_n, max_n): def ngrams(tokens, min_n, max_n):
...@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary Returns a list of words that constitute the significant vocabulary
""" """
dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats) dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
dict_mat = dict.fit_transform(text) dict_mat = dict.fit_transform(text)
set_score = numpy.asarray(score, dtype=numpy.int) set_score = numpy.asarray(score, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
...@@ -335,6 +338,7 @@ def calc_list_average(l): ...@@ -335,6 +338,7 @@ def calc_list_average(l):
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None): def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
""" """
Calculates kappa correlation between rater_a and rater_b. Calculates kappa correlation between rater_a and rater_b.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment