Commit da78277e by gradyward

Cleand up all of the files sytlistically

parent a990b25e
...@@ -7,7 +7,7 @@ import sys ...@@ -7,7 +7,7 @@ import sys
import logging import logging
import numpy import numpy
#Define base path and add to sys path # Define base path and add to sys path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//')) one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
...@@ -24,6 +24,7 @@ import json ...@@ -24,6 +24,7 @@ import json
#Make a log #Make a log
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def dump_input_data(text, score): def dump_input_data(text, score):
try: try:
file_path = base_path + "/tests/data/json_data/" file_path = base_path + "/tests/data/json_data/"
...@@ -32,14 +33,15 @@ def dump_input_data(text, score): ...@@ -32,14 +33,15 @@ def dump_input_data(text, score):
filename = prefix + time_suffix + ".json" filename = prefix + time_suffix + ".json"
json_data = [] json_data = []
for i in xrange(0, len(text)): for i in xrange(0, len(text)):
json_data.append({'text' : text[i], 'score' : score[i]}) json_data.append({'text': text[i], 'score': score[i]})
with open(file_path + filename, 'w+') as outfile: with open(file_path + filename, 'w+') as outfile:
json.dump(json_data, outfile) json.dump(json_data, outfile)
except: except:
error = "Could not dump data to file." error = "Could not dump data to file."
log.exception(error) log.exception(error)
def create(text,score,prompt_string, dump_data=False):
def create(text, score, prompt_string, dump_data=False):
""" """
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code TODO: Remove model path argument, it is needed for now to support legacy code
...@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False): ...@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):
algorithm = select_algorithm(score) algorithm = select_algorithm(score)
#Initialize a results dictionary to return #Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm, 'feature_ext': "", 'classifier': "", 'algorithm': algorithm,
'score' : score, 'text' : text, 'prompt' : prompt_string} 'score': score, 'text': text, 'prompt': prompt_string}
if len(text)!=len(score): if len(text) != len(score):
msg = "Target and text lists must be same length." msg = "Target and text lists must be same length."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
...@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False): ...@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
log.exception(msg) log.exception(msg)
try: try:
#Gets features from the essay set and computes error #Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, algorithm = algorithm) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set,
results['cv_kappa']=cv_error_results['kappa'] algorithm=algorithm)
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_kappa'] = cv_error_results['kappa']
results['feature_ext']=feature_ext results['cv_mean_absolute_error'] = cv_error_results['mae']
results['classifier']=classifier results['feature_ext'] = feature_ext
results['classifier'] = classifier
results['algorithm'] = algorithm results['algorithm'] = algorithm
results['success']=True results['success'] = True
except: except:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
results['errors'].append(msg) results['errors'].append(msg)
...@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False): ...@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
return results return results
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression): def create_generic(numeric_values, textual_values, target, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Creates a model from a generic list numeric values and text values Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors numeric_values - A list of lists that are the predictors
...@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm = select_algorithm(target) algorithm = select_algorithm(target)
#Initialize a result dictionary to return. #Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [], 'success': False, 'cv_kappa': 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm} 'feature_ext': "", 'classifier': "", 'algorithm': algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target): if len(numeric_values) != len(textual_values) or len(numeric_values) != len(target):
msg = "Target, numeric features, and text features must all be the same length." msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
...@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try: try:
#Extract all features and then train a classifier with the features #Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset,
results['cv_kappa']=cv_error_results['kappa'] algorithm)
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_kappa'] = cv_error_results['kappa']
results['feature_ext']=feature_ext results['cv_mean_absolute_error'] = cv_error_results['mae']
results['classifier']=classifier results['feature_ext'] = feature_ext
results['success']=True results['classifier'] = classifier
results['success'] = True
except: except:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
results['errors'].append(msg) results['errors'].append(msg)
...@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
return results return results
def select_algorithm(score_list): def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification) #Decide what algorithm to use (regression or classification)
try: try:
#Count the number of unique score points in the score list #Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list)))>5: if len(util_functions.f7(list(score_list))) > 5:
algorithm = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
else: else:
algorithm = util_functions.AlgorithmTypes.classification algorithm = util_functions.AlgorithmTypes.classification
......
...@@ -27,7 +27,7 @@ class EssaySet(object): ...@@ -27,7 +27,7 @@ class EssaySet(object):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
if(essaytype != "train" and essaytype != "test"): if (essaytype != "train" and essaytype != "test"):
essaytype = "train" essaytype = "train"
self._type = essaytype self._type = essaytype
...@@ -52,7 +52,7 @@ class EssaySet(object): ...@@ -52,7 +52,7 @@ class EssaySet(object):
Returns a confirmation that essay was added. Returns a confirmation that essay was added.
""" """
# Get maximum current essay id, or set to 0 if this is the first essay added # Get maximum current essay id, or set to 0 if this is the first essay added
if(len(self._id) > 0): if (len(self._id) > 0):
max_id = max(self._id) max_id = max(self._id)
else: else:
max_id = 0 max_id = 0
...@@ -71,9 +71,10 @@ class EssaySet(object): ...@@ -71,9 +71,10 @@ class EssaySet(object):
essay_text = str(essay_text) essay_text = str(essay_text)
except: except:
# Nothing needed here, will return error in any case. # Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))) log.exception(
"Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score, int) and isinstance(essay_text, basestring)\ if isinstance(essay_score, int) and isinstance(essay_text, basestring) \
and (essay_generated == 0 or essay_generated == 1): and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1) self._id.append(max_id + 1)
self._score.append(essay_score) self._score.append(essay_score)
...@@ -83,7 +84,7 @@ class EssaySet(object): ...@@ -83,7 +84,7 @@ class EssaySet(object):
except: except:
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore') essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay = util_functions.sub_chars(essay_text).lower() cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH): if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay) self._text.append(cleaned_essay)
# Spell correct text using aspell # Spell correct text using aspell
...@@ -113,7 +114,7 @@ class EssaySet(object): ...@@ -113,7 +114,7 @@ class EssaySet(object):
prompt_text should be a string. prompt_text should be a string.
Returns the prompt as a confirmation. Returns the prompt as a confirmation.
""" """
if(isinstance(prompt_text, basestring)): if (isinstance(prompt_text, basestring)):
self._prompt = util_functions.sub_chars(prompt_text) self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt ret = self._prompt
else: else:
...@@ -134,7 +135,7 @@ class EssaySet(object): ...@@ -134,7 +135,7 @@ class EssaySet(object):
all_syns = [] all_syns = []
for word in e_toks: for word in e_toks:
synonyms = util_functions.get_wordnet_syns(word) synonyms = util_functions.get_wordnet_syns(word)
if(len(synonyms) > max_syns): if (len(synonyms) > max_syns):
synonyms = random.sample(synonyms, max_syns) synonyms = random.sample(synonyms, max_syns)
all_syns.append(synonyms) all_syns.append(synonyms)
new_essays = [] new_essays = []
......
...@@ -8,7 +8,7 @@ import os ...@@ -8,7 +8,7 @@ import os
import numpy import numpy
import logging import logging
#Append sys to base path to import the following modules # Append sys to base path to import the following modules
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
...@@ -25,7 +25,8 @@ import math ...@@ -25,7 +25,8 @@ import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
""" """
Grades a specified submission using specified models Grades a specified submission using specified models
grader_data - A dictionary: grader_data - A dictionary:
...@@ -39,10 +40,10 @@ def grade(grader_data,submission): ...@@ -39,10 +40,10 @@ def grade(grader_data,submission):
""" """
#Initialize result dictionary #Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error=False has_error = False
grader_set=EssaySet(essaytype="test") grader_set = EssaySet(essaytype="test")
feedback = {} feedback = {}
model, extractor = get_classifier_and_ext(grader_data) model, extractor = get_classifier_and_ext(grader_data)
...@@ -53,28 +54,29 @@ def grade(grader_data,submission): ...@@ -53,28 +54,29 @@ def grade(grader_data,submission):
try: try:
#Try to add essay to essay set object #Try to add essay to essay set object
grader_set.add_essay(str(submission),0) grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt'])) grader_set.update_prompt(str(grader_data['prompt']))
except Exception: except Exception:
error_message = "Essay could not be added to essay set:{0}".format(submission) error_message = "Essay could not be added to essay set:{0}".format(submission)
log.exception(error_message) log.exception(error_message)
results['errors'].append(error_message) results['errors'].append(error_message)
has_error=True has_error = True
#Try to extract features from submission and assign score via the model #Try to extract features from submission and assign score via the model
try: try:
grader_feats=extractor.gen_feats(grader_set) grader_feats = extractor.gen_feats(grader_set)
feedback=extractor.gen_feedback(grader_set,grader_feats)[0] feedback = extractor.gen_feedback(grader_set, grader_feats)[0]
results['score']=int(model.predict(grader_feats)[0]) results['score'] = int(model.predict(grader_feats)[0])
except Exception: except Exception:
error_message = "Could not extract features and score essay." error_message = "Could not extract features and score essay."
log.exception(error_message) log.exception(error_message)
results['errors'].append(error_message) results['errors'].append(error_message)
has_error=True has_error = True
#Try to determine confidence level #Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'], grader_data['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'],
grader_data['score'])
except Exception: except Exception:
#If there is an error getting confidence, it is not a show-stopper, so just log #If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
...@@ -82,11 +84,11 @@ def grade(grader_data,submission): ...@@ -82,11 +84,11 @@ def grade(grader_data,submission):
if not has_error: if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score #If the essay is just a copy of the prompt, return a 0 as the score
if( 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']): if 'too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']:
results['score']=0 results['score'] = 0
results['correct']=False results['correct'] = False
results['success']=True results['success'] = True
#Generate short form output--number of problem areas identified in feedback #Generate short form output--number of problem areas identified in feedback
...@@ -94,24 +96,25 @@ def grade(grader_data,submission): ...@@ -94,24 +96,25 @@ def grade(grader_data,submission):
results['feedback'] = {} results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback: if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({ results['feedback'].update({
'topicality' : feedback['topicality'], 'topicality': feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'], 'prompt-overlap': feedback['prompt_overlap'],
}) })
results['feedback'].update( results['feedback'].update(
{ {
'spelling' : feedback['spelling'], 'spelling': feedback['spelling'],
'grammar' : feedback['grammar'], 'grammar': feedback['grammar'],
'markup-text' : feedback['markup_text'], 'markup-text': feedback['markup_text'],
} }
) )
else: else:
#If error, success is False. #If error, success is False.
results['success']=False results['success'] = False
return results return results
def grade_generic(grader_data, numeric_features, textual_features): def grade_generic(grader_data, numeric_features, textual_features):
""" """
Grades a set of numeric and textual features using a generic model Grades a set of numeric and textual features using a generic model
...@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on textual_features - list of textual feature to predict on
""" """
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False has_error = False
#Try to find and load the model file #Try to find and load the model file
grader_set=predictor_set.PredictorSet(essaytype="test") grader_set = predictor_set.PredictorSet(essaytype="test")
model, extractor = get_classifier_and_ext(grader_data) model, extractor = get_classifier_and_ext(grader_data)
#Try to add essays to essay set object #Try to add essays to essay set object
try: try:
grader_set.add_row(numeric_features, textual_features,0) grader_set.add_row(numeric_features, textual_features, 0)
except Exception: except Exception:
error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features) error_msg = "Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)
log.exception(error_msg) log.exception(error_msg)
results['errors'].append(error_msg) results['errors'].append(error_msg)
has_error=True has_error = True
#Try to extract features from submission and assign score via the model #Try to extract features from submission and assign score via the model
try: try:
grader_feats=extractor.gen_feats(grader_set) grader_feats = extractor.gen_feats(grader_set)
results['score']=model.predict(grader_feats)[0] results['score'] = model.predict(grader_feats)[0]
except Exception: except Exception:
error_msg = "Could not extract features and score essay." error_msg = "Could not extract features and score essay."
log.exception(error_msg) log.exception(error_msg)
results['errors'].append(error_msg) results['errors'].append(error_msg)
has_error=True has_error = True
#Try to determine confidence level #Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'],model, grader_feats, results['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], model, grader_feats, results['score'])
except Exception: except Exception:
#If there is an error getting confidence, it is not a show-stopper, so just log #If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
...@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
""" """
Determines a confidence in a certain score, given proper input parameters Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes algorithm- from util_functions.AlgorithmTypes
...@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores): ...@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
grader_feats - a row of features used by the model for classification/regression grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model score - The score assigned to the submission by a prior model
""" """
min_score=min(numpy.asarray(scores)) min_score = min(numpy.asarray(scores))
max_score=max(numpy.asarray(scores)) max_score = max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"): if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point #If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))] raw_confidence = model.predict_proba(grader_feats)[0, (float(score) - float(min_score))]
#TODO: Normalize confidence somehow here #TODO: Normalize confidence somehow here
confidence=raw_confidence confidence = raw_confidence
elif hasattr(model, "predict"): elif hasattr(model, "predict"):
raw_confidence = model.predict(grader_feats)[0] raw_confidence = model.predict(grader_feats)[0]
confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)), math.ceil(float(raw_confidence)) - float(raw_confidence)) confidence = max(float(raw_confidence) - math.floor(float(raw_confidence)),
math.ceil(float(raw_confidence)) - float(raw_confidence))
else: else:
confidence = 0 confidence = 0
return confidence return confidence
def get_classifier_and_ext(grader_data): def get_classifier_and_ext(grader_data):
if 'classifier' in grader_data: if 'classifier' in grader_data:
model = grader_data['classifier'] model = grader_data['classifier']
......
#Provides interface functions to create and save models # Provides interface functions to create and save models
import numpy import numpy
import re import re
...@@ -19,7 +19,8 @@ import feature_extractor ...@@ -19,7 +19,8 @@ import feature_extractor
import logging import logging
import predictor_extractor import predictor_extractor
log=logging.getLogger() log = logging.getLogger()
def read_in_test_data(filename): def read_in_test_data(filename):
""" """
...@@ -49,7 +50,8 @@ def read_in_test_prompt(filename): ...@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string = open(filename).read() prompt_string = open(filename).read()
return prompt_string return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
def read_in_test_data_twocolumn(filename, sep=","):
""" """
Reads in a two column version of the test data. Reads in a two column version of the test data.
Filename must point to a delimited file. Filename must point to a delimited file.
...@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True): ...@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x return x
def get_cv_error(clf,feats,scores):
def get_cv_error(clf, feats, scores):
""" """
Gets cross validated error for a given classifier, set of features, and scores Gets cross validated error for a given classifier, set of features, and scores
clf - classifier clf - classifier
feats - features to feed into the classified and cross validate over feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc. scores - scores associated with the features -- feature row 1 associates with score 1, etc.
""" """
results={'success' : False, 'kappa' : 0, 'mae' : 0} results = {'success': False, 'kappa': 0, 'mae': 0}
try: try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores) cv_preds = util_functions.gen_cv_preds(clf, feats, scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores)) err = numpy.mean(numpy.abs(numpy.array(cv_preds) - scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores) kappa = util_functions.quadratic_weighted_kappa(list(cv_preds), scores)
results['mae']=err results['mae'] = err
results['kappa']=kappa results['kappa'] = kappa
results['success']=True results['success'] = True
except ValueError as ex: except ValueError as ex:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal. # If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex) msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
...@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores): ...@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):
return results return results
def get_algorithms(algorithm): def get_algorithms(algorithm):
""" """
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error. Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...@@ -117,14 +121,14 @@ def get_algorithms(algorithm): ...@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
""" """
if algorithm == util_functions.AlgorithmTypes.classification: if algorithm == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
else: else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05, clf2 = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2 return clf, clf2
...@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util ...@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
predictor_set - a PredictorSet object that has been initialized with data predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType type - one of util_functions.AlgorithmType
""" """
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]): if (algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor() f = predictor_extractor.PredictorExtractor()
...@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util ...@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
train_feats = f.gen_feats(predictor_set) train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(algorithm) clf, clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target) cv_error_results = get_cv_error(clf2, train_feats, predictor_set._target)
try: try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int) set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
...@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util ...@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
except ValueError: except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score = predictor_set._target set_score = predictor_set._target
set_score[0]=1 set_score[0] = 1
set_score[1]=0 set_score[1] = 0
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
return f, clf, cv_error_results return f, clf, cv_error_results
...@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit ...@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats = f.gen_feats(essays) train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5: if len(util_functions.f7(list(set_score))) > 5:
algorithm = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
else: else:
algorithm = util_functions.AlgorithmTypes.classification algorithm = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(algorithm) clf, clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,essays._score) cv_error_results = get_cv_error(clf2, train_feats, essays._score)
try: try:
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
except ValueError: except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1 set_score[0] = 1
set_score[1]=0 set_score[1] = 0
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
return f, clf, cv_error_results return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path): def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
""" """
Writes out a model to a file. Writes out a model to a file.
...@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode ...@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier classifier is a trained classifier
model_path is the path of write out the model file to model_path is the path of write out the model file to
""" """
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score} model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text': text, 'score': score}
pickle.dump(model_file, file=open(model_path, "w")) pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
def create_essay_set_and_dump_model(text, score, prompt, model_path, additional_array=None):
""" """
Function that creates essay set, extracts features, and writes out model Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions See above functions for argument descriptions
""" """
essay_set=create_essay_set(text,score,prompt) essay_set = create_essay_set(text, score, prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array) feature_ext, clf = extract_features_and_generate_model(essay_set, additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path) dump_model_to_file(prompt, feature_ext, clf, model_path)
...@@ -16,17 +16,18 @@ import logging ...@@ -16,17 +16,18 @@ import logging
import math import math
from feature_extractor import FeatureExtractor from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path # Append to path and then import things that depend on path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import EssaySet from essay_set import EssaySet
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class PredictorExtractor(object): class PredictorExtractor(object):
def __init__(self): def __init__(self):
self._extractors = [] self._extractors = []
...@@ -48,13 +49,13 @@ class PredictorExtractor(object): ...@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets) div_length = len(p_set._essay_sets)
if div_length==0: if div_length == 0:
div_length=1 div_length = 1
#Ensures that even with a large amount of input textual features, training time stays reasonable #Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length)) max_feats2 = int(math.floor(200 / div_length))
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True self._initialized = True
...@@ -66,13 +67,13 @@ class PredictorExtractor(object): ...@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set Generates features based on an iput p_set
p_set - PredictorSet p_set - PredictorSet
""" """
if self._initialized!=True: if self._initialized != True:
error_message = "Dictionaries have not been initialized." error_message = "Dictionaries have not been initialized."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(p_set, error_message) raise util_functions.InputError(p_set, error_message)
textual_features = [] textual_features = []
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0, len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i])) textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1) textual_matrix = numpy.concatenate(textual_features, axis=1)
......
...@@ -11,26 +11,27 @@ sys.path.append(base_path) ...@@ -11,26 +11,27 @@ sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log = logging.getLogger(__name__)
log=logging.getLogger(__name__)
class PredictorSet(object): class PredictorSet(object):
def __init__(self, essaytype = "train"): def __init__(self, essaytype="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
if(essaytype != "train" and essaytype != "test"): if (essaytype != "train" and essaytype != "test"):
essaytype = "train" essaytype = "train"
self._type = essaytype self._type = essaytype
self._target=[] self._target = []
self._textual_features=[] self._textual_features = []
self._numeric_features=[] self._numeric_features = []
self._essay_sets=[] self._essay_sets = []
def add_row(self, numeric_features, textual_features, target): def add_row(self, numeric_features, textual_features, target):
#Basic input checking # Basic input checking
if not isinstance(target, (int, long, float)): if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value." error_message = "Target is not a numeric value."
log.exception(error_message) log.exception(error_message)
...@@ -47,16 +48,16 @@ class PredictorSet(object): ...@@ -47,16 +48,16 @@ class PredictorSet(object):
raise util_functions.InputError(textual_features, error_message) raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters #Do some length checking for parameters
if len(self._numeric_features)>0: if len(self._numeric_features) > 0:
numeric_length = len(self._numeric_features[-1]) numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features) current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length: if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length." error_message = "Numeric features are an improper length."
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message) raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0: if len(self._textual_features) > 0:
textual_length = len(self._textual_features[-1]) textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features) current_textual_length = len(textual_features)
if textual_length != current_textual_length: if textual_length != current_textual_length:
error_message = "Textual features are an improper length." error_message = "Textual features are an improper length."
...@@ -65,7 +66,7 @@ class PredictorSet(object): ...@@ -65,7 +66,7 @@ class PredictorSet(object):
#Now check to see if text features and numeric features are individually correct #Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)): for i in xrange(0, len(numeric_features)):
try: try:
numeric_features[i] = float(numeric_features[i]) numeric_features[i] = float(numeric_features[i])
except: except:
...@@ -73,8 +74,7 @@ class PredictorSet(object): ...@@ -73,8 +74,7 @@ class PredictorSet(object):
log.exception(error_message) log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message) raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0, len(textual_features)):
for i in xrange(0,len(textual_features)):
try: try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore')) textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except: except:
...@@ -83,8 +83,8 @@ class PredictorSet(object): ...@@ -83,8 +83,8 @@ class PredictorSet(object):
raise util_functions.InputError(textual_features, error_message) raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed #Create essay sets for textual features if needed
if len(self._textual_features)==0: if len(self._textual_features) == 0:
for i in xrange(0,len(textual_features)): for i in xrange(0, len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(essaytype=self._type)) self._essay_sets.append(essay_set.EssaySet(essaytype=self._type))
#Add numeric and textual features #Add numeric and textual features
...@@ -95,6 +95,6 @@ class PredictorSet(object): ...@@ -95,6 +95,6 @@ class PredictorSet(object):
self._target.append(target) self._target.append(target)
#Add textual features to essay sets #Add textual features to essay sets
for i in xrange(0,len(textual_features)): for i in xrange(0, len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target) self._essay_sets[i].add_essay(textual_features[i], target)
#Collection of misc functions needed to support essay_set.py and feature_extractor.py. # Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path #Requires aspell to be installed and added to the path
from fisher import pvalue from fisher import pvalue
...@@ -15,17 +15,18 @@ import logging ...@@ -15,17 +15,18 @@ import logging
import sys import sys
import tempfile import tempfile
log=logging.getLogger(__name__) log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
#Paths to needed data files #Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt" ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p" ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object): class AlgorithmTypes(object):
""" """
Defines what types of algorithm can be used Defines what types of algorithm can be used
...@@ -33,20 +34,22 @@ class AlgorithmTypes(object): ...@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
regression = "regression" regression = "regression"
classification = "classifiction" classification = "classifiction"
def create_model_path(model_path): def create_model_path(model_path):
""" """
Creates a path to model files Creates a path to model files
model_path - string model_path - string
""" """
if not model_path.startswith("/") and not model_path.startswith("models/"): if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path model_path = "/" + model_path
if not model_path.startswith("models"): if not model_path.startswith("models"):
model_path = "models" + model_path model_path = "models" + model_path
if not model_path.endswith(".p"): if not model_path.endswith(".p"):
model_path+=".p" model_path += ".p"
return model_path return model_path
def sub_chars(string): def sub_chars(string):
""" """
Strips illegal characters from a string. Used to sanitize input essays. Strips illegal characters from a string. Used to sanitize input essays.
...@@ -66,7 +69,7 @@ def sub_chars(string): ...@@ -66,7 +69,7 @@ def sub_chars(string):
#Replace text. Ordering is very important! #Replace text. Ordering is very important!
nstring = re.sub(sub_pat, " ", string) nstring = re.sub(sub_pat, " ", string)
nstring = re.sub(char_pat," .", nstring) nstring = re.sub(char_pat, " .", nstring)
nstring = re.sub(com_pat, " ,", nstring) nstring = re.sub(com_pat, " ,", nstring)
nstring = re.sub(ques_pat, " ?", nstring) nstring = re.sub(ques_pat, " ?", nstring)
nstring = re.sub(excl_pat, " !", nstring) nstring = re.sub(excl_pat, " !", nstring)
...@@ -101,7 +104,7 @@ def spell_correct(string): ...@@ -101,7 +104,7 @@ def spell_correct(string):
except Exception: except Exception:
log.exception("aspell process failed; could not spell check") log.exception("aspell process failed; could not spell check")
# Return original string if aspell fails # Return original string if aspell fails
return string,0, string return string, 0, string
finally: finally:
f.close() f.close()
...@@ -109,7 +112,7 @@ def spell_correct(string): ...@@ -109,7 +112,7 @@ def spell_correct(string):
incorrect_words = list() incorrect_words = list()
correct_spelling = list() correct_spelling = list()
for i in range(1, len(incorrect)): for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10): if (len(incorrect[i]) > 10):
#Reformat aspell output to make sense #Reformat aspell output to make sense
match = re.search(":", incorrect[i]) match = re.search(":", incorrect[i])
if hasattr(match, "start"): if hasattr(match, "start"):
...@@ -128,16 +131,16 @@ def spell_correct(string): ...@@ -128,16 +131,16 @@ def spell_correct(string):
#Create markup based on spelling errors #Create markup based on spelling errors
newstring = string newstring = string
markup_string = string markup_string = string
already_subbed=[] already_subbed = []
for i in range(0, len(incorrect_words)): for i in range(0, len(incorrect_words)):
sub_pat = r"\b" + incorrect_words[i] + r"\b" sub_pat = r"\b" + incorrect_words[i] + r"\b"
sub_comp = re.compile(sub_pat) sub_comp = re.compile(sub_pat)
newstring = re.sub(sub_comp, correct_spelling[i], newstring) newstring = re.sub(sub_comp, correct_spelling[i], newstring)
if incorrect_words[i] not in already_subbed: if incorrect_words[i] not in already_subbed:
markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string) markup_string = re.sub(sub_comp, '<bs>' + incorrect_words[i] + "</bs>", markup_string)
already_subbed.append(incorrect_words[i]) already_subbed.append(incorrect_words[i])
return newstring,len(incorrect_words),markup_string return newstring, len(incorrect_words), markup_string
def ngrams(tokens, min_n, max_n): def ngrams(tokens, min_n, max_n):
...@@ -162,6 +165,7 @@ def f7(seq): ...@@ -162,6 +165,7 @@ def f7(seq):
""" """
seen = set() seen = set()
seen_add = seen.add seen_add = seen.add
#TODO Potential Improvment Here
return [x for x in seq if x not in seen and not seen_add(x)] return [x for x in seq if x not in seen and not seen_add(x)]
...@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary Returns a list of words that constitute the significant vocabulary
""" """
dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats) dict = CountVectorizer(ngram_range=(1, 2), max_features=max_feats)
dict_mat = dict.fit_transform(text) dict_mat = dict.fit_transform(text)
set_score = numpy.asarray(score, dtype=numpy.int) set_score = numpy.asarray(score, dtype=numpy.int)
med_score = numpy.median(set_score) med_score = numpy.median(set_score)
new_score = set_score new_score = set_score
if(med_score == 0): if (med_score == 0):
med_score = 1 med_score = 1
new_score[set_score < med_score] = 0 new_score[set_score < med_score] = 0
new_score[set_score >= med_score] = 1 new_score[set_score >= med_score] = 1
...@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals.append(fish_val) fish_vals.append(fish_val)
cutoff = 1 cutoff = 1
if(len(fish_vals) > max_feats2): if (len(fish_vals) > max_feats2):
cutoff = sorted(fish_vals)[max_feats2] cutoff = sorted(fish_vals)[max_feats2]
good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff]) good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
...@@ -253,12 +257,12 @@ def edit_distance(s1, s2): ...@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
else: else:
cost = 1 cost = 1
d[(i, j)] = min( d[(i, j)] = min(
d[(i - 1, j)] + 1, # deletion d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution d[(i - 1, j - 1)] + cost, # substitution
) )
if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]: if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
return d[lenstr1 - 1, lenstr2 - 1] return d[lenstr1 - 1, lenstr2 - 1]
...@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3): ...@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds]) sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
preds.append(list(sim_fit.predict(arr[chunks[i]]))) preds.append(list(sim_fit.predict(arr[chunks[i]])))
all_preds = list(chain(*preds)) all_preds = list(chain(*preds))
return(all_preds) return (all_preds)
def gen_model(clf, arr, sel_score): def gen_model(clf, arr, sel_score):
...@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score): ...@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
""" """
set_score = numpy.asarray(sel_score, dtype=numpy.int) set_score = numpy.asarray(sel_score, dtype=numpy.int)
sim_fit = clf.fit(arr, set_score) sim_fit = clf.fit(arr, set_score)
return(sim_fit) return (sim_fit)
def gen_preds(clf, arr): def gen_preds(clf, arr):
...@@ -322,7 +326,7 @@ def gen_preds(clf, arr): ...@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
arr is a data array identical in dimension to the array clf was trained on arr is a data array identical in dimension to the array clf was trained on
Returns the array of predictions. Returns the array of predictions.
""" """
if(hasattr(clf, "predict_proba")): if (hasattr(clf, "predict_proba")):
ret = clf.predict(arr) ret = clf.predict(arr)
# pred_score=preds.argmax(1)+min(x._score) # pred_score=preds.argmax(1)+min(x._score)
else: else:
...@@ -340,8 +344,10 @@ def calc_list_average(l): ...@@ -340,8 +344,10 @@ def calc_list_average(l):
total += value total += value
return total / len(l) return total / len(l)
stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5 stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None): def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
""" """
Calculates kappa correlation between rater_a and rater_b. Calculates kappa correlation between rater_a and rater_b.
...@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None) ...@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
max_rating is an optional argument describing the maximum rating possible on the data set max_rating is an optional argument describing the maximum rating possible on the data set
Returns a float corresponding to the kappa correlation Returns a float corresponding to the kappa correlation
""" """
assert(len(rater_a) == len(rater_b)) assert (len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a] rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b] rater_b = [int(b) for b in rater_b]
if min_rating is None: if min_rating is None:
...@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None) ...@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if max_rating is None: if max_rating is None:
max_rating = max(rater_a + rater_b) max_rating = max(rater_a + rater_b)
conf_mat = confusion_matrix(rater_a, rater_b, conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating) min_rating, max_rating)
num_ratings = len(conf_mat) num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a)) num_scored_items = float(len(rater_a))
...@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None) ...@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator = 0.0 numerator = 0.0
denominator = 0.0 denominator = 0.0
if(num_ratings > 1): if (num_ratings > 1):
for i in range(num_ratings): for i in range(num_ratings):
for j in range(num_ratings): for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j] expected_count = (hist_rater_a[i] * hist_rater_b[j]
...@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None): ...@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
A confusion matrix shows how often 2 values agree and disagree A confusion matrix shows how often 2 values agree and disagree
See quadratic_weighted_kappa for argument descriptions See quadratic_weighted_kappa for argument descriptions
""" """
assert(len(rater_a) == len(rater_b)) assert (len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a] rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b] rater_b = [int(b) for b in rater_b]
min_rating = int(min_rating) min_rating = int(min_rating)
...@@ -450,7 +456,7 @@ def get_separator_words(toks1): ...@@ -450,7 +456,7 @@ def get_separator_words(toks1):
Returns a list of separator words Returns a list of separator words
""" """
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1) tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile(ESSAY_COR_TOKENS_PATH)): if (os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb')) toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else: else:
essay_corpus = open(ESSAY_CORPUS_PATH).read() essay_corpus = open(ESSAY_CORPUS_PATH).read()
...@@ -460,12 +466,12 @@ def get_separator_words(toks1): ...@@ -460,12 +466,12 @@ def get_separator_words(toks1):
sep_words = [] sep_words = []
for word in tab_toks1.keys(): for word in tab_toks1.keys():
tok1_present = tab_toks1[word] tok1_present = tab_toks1[word]
if(tok1_present > 2): if (tok1_present > 2):
tok1_total = tab_toks1._N tok1_total = tab_toks1._N
tok2_present = toks2[word] tok2_present = toks2[word]
tok2_total = toks2._N tok2_total = toks2._N
fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2): if (fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
sep_words.append(word) sep_words.append(word)
sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5] sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
return sep_words return sep_words
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment