Commit b118aba5 by gradyward

Finishing Touches

parent 6cac95c1
......@@ -28,7 +28,7 @@ from ease import feature_extractor
from ease.essay_set import EssaySet
def create(examples, scores, prompt_string, dump_data=False):
def create(examples, scores, prompt_string):
"""
Creates a machine learning model from basic inputs (essays, associated scores and a prompt) and trains the model.
......@@ -39,9 +39,6 @@ def create(examples, scores, prompt_string, dump_data=False):
scores (list of int): the associated scores that correspond to the essays.
prompt_string (str): the common prompt for all of the example essays.
Kwargs:
dump_data (bool): whether or not a examples and scores should be set via a data input dump
Returns:
(dict): Has the following keys:
'errors' (list of Exception): List of all errors that occurred during training
......@@ -52,11 +49,7 @@ def create(examples, scores, prompt_string, dump_data=False):
'success' (bool): Whether or not the training of the classifier was successful.
"""
# If dump_data is true, then the examples and scores are loaded from json data.
if dump_data:
_dump_input_data(examples, scores)
# Selects the appropriate ML algorithm to use to train the classifier
# Selects the appropriate ML algorithm to use to train (Classification or Regression)
algorithm = _determine_algorithm(scores)
#Initialize a results dictionary to return
......@@ -114,7 +107,7 @@ def _determine_algorithm(score_list):
The ML algorithm used to train the classifier set and feature extractor
"""
#Count the number of unique score points in the score list
#Count the number of unique score values in the score list
if len(set(score_list)) > 5:
return util_functions.AlgorithmTypes.regression
else:
......@@ -249,33 +242,8 @@ def _get_cv_error(classifier, features, scores):
results['success'] = True
except ValueError as ex:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# TODO Figure out why this error would occur in the first place.
# TODO Figure out why this error would occur in the first place. ^^^ THIS IS NOT ACCEPTABLE
msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
log.debug(msg)
return results
def _dump_input_data(essays, scores):
"""
Dumps input data using json serialized objects of the form {'text': essay, 'score': score}
Args:
essays (list of str): A list of essays to dump
scores (list of int): An associated list of scores
"""
file_path = base_path + "/tests/data/json_data/"
time_suffix = datetime.now().strftime("%H%M%S%d%m%Y")
prefix = "test-case-"
filename = prefix + time_suffix + ".json"
json_data = []
try:
for i in xrange(0, len(essays)):
json_data.append({'text': essays[i], 'score': scores[i]})
with open(file_path + filename, 'w+') as outfile:
json.dump(json_data, outfile)
except IOError as ex:
error = "An IO error occurred while trying to dump JSON data to a file: {ex}".format(ex=ex)
log.exception(error)
raise CreateRequestError(error)
return results
\ No newline at end of file
......@@ -102,22 +102,17 @@ class EssaySet(object):
log.exception(msg)
raise EssaySetRequestError(msg)
# Validates that score is an integer and essay_text is a string.
# Validates that score is an integer and essay_text is a string and essay_generated is a 0 or a 1.
try:
essay_score = int(essay_score)
essay_text = str(essay_text)
essay_generated = int(essay_generated)
bool(essay_generated)
except TypeError:
ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
log.exception(ex)
raise EssaySetRequestError(ex)
# Validates that essay generated is 0 or 1
if essay_generated != 0 and essay_generated != 1:
ex = "Invalid value for essay_generated ({}). Value must be 0 or 1.".format(essay_generated)
log.exception(ex)
raise EssaySetRequestError(ex)
# Validates to make sure that the essay is at least five characters long.
if len(essay_text) < 5:
essay_text = "Invalid essay."
......
......@@ -117,19 +117,19 @@ class FeatureExtractor(object):
Array of features with the following included:
- Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features
- EDIT: Prompt Features were being ignored (passed in an empty string), so for posterity we are ignoring
them.
"""
try:
vocabulary_features = self._generate_vocabulary_features(essay_set)
length_features = self._generate_length_features(essay_set)
prompt_features = self._generate_prompt_features(essay_set)
except Exception as ex:
msg = "An unexpected error occurred during feature extraction: {}".format(ex)
log.exception(msg)
raise FeatureExtractionInternalError(msg)
# Lumps them all together, copies to solidify, and returns
overall_features = numpy.concatenate((length_features, prompt_features, vocabulary_features), axis=1)
overall_features = numpy.concatenate((length_features, vocabulary_features), axis=1)
overall_features = overall_features.copy()
return overall_features
......@@ -183,45 +183,6 @@ class FeatureExtractor(object):
bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1)
return bag_features.copy()
def _generate_prompt_features(self, essay_set):
"""
Generates prompt based features from an essay set object and internal prompt variable.
Called internally by generate_features
Args:
essay_set (EssaySet): an essay set object that is manipulated to generate prompt features
Returns:
an array of prompt features
"""
prompt_toks = nltk.word_tokenize(essay_set._prompt)
expand_syns = []
for word in prompt_toks:
synonyms = util_functions.get_wordnet_syns(word)
expand_syns.append(synonyms)
expand_syns = list(chain.from_iterable(expand_syns))
prompt_overlap = []
prompt_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if tok_length == 0:
tok_length = 1
prompt_overlap.append(len([i for i in j if i in prompt_toks]))
prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
expand_overlap = []
expand_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if tok_length == 0:
tok_length = 1
expand_overlap.append(len([i for i in j if i in expand_syns]))
expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
return prompt_arr.copy()
def _get_grammar_errors(self, pos, essays):
"""
Internal function to get the number of grammar errors in given text
......
......@@ -8,7 +8,6 @@ import logging
import sys
# Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment