Commit b118aba5 by gradyward

Finishing Touches

parent 6cac95c1
...@@ -28,7 +28,7 @@ from ease import feature_extractor ...@@ -28,7 +28,7 @@ from ease import feature_extractor
from ease.essay_set import EssaySet from ease.essay_set import EssaySet
def create(examples, scores, prompt_string, dump_data=False): def create(examples, scores, prompt_string):
""" """
Creates a machine learning model from basic inputs (essays, associated scores and a prompt) and trains the model. Creates a machine learning model from basic inputs (essays, associated scores and a prompt) and trains the model.
...@@ -39,9 +39,6 @@ def create(examples, scores, prompt_string, dump_data=False): ...@@ -39,9 +39,6 @@ def create(examples, scores, prompt_string, dump_data=False):
scores (list of int): the associated scores that correspond to the essays. scores (list of int): the associated scores that correspond to the essays.
prompt_string (str): the common prompt for all of the example essays. prompt_string (str): the common prompt for all of the example essays.
Kwargs:
dump_data (bool): whether or not a examples and scores should be set via a data input dump
Returns: Returns:
(dict): Has the following keys: (dict): Has the following keys:
'errors' (list of Exception): List of all errors that occurred during training 'errors' (list of Exception): List of all errors that occurred during training
...@@ -52,11 +49,7 @@ def create(examples, scores, prompt_string, dump_data=False): ...@@ -52,11 +49,7 @@ def create(examples, scores, prompt_string, dump_data=False):
'success' (bool): Whether or not the training of the classifier was successful. 'success' (bool): Whether or not the training of the classifier was successful.
""" """
# If dump_data is true, then the examples and scores are loaded from json data. # Selects the appropriate ML algorithm to use to train (Classification or Regression)
if dump_data:
_dump_input_data(examples, scores)
# Selects the appropriate ML algorithm to use to train the classifier
algorithm = _determine_algorithm(scores) algorithm = _determine_algorithm(scores)
#Initialize a results dictionary to return #Initialize a results dictionary to return
...@@ -114,7 +107,7 @@ def _determine_algorithm(score_list): ...@@ -114,7 +107,7 @@ def _determine_algorithm(score_list):
The ML algorithm used to train the classifier set and feature extractor The ML algorithm used to train the classifier set and feature extractor
""" """
#Count the number of unique score points in the score list #Count the number of unique score values in the score list
if len(set(score_list)) > 5: if len(set(score_list)) > 5:
return util_functions.AlgorithmTypes.regression return util_functions.AlgorithmTypes.regression
else: else:
...@@ -249,33 +242,8 @@ def _get_cv_error(classifier, features, scores): ...@@ -249,33 +242,8 @@ def _get_cv_error(classifier, features, scores):
results['success'] = True results['success'] = True
except ValueError as ex: except ValueError as ex:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal. # If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# TODO Figure out why this error would occur in the first place. # TODO Figure out why this error would occur in the first place. ^^^ THIS IS NOT ACCEPTABLE
msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex) msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
log.debug(msg) log.debug(msg)
return results return results
\ No newline at end of file
def _dump_input_data(essays, scores):
"""
Dumps input data using json serialized objects of the form {'text': essay, 'score': score}
Args:
essays (list of str): A list of essays to dump
scores (list of int): An associated list of scores
"""
file_path = base_path + "/tests/data/json_data/"
time_suffix = datetime.now().strftime("%H%M%S%d%m%Y")
prefix = "test-case-"
filename = prefix + time_suffix + ".json"
json_data = []
try:
for i in xrange(0, len(essays)):
json_data.append({'text': essays[i], 'score': scores[i]})
with open(file_path + filename, 'w+') as outfile:
json.dump(json_data, outfile)
except IOError as ex:
error = "An IO error occurred while trying to dump JSON data to a file: {ex}".format(ex=ex)
log.exception(error)
raise CreateRequestError(error)
...@@ -102,22 +102,17 @@ class EssaySet(object): ...@@ -102,22 +102,17 @@ class EssaySet(object):
log.exception(msg) log.exception(msg)
raise EssaySetRequestError(msg) raise EssaySetRequestError(msg)
# Validates that score is an integer and essay_text is a string. # Validates that score is an integer and essay_text is a string and essay_generated is a 0 or a 1.
try: try:
essay_score = int(essay_score) essay_score = int(essay_score)
essay_text = str(essay_text) essay_text = str(essay_text)
essay_generated = int(essay_generated) essay_generated = int(essay_generated)
bool(essay_generated)
except TypeError: except TypeError:
ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)) ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
log.exception(ex) log.exception(ex)
raise EssaySetRequestError(ex) raise EssaySetRequestError(ex)
# Validates that essay generated is 0 or 1
if essay_generated != 0 and essay_generated != 1:
ex = "Invalid value for essay_generated ({}). Value must be 0 or 1.".format(essay_generated)
log.exception(ex)
raise EssaySetRequestError(ex)
# Validates to make sure that the essay is at least five characters long. # Validates to make sure that the essay is at least five characters long.
if len(essay_text) < 5: if len(essay_text) < 5:
essay_text = "Invalid essay." essay_text = "Invalid essay."
......
...@@ -117,19 +117,19 @@ class FeatureExtractor(object): ...@@ -117,19 +117,19 @@ class FeatureExtractor(object):
Array of features with the following included: Array of features with the following included:
- Length Features - Length Features
- Vocabulary Features (both Normal and Stemmed Vocabulary) - Vocabulary Features (both Normal and Stemmed Vocabulary)
- Prompt Features - EDIT: Prompt Features were being ignored (passed in an empty string), so for posterity we are ignoring
them.
""" """
try: try:
vocabulary_features = self._generate_vocabulary_features(essay_set) vocabulary_features = self._generate_vocabulary_features(essay_set)
length_features = self._generate_length_features(essay_set) length_features = self._generate_length_features(essay_set)
prompt_features = self._generate_prompt_features(essay_set)
except Exception as ex: except Exception as ex:
msg = "An unexpected error occurred during feature extraction: {}".format(ex) msg = "An unexpected error occurred during feature extraction: {}".format(ex)
log.exception(msg) log.exception(msg)
raise FeatureExtractionInternalError(msg) raise FeatureExtractionInternalError(msg)
# Lumps them all together, copies to solidify, and returns # Lumps them all together, copies to solidify, and returns
overall_features = numpy.concatenate((length_features, prompt_features, vocabulary_features), axis=1) overall_features = numpy.concatenate((length_features, vocabulary_features), axis=1)
overall_features = overall_features.copy() overall_features = overall_features.copy()
return overall_features return overall_features
...@@ -183,45 +183,6 @@ class FeatureExtractor(object): ...@@ -183,45 +183,6 @@ class FeatureExtractor(object):
bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1) bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1)
return bag_features.copy() return bag_features.copy()
def _generate_prompt_features(self, essay_set):
"""
Generates prompt based features from an essay set object and internal prompt variable.
Called internally by generate_features
Args:
essay_set (EssaySet): an essay set object that is manipulated to generate prompt features
Returns:
an array of prompt features
"""
prompt_toks = nltk.word_tokenize(essay_set._prompt)
expand_syns = []
for word in prompt_toks:
synonyms = util_functions.get_wordnet_syns(word)
expand_syns.append(synonyms)
expand_syns = list(chain.from_iterable(expand_syns))
prompt_overlap = []
prompt_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if tok_length == 0:
tok_length = 1
prompt_overlap.append(len([i for i in j if i in prompt_toks]))
prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
expand_overlap = []
expand_overlap_prop = []
for j in essay_set._tokens:
tok_length = len(j)
if tok_length == 0:
tok_length = 1
expand_overlap.append(len([i for i in j if i in expand_syns]))
expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
return prompt_arr.copy()
def _get_grammar_errors(self, pos, essays): def _get_grammar_errors(self, pos, essays):
""" """
Internal function to get the number of grammar errors in given text Internal function to get the number of grammar errors in given text
......
...@@ -8,7 +8,6 @@ import logging ...@@ -8,7 +8,6 @@ import logging
import sys import sys
# Append sys to base path to import the following modules # Append sys to base path to import the following modules
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment