Finishing Touches

b118aba5 · gradyward · 6cac95c1 · b118aba5 · b118aba5 · b118aba5
Commit b118aba5 authored Jun 13, 2014 by gradyward
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 87 deletions

ease/create.py
+6 -37

ease/essay_set.py
+2 -7

ease/feature_extractor.py
+3 -42

ease/grade.py
+0 -1

No files found.
--- a/ease/create.py
+++ b/ease/create.py
@@ -28,7 +28,7 @@ from ease import feature_extractor
 from ease.essay_set import EssaySet


-def create(examples, scores, prompt_string, dump_data=False):
+def create(examples, scores, prompt_string):
    """
    Creates a machine learning model from basic inputs (essays, associated scores and a prompt) and trains the model.

@@ -39,9 +39,6 @@ def create(examples, scores, prompt_string, dump_data=False):
        scores (list of int): the associated scores that correspond to the essays.
        prompt_string (str): the common prompt for all of the example essays.

-    Kwargs:
-        dump_data (bool): whether or not a examples and scores should be set via a data input dump
-
    Returns:
        (dict): Has the following keys:
            'errors' (list of Exception): List of all errors that occurred during training
@@ -52,11 +49,7 @@ def create(examples, scores, prompt_string, dump_data=False):
            'success' (bool): Whether or not the training of the classifier was successful.
    """

-    # If dump_data is true, then the examples and scores are loaded from json data.
-    if dump_data:
-        _dump_input_data(examples, scores)
-
-    # Selects the appropriate ML algorithm to use to train the classifier
+    # Selects the appropriate ML algorithm to use to train (Classification or Regression)
    algorithm = _determine_algorithm(scores)

    #Initialize a results dictionary to return
@@ -114,7 +107,7 @@ def _determine_algorithm(score_list):
        The ML algorithm used to train the classifier set and feature extractor
    """

-    #Count the number of unique score points in the score list
+    #Count the number of unique score values in the score list
    if len(set(score_list)) > 5:
        return util_functions.AlgorithmTypes.regression
    else:
@@ -249,33 +242,8 @@ def _get_cv_error(classifier, features, scores):
        results['success'] = True
    except ValueError as ex:
        # If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
-        # TODO Figure out why this error would occur in the first place.
+        # TODO Figure out why this error would occur in the first place. ^^^ THIS IS NOT ACCEPTABLE
        msg = u"Not enough classes (0,1,etc) in each cross validation fold: {ex}".format(ex=ex)
        log.debug(msg)

-    return results
-
-
-def _dump_input_data(essays, scores):
-    """
-    Dumps input data using json serialized objects of the form {'text': essay, 'score': score}
-
-    Args:
-        essays (list of str): A list of essays to dump
-        scores (list of int): An associated list of scores
-    """
-
-    file_path = base_path + "/tests/data/json_data/"
-    time_suffix = datetime.now().strftime("%H%M%S%d%m%Y")
-    prefix = "test-case-"
-    filename = prefix + time_suffix + ".json"
-    json_data = []
-    try:
-        for i in xrange(0, len(essays)):
-            json_data.append({'text': essays[i], 'score': scores[i]})
-        with open(file_path + filename, 'w+') as outfile:
-            json.dump(json_data, outfile)
-    except IOError as ex:
-        error = "An IO error occurred while trying to dump JSON data to a file: {ex}".format(ex=ex)
-        log.exception(error)
-        raise CreateRequestError(error)
+    return results
\ No newline at end of file
--- a/ease/essay_set.py
+++ b/ease/essay_set.py
@@ -102,22 +102,17 @@ class EssaySet(object):
                log.exception(msg)
                raise EssaySetRequestError(msg)

-        # Validates that score is an integer and essay_text is a string.
+        # Validates that score is an integer and essay_text is a string and essay_generated is a 0 or a 1.
        try:
            essay_score = int(essay_score)
            essay_text = str(essay_text)
            essay_generated = int(essay_generated)
+            bool(essay_generated)
        except TypeError:
            ex = "Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text))
            log.exception(ex)
            raise EssaySetRequestError(ex)

-        # Validates that essay generated is 0 or 1
-        if essay_generated != 0 and essay_generated != 1:
-            ex = "Invalid value for essay_generated ({}).  Value must be 0 or 1.".format(essay_generated)
-            log.exception(ex)
-            raise EssaySetRequestError(ex)
-
        # Validates to make sure that the essay is at least five characters long.
        if len(essay_text) < 5:
            essay_text = "Invalid essay."

--- a/ease/feature_extractor.py
+++ b/ease/feature_extractor.py
@@ -117,19 +117,19 @@ class FeatureExtractor(object):
            Array of features with the following included:
                - Length Features
                - Vocabulary Features (both Normal and Stemmed Vocabulary)
-                - Prompt Features
+                - EDIT: Prompt Features were being ignored (passed in an empty string), so for posterity we are ignoring
+                    them.
        """
        try:
            vocabulary_features = self._generate_vocabulary_features(essay_set)
            length_features = self._generate_length_features(essay_set)
-            prompt_features = self._generate_prompt_features(essay_set)
        except Exception as ex:
            msg = "An unexpected error occurred during feature extraction: {}".format(ex)
            log.exception(msg)
            raise FeatureExtractionInternalError(msg)

        # Lumps them all together, copies to solidify, and returns
-        overall_features = numpy.concatenate((length_features, prompt_features, vocabulary_features), axis=1)
+        overall_features = numpy.concatenate((length_features, vocabulary_features), axis=1)
        overall_features = overall_features.copy()
        return overall_features

@@ -183,45 +183,6 @@ class FeatureExtractor(object):
        bag_features = numpy.concatenate((stem_features.toarray(), normal_features.toarray()), axis=1)
        return bag_features.copy()

-    def _generate_prompt_features(self, essay_set):
-        """
-        Generates prompt based features from an essay set object and internal prompt variable.
-
-        Called internally by generate_features
-
-        Args:
-            essay_set (EssaySet): an essay set object that is manipulated to generate prompt features
-
-        Returns:
-            an array of prompt features
-        """
-        prompt_toks = nltk.word_tokenize(essay_set._prompt)
-        expand_syns = []
-        for word in prompt_toks:
-            synonyms = util_functions.get_wordnet_syns(word)
-            expand_syns.append(synonyms)
-        expand_syns = list(chain.from_iterable(expand_syns))
-        prompt_overlap = []
-        prompt_overlap_prop = []
-        for j in essay_set._tokens:
-            tok_length = len(j)
-            if tok_length == 0:
-                tok_length = 1
-            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
-            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
-        expand_overlap = []
-        expand_overlap_prop = []
-        for j in essay_set._tokens:
-            tok_length = len(j)
-            if tok_length == 0:
-                tok_length = 1
-            expand_overlap.append(len([i for i in j if i in expand_syns]))
-            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
-
-        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
-
-        return prompt_arr.copy()
-
    def _get_grammar_errors(self, pos, essays):
        """
        Internal function to get the number of grammar errors in given text

--- a/ease/grade.py
+++ b/ease/grade.py
@@ -8,7 +8,6 @@ import logging
 import sys


-
 # Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)