Docs, remove old variables from create and grade interfaces

92d51e67 · Vik Paruchuri · 3e7457ae · 92d51e67 · 92d51e67 · 92d51e67
Commit 92d51e67 authored Feb 26, 2013 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 8 deletions

create.py
+2 -4

grade.py
+2 -4

predictor_extractor.py
+14 -0

util_functions.py
+7 -0

No files found.
--- a/create.py
+++ b/create.py
@@ -24,14 +24,13 @@ import predictor_extractor
 log = logging.getLogger(__name__)

 @statsd.timed('open_ended_assessment.machine_learning.creator.time')
-def create(text,score,prompt_string,model_path = None):
+def create(text,score,prompt_string):
    """
    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
    TODO: Remove model path argument, it is needed for now to support legacy code
    text - A list of strings containing the text of the essays
    score - a list of integers containing score values
    prompt_string - the common prompt for the set of essays
-    model_path - Deprecated, not needed
    """

    #Initialize a results dictionary to return
@@ -81,14 +80,13 @@ def create(text,score,prompt_string,model_path = None):
    return results


-def create_generic(numeric_values, textual_values, target, model_path = None, algorithm = util_functions.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
    """
    Creates a model from a generic list numeric values and text values
    numeric_values - A list of lists that are the predictors
    textual_values - A list of lists that are the predictors
    (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
    target - The variable that we are trying to predict.  A list of integers.
-    model_path - deprecated, kept for legacy code.  Do not use.
    algorithm - the type of algorithm that will be used
    """


--- a/grade.py
+++ b/grade.py
@@ -27,7 +27,7 @@ import math
 log = logging.getLogger(__name__)

 @statsd.timed('open_ended_assessment.machine_learning.grader.time')
-def grade(grader_data,grader_config,submission):
+def grade(grader_data,submission):
    """
    Grades a specified submission using specified models
    grader_data - A dictionary:
@@ -37,7 +37,6 @@ def grade(grader_data,grader_config,submission):
        'prompt' : prompt for the question,
        'algorithm' : algorithm for the question,
    }
-    grader_config - Legacy, kept for compatibility with old code.  Need to remove.
    submission - The student submission (string)
    """

@@ -112,14 +111,13 @@ def grade(grader_data,grader_config,submission):

    return results

-def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+def grade_generic(grader_data, numeric_features, textual_features):
    """
    Grades a set of numeric and textual features using a generic model
    grader_data -- dictionary containing:
    {
        'algorithm' - Type of algorithm to use to score
    }
-    grader_config - legacy, kept for compatibility with old code.  Need to remove.
    numeric_features - list of numeric features to predict on
    textual_features - list of textual feature to predict on


--- a/predictor_extractor.py
+++ b/predictor_extractor.py
+"""
+Extracts features for an arbitrary set of textual and numeric inputs
+"""
+
 import numpy
 import re
 import nltk
@@ -12,6 +16,7 @@ import logging
 import math
 from feature_extractor import FeatureExtractor

+#Append to path and then import things that depend on path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
@@ -28,6 +33,10 @@ class PredictorExtractor(object):
        self._initialized = False

    def initialize_dictionaries(self, p_set):
+        """
+        Initialize dictionaries with the textual inputs in the PredictorSet object
+        p_set - PredictorSet object that has had data fed in
+        """
        success = False
        if not (hasattr(p_set, '_type')):
            error_message = "needs to be an essay set of the train type."
@@ -43,6 +52,7 @@ class PredictorExtractor(object):
        if div_length==0:
            div_length=1

+        #Ensures that even with a large amount of input textual features, training time stays reasonable
        max_feats2 = int(math.floor(200/div_length))
        for i in xrange(0,len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
@@ -52,6 +62,10 @@ class PredictorExtractor(object):
        return success

    def gen_feats(self, p_set):
+        """
+        Generates features based on an iput p_set
+        p_set - PredictorSet
+        """
        if self._initialized!=True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)

--- a/util_functions.py
+++ b/util_functions.py
@@ -83,6 +83,8 @@ def spell_correct(string):
    Returns the spell corrected string if aspell is found, original string if not.
    string - string
    """
+
+    #Create a temp file so that aspell could be used
    f = open('tmpfile', 'w')
    f.write(string)
    f_path = os.path.abspath(f.name)
@@ -91,13 +93,16 @@ def spell_correct(string):
        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
    except:
        log.exception("Could not find aspell, so could not spell correct!")
+        #Return original string if aspell fails
        return string,0, string
+    #Aspell returns a list of incorrect words with the above flags
    incorrect = p.readlines()
    p.close()
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
        if(len(incorrect[i]) > 10):
+            #Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
                begstring = incorrect[i][2:match.start()]
@@ -111,6 +116,8 @@ def spell_correct(string):

                    incorrect_words.append(begword)
                    correct_spelling.append(sug)
+
+    #Create markup based on spelling errors
    newstring = string
    markup_string = string
    already_subbed=[]