Commit 92d51e67 by Vik Paruchuri

Docs, remove old variables from create and grade interfaces

parent 3e7457ae
......@@ -24,14 +24,13 @@ import predictor_extractor
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string,model_path = None):
def create(text,score,prompt_string):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays
model_path - Deprecated, not needed
"""
#Initialize a results dictionary to return
......@@ -81,14 +80,13 @@ def create(text,score,prompt_string,model_path = None):
return results
def create_generic(numeric_values, textual_values, target, model_path = None, algorithm = util_functions.AlgorithmTypes.regression):
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
model_path - deprecated, kept for legacy code. Do not use.
algorithm - the type of algorithm that will be used
"""
......
......@@ -27,7 +27,7 @@ import math
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.grader.time')
def grade(grader_data,grader_config,submission):
def grade(grader_data,submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
......@@ -37,7 +37,6 @@ def grade(grader_data,grader_config,submission):
'prompt' : prompt for the question,
'algorithm' : algorithm for the question,
}
grader_config - Legacy, kept for compatibility with old code. Need to remove.
submission - The student submission (string)
"""
......@@ -112,14 +111,13 @@ def grade(grader_data,grader_config,submission):
return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
grader_data -- dictionary containing:
{
'algorithm' - Type of algorithm to use to score
}
grader_config - legacy, kept for compatibility with old code. Need to remove.
numeric_features - list of numeric features to predict on
textual_features - list of textual feature to predict on
......
"""
Extracts features for an arbitrary set of textual and numeric inputs
"""
import numpy
import re
import nltk
......@@ -12,6 +16,7 @@ import logging
import math
from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
......@@ -28,6 +33,10 @@ class PredictorExtractor(object):
self._initialized = False
def initialize_dictionaries(self, p_set):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
"""
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
......@@ -43,6 +52,7 @@ class PredictorExtractor(object):
if div_length==0:
div_length=1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
......@@ -52,6 +62,10 @@ class PredictorExtractor(object):
return success
def gen_feats(self, p_set):
"""
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
......
......@@ -83,6 +83,8 @@ def spell_correct(string):
Returns the spell corrected string if aspell is found, original string if not.
string - string
"""
#Create a temp file so that aspell could be used
f = open('tmpfile', 'w')
f.write(string)
f_path = os.path.abspath(f.name)
......@@ -91,13 +93,16 @@ def spell_correct(string):
p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
except:
log.exception("Could not find aspell, so could not spell correct!")
#Return original string if aspell fails
return string,0, string
#Aspell returns a list of incorrect words with the above flags
incorrect = p.readlines()
p.close()
incorrect_words = list()
correct_spelling = list()
for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10):
#Reformat aspell output to make sense
match = re.search(":", incorrect[i])
if hasattr(match, "start"):
begstring = incorrect[i][2:match.start()]
......@@ -111,6 +116,8 @@ def spell_correct(string):
incorrect_words.append(begword)
correct_spelling.append(sug)
#Create markup based on spelling errors
newstring = string
markup_string = string
already_subbed=[]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment