Commit 92d51e67 by Vik Paruchuri

Docs, remove old variables from create and grade interfaces

parent 3e7457ae
...@@ -24,14 +24,13 @@ import predictor_extractor ...@@ -24,14 +24,13 @@ import predictor_extractor
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.creator.time') @statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string,model_path = None): def create(text,score,prompt_string):
""" """
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays text - A list of strings containing the text of the essays
score - a list of integers containing score values score - a list of integers containing score values
prompt_string - the common prompt for the set of essays prompt_string - the common prompt for the set of essays
model_path - Deprecated, not needed
""" """
#Initialize a results dictionary to return #Initialize a results dictionary to return
...@@ -81,14 +80,13 @@ def create(text,score,prompt_string,model_path = None): ...@@ -81,14 +80,13 @@ def create(text,score,prompt_string,model_path = None):
return results return results
def create_generic(numeric_values, textual_values, target, model_path = None, algorithm = util_functions.AlgorithmTypes.regression): def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
""" """
Creates a model from a generic list numeric values and text values Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values) (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers. target - The variable that we are trying to predict. A list of integers.
model_path - deprecated, kept for legacy code. Do not use.
algorithm - the type of algorithm that will be used algorithm - the type of algorithm that will be used
""" """
......
...@@ -27,7 +27,7 @@ import math ...@@ -27,7 +27,7 @@ import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.grader.time') @statsd.timed('open_ended_assessment.machine_learning.grader.time')
def grade(grader_data,grader_config,submission): def grade(grader_data,submission):
""" """
Grades a specified submission using specified models Grades a specified submission using specified models
grader_data - A dictionary: grader_data - A dictionary:
...@@ -37,7 +37,6 @@ def grade(grader_data,grader_config,submission): ...@@ -37,7 +37,6 @@ def grade(grader_data,grader_config,submission):
'prompt' : prompt for the question, 'prompt' : prompt for the question,
'algorithm' : algorithm for the question, 'algorithm' : algorithm for the question,
} }
grader_config - Legacy, kept for compatibility with old code. Need to remove.
submission - The student submission (string) submission - The student submission (string)
""" """
...@@ -112,14 +111,13 @@ def grade(grader_data,grader_config,submission): ...@@ -112,14 +111,13 @@ def grade(grader_data,grader_config,submission):
return results return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features): def grade_generic(grader_data, numeric_features, textual_features):
""" """
Grades a set of numeric and textual features using a generic model Grades a set of numeric and textual features using a generic model
grader_data -- dictionary containing: grader_data -- dictionary containing:
{ {
'algorithm' - Type of algorithm to use to score 'algorithm' - Type of algorithm to use to score
} }
grader_config - legacy, kept for compatibility with old code. Need to remove.
numeric_features - list of numeric features to predict on numeric_features - list of numeric features to predict on
textual_features - list of textual feature to predict on textual_features - list of textual feature to predict on
......
"""
Extracts features for an arbitrary set of textual and numeric inputs
"""
import numpy import numpy
import re import re
import nltk import nltk
...@@ -12,6 +16,7 @@ import logging ...@@ -12,6 +16,7 @@ import logging
import math import math
from feature_extractor import FeatureExtractor from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
from essay_set import EssaySet from essay_set import EssaySet
...@@ -28,6 +33,10 @@ class PredictorExtractor(object): ...@@ -28,6 +33,10 @@ class PredictorExtractor(object):
self._initialized = False self._initialized = False
def initialize_dictionaries(self, p_set): def initialize_dictionaries(self, p_set):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
"""
success = False success = False
if not (hasattr(p_set, '_type')): if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type." error_message = "needs to be an essay set of the train type."
...@@ -43,6 +52,7 @@ class PredictorExtractor(object): ...@@ -43,6 +52,7 @@ class PredictorExtractor(object):
if div_length==0: if div_length==0:
div_length=1 div_length=1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length)) max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)): for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor()) self._extractors.append(FeatureExtractor())
...@@ -52,6 +62,10 @@ class PredictorExtractor(object): ...@@ -52,6 +62,10 @@ class PredictorExtractor(object):
return success return success
def gen_feats(self, p_set): def gen_feats(self, p_set):
"""
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True: if self._initialized!=True:
error_message = "Dictionaries have not been initialized." error_message = "Dictionaries have not been initialized."
log.exception(error_message) log.exception(error_message)
......
...@@ -83,6 +83,8 @@ def spell_correct(string): ...@@ -83,6 +83,8 @@ def spell_correct(string):
Returns the spell corrected string if aspell is found, original string if not. Returns the spell corrected string if aspell is found, original string if not.
string - string string - string
""" """
#Create a temp file so that aspell could be used
f = open('tmpfile', 'w') f = open('tmpfile', 'w')
f.write(string) f.write(string)
f_path = os.path.abspath(f.name) f_path = os.path.abspath(f.name)
...@@ -91,13 +93,16 @@ def spell_correct(string): ...@@ -91,13 +93,16 @@ def spell_correct(string):
p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra") p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
except: except:
log.exception("Could not find aspell, so could not spell correct!") log.exception("Could not find aspell, so could not spell correct!")
#Return original string if aspell fails
return string,0, string return string,0, string
#Aspell returns a list of incorrect words with the above flags
incorrect = p.readlines() incorrect = p.readlines()
p.close() p.close()
incorrect_words = list() incorrect_words = list()
correct_spelling = list() correct_spelling = list()
for i in range(1, len(incorrect)): for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10): if(len(incorrect[i]) > 10):
#Reformat aspell output to make sense
match = re.search(":", incorrect[i]) match = re.search(":", incorrect[i])
if hasattr(match, "start"): if hasattr(match, "start"):
begstring = incorrect[i][2:match.start()] begstring = incorrect[i][2:match.start()]
...@@ -111,6 +116,8 @@ def spell_correct(string): ...@@ -111,6 +116,8 @@ def spell_correct(string):
incorrect_words.append(begword) incorrect_words.append(begword)
correct_spelling.append(sug) correct_spelling.append(sug)
#Create markup based on spelling errors
newstring = string newstring = string
markup_string = string markup_string = string
already_subbed=[] already_subbed=[]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment