Commit 279d3a0c by gradyward

A handful of small changes.

A final commit before I attempt to depricate all of the unused predictor stuff.

Stand with me now.
parent b32d5674
......@@ -95,7 +95,7 @@ def create(examples, scores, prompt_string, dump_data=False):
# Gets the features and classifiers from the essay set and computes the error
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
essay_set, algorithm=algorithm
essay_set
)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
......@@ -103,10 +103,11 @@ def create(examples, scores, prompt_string, dump_data=False):
results['classifier'] = classifier
results['algorithm'] = algorithm
results['success'] = True
except:
except Exception as ex:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
log.exception(ex)
return results
......@@ -153,7 +154,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
# Gets the features and classifiers from the essay set and computes the error
try:
feature_ext, classifier, cv_error_results = \
model_creator.extract_features_and_generate_model_predictors(predictor, algorithm)
model_creator.extract_features_and_generate_model_from_predictors(predictor, algorithm)
results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext
......
......@@ -25,9 +25,18 @@ log = logging.getLogger()
def read_in_test_data(filename):
"""
Reads in test data file found at filename.
Reads in tab delimited test data file found at filename for training purposes.
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
Args:
filename (str): The path to the data
Return:
Tuple of the form (score, text), where:
The former is the list of scores assigned to the essays in the file (int)
The latter is the list of essays in the file
"""
tid, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read()
......@@ -45,8 +54,13 @@ def read_in_test_data(filename):
def read_in_test_prompt(filename):
"""
Reads in the prompt from a text file
Returns string
Reads in the prompt from a file.
Args:
filename (str): the name of the file
Returns:
(str): the prompt as a string.
"""
prompt_string = open(filename).read()
return prompt_string
......@@ -55,10 +69,16 @@ def read_in_test_prompt(filename):
def read_in_test_data_twocolumn(filename, sep=","):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
In filename, the first column should be integer score data.
The second column should be string text data.
Sep specifies the type of separator between fields.
Return:
Tuple of the form (score, text), where:
The former is the list of scores assigned to the essays in the file (int)
The latter is the list of essays in the file
"""
score, text = [], []
combined_raw = open(filename).read()
......@@ -132,8 +152,7 @@ def get_algorithms(algorithm):
max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2
#TODO RENAME train_from_predictors
def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
def extract_features_and_generate_model_from_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
......@@ -162,40 +181,49 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
return f, clf, cv_error_results
def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression):
def extract_features_and_generate_model(essays):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
additional array is an optional argument that can specify
a numpy array of values to add in
returns a trained FeatureExtractor object and a trained classifier
Args:
essays (EssaySet): The essay set to construct the feature extractor and model off of
Returns:
A tuple with the following elements in the following order:
- The Trained Feature extractor
- The Trained Classifier
- Any Cross Validation results
"""
f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays)
feat_extractor = feature_extractor.FeatureExtractor(essays)
train_feats = f.generate_features(essays)
features = feat_extractor.generate_features(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
algorithm = create.select_algorithm(set_score)
clf, clf2 = get_algorithms(algorithm)
predict_classifier, cv_error_classifier = get_algorithms(algorithm)
cv_error_results = get_cv_error(clf2, train_feats, essays._score)
cv_error_results = get_cv_error(cv_error_classifier, features, essays._score)
try:
clf.fit(train_feats, set_score)
except ValueError:
predict_classifier.fit(features, set_score)
except:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0] = 1
set_score[1] = 0
clf.fit(train_feats, set_score)
predict_classifier.fit(features, set_score)
return f, clf, cv_error_results
return feat_extractor, predict_classifier, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
"""
Writes out a model to a file.
Args:
prompt_string (str): The prompt for the set of essays
feature_ext (FeatureExtractor): a trained FeatureExtractor Object
classifier : a trained Classifier Object
prompt string is a string containing the prompt
feature_ext is a trained FeatureExtractor object
classifier is a trained classifier
......
......@@ -83,7 +83,7 @@ class PredictorSet(object):
except UnicodeError:
raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))
# Create essay sets for textual features if needed
# Create essay sets for textual features
# TODO Understand this logic and change it, I don't think it is right.
if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)):
......
# Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
"""
Collection of misc functions needed to support essay_set.py and feature_extractor.py.
Requires aspell to be installed and added to the path
The cleanup of this file is far beyond the scope of this hackathon --GBW--
"""
from fisher import pvalue
aspell_path = "aspell"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment