Commit 279d3a0c by gradyward

A handful of small changes.

A final commit before I attempt to depricate all of the unused predictor stuff.

Stand with me now.
parent b32d5674
...@@ -95,7 +95,7 @@ def create(examples, scores, prompt_string, dump_data=False): ...@@ -95,7 +95,7 @@ def create(examples, scores, prompt_string, dump_data=False):
# Gets the features and classifiers from the essay set and computes the error # Gets the features and classifiers from the essay set and computes the error
try: try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model( feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(
essay_set, algorithm=algorithm essay_set
) )
results['cv_kappa'] = cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
...@@ -103,10 +103,11 @@ def create(examples, scores, prompt_string, dump_data=False): ...@@ -103,10 +103,11 @@ def create(examples, scores, prompt_string, dump_data=False):
results['classifier'] = classifier results['classifier'] = classifier
results['algorithm'] = algorithm results['algorithm'] = algorithm
results['success'] = True results['success'] = True
except: except Exception as ex:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
log.exception(ex)
return results return results
...@@ -153,7 +154,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi ...@@ -153,7 +154,7 @@ def create_generic(numeric_values, textual_values, target, algorithm=util_functi
# Gets the features and classifiers from the essay set and computes the error # Gets the features and classifiers from the essay set and computes the error
try: try:
feature_ext, classifier, cv_error_results = \ feature_ext, classifier, cv_error_results = \
model_creator.extract_features_and_generate_model_predictors(predictor, algorithm) model_creator.extract_features_and_generate_model_from_predictors(predictor, algorithm)
results['cv_kappa'] = cv_error_results['kappa'] results['cv_kappa'] = cv_error_results['kappa']
results['cv_mean_absolute_error'] = cv_error_results['mae'] results['cv_mean_absolute_error'] = cv_error_results['mae']
results['feature_ext'] = feature_ext results['feature_ext'] = feature_ext
......
...@@ -25,9 +25,18 @@ log = logging.getLogger() ...@@ -25,9 +25,18 @@ log = logging.getLogger()
def read_in_test_data(filename): def read_in_test_data(filename):
""" """
Reads in test data file found at filename. Reads in tab delimited test data file found at filename for training purposes.
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
Args:
filename (str): The path to the data
Return:
Tuple of the form (score, text), where:
The former is the list of scores assigned to the essays in the file (int)
The latter is the list of essays in the file
""" """
tid, e_set, score, score2, text = [], [], [], [], [] tid, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read() combined_raw = open(filename).read()
...@@ -45,8 +54,13 @@ def read_in_test_data(filename): ...@@ -45,8 +54,13 @@ def read_in_test_data(filename):
def read_in_test_prompt(filename): def read_in_test_prompt(filename):
""" """
Reads in the prompt from a text file Reads in the prompt from a file.
Returns string
Args:
filename (str): the name of the file
Returns:
(str): the prompt as a string.
""" """
prompt_string = open(filename).read() prompt_string = open(filename).read()
return prompt_string return prompt_string
...@@ -55,10 +69,16 @@ def read_in_test_prompt(filename): ...@@ -55,10 +69,16 @@ def read_in_test_prompt(filename):
def read_in_test_data_twocolumn(filename, sep=","): def read_in_test_data_twocolumn(filename, sep=","):
""" """
Reads in a two column version of the test data. Reads in a two column version of the test data.
Filename must point to a delimited file.
In filename, the first column should be integer score data. In filename, the first column should be integer score data.
The second column should be string text data. The second column should be string text data.
Sep specifies the type of separator between fields. Sep specifies the type of separator between fields.
Return:
Tuple of the form (score, text), where:
The former is the list of scores assigned to the essays in the file (int)
The latter is the list of essays in the file
""" """
score, text = [], [] score, text = [], []
combined_raw = open(filename).read() combined_raw = open(filename).read()
...@@ -132,8 +152,7 @@ def get_algorithms(algorithm): ...@@ -132,8 +152,7 @@ def get_algorithms(algorithm):
max_depth=4, random_state=1, min_samples_leaf=3) max_depth=4, random_state=1, min_samples_leaf=3)
return clf, clf2 return clf, clf2
#TODO RENAME train_from_predictors def extract_features_and_generate_model_from_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Extracts features and generates predictors based on a given predictor set Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data predictor_set - a PredictorSet object that has been initialized with data
...@@ -162,40 +181,49 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util ...@@ -162,40 +181,49 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
return f, clf, cv_error_results return f, clf, cv_error_results
def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression): def extract_features_and_generate_model(essays):
""" """
Feed in an essay set to get feature vector and classifier Feed in an essay set to get feature vector and classifier
essays must be an essay set object
additional array is an optional argument that can specify Args:
a numpy array of values to add in essays (EssaySet): The essay set to construct the feature extractor and model off of
returns a trained FeatureExtractor object and a trained classifier
Returns:
A tuple with the following elements in the following order:
- The Trained Feature extractor
- The Trained Classifier
- Any Cross Validation results
""" """
f = feature_extractor.FeatureExtractor() feat_extractor = feature_extractor.FeatureExtractor(essays)
f.initialize_dictionaries(essays)
train_feats = f.generate_features(essays) features = feat_extractor.generate_features(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
algorithm = create.select_algorithm(set_score) algorithm = create.select_algorithm(set_score)
clf, clf2 = get_algorithms(algorithm) predict_classifier, cv_error_classifier = get_algorithms(algorithm)
cv_error_results = get_cv_error(clf2, train_feats, essays._score) cv_error_results = get_cv_error(cv_error_classifier, features, essays._score)
try: try:
clf.fit(train_feats, set_score) predict_classifier.fit(features, set_score)
except ValueError: except:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0] = 1 set_score[0] = 1
set_score[1] = 0 set_score[1] = 0
clf.fit(train_feats, set_score) predict_classifier.fit(features, set_score)
return f, clf, cv_error_results return feat_extractor, predict_classifier, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path): def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
""" """
Writes out a model to a file. Writes out a model to a file.
Args:
prompt_string (str): The prompt for the set of essays
feature_ext (FeatureExtractor): a trained FeatureExtractor Object
classifier : a trained Classifier Object
prompt string is a string containing the prompt prompt string is a string containing the prompt
feature_ext is a trained FeatureExtractor object feature_ext is a trained FeatureExtractor object
classifier is a trained classifier classifier is a trained classifier
......
...@@ -83,7 +83,7 @@ class PredictorSet(object): ...@@ -83,7 +83,7 @@ class PredictorSet(object):
except UnicodeError: except UnicodeError:
raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i])) raise log_error(textual_features,"Textual feature {} could not be decoded.".format(textual_features[i]))
# Create essay sets for textual features if needed # Create essay sets for textual features
# TODO Understand this logic and change it, I don't think it is right. # TODO Understand this logic and change it, I don't think it is right.
if len(self._textual_features) == 0: if len(self._textual_features) == 0:
for i in xrange(0, len(textual_features)): for i in xrange(0, len(textual_features)):
......
# Collection of misc functions needed to support essay_set.py and feature_extractor.py. """
#Requires aspell to be installed and added to the path Collection of misc functions needed to support essay_set.py and feature_extractor.py.
Requires aspell to be installed and added to the path
The cleanup of this file is far beyond the scope of this hackathon --GBW--
"""
from fisher import pvalue from fisher import pvalue
aspell_path = "aspell" aspell_path = "aspell"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment