Commit aae59858 by Vik Paruchuri

Generate some documentation

parent 2c7214d8
......@@ -24,6 +24,7 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__)
#Paths to needed data files
NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
......@@ -43,17 +44,26 @@ class FeatureExtractor(object):
"""
if(hasattr(e_set, '_type')):
if(e_set._type == "train"):
#normal text (unstemmed) useful words/bigrams
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
#stemmed and spell corrected vocab useful words/ngrams
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
#dictionary trained on proper vocab
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
#dictionary trained on proper vocab
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True
#Average spelling errors in set. needed later for spelling detection
self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
#Gets the number and positions of grammar errors
good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
#Generate bag of words features
bag_feats=self.gen_bag_feats(e_set)
#Sum of a row of bag of words features (topical words in an essay)
f_row_sum=numpy.sum(bag_feats[:,:])
#Average index of how "topical" essays are
self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
ret = "ok"
else:
......@@ -87,6 +97,9 @@ class FeatureExtractor(object):
def _get_grammar_errors(self,pos,text,tokens):
"""
Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
text - normal text (list)
tokens - list of lists of tokenized text
"""
word_counts = [max(len(t),1) for t in tokens]
good_pos_tags = []
......@@ -123,6 +136,7 @@ class FeatureExtractor(object):
Generates length based features from an essay set
Generally an internal function called by gen_feats
Returns an array of length features
e_set - EssaySet object
"""
text = e_set._text
lengths = [len(e) for e in text]
......@@ -146,6 +160,7 @@ class FeatureExtractor(object):
Generates bag of words features from an input essay set and trained FeatureExtractor
Generally called by gen_feats
Returns an array of features
e_set - EssaySet object
"""
if(hasattr(self, '_stem_dict')):
sfeats = self._stem_dict.transform(e_set._clean_stem_text)
......@@ -159,6 +174,7 @@ class FeatureExtractor(object):
"""
Generates bag of words, length, and prompt features from an essay set object
returns an array of features
e_set - EssaySet object
"""
bag_feats = self.gen_bag_feats(e_set)
length_feats = self.gen_length_feats(e_set)
......@@ -173,6 +189,7 @@ class FeatureExtractor(object):
Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats
Returns an array of prompt features
e_set - EssaySet object
"""
prompt_toks = nltk.word_tokenize(e_set._prompt)
expand_syns = []
......@@ -208,6 +225,7 @@ class FeatureExtractor(object):
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
in order to get off topic feedback.
Returns a list of lists (one list per essay in e_set)
e_set - EssaySet object
"""
#Set ratio to modify thresholds for grammar/spelling errors
......
......@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score):
min_score=min(numpy.asarray(score))
max_score=max(numpy.asarray(score))
if algorithm == util_functions.AlgorithmTypes.classification:
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
......
......@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x
def get_cv_error(clf,feats,scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
......@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores):
return results
def get_algorithms(type):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
......@@ -118,6 +128,11 @@ def get_algorithms(type):
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment