Commit aae59858 by Vik Paruchuri

Generate some documentation

parent 2c7214d8
...@@ -24,6 +24,7 @@ if not base_path.endswith("/"): ...@@ -24,6 +24,7 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
#Paths to needed data files
NGRAM_PATH = base_path + "data/good_pos_ngrams.p" NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
...@@ -43,17 +44,26 @@ class FeatureExtractor(object): ...@@ -43,17 +44,26 @@ class FeatureExtractor(object):
""" """
if(hasattr(e_set, '_type')): if(hasattr(e_set, '_type')):
if(e_set._type == "train"): if(e_set._type == "train"):
#normal text (unstemmed) useful words/bigrams
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2) nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
#stemmed and spell corrected vocab useful words/ngrams
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2) svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
#dictionary trained on proper vocab
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab) self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
#dictionary trained on proper vocab
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab) self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True self.dict_initialized = True
#Average spelling errors in set. needed later for spelling detection
self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors)) self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text])) self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
#Gets the number and positions of grammar errors
good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens) good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text]))) self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
#Generate bag of words features
bag_feats=self.gen_bag_feats(e_set) bag_feats=self.gen_bag_feats(e_set)
#Sum of a row of bag of words features (topical words in an essay)
f_row_sum=numpy.sum(bag_feats[:,:]) f_row_sum=numpy.sum(bag_feats[:,:])
#Average index of how "topical" essays are
self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text])) self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
ret = "ok" ret = "ok"
else: else:
...@@ -87,6 +97,9 @@ class FeatureExtractor(object): ...@@ -87,6 +97,9 @@ class FeatureExtractor(object):
def _get_grammar_errors(self,pos,text,tokens): def _get_grammar_errors(self,pos,text,tokens):
""" """
Internal function to get the number of grammar errors in given text Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
text - normal text (list)
tokens - list of lists of tokenized text
""" """
word_counts = [max(len(t),1) for t in tokens] word_counts = [max(len(t),1) for t in tokens]
good_pos_tags = [] good_pos_tags = []
...@@ -123,6 +136,7 @@ class FeatureExtractor(object): ...@@ -123,6 +136,7 @@ class FeatureExtractor(object):
Generates length based features from an essay set Generates length based features from an essay set
Generally an internal function called by gen_feats Generally an internal function called by gen_feats
Returns an array of length features Returns an array of length features
e_set - EssaySet object
""" """
text = e_set._text text = e_set._text
lengths = [len(e) for e in text] lengths = [len(e) for e in text]
...@@ -146,6 +160,7 @@ class FeatureExtractor(object): ...@@ -146,6 +160,7 @@ class FeatureExtractor(object):
Generates bag of words features from an input essay set and trained FeatureExtractor Generates bag of words features from an input essay set and trained FeatureExtractor
Generally called by gen_feats Generally called by gen_feats
Returns an array of features Returns an array of features
e_set - EssaySet object
""" """
if(hasattr(self, '_stem_dict')): if(hasattr(self, '_stem_dict')):
sfeats = self._stem_dict.transform(e_set._clean_stem_text) sfeats = self._stem_dict.transform(e_set._clean_stem_text)
...@@ -159,6 +174,7 @@ class FeatureExtractor(object): ...@@ -159,6 +174,7 @@ class FeatureExtractor(object):
""" """
Generates bag of words, length, and prompt features from an essay set object Generates bag of words, length, and prompt features from an essay set object
returns an array of features returns an array of features
e_set - EssaySet object
""" """
bag_feats = self.gen_bag_feats(e_set) bag_feats = self.gen_bag_feats(e_set)
length_feats = self.gen_length_feats(e_set) length_feats = self.gen_length_feats(e_set)
...@@ -173,6 +189,7 @@ class FeatureExtractor(object): ...@@ -173,6 +189,7 @@ class FeatureExtractor(object):
Generates prompt based features from an essay set object and internal prompt variable. Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats Generally called internally by gen_feats
Returns an array of prompt features Returns an array of prompt features
e_set - EssaySet object
""" """
prompt_toks = nltk.word_tokenize(e_set._prompt) prompt_toks = nltk.word_tokenize(e_set._prompt)
expand_syns = [] expand_syns = []
...@@ -208,6 +225,7 @@ class FeatureExtractor(object): ...@@ -208,6 +225,7 @@ class FeatureExtractor(object):
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
in order to get off topic feedback. in order to get off topic feedback.
Returns a list of lists (one list per essay in e_set) Returns a list of lists (one list per essay in e_set)
e_set - EssaySet object
""" """
#Set ratio to modify thresholds for grammar/spelling errors #Set ratio to modify thresholds for grammar/spelling errors
......
...@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score): ...@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score):
min_score=min(numpy.asarray(score)) min_score=min(numpy.asarray(score))
max_score=max(numpy.asarray(score)) max_score=max(numpy.asarray(score))
if algorithm == util_functions.AlgorithmTypes.classification: if algorithm == util_functions.AlgorithmTypes.classification:
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)] raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here #TODO: Normalize confidence somehow here
confidence=raw_confidence confidence=raw_confidence
......
...@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True): ...@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x return x
def get_cv_error(clf,feats,scores): def get_cv_error(clf,feats,scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0} results={'success' : False, 'kappa' : 0, 'mae' : 0}
try: try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores) cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
...@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores): ...@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores):
return results return results
def get_algorithms(type): def get_algorithms(type):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if type == util_functions.AlgorithmTypes.classification: if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1,min_samples_leaf=3)
...@@ -118,6 +128,11 @@ def get_algorithms(type): ...@@ -118,6 +128,11 @@ def get_algorithms(type):
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression): def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]): if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment