Generate some documentation

aae59858 · Vik Paruchuri · 2c7214d8 · aae59858 · aae59858 · aae59858
Commit aae59858 authored Feb 26, 2013 by Vik Paruchuri
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 0 deletions

feature_extractor.py
+18 -0

grade.py
+1 -0

model_creator.py
+15 -0

No files found.
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -24,6 +24,7 @@ if not base_path.endswith("/"):
 log = logging.getLogger(__name__)
+#Paths to needed data files
 NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
 ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
@@ -43,17 +44,26 @@ class FeatureExtractor(object):
        """
        if(hasattr(e_set, '_type')):
            if(e_set._type == "train"):
+                #normal text (unstemmed) useful words/bigrams
                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
+                #stemmed and spell corrected vocab useful words/ngrams
                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
+                #dictionary trained on proper vocab
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
+                #dictionary trained on proper vocab
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True
+                #Average spelling errors in set. needed later for spelling detection
                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
                self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
+                #Gets the number and positions of grammar errors
                good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
                self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
+                #Generate bag of words features
                bag_feats=self.gen_bag_feats(e_set)
+                #Sum of a row of bag of words features (topical words in an essay)
                f_row_sum=numpy.sum(bag_feats[:,:])
+                #Average index of how "topical" essays are
                self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
                ret = "ok"
            else:
@@ -87,6 +97,9 @@ class FeatureExtractor(object):
    def _get_grammar_errors(self,pos,text,tokens):
        """
        Internal function to get the number of grammar errors in given text
+        pos - part of speech tagged text (list)
+        text - normal text (list)
+        tokens - list of lists of tokenized text
        """
        word_counts = [max(len(t),1) for t in tokens]
        good_pos_tags = []
@@ -123,6 +136,7 @@ class FeatureExtractor(object):
        Generates length based features from an essay set
        Generally an internal function called by gen_feats
        Returns an array of length features
+        e_set - EssaySet object
        """
        text = e_set._text
        lengths = [len(e) for e in text]
@@ -146,6 +160,7 @@ class FeatureExtractor(object):
        Generates bag of words features from an input essay set and trained FeatureExtractor
        Generally called by gen_feats
        Returns an array of features
+        e_set - EssaySet object
        """
        if(hasattr(self, '_stem_dict')):
            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
@@ -159,6 +174,7 @@ class FeatureExtractor(object):
        """
        Generates bag of words, length, and prompt features from an essay set object
        returns an array of features
+        e_set - EssaySet object
        """
        bag_feats = self.gen_bag_feats(e_set)
        length_feats = self.gen_length_feats(e_set)
@@ -173,6 +189,7 @@ class FeatureExtractor(object):
        Generates prompt based features from an essay set object and internal prompt variable.
        Generally called internally by gen_feats
        Returns an array of prompt features
+        e_set - EssaySet object
        """
        prompt_toks = nltk.word_tokenize(e_set._prompt)
        expand_syns = []
@@ -208,6 +225,7 @@ class FeatureExtractor(object):
        features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
        in order to get off topic feedback.
        Returns a list of lists (one list per essay in e_set)
+        e_set - EssaySet object
        """
        #Set ratio to modify thresholds for grammar/spelling errors

--- a/grade.py
+++ b/grade.py
@@ -174,6 +174,7 @@ def get_confidence_value(algorithm,model,grader_feats,score):
    min_score=min(numpy.asarray(score))
    max_score=max(numpy.asarray(score))
    if algorithm == util_functions.AlgorithmTypes.classification:
+        #If classification, predict with probability, which gives you a matrix of confidences per score point
        raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
        #TODO: Normalize confidence somehow here
        confidence=raw_confidence

--- a/model_creator.py
+++ b/model_creator.py
@@ -87,6 +87,12 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
    return x
 def get_cv_error(clf,feats,scores):
+    """
+    Gets cross validated error for a given classifier, set of features, and scores
+    clf - classifier
+    feats - features to feed into the classified and cross validate over
+    scores - scores associated with the features -- feature row 1 associates with score 1, etc.
+    """
    results={'success' : False, 'kappa' : 0, 'mae' : 0}
    try:
        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
@@ -104,6 +110,10 @@ def get_cv_error(clf,feats,scores):
    return results
 def get_algorithms(type):
+    """
+    Gets two classifiers for each type of algorithm, and returns them.  First for predicting, second for cv error.
+    type - one of util_functions.AlgorithmTypes
+    """
    if type == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
            max_depth=4, random_state=1,min_samples_leaf=3)
@@ -118,6 +128,11 @@ def get_algorithms(type):
 def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
+    """
+    Extracts features and generates predictors based on a given predictor set
+    predictor_set - a PredictorSet object that has been initialized with data
+    type - one of util_functions.AlgorithmType
+    """
    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
        algorithm = util_functions.AlgorithmTypes.regression