Merge pull request #9 from MITx/vik/deployment_work

Vik/deployment work

Merge pull request #9 from MITx/vik/deployment_work
Vik/deployment work
51d33f29 · VikParuchuri · ed930658 · 18fdc2ab · 51d33f29 · ed930658
Commit 51d33f29 authored Feb 27, 2013 by VikParuchuri
47 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ __pycache__/
 models/
 *.pyc
 *~
+tests/
--- a/apt-packages.txt
+++ b/apt-packages.txt
-aspell
\ No newline at end of file
--- a/create.py
+++ b/create.py
+"""
+Functions that create a machine learning model from training data
+"""
+
 import os
 import sys
 import logging
-log = logging.getLogger(__name__)
+from statsd import statsd
+import numpy

+#Define base path and add to sys path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
-
 one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
 sys.path.append(one_up_path)

+#Import modules that are dependent on the base path
 import model_creator
 import util_functions
 import predictor_set
 import predictor_extractor

-from statsd import statsd
+#Make a log
+log = logging.getLogger(__name__)

 @statsd.timed('open_ended_assessment.machine_learning.creator.time')
-def create(text,score,prompt_string,model_path):
-
+def create(text,score,prompt_string):
+    """
+    Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
+    TODO: Remove model path argument, it is needed for now to support legacy code
+    text - A list of strings containing the text of the essays
+    score - a list of integers containing score values
+    prompt_string - the common prompt for the set of essays
+    """
+
+    #Initialize a results dictionary to return
    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : ""}
+               'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
+               'score' : score, 'text' : text, 'prompt' : prompt_string}

    if len(text)!=len(score):
        msg = "Target and text lists must be same length."
@@ -28,18 +44,30 @@ def create(text,score,prompt_string,model_path):
        log.exception(msg)
        return results

+    #Decide what algorithm to use (regression or classification)
    try:
+        if len(util_functions.f7(list(score)))>5:
+            type = util_functions.AlgorithmTypes.regression
+        else:
+            type = util_functions.AlgorithmTypes.classification
+    except:
+        type = util_functions.AlgorithmTypes.regression
+
+    try:
+        #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
        e_set = model_creator.create_essay_set(text, score, prompt_string)
    except:
        msg = "essay set creation failed."
        results['errors'].append(msg)
        log.exception(msg)
    try:
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
+        #Gets features from the essay set and computes error
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
        results['cv_kappa']=cv_error_results['kappa']
        results['cv_mean_absolute_error']=cv_error_results['mae']
        results['feature_ext']=feature_ext
        results['classifier']=classifier
+        results['algorithm'] = type
        results['success']=True
    except:
        msg = "feature extraction and model creation failed."
@@ -53,7 +81,17 @@ def create(text,score,prompt_string,model_path):
    return results


-def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
+def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
+    """
+    Creates a model from a generic list numeric values and text values
+    numeric_values - A list of lists that are the predictors
+    textual_values - A list of lists that are the predictors
+    (each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
+    target - The variable that we are trying to predict.  A list of integers.
+    algorithm - the type of algorithm that will be used
+    """
+
+    #Initialize a result dictionary to return.
    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}

@@ -64,6 +102,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
        return results

    try:
+        #Initialize a predictor set object that encapsulates all of the text and numeric predictors
        pset = predictor_set.PredictorSet(type="train")
        for i in xrange(0, len(numeric_values)):
            pset.add_row(numeric_values[i], textual_values[i], target[i])
@@ -73,6 +112,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
        log.exception(msg)

    try:
+        #Extract all features and then train a classifier with the features
        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
        results['cv_kappa']=cv_error_results['kappa']
        results['cv_mean_absolute_error']=cv_error_results['mae']

--- a/cvout.tsv
+++ b/cvout.tsv
-cv_pred	actual
\ No newline at end of file
--- a/essaycorpus.txt
+++ b/essaycorpus.txt
--- a/good_pos_ngrams.p
+++ b/good_pos_ngrams.p
--- a/deployment_steps.txt
+++ b/deployment_steps.txt
--- a/documentation/install.txt
+++ b/documentation/install.txt
+sudo apt-get update
+sudo apt-get upgrade gcc
+sudo xargs -a apt-packages.txt apt-get install
+sudo pip install virtualenv
+sudo mkdir /opt/edx
+source /opt/edx/bin/activate
+cd /opt/wwc/machine-learning
+pip install numpy
+pip install scipy
+pip install -r requirements.txt
+cd opt/wwc/machine-learning
+pup install -r requirements.txt
+python -m nltk.downloader maxent_treebank_pos_tagger wordnet
+sudo mv /path/to/nltk_data /usr/share
\ No newline at end of file
--- a/essay_set.py
+++ b/essay_set.py
@@ -77,7 +77,10 @@ class EssaySet(object):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
            # Clean text by removing non digit/work/punctuation characters
-            essay_text=str(essay_text.encode('ascii', 'ignore'))
+            try:
+                essay_text=str(essay_text.encode('ascii', 'ignore'))
+            except:
+                essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
            cleaned_essay=util_functions.sub_chars(essay_text).lower()
            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]

--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/external_code/fisher/LICENSE.txt
+++ b/external_code/fisher/LICENSE.txt
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/external_code/fisher/__init__.py
+++ b/external_code/fisher/__init__.py
+__author__ = 'vik'
--- a/fisher.py
+++ b/fisher.py
--- a/feature_extractor.py
+++ b/feature_extractor.py
@@ -24,6 +24,9 @@ if not base_path.endswith("/"):

 log = logging.getLogger(__name__)

+#Paths to needed data files
+NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
+ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH

 class FeatureExtractor(object):
    def __init__(self):
@@ -41,17 +44,26 @@ class FeatureExtractor(object):
        """
        if(hasattr(e_set, '_type')):
            if(e_set._type == "train"):
+                #normal text (unstemmed) useful words/bigrams
                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
+                #stemmed and spell corrected vocab useful words/ngrams
                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
+                #dictionary trained on proper vocab
                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
+                #dictionary trained on proper vocab
                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
                self.dict_initialized = True
+                #Average spelling errors in set. needed later for spelling detection
                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
                self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
+                #Gets the number and positions of grammar errors
                good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
                self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
+                #Generate bag of words features
                bag_feats=self.gen_bag_feats(e_set)
+                #Sum of a row of bag of words features (topical words in an essay)
                f_row_sum=numpy.sum(bag_feats[:,:])
+                #Average index of how "topical" essays are
                self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
                ret = "ok"
            else:
@@ -65,13 +77,13 @@ class FeatureExtractor(object):
        Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
        Returns the list and caches the file
        """
-        if(os.path.isfile(base_path + "good_pos_ngrams.p")):
-            good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb'))
-        elif os.path.isfile(base_path + "essaycorpus.txt"):
-            essay_corpus = open(base_path + "essaycorpus.txt").read()
+        if(os.path.isfile(NGRAM_PATH)):
+            good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
+        elif os.path.isfile(ESSAY_CORPUS_PATH):
+            essay_corpus = open(ESSAY_CORPUS_PATH).read()
            essay_corpus = util_functions.sub_chars(essay_corpus)
            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
-            pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
+            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
        else:
            #Hard coded list in case the needed files cannot be found
            good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
@@ -85,6 +97,9 @@ class FeatureExtractor(object):
    def _get_grammar_errors(self,pos,text,tokens):
        """
        Internal function to get the number of grammar errors in given text
+        pos - part of speech tagged text (list)
+        text - normal text (list)
+        tokens - list of lists of tokenized text
        """
        word_counts = [max(len(t),1) for t in tokens]
        good_pos_tags = []
@@ -121,6 +136,7 @@ class FeatureExtractor(object):
        Generates length based features from an essay set
        Generally an internal function called by gen_feats
        Returns an array of length features
+        e_set - EssaySet object
        """
        text = e_set._text
        lengths = [len(e) for e in text]
@@ -144,6 +160,7 @@ class FeatureExtractor(object):
        Generates bag of words features from an input essay set and trained FeatureExtractor
        Generally called by gen_feats
        Returns an array of features
+        e_set - EssaySet object
        """
        if(hasattr(self, '_stem_dict')):
            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
@@ -157,6 +174,7 @@ class FeatureExtractor(object):
        """
        Generates bag of words, length, and prompt features from an essay set object
        returns an array of features
+        e_set - EssaySet object
        """
        bag_feats = self.gen_bag_feats(e_set)
        length_feats = self.gen_length_feats(e_set)
@@ -171,6 +189,7 @@ class FeatureExtractor(object):
        Generates prompt based features from an essay set object and internal prompt variable.
        Generally called internally by gen_feats
        Returns an array of prompt features
+        e_set - EssaySet object
        """
        prompt_toks = nltk.word_tokenize(e_set._prompt)
        expand_syns = []
@@ -206,6 +225,7 @@ class FeatureExtractor(object):
        features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
        in order to get off topic feedback.
        Returns a list of lists (one list per essay in e_set)
+        e_set - EssaySet object
        """

        #Set ratio to modify thresholds for grammar/spelling errors
@@ -220,9 +240,9 @@ class FeatureExtractor(object):
        all_feedback=[]
        for m in xrange(0,len(e_set._text)):
            #Be very careful about changing these messages!
-            individual_feedback={'grammar' : "Grammar: Ok.", 'spelling' : "Spelling: Ok.",
-                                 'topicality' : "Topicality: Ok.", 'markup_text' : "",
-                                 'prompt_overlap' : "Prompt Overlap: Ok.",
+            individual_feedback={'grammar' : "Grammar: Ok.",
+                                 'spelling' : "Spelling: Ok.",
+                                 'markup_text' : "",
                                 'grammar_per_char' : set_grammar_per_character[m],
                                 'spelling_per_char' : set_spell_errors_per_character[m],
                                 'too_similar_to_prompt' : False,

--- a/grade.py
+++ b/grade.py
-#Grader called by pyxserver_wsgi.py
-#Loads a grader file, which is a dict containing the prompt of the question,
-#a feature extractor object, and a trained model.
-#Extracts features and runs trained model on the submission to produce a final score.
-#Correctness determined by ratio of score to max possible score.
-#Requires aspell to be installed and added to the path.
+"""
+Functions to score specified data using specified ML models
+"""

 import sys
 import pickle
@@ -12,9 +9,11 @@ import numpy
 import logging
 from statsd import statsd

+#Append sys to base path to import the following modules
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)

+#Depend on base path to be imported
 from essay_set import EssaySet
 import predictor_extractor
 import predictor_set
@@ -28,18 +27,31 @@ import math
 log = logging.getLogger(__name__)

 @statsd.timed('open_ended_assessment.machine_learning.grader.time')
-def grade(grader_data,grader_config,submission):
-
+def grade(grader_data,submission):
+    """
+    Grades a specified submission using specified models
+    grader_data - A dictionary:
+    {
+        'model' : trained model,
+        'extractor' : trained feature extractor,
+        'prompt' : prompt for the question,
+        'algorithm' : algorithm for the question,
+    }
+    submission - The student submission (string)
+    """
+
+    #Initialize result dictionary
    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
-
    has_error=False

-    #Try to find and load the model file
-
    grader_set=EssaySet(type="test")

-    #Try to add essays to essay set object
+    #This is to preserve legacy functionality
+    if 'algorithm' not in grader_data:
+        grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
+
    try:
+        #Try to add essay to essay set object
        grader_set.add_essay(str(submission),0)
        grader_set.update_prompt(str(grader_data['prompt']))
    except:
@@ -57,17 +69,14 @@ def grade(grader_data,grader_config,submission):

    #Try to determine confidence level
    try:
-        min_score=min(numpy.asarray(grader_data['score']))
-        max_score=max(numpy.asarray(grader_data['score']))
-        raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-        #TODO: Normalize confidence somehow here
-        results['confidence']=raw_confidence
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
    except:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")

    if not has_error:

+        #If the essay is just a copy of the prompt, return a 0 as the score
        if(feedback['too_similar_to_prompt']):
            results['score']=0
            results['correct']=False
@@ -75,24 +84,23 @@ def grade(grader_data,grader_config,submission):
        results['success']=True

        #Generate short form output--number of problem areas identified in feedback
-        problem_areas=0
-        for tag in feedback:
-            if tag in ['topicality', 'prompt-overlap', 'spelling', 'grammar']:
-                problem_areas+=len(feedback[tag])>5
-
-        #Add feedback to results
-        results['feedback']={
-            'topicality' : feedback['topicality'],
-            'prompt-overlap' : feedback['prompt_overlap'],
-        }
-
-        if results['score']/float(max_score)<.33:
-            results['feedback'].update(
-                {'spelling' : feedback['spelling'],
-            'grammar' : feedback['grammar'],
-            'markup-text' : feedback['markup_text'],
+
+        #Add feedback to results if available
+        results['feedback'] = {}
+        if 'topicality' in feedback and 'prompt_overlap' in feedback:
+            results['feedback'].update({
+                'topicality' : feedback['topicality'],
+                'prompt-overlap' : feedback['prompt_overlap'],
            })

+        results['feedback'].update(
+            {
+                'spelling' : feedback['spelling'],
+                'grammar' : feedback['grammar'],
+                'markup-text' : feedback['markup_text'],
+            }
+        )
+
    else:
        #If error, success is False.
        results['success']=False
@@ -103,7 +111,17 @@ def grade(grader_data,grader_config,submission):

    return results

-def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+def grade_generic(grader_data, numeric_features, textual_features):
+    """
+    Grades a set of numeric and textual features using a generic model
+    grader_data -- dictionary containing:
+    {
+        'algorithm' - Type of algorithm to use to score
+    }
+    numeric_features - list of numeric features to predict on
+    textual_features - list of textual feature to predict on
+
+    """
    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}

    has_error=False
@@ -129,16 +147,7 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features

    #Try to determine confidence level
    try:
-        min_score=min(numpy.asarray(grader_data['score']))
-        max_score=max(numpy.asarray(grader_data['score']))
-        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
-            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-            #TODO: Normalize confidence somehow here
-            results['confidence']=raw_confidence
-        else:
-            raw_confidence = grader_data['model'].predict(grader_feats)[0]
-            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
-            results['confidence'] = confidence
+        results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
    except:
        #If there is an error getting confidence, it is not a show-stopper, so just log
        log.exception("Problem generating confidence value")
@@ -151,3 +160,25 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
        results['success'] = True

    return results
+
+def get_confidence_value(algorithm,model,grader_feats,score, scores):
+    """
+    Determines a confidence in a certain score, given proper input parameters
+    algorithm- from util_functions.AlgorithmTypes
+    model - a trained model
+    grader_feats - a row of features used by the model for classification/regression
+    score - The score assigned to the submission by a prior model
+    """
+    min_score=min(numpy.asarray(scores))
+    max_score=max(numpy.asarray(scores))
+    if algorithm == util_functions.AlgorithmTypes.classification:
+        #If classification, predict with probability, which gives you a matrix of confidences per score point
+        raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
+        #TODO: Normalize confidence somehow here
+        confidence=raw_confidence
+    else:
+        raw_confidence = model.predict(grader_feats)[0]
+        confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
+
+    return confidence
+
--- a/install/apt-packages.txt
+++ b/install/apt-packages.txt
+python-pip
+python-scipy
+python-mysqldb
+ipython
+nginx
+git
+redis-server
+libmysqlclient-dev
+gfortran
+libblas3gf
+libblas-dev
+liblapack3gf
+liblapack-dev
+libatlas-base-dev
+libxml2-dev
+libxslt1-dev
+libreadline6
+libreadline6-dev
+build-essential
+curl
+aspell
+python
\ No newline at end of file
--- a/install_system_req.sh
+++ b/install_system_req.sh
--- a/pre-requirements.txt
+++ b/pre-requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
--- a/model_creator.py
+++ b/model_creator.py
@@ -87,10 +87,16 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
    return x

 def get_cv_error(clf,feats,scores):
+    """
+    Gets cross validated error for a given classifier, set of features, and scores
+    clf - classifier
+    feats - features to feed into the classified and cross validate over
+    scores - scores associated with the features -- feature row 1 associates with score 1, etc.
+    """
    results={'success' : False, 'kappa' : 0, 'mae' : 0}
    try:
        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
-        err=numpy.mean(numpy.abs(cv_preds-scores))
+        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
        results['mae']=err
        results['kappa']=kappa
@@ -103,15 +109,11 @@ def get_cv_error(clf,feats,scores):

    return results

-def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
-    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
-        algorithm = util_functions.AlgorithmTypes.regression
-
-    f = predictor_extractor.PredictorExtractor()
-    f.initialize_dictionaries(predictor_set)
-
-    train_feats = f.gen_feats(predictor_set)
-
+def get_algorithms(type):
+    """
+    Gets two classifiers for each type of algorithm, and returns them.  First for predicting, second for cv error.
+    type - one of util_functions.AlgorithmTypes
+    """
    if type == util_functions.AlgorithmTypes.classification:
        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
            max_depth=4, random_state=1,min_samples_leaf=3)
@@ -122,7 +124,24 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
            max_depth=4, random_state=1,min_samples_leaf=3)
        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
            max_depth=4, random_state=1,min_samples_leaf=3)
+    return clf, clf2

+
+def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
+    """
+    Extracts features and generates predictors based on a given predictor set
+    predictor_set - a PredictorSet object that has been initialized with data
+    type - one of util_functions.AlgorithmType
+    """
+    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
+        algorithm = util_functions.AlgorithmTypes.regression
+
+    f = predictor_extractor.PredictorExtractor()
+    f.initialize_dictionaries(predictor_set)
+
+    train_feats = f.gen_feats(predictor_set)
+
+    clf,clf2 = get_algorithms(type)
    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)

    try:
@@ -137,7 +156,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
    return f, clf, cv_error_results


-def extract_features_and_generate_model(essays,additional_array=None):
+def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
    """
    Feed in an essay set to get feature vector and classifier
    essays must be an essay set object
@@ -149,20 +168,18 @@ def extract_features_and_generate_model(essays,additional_array=None):
    f.initialize_dictionaries(essays)

    train_feats = f.gen_feats(essays)
-    if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
-        if(additional_array.shape[0]==train_feats.shape[0]):
-            train_feats=numpy.concatenate((train_feats,additional_array),axis=1)

-    clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-        max_depth=4, random_state=1,min_samples_leaf=3)
+    set_score = numpy.asarray(essays._score, dtype=numpy.int)
+    if len(util_functions.f7(list(set_score)))>5:
+        type = util_functions.AlgorithmTypes.regression
+    else:
+        type = util_functions.AlgorithmTypes.classification

-    clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-        max_depth=4, random_state=1,min_samples_leaf=3)
+    clf,clf2 = get_algorithms(type)

    cv_error_results=get_cv_error(clf2,train_feats,essays._score)

    try:
-        set_score = numpy.asarray(essays._score, dtype=numpy.int)
        clf.fit(train_feats, set_score)
    except ValueError:
        log.exception("Not enough classes (0,1,etc) in sample.")

--- a/predictor_extractor.py
+++ b/predictor_extractor.py
+"""
+Extracts features for an arbitrary set of textual and numeric inputs
+"""
+
 import numpy
 import re
 import nltk
@@ -12,6 +16,7 @@ import logging
 import math
 from feature_extractor import FeatureExtractor

+#Append to path and then import things that depend on path
 base_path = os.path.dirname(__file__)
 sys.path.append(base_path)
 from essay_set import EssaySet
@@ -28,6 +33,10 @@ class PredictorExtractor(object):
        self._initialized = False

    def initialize_dictionaries(self, p_set):
+        """
+        Initialize dictionaries with the textual inputs in the PredictorSet object
+        p_set - PredictorSet object that has had data fed in
+        """
        success = False
        if not (hasattr(p_set, '_type')):
            error_message = "needs to be an essay set of the train type."
@@ -43,6 +52,7 @@ class PredictorExtractor(object):
        if div_length==0:
            div_length=1

+        #Ensures that even with a large amount of input textual features, training time stays reasonable
        max_feats2 = int(math.floor(200/div_length))
        for i in xrange(0,len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
@@ -52,6 +62,10 @@ class PredictorExtractor(object):
        return success

    def gen_feats(self, p_set):
+        """
+        Generates features based on an iput p_set
+        p_set - PredictorSet
+        """
        if self._initialized!=True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)

--- a/test_server_code/pyxserver_wsgi.py
+++ b/test_server_code/pyxserver_wsgi.py
-#!/usr/bin/python
-#------------------------------------------------------------
-# Run me with (may need su privilege for logging):
-#        gunicorn -w 4 -b 127.0.0.1:3031 pyxserver_wsgi:application
-#------------------------------------------------------------
-
-import cgi    # for the escape() function
-import json
-import logging
-import os
-import os.path
-import sys
-from time import localtime, strftime
-
-script_dir = os.path.dirname(__file__)
-sys.path.append(script_dir)
-import settings    # Not django, but do something similar
-
-# make sure we can find the grader files
-sys.path.append(settings.GRADER_ROOT)
-import grade
-
-results_template = """
-<div class="test">
-<header>Test results</header>
-  <section>
-    <div class="shortform">
-    {status}
-    </div>
-    <div class="longform">
-      {errors}
-      {results}
-    </div>
-  </section>
-</div>
-"""
-
-
-results_correct_template = """
-  <div class="result-output result-correct">
-    <h4>{short-description}</h4>
-    <p>{long-description}</p>
-    <dl>
-    <dt>Output:</dt>
-    <dd class="result-actual-output">
-       <pre>{actual-output}</pre>
-       </dd>
-    </dl>
-  </div>
-"""
-
-
-results_incorrect_template = """
-  <div class="result-output result-incorrect">
-    <h4>{short-description}</h4>
-    <p>{long-description}</p>
-    <dl>
-    <dt>Your output:</dt>
-    <dd class="result-actual-output"><pre>{actual-output}</pre></dd>
-    <dt>Correct output:</dt>
-    <dd><pre>{expected-output}</pre></dd>
-    </dl>
-  </div>
-"""
-
-
-def format_errors(errors):
-    esc = cgi.escape
-    error_string = ''
-    error_list = [esc(e) for e in errors or []]
-    if error_list:
-        items = '\n'.join(['<li><pre>{0}</pre></li>\n'.format(e) for e in error_list])
-        error_string = '<ul>\n{0}</ul>\n'.format(items)
-        error_string = '<div class="result-errors">{0}</div>'.format(error_string)
-    return error_string
-
-
-def to_dict(result):
-    # long description may or may not be provided.  If not, don't display it.
-    # TODO: replace with mako template
-    esc = cgi.escape
-    if result[1]:
-        long_desc = '<p>{0}</p>'.format(esc(result[1]))
-    else:
-        long_desc = ''
-    return {'short-description': esc(result[0]),
-            'long-description': long_desc,
-            'correct': result[2],   # Boolean; don't escape.
-            'expected-output': esc(result[3]),
-            'actual-output': esc(result[4])
-            }
-
-
-def render_results(results):
-    output = []
-    test_results = [to_dict(r) for r in results['tests']]
-    for result in test_results:
-        if result['correct']:
-            template = results_correct_template
-        else:
-            template = results_incorrect_template
-        output += template.format(**result)
-
-    errors = format_errors(results['errors'])
-
-    status = 'INCORRECT'
-    if errors:
-        status = 'ERROR'
-    elif results['correct']:
-        status = 'CORRECT'
-
-    return results_template.format(status=status,
-                                   errors=errors,
-                                   results=''.join(output))
-
-
-def do_GET(data):
-    return "Hey, the time is %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime())
-
-
-def do_POST(data):
-    # This server expects jobs to be pushed to it from the queue
-    xpackage = json.loads(data)
-    body  = xpackage['xqueue_body']
-
-    # Delivery from the lms
-    body = json.loads(body)
-    student_response = body['student_response']
-    payload = body['grader_payload']
-    try:
-        grader_config = json.loads(payload)
-    except ValueError as err:
-        # If parsing json fails, erroring is fine--something is wrong in the content.
-        # However, for debugging, still want to see what the problem is
-        raise
-
-    relative_grader_path = grader_config['grader']
-    grader_path = os.path.join(settings.GRADER_ROOT, relative_grader_path)
-    results = grade.grade(grader_path, student_response)
-
-
-    # Make valid JSON message
-    reply = { 'correct': results['correct'],
-              'score': results['score'],
-              'msg': render_results(results) }
-
-    return json.dumps(reply)
-
-
-# Entry point
-def application(env, start_response):
-
-    # Handle request
-    method = env['REQUEST_METHOD']
-    data = env['wsgi.input'].read()
-
-    def post_wrapper(data):
-        try:
-            return do_POST(data)
-        except:
-            return None
-
-    handlers = {'GET': do_GET,
-                 'POST': post_wrapper,
-                 }
-    if method in handlers.keys():
-        reply = handlers[method](data)
-
-        if reply is not None:
-
-            start_response('200 OK', [('Content-Type', 'text/html')])
-            return reply
-
-    # If we fell through to here, complain.
-    start_response('404 Not Found', [('Content-Type', 'text/plain')])
-    return ''
--- a/test_server_code/settings.py
+++ b/test_server_code/settings.py
-# Not django (for now), but use the same settings format anyway
-
-import json
-import os
-from path import path
-import sys
-
-ROOT_PATH = path(__file__).dirname()
-REPO_PATH = ROOT_PATH
-ENV_ROOT = REPO_PATH.dirname()
-
-# DEFAULTS
-
-DEBUG = False
-
-# Must end in '/'
-RUN_URL = 'http://127.0.0.1:3031/'  # Victor's VM ...
-RUN_URL = 'http://sandbox-runserver-001.m.edx.org:8080/'
-RUN_URL = 'http://sandbox-runserver.elb.edx.org:80/'
-
-GRADER_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-
-# AWS
-
-if os.path.isfile(ENV_ROOT / "env.json"):
-    print "Opening env.json file"
-    with open(ENV_ROOT / "env.json") as env_file:
-        ENV_TOKENS = json.load(env_file)
-
-    RUN_URL = ENV_TOKENS['RUN_URL']
-
-    LOG_DIR = ENV_TOKENS['LOG_DIR']
-
-    # Should be absolute path to 6.00 grader dir.
-    # NOTE: This means we only get one version of 6.00 graders available--has to
-    # be the same for internal and external class.  Not critical -- can always
-    # use different grader file if want different problems.
-    GRADER_ROOT = ENV_TOKENS.get('GRADER_ROOT')
--- a/test_server_code/tmpfile
+++ b/test_server_code/tmpfile
-this is an incorrect response 
\ No newline at end of file
--- a/tests/.~lock.train.tsv#
+++ b/tests/.~lock.train.tsv#
-,vik,vikp,02.11.2012 17:19,file:///home/vik/.config/libreoffice/3;
\ No newline at end of file
--- a/tests/bad_grammar_and_spelling/answer.txt
+++ b/tests/bad_grammar_and_spelling/answer.txt
-This experement didn't have a controle and the grupe didn't do multiple triles. You would also may need to know what tempriture the rome is.
-
--- a/tests/bad_grammar_and_spelling/payload.json
+++ b/tests/bad_grammar_and_spelling/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/bad_grammar_and_spelling/wrong.txt
+++ b/tests/bad_grammar_and_spelling/wrong.txt
-In order for I for  replicate this expirement I woukd need to know what are the reaserching with this expirement what kind of result are  being booked at and the mass of each sample at the end of expirment  theie results.
-
--- a/tests/essaycorpus.txt
+++ b/tests/essaycorpus.txt
--- a/tests/essays_with_symbols/answer.txt
+++ b/tests/essays_with_symbols/answer.txt
-<b><fg>In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard&&&&&and many other sample materials that they&;;;& didn't use and would get different results. Also I would also<>>> need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell//////where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.<b><b>
--- a/tests/essays_with_symbols/payload.json
+++ b/tests/essays_with_symbols/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/essays_with_symbols/wrong.txt
+++ b/tests/essays_with_symbols/wrong.txt
-In order to conduct the experiment, the students would need to know the mass of the marble, the height of the drop, and the air temperature.
--- a/tests/good_pos_ngrams.p
+++ b/tests/good_pos_ngrams.p
--- a/tests/prompt.txt
+++ b/tests/prompt.txt
-"A group of students wrote the following procedure for their investigation. Procedure: 1. Determine the mass of four different samples. 2. Pour vinegar in each of four separate, but identical, containers. 3. Place a sample of one material into one container and label. Repeat with remaining samples, placing a single sample into a single container. 4. After 24 hours, remove the samples from the containers and rinse each sample with distilled water. 5. Allow the samples to sit and dry for 30 minutes. 6. Determine the mass of each sample. The students’ data are recorded in the table below. Sample Starting Mass (g) Ending Mass (g) Difference in Mass (g) Marble 9.8 9.4 –0.4 Limestone 10.4 9.1 –1.3 Wood 11.2 11.2 0.0 Plastic 7.2 7.1 –0.1"
-
--- a/tests/sa_data.tsv
+++ b/tests/sa_data.tsv
--- a/tests/simple_essay/answer.txt
+++ b/tests/simple_essay/answer.txt
-In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard and many other sample materials that they didn't use and would get different results. Also I would also need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.
--- a/tests/simple_essay/payload.json
+++ b/tests/simple_essay/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/simple_essay/wrong.txt
+++ b/tests/simple_essay/wrong.txt
-this is an incorrect response
--- a/tests/test_cv_accuracy.py
+++ b/tests/test_cv_accuracy.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import essay_set
-import feature_extractor
-import numpy
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-FILENAME="sa_data.tsv"
-
-
-all_err=[]
-all_kappa=[]
-
-for t_len in [0,50,100,200,300]:
-    sa_val = file(FILENAME)
-    scores=[]
-    texts=[]
-    lines=sa_val.readlines()
-    eset=essay_set.EssaySet(type="train")
-    for i in xrange(1,len(lines)):
-        score,text=lines[i].split("\t\"")
-        if len(text)>t_len:
-            scores.append(int(score))
-            texts.append(text)
-            eset.add_essay(text,int(score))
-            #if int(score)==0:
-            #    eset.generate_additional_essays(text,int(score))
-    extractor=feature_extractor.FeatureExtractor()
-    extractor.initialize_dictionaries(eset)
-    train_feats=extractor.gen_feats(eset)
-    clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-    cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
-    err=numpy.mean(numpy.abs(cv_preds-scores))
-    print err
-    kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-    print kappa
-    all_err.append(err)
-    all_kappa.append(kappa)
-
-    """
-    outfile=open("full_cvout.tsv",'w+')
-    outfile.write("cv_pred" + "\t" + "actual")
-    for i in xrange(0,len(cv_preds)):
-        outfile.write("{0}\t{1}".format(cv_preds[i],scores[i]))
-    """
-
-
-
--- a/tests/test_cv_full.py
+++ b/tests/test_cv_full.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import essay_set
-import feature_extractor
-import numpy
-import math
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-filenames = ['LSQ_W09_60_MLT.tsv',
-             'LSQ_W10_22_a.tsv',
-              'LSQ_W11_21_MLT.tsv',
-            ]
-
-for filename in filenames:
-    base_name = base_path + filename
-    print base_name
-    sa_val = file(base_name)
-    scores=[]
-    texts=[]
-    lines=sa_val.readlines()
-    eset=essay_set.EssaySet(type="train")
-    for i in xrange(1,len(lines)):
-        score,text=lines[i].split("\t\"")
-        scores.append(int(score))
-        texts.append(text)
-        eset.add_essay(text,int(score))
-        #if int(score)==0:
-        #    eset.generate_additional_essays(text,int(score))
-    extractor=feature_extractor.FeatureExtractor()
-    extractor.initialize_dictionaries(eset)
-    train_feats=extractor.gen_feats(eset)
-    clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-    cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores, num_chunks = int(math.floor(len(texts)/2)))
-    err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
-    print err
-    kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-    print kappa
-
-    outfile=open(filename + "_cvout.tsv",'w+')
-    outfile.write("cv_pred" + "\t" + "actual\n")
-    for i in xrange(0,len(cv_preds)):
-        outfile.write("{0}\t{1}\n".format(str(cv_preds[i]),str(scores[i])))
-    outfile.close()
-
-
-
-
--- a/tests/test_generic_ml.py
+++ b/tests/test_generic_ml.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(base_path,'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import predictor_set
-import predictor_extractor
-import numpy
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-FILENAME="sa_data.tsv"
-
-
-sa_val = file(FILENAME)
-scores=[]
-texts=[]
-lines=sa_val.readlines()
-pset = predictor_set.PredictorSet(type="train")
-for i in xrange(1,len(lines)):
-    score,text=lines[i].split("\t\"")
-    if len(text)>t_len:
-        scores.append(int(score))
-        texts.append(text)
-        pset.add_row([1],[text],int(score))
-extractor=predictor_extractor.PredictorExtractor()
-extractor.initialize_dictionaries(pset)
-train_feats=extractor.gen_feats(pset)
-
-clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
-err=numpy.mean(numpy.abs(cv_preds-scores))
-print err
-kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-print kappa
\ No newline at end of file
--- a/tests/test_graders.py
+++ b/tests/test_graders.py
-#!/usr/bin/env python
-"""
-Send some test programs to an xserver.
-
-For each dir in the current directory, send the contents of payload.xml and each
-of the answer*.py, right*.py and wrong*.py files.
-"""
-
-import argparse
-import glob
-import json
-import os
-import os.path
-from path import path
-import requests
-import sys
-import time
-
-xserver = 'http://127.0.0.1:3031/'
-
-def send(payload, answer):
-    """
-    Send a grading request to the xserver
-    """
-
-    body = {'grader_payload': payload,
-            'student_response': answer}
-
-    data = {'xqueue_body': json.dumps(body),
-            'xqueue_files': ''}
-
-    start = time.time()
-    r = requests.post(xserver, data=json.dumps(data))
-    end = time.time()
-    print "Request took %.03f sec" % (end - start)
-
-    if r.status_code != requests.codes.ok:
-        print "Request error:{0},{1},{2}".format(r.headers,payload,answer)
-
-    parsed_text=json.loads(r.text)
-    print("\nAnswer: {0}\nScore: {1} Correct: {2} \nFeedback: {3}"
-          .format(answer,parsed_text['score'],parsed_text['correct'],
-          parsed_text['feedback']))
-    #print "Score:{0} {1}".format(parsed_text['score'],parsed_text['correct'])
-    return r.text
-
-
-def check_contains(string, substr):
-    if not substr in string:
-        print "ERROR: Expected to be {0}".format(substr)
-        return False
-    else:
-        return True
-
-def check_not_contains(string, substr):
-    if substr in string:
-        print "ERROR: Expected to be {0}".format(substr)
-        return False
-    else:
-        return True
-
-def check_right(string):
-    return check_contains(string, '\"correct\": true')
-
-def check_wrong(string):
-    return check_contains(string, '\"correct\": false')
-
-def globs(dirname, *patterns):
-    """
-    Produce a sequence of all the files matching any of our patterns in dirname.
-    """
-    for pat in patterns:
-        for fname in glob.glob(os.path.join(dirname, pat)):
-            yield fname
-
-def contents(fname):
-    """
-    Return the contents of the file `fname`.
-    """
-    with open(fname) as f:
-        return f.read()
-
-def check(dirname,type):
-    """
-    Look for payload.json, answer*.py, right*.py, wrong*.py, run tests.
-    """
-    payload_file = os.path.join(dirname, 'payload.json')
-    if os.path.isfile(payload_file):
-        payload = contents(payload_file)
-        print("found payload: " + payload)
-    else:
-        graders = list(globs(dirname, 'grade*.py'))
-        if not graders:
-            #print "No payload.json or grade*.py in {0}".format(dirname)
-            return
-        if len(graders) > 1:
-            print "More than one grader in {0}".format(dirname)
-            return
-        payload = json.dumps({'grader': os.path.abspath(graders[0])})
-
-    for name in globs(dirname, 'answer*.txt', 'right*.py'):
-        #print "Checking correct response from {0}".format(name)
-        answer = contents(name)
-        right=check_right(send(payload, answer))
-
-    for name in globs(dirname, 'wrong*.txt'):
-        #print "Checking wrong response from {0}".format(name)
-        answer = contents(name)
-        wrong=check_wrong(send(payload, answer))
-    if(type=="test"):
-        assert wrong and right
-
-def main(argv):
-    global xserver
-
-    #parser = argparse.ArgumentParser(description="Send dummy requests to a qserver")
-    #parser.add_argument('server')
-    #parser.add_argument('root', nargs='?')
-
-    #args = parser.parse_args(argv)
-
-    #xserver = args.server
-    if not xserver.endswith('/'):
-        xserver += '/'
-
-    #root = args.root or '.'
-    root=os.path.dirname( os.path.abspath(__file__ ))
-    for dirpath, _, _ in os.walk(root):
-        print("checking" + dirpath)
-        check(dirpath,"normal")
-
-if __name__=="__main__":
-    main(sys.argv[1:])
-
-def test_graders():
-    root=os.path.dirname( os.path.abspath(__file__ ))
-    for dirpath, _, _ in os.walk(root):
-        print("checking" + dirpath)
-        yield check, dirpath, "test"
-
-def test_model_creation():
-    model_creator_dir=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-
--- a/tests/test_models.py
+++ b/tests/test_models.py
-# Run with arguments train_file prompt_file model_path to generate a sample model file
-
-import os
-import sys
-import argparse
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import model_creator
-
-
-def main(argv):
-    parser = argparse.ArgumentParser(description="Generate model from test data files")
-    parser.add_argument('train_file')
-    parser.add_argument('prompt_file')
-    parser.add_argument('model_path')
-
-    args = parser.parse_args(argv)
-
-    score, text = model_creator.read_in_test_data(args.train_file)
-    prompt_string = model_creator.read_in_test_prompt(args.prompt_file)
-    print("data read")
-    e_set = model_creator.create_essay_set(text, score, prompt_string)
-    print("essay set created")
-    feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
-    print("features pulled out and model generated")
-    model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, text, score, args.model_path)
-    print("model file written")
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
-
-
-def test_model_creation():
-    try:
-        score, text = model_creator.read_in_test_data("train.tsv")
-        prompt_string = model_creator.read_in_test_prompt("prompt.txt")
-        e_set = model_creator.create_essay_set(text, score, prompt_string)
-        feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
-        model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, args.model_path)
-        assert True
-    except:
-        assert False
--- a/tests/tmpfile
+++ b/tests/tmpfile
- in order to replicate this experiment , we would need to know the temperature of the vinegar as well as how much vinegar to put in . both of these could vary and therefore change the result of the experiment . 
\ No newline at end of file
--- a/tests/train.tsv
+++ b/tests/train.tsv
--- a/tmpfile
+++ b/tmpfile
-in order for i for replicate this expirement i woukd need to know what are the reaserching with this expirement what kind of result are being booked at and the mass of each sample at the end of expirment theie results . i didn't know what the answer is .
\ No newline at end of file
--- a/util_functions.py
+++ b/util_functions.py
 #Collection of misc functions needed to support essay_set.py and feature_extractor.py.
 #Requires aspell to be installed and added to the path
+from external_code.fisher import fisher

 aspell_path = "aspell"
 import re
 import os
 from sklearn.feature_extraction.text import CountVectorizer
-import fisher
 import numpy
 from itertools import chain
 import math
 import nltk
-import random
 import pickle
-from path import path
 import logging
+import sys

 log=logging.getLogger(__name__)

+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+
+#Paths to needed data files
+ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
+ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
+
 class AlgorithmTypes(object):
+    """
+    Defines what types of algorithm can be used
+    """
    regression = "regression"
    classification = "classifiction"

 def create_model_path(model_path):
+    """
+    Creates a path to model files
+    model_path - string
+    """
    if not model_path.startswith("/") and not model_path.startswith("models/"):
        model_path="/" + model_path
    if not model_path.startswith("models"):
@@ -36,7 +51,9 @@ def sub_chars(string):
    Strips illegal characters from a string.  Used to sanitize input essays.
    Removes all non-punctuation, digit, or letter characters.
    Returns sanitized string.
+    string - string
    """
+    #Define replacement patterns
    sub_pat = r"[^A-Za-z\.\?!,';:]"
    char_pat = r"\."
    com_pat = r","
@@ -44,26 +61,18 @@ def sub_chars(string):
    excl_pat = r"!"
    sem_pat = r";"
    col_pat = r":"
-
    whitespace_pat = r"\s{1,}"
-    whitespace_comp = re.compile(whitespace_pat)
-    sub_comp = re.compile(sub_pat)
-    char_comp = re.compile(char_pat)
-    com_comp = re.compile(com_pat)
-    ques_comp = re.compile(ques_pat)
-    excl_comp = re.compile(excl_pat)
-    sem_comp = re.compile(sem_pat)
-    col_comp = re.compile(col_pat)
-
-    nstring = sub_comp.sub(" ", string)
-    nstring = char_comp.sub(" .", nstring)
-    nstring = com_comp.sub(" ,", nstring)
-    nstring = ques_comp.sub(" ?", nstring)
-    nstring = excl_comp.sub(" !", nstring)
-    nstring = sem_comp.sub(" ;", nstring)
-    nstring = col_comp.sub(" :", nstring)
-
-    nstring = whitespace_comp.sub(" ", nstring)
+
+    #Replace text.  Ordering is very important!
+    nstring = re.sub(sub_pat, " ", string)
+    nstring = re.sub(char_pat," .", nstring)
+    nstring = re.sub(com_pat, " ,", nstring)
+    nstring = re.sub(ques_pat, " ?", nstring)
+    nstring = re.sub(excl_pat, " !", nstring)
+    nstring = re.sub(sem_pat, " ;", nstring)
+    nstring = re.sub(col_pat, " :", nstring)
+    nstring = re.sub(whitespace_pat, " ", nstring)
+
    return nstring


@@ -72,7 +81,10 @@ def spell_correct(string):
    Uses aspell to spell correct an input string.
    Requires aspell to be installed and added to the path.
    Returns the spell corrected string if aspell is found, original string if not.
+    string - string
    """
+
+    #Create a temp file so that aspell could be used
    f = open('tmpfile', 'w')
    f.write(string)
    f_path = os.path.abspath(f.name)
@@ -81,13 +93,16 @@ def spell_correct(string):
        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
    except:
        log.exception("Could not find aspell, so could not spell correct!")
+        #Return original string if aspell fails
        return string,0, string
+    #Aspell returns a list of incorrect words with the above flags
    incorrect = p.readlines()
    p.close()
    incorrect_words = list()
    correct_spelling = list()
    for i in range(1, len(incorrect)):
        if(len(incorrect[i]) > 10):
+            #Reformat aspell output to make sense
            match = re.search(":", incorrect[i])
            if hasattr(match, "start"):
                begstring = incorrect[i][2:match.start()]
@@ -101,6 +116,8 @@ def spell_correct(string):

                    incorrect_words.append(begword)
                    correct_spelling.append(sug)
+
+    #Create markup based on spelling errors
    newstring = string
    markup_string = string
    already_subbed=[]
@@ -419,13 +436,13 @@ def get_separator_words(toks1):
    Returns a list of separator words
    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
-    if(os.path.isfile("essay_cor_tokens.p")):
-        toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
+    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
+        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
    else:
-        essay_corpus = open("essaycorpus.txt").read()
+        essay_corpus = open(ESSAY_CORPUS_PATH).read()
        essay_corpus = sub_chars(essay_corpus)
        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
-        pickle.dump(toks2, open('essay_cor_tokens.p', 'wb'))
+        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
    sep_words = []
    for word in tab_toks1.keys():
        tok1_present = tab_toks1[word]