Add files back in

4286f7e9 · Vik Paruchuri · ade012fc · 4286f7e9 · 4286f7e9 · 4286f7e9
Commit 4286f7e9 authored Feb 26, 2013 by Vik Paruchuri
22 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea/
+__pycache__/
+models/
+*.pyc
+*~
+tests/
--- a/Readme.md
+++ b/Readme.md
+Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy, sklearn, and nltk also need to be installed.
+Nltk also requires the treebank maxent tagger and wordnet to be installed.  These can be installed through the nltk downloader(nltk.download()), or programatically through  `python -m nltk.downloader maxent_treebank_pos_tagger wordnet` .
+Runnable files:
+1. tests/test_models.py 
+	Generates test models when used like: `python create_test_models.py train_file prompt_file model_path`.  Use `python create_test_models.py train.tsv prompt.txt models/essay_set_1.p` to generate a model using sample data.
+2. test_server_code/pyxserver_wsgi.py
+	Starts a server instance that can be sent answers to score.  Calls grade.py to score responses.  Run server with `gunicorn -w 4 -b 127.0.0.1:3031 pyxserver_wsgi:application` . 
+3. tests/test.py
+	Submits test data found in directories within the tests folder to the xserver and displays results.  See tests/simple_essay for an example of how to format files.  You need payload.json, wrong.txt, and answer.txt to make a test.
+Testing:
+Tests can be run by running nosetests in the tests directory.  Make sure the test server is running first! 
--- a/create.py
+++ b/create.py
+import os
+import sys
+import logging
+log = logging.getLogger(__name__)
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
+sys.path.append(one_up_path)
+import model_creator
+import util_functions
+import predictor_set
+import predictor_extractor
+from statsd import statsd
+@statsd.timed('open_ended_assessment.machine_learning.creator.time')
+def create(text,score,prompt_string,model_path):
+    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+               'feature_ext' : "", 'classifier' : ""}
+    if len(text)!=len(score):
+        msg = "Target and text lists must be same length."
+        results['errors'].append(msg)
+        log.exception(msg)
+        return results
+    try:
+        e_set = model_creator.create_essay_set(text, score, prompt_string)
+    except:
+        msg = "essay set creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+    try:
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
+        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['feature_ext']=feature_ext
+        results['classifier']=classifier
+        results['success']=True
+    except:
+        msg = "feature extraction and model creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+    #Count number of successful/unsuccessful creations
+    statsd.increment("open_ended_assessment.machine_learning.creator_count",
+        tags=["success:{0}".format(results['success'])])
+    return results
+def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
+    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
+               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
+    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
+        msg = "Target, numeric features, and text features must all be the same length."
+        results['errors'].append(msg)
+        log.exception(msg)
+        return results
+    try:
+        pset = predictor_set.PredictorSet(type="train")
+        for i in xrange(0, len(numeric_values)):
+            pset.add_row(numeric_values[i], textual_values[i], target[i])
+    except:
+        msg = "predictor set creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+    try:
+        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
+        results['cv_kappa']=cv_error_results['kappa']
+        results['cv_mean_absolute_error']=cv_error_results['mae']
+        results['feature_ext']=feature_ext
+        results['classifier']=classifier
+        results['success']=True
+    except:
+        msg = "feature extraction and model creation failed."
+        results['errors'].append(msg)
+        log.exception(msg)
+        #Count number of successful/unsuccessful creations
+    statsd.increment("open_ended_assessment.machine_learning.creator_count",
+        tags=["success:{0}".format(results['success'])])
+    return results
\ No newline at end of file
--- a/data/essaycorpus.txt
+++ b/data/essaycorpus.txt
--- a/data/good_pos_ngrams.p
+++ b/data/good_pos_ngrams.p
--- a/documentation/deployment_steps.txt
+++ b/documentation/deployment_steps.txt
+Old steps (deprecated):
+source opt/edx/bin/activate
+sudo apt-get upgrade gcc
+sudo pip install numpy
+#in ml dir
+sudo pip install -r requirements.txt
+#in controller dir
+sudo pip install -r requirements.txt
+mkdir log in both repos
+mkdir /opt/wwc/log
+touch /opt/wwc/log/edx.log
+sudo su makeitso
+django-admin syncdb
+django-admin migrate
+sudo su makeitso
+bash
+source /opt/edx/bin/activate
+cd /opt/wwc/grading-controller
+pip install numpy
+pip install -r requirements.txt
+cd opt/wwc/machine-learning
+pup install -r requirements.txt
+python -m nltk.downloader maxent_treebank_pos_tagger wordnet
+sudo mv /path/to/nltk_data /usr/share
+sudo apt-get install aspell
+python /opt/wwc/grading-controller/manage.py update_users
+Copy auth.json and env.json into the grading-controller folder (no idea why this is needed, but django-admin doesn't find them at ENV_ROOT)
+chown grading-controller/edx.log to makeitso
+make db file (for sqlite)
+chown db file to makeitso
+For some reason, have to copy auth.json and env.json to both opt/wwc/grading-controller and opt/wwc/ dirs
+sudo apt-get install super
+pip install MySQL-python
+In upstart files, be sure to specify settings file and full python path! (/opt/wwc/grading-controller)
+How to run:
+/opt/edx/bin/gunicorn --preload -b 127.0.0.1:8000 -w 4 --timeout=300 --pythonpath=/opt/wwc/grading-controller grading_controller.wsgi
+Upstart tasks:
+grader
+ml_grader
+ml_creator
+pull_from_xqueue
+expire_old
+  "DATABASES": {
+    "default": {
+      "ENGINE": "django.db.backends.mysql",
+      "NAME": "sandbox_grader",
+      "USERNAME": "sandbox_grader",
+      "PORT": "3306",
+      "PASSWORD": "faarg16ren",
+      "HOST": "sandbox.rds.edx.org"
+    }
+  },
--- a/documentation/install.txt
+++ b/documentation/install.txt
+sudo apt-get update
+sudo apt-get upgrade gcc
+sudo xargs -a apt-packages.txt apt-get install
+sudo pip install virtualenv
+sudo mkdir /opt/edx
+source /opt/edx/bin/activate
+cd /opt/wwc/machine-learning
+pip install numpy
+pip install scipy
+pip install -r requirements.txt
+cd opt/wwc/machine-learning
+pup install -r requirements.txt
+python -m nltk.downloader maxent_treebank_pos_tagger wordnet
+sudo mv /path/to/nltk_data /usr/share
\ No newline at end of file
--- a/essay_set.py
+++ b/essay_set.py
+"""
+Defines an essay set object, which encapsulates essays from training and test sets.
+Performs spell and grammar checking, tokenization, and stemming.
+"""
+import numpy
+import nltk
+import sys
+import random
+import os
+import logging
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+import util_functions
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+log=logging.getLogger(__name__)
+MAXIMUM_ESSAY_LENGTH=20000
+class EssaySet(object):
+    def __init__(self, type="train"):
+        """
+        Initialize variables and check essay set type
+        """
+        if(type != "train" and type != "test"):
+            type = "train"
+        self._type = type
+        self._score=[]
+        self._text=[]
+        self._id=[]
+        self._clean_text=[]
+        self._tokens=[]
+        self._pos=[]
+        self._clean_stem_text=[]
+        self._generated = []
+        self._prompt = ""
+        self._spelling_errors=[]
+        self._markup_text=[]
+    def add_essay(self, essay_text, essay_score, essay_generated=0):
+        """
+        Add new (essay_text,essay_score) pair to the essay set.
+        essay_text must be a string.
+        essay_score must be an int.
+        essay_generated should not be changed by the user.
+        Returns a confirmation that essay was added.
+        """
+        # Get maximum current essay id, or set to 0 if this is the first essay added
+        if(len(self._id) > 0):
+            max_id = max(self._id)
+        else:
+            max_id = 0
+            # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
+        try:
+            essay_text=essay_text.encode('ascii', 'ignore')
+            if len(essay_text)<5:
+                essay_text="Invalid essay."
+        except:
+            log.exception("Could not parse essay into ascii.")
+        try:
+            #Try conversion of types
+            essay_score=int(essay_score)
+            essay_text=str(essay_text)
+        except:
+            #Nothing needed here, will return error in any case.
+            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
+        if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
+        and (essay_generated == 0 or essay_generated == 1):
+            self._id.append(max_id + 1)
+            self._score.append(essay_score)
+            # Clean text by removing non digit/work/punctuation characters
+            try:
+                essay_text=str(essay_text.encode('ascii', 'ignore'))
+            except:
+                essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
+            cleaned_essay=util_functions.sub_chars(essay_text).lower()
+            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
+                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
+            self._text.append(cleaned_essay)
+            # Spell correct text using aspell
+            cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
+            self._clean_text.append(cleaned_text)
+            self._spelling_errors.append(spell_errors)
+            self._markup_text.append(markup_text)
+            # Tokenize text
+            self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
+            # Part of speech tag text
+            self._pos.append(nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" ")))
+            self._generated.append(essay_generated)
+            # Stem spell corrected text
+            porter = nltk.PorterStemmer()
+            por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
+            self._clean_stem_text.append(por_toks)
+            ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
+        else:
+            raise util_functions.InputError(essay_text, "arguments need to be in format "
+                                                        "(text,score). text needs to be string,"
+                                                        " score needs to be int.")
+    def update_prompt(self, prompt_text):
+        """
+        Update the default prompt string, which is "".
+        prompt_text should be a string.
+        Returns the prompt as a confirmation.
+        """
+        if(type(prompt_text) == type("text")):
+            self._prompt = util_functions.sub_chars(prompt_text)
+            ret = self._prompt
+        else:
+            raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
+        return ret
+    def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
+        """
+        Substitute synonyms to generate extra essays from existing ones.
+        This is done to increase the amount of training data.
+        Should only be used with lowest scoring essays.
+        e_text is the text of the original essay.
+        e_score is the score of the original essay.
+        dict is a fixed dictionary (list) of words to replace.
+        max_syns defines the maximum number of additional essays to generate.  Do not set too high.
+        """
+        random.seed(1)
+        e_toks = nltk.word_tokenize(e_text)
+        all_syns = []
+        for word in e_toks:
+            synonyms = util_functions.get_wordnet_syns(word)
+            if(len(synonyms) > max_syns):
+                synonyms = random.sample(synonyms, max_syns)
+            all_syns.append(synonyms)
+        new_essays = []
+        for i in range(0, max_syns):
+            syn_toks = e_toks
+            for z in range(0, len(e_toks)):
+                if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
+                    syn_toks[z] = all_syns[z][i]
+            new_essays.append(" ".join(syn_toks))
+        for z in xrange(0, len(new_essays)):
+            self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
--- a/external_code/__init__.py
+++ b/external_code/__init__.py
+__author__ = 'vik'
--- a/external_code/fisher/LICENSE.txt
+++ b/external_code/fisher/LICENSE.txt
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/external_code/fisher/__init__.py
+++ b/external_code/fisher/__init__.py
+__author__ = 'vik'
--- a/external_code/fisher/fisher.py
+++ b/external_code/fisher/fisher.py
+#! /usr/bin/env python
+##############################################################################
+# Following functions have been taken from the DendroPy library from:
+##
+## DendroPy Phylogenetic Computing Library.
+##
+## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
+## All rights reserved.
+##
+## See "LICENSE.txt" for terms and conditions of usage.
+##
+## If you use this work or any portion thereof in published work,
+## please cite it as:
+##
+## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
+## for phylogenetic computing. Bioinformatics 26: 1569-1571.
+##
+##############################################################################
+import math
+## From dendropy.mathlib.probability
+def hypergeometric_pmf(x, m, n, k):
+    """
+Given a population consisting of `m` items of class M and `n` items of class N,
+this returns the probability of observing `x` items of class M when sampling
+`k` times without replacement from the entire population (i.e., {M,N})
+p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
+"""
+    # following fails with 'OverflowError: long int too large to convert to
+    # float' with large numbers
+    # return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
+    a = math.log(binomial_coefficient(m, x))
+    b = math.log(binomial_coefficient(n, k-x))
+    c = math.log(binomial_coefficient(m+n, k))
+    return math.exp(a+b-c)
+## From dendropy.mathlib.probability
+def binomial_coefficient(population, sample):
+    "Returns `population` choose `sample`."
+    s = max(sample, population - sample)
+    assert s <= population
+    assert population > -1
+    if s == population:
+        return 1
+    numerator = 1
+    denominator = 1
+    for i in xrange(s+1, population + 1):
+        numerator *= i
+        denominator *= (i - s)
+    return numerator/denominator
+## From dendropy.mathlib.statistics
+class FishersExactTest(object):
+    """
+Given a 2x2 table:
+---+---+
+| a | b |
+---+---+
+| c | d |
+---+---+
+represented by a list of lists::
+[[a,b],[c,d]]
+this calculates the sum of the probability of this table and all others
+more extreme under the null hypothesis that there is no association between
+the categories represented by the vertical and horizontal axes.
+"""
+    def probability_of_table(table):
+        """
+Given a 2x2 table:
+---+---+
+| a | b |
+---+---+
+| c | d |
+---+---+
+represented by a list of lists::
+[[a,b],[c,d]]
+this returns the probability of this table under the null hypothesis of
+no association between rows and columns, which was shown by Fisher to be
+a hypergeometric distribution:
+p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
+"""
+        a = table[0][0]
+        b = table[0][1]
+        c = table[1][0]
+        d = table[1][1]
+        return hypergeometric_pmf(a, a+b, c+d, a+c)
+    probability_of_table = staticmethod(probability_of_table)
+    def __init__(self, table):
+        self.table = table
+        self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
+        self.min_value = min(self.flat_table)
+        self.max_value = max(self.flat_table)
+    def _rotate_cw(self, table):
+        """
+Returns a copy of table such that all the values
+are rotated clockwise once.
+"""
+        return [ [ table[1][0], table[0][0] ],
+                [table[1][1], table[0][1] ] ]
+    def _min_rotation(self):
+        """
+Returns copy of self.table such that the smallest value is in the first
+(upper left) cell.
+"""
+        table = [list(self.table[0]), list(self.table[1])]
+        while table[0][0] != self.min_value:
+            table = self._rotate_cw(table)
+        return table
+    def _max_rotation(self):
+        """
+Returns copy of self.table such that the largest value is in the first
+(upper left) cell.
+"""
+        table = [list(self.table[0]), list(self.table[1])]
+        while table[0][0] != self.max_value:
+            table = self._rotate_cw(table)
+        return table
+    def _sum_left_tail(self):
+        # left_tail_tables = self._get_left_tail_tables()
+        # p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
+        p_vals = self._get_left_tail_probs()
+        return sum(p_vals)
+    def _sum_right_tail(self):
+        # right_tail_tables = self._get_right_tail_tables()
+        # p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
+        p_vals = self._get_right_tail_probs()
+        return sum(p_vals)
+    def _get_left_tail_probs(self):
+        table = self._min_rotation()
+        row_totals = [sum(table[0]), sum(table[1])]
+        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
+        p_vals = []
+        while True:
+            table[0][0] -= 1
+            if table[0][0] < 0:
+                break
+            table[0][1] = row_totals[0] - table[0][0]
+            table[1][0] = col_totals[0] - table[0][0]
+            table[1][1] = row_totals[1] - table[1][0]
+            p_vals.append(self.probability_of_table(table))
+        return p_vals
+    def _get_right_tail_probs(self):
+        table = self._min_rotation()
+        row_totals = [sum(table[0]), sum(table[1])]
+        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
+        p_vals = []
+        while True:
+            table[0][0] += 1
+            table[0][1] = row_totals[0] - table[0][0]
+            if table[0][1] < 0:
+                break
+            table[1][0] = col_totals[0] - table[0][0]
+            if table[1][0] < 0:
+                break
+            table[1][1] = row_totals[1] - table[1][0]
+            if table[1][1] < 0:
+                break
+            p_vals.append(self.probability_of_table(table))
+        return p_vals
+    def _get_left_tail_tables(self):
+        table = self._min_rotation()
+        row_totals = [sum(table[0]), sum(table[1])]
+        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
+        left_tail_tables = []
+        while True:
+            table[0][0] -= 1
+            if table[0][0] < 0:
+                break
+            table[0][1] = row_totals[0] - table[0][0]
+            table[1][0] = col_totals[0] - table[0][0]
+            table[1][1] = row_totals[1] - table[1][0]
+            left_tail_tables.append([list(table[0]), list(table[1])])
+        return left_tail_tables
+    def _get_right_tail_tables(self):
+        table = self._min_rotation()
+        row_totals = [sum(table[0]), sum(table[1])]
+        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
+        right_tail_tables = []
+        while True:
+            table[0][0] += 1
+            table[0][1] = row_totals[0] - table[0][0]
+            if table[0][1] < 0:
+                break
+            table[1][0] = col_totals[0] - table[0][0]
+            if table[1][0] < 0:
+                break
+            table[1][1] = row_totals[1] - table[1][0]
+            if table[1][1] < 0:
+                break
+            right_tail_tables.append([list(table[0]), list(table[1])])
+        return right_tail_tables
+    def left_tail_p(self):
+        """
+Returns the sum of probabilities of this table and all others more
+extreme.
+"""
+        return self.probability_of_table(self.table) + self._sum_left_tail()
+    def right_tail_p(self):
+        """
+Returns the sum of probabilities of this table and all others more
+extreme.
+"""
+        return self.probability_of_table(self.table) + self._sum_right_tail()
+    def two_tail_p(self):
+        """
+Returns the sum of probabilities of this table and all others more
+extreme.
+"""
+        p0 = self.probability_of_table(self.table)
+        all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
+        p_vals = []
+        for p in all_p_vals:
+            if p <= p0:
+                p_vals.append(p)
+        return sum(p_vals) + p0
+def assert_almost_equal(v1, v2, prec=8):
+    if abs(v1-v2) <= 10**(-prec):
+        print "OK: {} == {}".format(v1, v2)
+    else:
+        print "FAIL: {} != {}".format(v1, v2)
+if __name__ == "__main__":
+    table = [[12, 5], [29, 2]]
+    ft = FishersExactTest(table)
+    assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
+    assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
+    assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
--- a/feature_extractor.py
+++ b/feature_extractor.py
+"""
+Extracts features from training set and test set essays
+"""
+import numpy
+import re
+import nltk
+import sys
+from sklearn.feature_extraction.text import CountVectorizer
+import pickle
+import os
+from itertools import chain
+import copy
+import operator
+import logging
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+from essay_set import EssaySet
+import util_functions
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+log = logging.getLogger(__name__)
+NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
+ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
+class FeatureExtractor(object):
+    def __init__(self):
+        self._good_pos_ngrams = self.get_good_pos_ngrams()
+        self.dict_initialized = False
+        self._spell_errors_per_character=0
+        self._grammar_errors_per_character=0
+    def initialize_dictionaries(self, e_set, max_feats2 = 200):
+        """
+        Initializes dictionaries from an essay set object
+        Dictionaries must be initialized prior to using this to extract features
+        e_set is an input essay set
+        returns a confirmation of initialization
+        """
+        if(hasattr(e_set, '_type')):
+            if(e_set._type == "train"):
+                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
+                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
+                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
+                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
+                self.dict_initialized = True
+                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
+                self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
+                good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+                self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
+                bag_feats=self.gen_bag_feats(e_set)
+                f_row_sum=numpy.sum(bag_feats[:,:])
+                self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
+                ret = "ok"
+            else:
+                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
+        else:
+            raise util_functions.InputError(e_set, "wrong input. need an essay set object")
+        return ret
+    def get_good_pos_ngrams(self):
+        """
+        Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
+        Returns the list and caches the file
+        """
+        if(os.path.isfile(NGRAM_PATH)):
+            good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
+        elif os.path.isfile(ESSAY_CORPUS_PATH):
+            essay_corpus = open(ESSAY_CORPUS_PATH).read()
+            essay_corpus = util_functions.sub_chars(essay_corpus)
+            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
+            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
+        else:
+            #Hard coded list in case the needed files cannot be found
+            good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
+             '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP',
+             'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO',
+             'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP',
+             'TO NNP', 'TO NNP NNP']
+        return good_pos_ngrams
+    def _get_grammar_errors(self,pos,text,tokens):
+        """
+        Internal function to get the number of grammar errors in given text
+        """
+        word_counts = [max(len(t),1) for t in tokens]
+        good_pos_tags = []
+        min_pos_seq=2
+        max_pos_seq=4
+        bad_pos_positions=[]
+        for i in xrange(0, len(text)):
+            pos_seq = [tag[1] for tag in pos[i]]
+            pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
+            long_pos_ngrams=[z for z in pos_ngrams if z.count(' ')==(max_pos_seq-1)]
+            bad_pos_tuples=[[z,z+max_pos_seq] for z in xrange(0,len(long_pos_ngrams)) if long_pos_ngrams[z] not in self._good_pos_ngrams]
+            bad_pos_tuples.sort(key=operator.itemgetter(1))
+            to_delete=[]
+            for m in reversed(xrange(len(bad_pos_tuples)-1)):
+                start, end = bad_pos_tuples[m]
+                for j in xrange(m+1, len(bad_pos_tuples)):
+                    lstart, lend = bad_pos_tuples[j]
+                    if lstart >= start and lstart <= end:
+                        bad_pos_tuples[m][1]=bad_pos_tuples[j][1]
+                        to_delete.append(j)
+            fixed_bad_pos_tuples=[bad_pos_tuples[z] for z in xrange(0,len(bad_pos_tuples)) if z not in to_delete]
+            bad_pos_positions.append(fixed_bad_pos_tuples)
+            overlap_ngrams = [z for z in pos_ngrams if z in self._good_pos_ngrams]
+            if (len(pos_ngrams)-len(overlap_ngrams))>0:
+                divisor=len(pos_ngrams)/len(pos_seq)
+            else:
+                divisor=1
+            good_pos_tags.append((len(pos_ngrams)-len(overlap_ngrams))/divisor)
+        return good_pos_tags,bad_pos_positions
+    def gen_length_feats(self, e_set):
+        """
+        Generates length based features from an essay set
+        Generally an internal function called by gen_feats
+        Returns an array of length features
+        """
+        text = e_set._text
+        lengths = [len(e) for e in text]
+        word_counts = [max(len(t),1) for t in e_set._tokens]
+        comma_count = [e.count(",") for e in text]
+        ap_count = [e.count("'") for e in text]
+        punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
+        chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
+        good_pos_tags,bad_pos_positions= self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
+        length_arr = numpy.array((
+        lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
+        good_pos_tag_prop)).transpose()
+        return length_arr.copy()
+    def gen_bag_feats(self, e_set):
+        """
+        Generates bag of words features from an input essay set and trained FeatureExtractor
+        Generally called by gen_feats
+        Returns an array of features
+        """
+        if(hasattr(self, '_stem_dict')):
+            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
+            nfeats = self._normal_dict.transform(e_set._text)
+            bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
+        else:
+            raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
+        return bag_feats.copy()
+    def gen_feats(self, e_set):
+        """
+        Generates bag of words, length, and prompt features from an essay set object
+        returns an array of features
+        """
+        bag_feats = self.gen_bag_feats(e_set)
+        length_feats = self.gen_length_feats(e_set)
+        prompt_feats = self.gen_prompt_feats(e_set)
+        overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
+        overall_feats = overall_feats.copy()
+        return overall_feats
+    def gen_prompt_feats(self, e_set):
+        """
+        Generates prompt based features from an essay set object and internal prompt variable.
+        Generally called internally by gen_feats
+        Returns an array of prompt features
+        """
+        prompt_toks = nltk.word_tokenize(e_set._prompt)
+        expand_syns = []
+        for word in prompt_toks:
+            synonyms = util_functions.get_wordnet_syns(word)
+            expand_syns.append(synonyms)
+        expand_syns = list(chain.from_iterable(expand_syns))
+        prompt_overlap = []
+        prompt_overlap_prop = []
+        for j in e_set._tokens:
+            tok_length=len(j)
+            if(tok_length==0):
+                tok_length=1
+            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
+            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
+        expand_overlap = []
+        expand_overlap_prop = []
+        for j in e_set._tokens:
+            tok_length=len(j)
+            if(tok_length==0):
+                tok_length=1
+            expand_overlap.append(len([i for i in j if i in expand_syns]))
+            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
+        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
+        return prompt_arr.copy()
+    def gen_feedback(self, e_set, features=None):
+        """
+        Generate feedback for a given set of essays
+        e_set - EssaySet object
+        features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
+        in order to get off topic feedback.
+        Returns a list of lists (one list per essay in e_set)
+        """
+        #Set ratio to modify thresholds for grammar/spelling errors
+        modifier_ratio=1.05
+        #Calc number of grammar and spelling errors per character
+        set_grammar,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
+        set_grammar_per_character=[set_grammar[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
+        set_spell_errors_per_character=[e_set._spelling_errors[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
+        #Iterate through essays and create a feedback dict for each
+        all_feedback=[]
+        for m in xrange(0,len(e_set._text)):
+            #Be very careful about changing these messages!
+            individual_feedback={'grammar' : "Grammar: Ok.",
+                                 'spelling' : "Spelling: Ok.",
+                                 'markup_text' : "",
+                                 'grammar_per_char' : set_grammar_per_character[m],
+                                 'spelling_per_char' : set_spell_errors_per_character[m],
+                                 'too_similar_to_prompt' : False,
+                                 }
+            markup_tokens=e_set._markup_text[m].split(" ")
+            #This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
+            #disjointed
+            bad_pos_starts=[z[0] for z in bad_pos_positions[m]]
+            bad_pos_ends=[z[1]-1 for z in bad_pos_positions[m]]
+            for z in xrange(0,len(markup_tokens)):
+                if z in bad_pos_starts:
+                    markup_tokens[z]='<bg>' + markup_tokens[z]
+                elif z in bad_pos_ends:
+                    markup_tokens[z]=markup_tokens[z] + "</bg>"
+            if(len(bad_pos_ends)>0 and len(bad_pos_starts)>0 and len(markup_tokens)>1):
+                if max(bad_pos_ends)>(len(markup_tokens)-1) and max(bad_pos_starts)<(len(markup_tokens)-1):
+                    markup_tokens[len(markup_tokens)-1]+="</bg>"
+            #Display messages if grammar/spelling errors greater than average in training set
+            if set_grammar_per_character[m]>(self._grammar_errors_per_character*modifier_ratio):
+                individual_feedback['grammar']="Grammar: More grammar errors than average."
+            if set_spell_errors_per_character[m]>(self._spell_errors_per_character*modifier_ratio):
+                individual_feedback['spelling']="Spelling: More spelling errors than average."
+            #Test topicality by calculating # of on topic words per character and comparing to the training set
+            #mean.  Requires features to be passed in
+            if features is not None:
+                f_row_sum=numpy.sum(features[m,12:])
+                f_row_prop=f_row_sum/len(e_set._text[m])
+                if f_row_prop<(self._mean_f_prop/1.5) or len(e_set._text[m])<20:
+                    individual_feedback['topicality']="Topicality: Essay may be off topic."
+                if(features[m,9]>.6):
+                    individual_feedback['prompt_overlap']="Prompt Overlap: Too much overlap with prompt."
+                    individual_feedback['too_similar_to_prompt']=True
+                    log.debug(features[m,9])
+            #Create string representation of markup text
+            markup_string=" ".join(markup_tokens)
+            individual_feedback['markup_text']=markup_string
+            all_feedback.append(individual_feedback)
+        return all_feedback
--- a/grade.py
+++ b/grade.py
+#Grader called by pyxserver_wsgi.py
+#Loads a grader file, which is a dict containing the prompt of the question,
+#a feature extractor object, and a trained model.
+#Extracts features and runs trained model on the submission to produce a final score.
+#Correctness determined by ratio of score to max possible score.
+#Requires aspell to be installed and added to the path.
+import sys
+import pickle
+import os
+import numpy
+import logging
+from statsd import statsd
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+from essay_set import EssaySet
+import predictor_extractor
+import predictor_set
+import util_functions
+#Imports needed to unpickle grader data
+import feature_extractor
+import sklearn.ensemble
+import math
+log = logging.getLogger(__name__)
+@statsd.timed('open_ended_assessment.machine_learning.grader.time')
+def grade(grader_data,grader_config,submission):
+    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
+    has_error=False
+    #Try to find and load the model file
+    grader_set=EssaySet(type="test")
+    #Try to add essays to essay set object
+    try:
+        grader_set.add_essay(str(submission),0)
+        grader_set.update_prompt(str(grader_data['prompt']))
+    except:
+        results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
+        has_error=True
+    #Try to extract features from submission and assign score via the model
+    try:
+        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
+        results['score']=int(grader_data['model'].predict(grader_feats)[0])
+    except :
+        results['errors'].append("Could not extract features and score essay.")
+        has_error=True
+    #Try to determine confidence level
+    try:
+        min_score=min(numpy.asarray(grader_data['score']))
+        max_score=max(numpy.asarray(grader_data['score']))
+        raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
+        #TODO: Normalize confidence somehow here
+        results['confidence']=raw_confidence
+    except:
+        #If there is an error getting confidence, it is not a show-stopper, so just log
+        log.exception("Problem generating confidence value")
+    if not has_error:
+        if(feedback['too_similar_to_prompt']):
+            results['score']=0
+            results['correct']=False
+        results['success']=True
+        #Generate short form output--number of problem areas identified in feedback
+        problem_areas=0
+        for tag in feedback:
+            if tag in ['topicality', 'prompt-overlap', 'spelling', 'grammar']:
+                problem_areas+=len(feedback[tag])>5
+        #Add feedback to results
+        results['feedback'] = {}
+        if 'topicality' in feedback and 'prompt_overlap' in feedback:
+            results['feedback'].update({
+                'topicality' : feedback['topicality'],
+                'prompt-overlap' : feedback['prompt_overlap'],
+            })
+        if results['score']/float(max_score)<.33:
+            results['feedback'].update(
+                {'spelling' : feedback['spelling'],
+            'grammar' : feedback['grammar'],
+            'markup-text' : feedback['markup_text'],
+            })
+    else:
+        #If error, success is False.
+        results['success']=False
+    #Count number of successful/unsuccessful gradings
+    statsd.increment("open_ended_assessment.machine_learning.grader_count",
+        tags=["success:{0}".format(results['success'])])
+    return results
+def grade_generic(grader_data, grader_config, numeric_features, textual_features):
+    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
+    has_error=False
+    #Try to find and load the model file
+    grader_set=predictor_set.PredictorSet(type="test")
+    #Try to add essays to essay set object
+    try:
+        grader_set.add_row(numeric_features, textual_features,0)
+    except:
+        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
+        has_error=True
+    #Try to extract features from submission and assign score via the model
+    try:
+        grader_feats=grader_data['extractor'].gen_feats(grader_set)
+        results['score']=grader_data['model'].predict(grader_feats)[0]
+    except :
+        results['errors'].append("Could not extract features and score essay.")
+        has_error=True
+    #Try to determine confidence level
+    try:
+        min_score=min(numpy.asarray(grader_data['score']))
+        max_score=max(numpy.asarray(grader_data['score']))
+        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
+            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
+            #TODO: Normalize confidence somehow here
+            results['confidence']=raw_confidence
+        else:
+            raw_confidence = grader_data['model'].predict(grader_feats)[0]
+            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
+            results['confidence'] = confidence
+    except:
+        #If there is an error getting confidence, it is not a show-stopper, so just log
+        log.exception("Problem generating confidence value")
+        #Count number of successful/unsuccessful gradings
+    statsd.increment("open_ended_assessment.machine_learning.grader_count",
+        tags=["success:{0}".format(results['success'])])
+    if not has_error:
+        results['success'] = True
+    return results
--- a/install/apt-packages.txt
+++ b/install/apt-packages.txt
+python-pip
+python-scipy
+python-mysqldb
+ipython
+nginx
+git
+redis-server
+libmysqlclient-dev
+gfortran
+libblas3gf
+libblas-dev
+liblapack3gf
+liblapack-dev
+libatlas-base-dev
+libxml2-dev
+libxslt1-dev
+libreadline6
+libreadline6-dev
+build-essential
+curl
+aspell
+python
\ No newline at end of file
--- a/install/install_system_req.sh
+++ b/install/install_system_req.sh
+#!/usr/bin/env bash
+# posix compliant sanity check
+if [ -z $BASH ] || [  $BASH = "/bin/sh" ]; then
+    echo "Please use the bash interpreter to run this script"
+    exit 1
+fi
+error() {
+      printf '\E[31m'; echo "$@"; printf '\E[0m'
+}
+output() {
+      printf '\E[36m'; echo "$@"; printf '\E[0m'
+}
+### START
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BREW_FILE=$DIR/"brew-formulas.txt"
+APT_PKGS_FILE=$DIR/"apt-packages.txt"
+case `uname -s` in
+    [Ll]inux)
+        command -v lsb_release &>/dev/null || {
+            error "Please install lsb-release."
+            exit 1
+        }
+        distro=`lsb_release -cs`
+        case $distro in
+            maya|lisa|natty|oneiric|precise|quantal)
+                output "Installing Ubuntu requirements"
+                # DEBIAN_FRONTEND=noninteractive is required for silent mysql-server installation
+                export DEBIAN_FRONTEND=noninteractive
+                # install packages listed in APT_PKGS_FILE
+                cat $APT_PKGS_FILE | xargs sudo apt-get -y install
+                ;;
+            *)
+                error "Unsupported distribution - $distro"
+                exit 1
+               ;;
+        esac
+        ;;
+    Darwin)
+        if [[ ! -w /usr/local ]]; then
+            cat<<EO
+        You need to be able to write to /usr/local for
+        the installation of brew and brew packages.
+        Either make sure the group you are in (most likely 'staff')
+        can write to that directory or simply execute the following
+        and re-run the script:
+        $ sudo chown -R $USER /usr/local
+EO
+            exit 1
+        fi
+        output "Installing OSX requirements"
+        if [[ ! -r $BREW_FILE ]]; then
+            error "$BREW_FILE does not exist, needed to install brew"
+            exit 1
+        fi
+        # brew errors if the package is already installed
+        for pkg in $(cat $BREW_FILE); do
+            grep $pkg <(brew list) &>/dev/null || {
+                output "Installing $pkg"
+                brew install $pkg
+            }
+        done
+        # paths where brew likes to install python scripts
+        PATH=/usr/local/share/python:/usr/local/bin:$PATH
+        command -v pip &>/dev/null || {
+            output "Installing pip"
+            easy_install pip
+        }
+        if ! grep -Eq ^1.7 <(virtualenv --version 2>/dev/null); then
+            output "Installing virtualenv >1.7"
+            pip install 'virtualenv>1.7' virtualenvwrapper
+        fi
+        command -v coffee &>/dev/null || {
+            output "Installing coffee script"
+            curl --insecure https://npmjs.org/install.sh | sh
+            npm install -g coffee-script
+        }
+        ;;
+    *)
+        error "Unsupported platform"
+        exit 1
+        ;;
+esac
--- a/install/pre-requirements.txt
+++ b/install/pre-requirements.txt
+numpy==1.6.2
--- a/install/requirements.txt
+++ b/install/requirements.txt
+boto==2.6.0
+coverage==3.5.3
+dogstatsd-python==0.2
+lxml==3.0.1
+mock==0.8.0
+nltk==2.0.3
+nose==1.2.1
+scipy==0.11.0
+path.py
+pip
+pygraphviz==1.1
+pylint==0.26.0
+pytz==2012h
+scikit-learn==0.12.1
--- a/model_creator.py
+++ b/model_creator.py
+#Provides interface functions to create and save models
+import numpy
+import re
+import nltk
+import sys
+from sklearn.feature_extraction.text import CountVectorizer
+import pickle
+import os
+import sklearn.ensemble
+from itertools import chain
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+from essay_set import EssaySet
+import util_functions
+import feature_extractor
+import logging
+import predictor_extractor
+log=logging.getLogger()
+def read_in_test_data(filename):
+    """
+    Reads in test data file found at filename.
+    filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
+    returns the score and the text
+    """
+    id, e_set, score, score2, text = [], [], [], [], []
+    combined_raw = open(filename).read()
+    raw_lines = combined_raw.splitlines()
+    for row in xrange(1, len(raw_lines)):
+        id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
+        id.append(int(id1))
+        text.append(text1)
+        e_set.append(int(set1))
+        score.append(int(score1))
+        score2.append(int(score12))
+    return score, text
+def read_in_test_prompt(filename):
+    """
+    Reads in the prompt from a text file
+    Returns string
+    """
+    prompt_string = open(filename).read()
+    return prompt_string
+def read_in_test_data_twocolumn(filename,sep=","):
+    """
+    Reads in a two column version of the test data.
+    Filename must point to a delimited file.
+    In filename, the first column should be integer score data.
+    The second column should be string text data.
+    Sep specifies the type of separator between fields.
+    """
+    score, text = [], []
+    combined_raw = open(filename).read()
+    raw_lines = combined_raw.splitlines()
+    for row in xrange(1, len(raw_lines)):
+        score1, text1 = raw_lines[row].strip().split("\t")
+        text.append(text1)
+        score.append(int(score1))
+    return score, text
+def create_essay_set(text, score, prompt_string, generate_additional=True):
+    """
+    Creates an essay set from given data.
+    Text should be a list of strings corresponding to essay text.
+    Score should be a list of scores where score[n] corresponds to text[n]
+    Prompt string is just a string containing the essay prompt.
+    Generate_additional indicates whether to generate additional essays at the minimum score point or not.
+    """
+    x = EssaySet()
+    for i in xrange(0, len(text)):
+        x.add_essay(text[i], score[i])
+        if score[i] == min(score) and generate_additional == True:
+            x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
+    x.update_prompt(prompt_string)
+    return x
+def get_cv_error(clf,feats,scores):
+    results={'success' : False, 'kappa' : 0, 'mae' : 0}
+    try:
+        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
+        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
+        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
+        results['mae']=err
+        results['kappa']=kappa
+        results['success']=True
+    except ValueError:
+        #If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
+        log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
+    except:
+        log.exception("Error getting cv error estimates.")
+    return results
+def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
+    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
+        algorithm = util_functions.AlgorithmTypes.regression
+    f = predictor_extractor.PredictorExtractor()
+    f.initialize_dictionaries(predictor_set)
+    train_feats = f.gen_feats(predictor_set)
+    if type == util_functions.AlgorithmTypes.classification:
+        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+    else:
+        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
+            max_depth=4, random_state=1,min_samples_leaf=3)
+    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
+    try:
+        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
+        clf.fit(train_feats, set_score)
+    except ValueError:
+        log.exception("Not enough classes (0,1,etc) in sample.")
+        set_score[0]=1
+        set_score[1]=0
+        clf.fit(train_feats, set_score)
+    return f, clf, cv_error_results
+def extract_features_and_generate_model(essays,additional_array=None):
+    """
+    Feed in an essay set to get feature vector and classifier
+    essays must be an essay set object
+    additional array is an optional argument that can specify
+    a numpy array of values to add in
+    returns a trained FeatureExtractor object and a trained classifier
+    """
+    f = feature_extractor.FeatureExtractor()
+    f.initialize_dictionaries(essays)
+    train_feats = f.gen_feats(essays)
+    if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
+        if(additional_array.shape[0]==train_feats.shape[0]):
+            train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
+    clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+        max_depth=4, random_state=1,min_samples_leaf=3)
+    clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
+        max_depth=4, random_state=1,min_samples_leaf=3)
+    cv_error_results=get_cv_error(clf2,train_feats,essays._score)
+    try:
+        set_score = numpy.asarray(essays._score, dtype=numpy.int)
+        clf.fit(train_feats, set_score)
+    except ValueError:
+        log.exception("Not enough classes (0,1,etc) in sample.")
+        set_score[0]=1
+        set_score[1]=0
+        clf.fit(train_feats, set_score)
+    return f, clf, cv_error_results
+def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
+    """
+    Writes out a model to a file.
+    prompt string is a string containing the prompt
+    feature_ext is a trained FeatureExtractor object
+    classifier is a trained classifier
+    model_path is the path of write out the model file to
+    """
+    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
+    pickle.dump(model_file, file=open(model_path, "w"))
+def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
+    """
+    Function that creates essay set, extracts features, and writes out model
+    See above functions for argument descriptions
+    """
+    essay_set=create_essay_set(text_score,prompt)
+    feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
+    dump_model_to_file(prompt,feature_ext,clf,model_path)
--- a/predictor_extractor.py
+++ b/predictor_extractor.py
+import numpy
+import re
+import nltk
+import sys
+from sklearn.feature_extraction.text import CountVectorizer
+import pickle
+import os
+from itertools import chain
+import copy
+import operator
+import logging
+import math
+from feature_extractor import FeatureExtractor
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+from essay_set import EssaySet
+import util_functions
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+log = logging.getLogger(__name__)
+class PredictorExtractor(object):
+    def __init__(self):
+        self._extractors = []
+        self._initialized = False
+    def initialize_dictionaries(self, p_set):
+        success = False
+        if not (hasattr(p_set, '_type')):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+        if not (p_set._type == "train"):
+            error_message = "needs to be an essay set of the train type."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+        div_length=len(p_set._essay_sets)
+        if div_length==0:
+            div_length=1
+        max_feats2 = int(math.floor(200/div_length))
+        for i in xrange(0,len(p_set._essay_sets)):
+            self._extractors.append(FeatureExtractor())
+            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
+            self._initialized = True
+            success = True
+        return success
+    def gen_feats(self, p_set):
+        if self._initialized!=True:
+            error_message = "Dictionaries have not been initialized."
+            log.exception(error_message)
+            raise util_functions.InputError(p_set, error_message)
+        textual_features = []
+        for i in xrange(0,len(p_set._essay_sets)):
+            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
+        textual_matrix = numpy.concatenate(textual_features, axis=1)
+        predictor_matrix = numpy.array(p_set._numeric_features)
+        print textual_matrix.shape
+        print predictor_matrix.shape
+        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
+        return overall_matrix.copy()
--- a/predictor_set.py
+++ b/predictor_set.py
+import numpy
+import nltk
+import sys
+import random
+import os
+import logging
+import essay_set
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+import util_functions
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+log=logging.getLogger(__name__)
+class PredictorSet(object):
+    def __init__(self, type = "train"):
+        """
+        Initialize variables and check essay set type
+        """
+        if(type != "train" and type != "test"):
+            type = "train"
+        self._type = type
+        self._target=[]
+        self._textual_features=[]
+        self._numeric_features=[]
+        self._essay_sets=[]
+    def add_row(self, numeric_features, textual_features, target):
+        #Basic input checking
+        if not isinstance(target, (int, long, float)):
+            error_message = "Target is not a numeric value."
+            log.exception(error_message)
+            raise util_functions.InputError(target, error_message)
+        if not isinstance(numeric_features, list):
+            error_message = "Numeric features are not a list."
+            log.exception(error_message)
+            raise util_functions.InputError(numeric_features, error_message)
+        if not isinstance(textual_features, list):
+            error_message = "Textual features are not a list."
+            log.exception(error_message)
+            raise util_functions.InputError(textual_features, error_message)
+        #Do some length checking for parameters
+        if len(self._numeric_features)>0:
+            numeric_length  = len(self._numeric_features[-1])
+            current_numeric_length = len(numeric_features)
+            if numeric_length != current_numeric_length:
+                error_message = "Numeric features are an improper length."
+                log.exception(error_message)
+                raise util_functions.InputError(numeric_features, error_message)
+        if len(self._textual_features)>0:
+            textual_length  = len(self._textual_features[-1])
+            current_textual_length = len(textual_features)
+            if textual_length != current_textual_length:
+                error_message = "Textual features are an improper length."
+                log.exception(error_message)
+                raise util_functions.InputError(textual_features, error_message)
+        #Now check to see if text features and numeric features are individually correct
+        for i in xrange(0,len(numeric_features)):
+            try:
+                numeric_features[i] = float(numeric_features[i])
+            except:
+                error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
+                log.exception(error_message)
+                raise util_functions.InputError(numeric_features, error_message)
+        for i in xrange(0,len(textual_features)):
+            try:
+                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
+            except:
+                error_message = "Textual feature {0} not string.".format(textual_features[i])
+                log.exception(error_message)
+                raise util_functions.InputError(textual_features, error_message)
+        #Create essay sets for textual features if needed
+        if len(self._textual_features)==0:
+            for i in xrange(0,len(textual_features)):
+                self._essay_sets.append(essay_set.EssaySet(type=self._type))
+        #Add numeric and textual features
+        self._numeric_features.append(numeric_features)
+        self._textual_features.append(textual_features)
+        #Add targets
+        self._target.append(target)
+        #Add textual features to essay sets
+        for i in xrange(0,len(textual_features)):
+            self._essay_sets[i].add_essay(textual_features[i], target)
--- a/util_functions.py
+++ b/util_functions.py
+#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
+#Requires aspell to be installed and added to the path
+from external_code.fisher import fisher
+aspell_path = "aspell"
+import re
+import os
+from sklearn.feature_extraction.text import CountVectorizer
+import numpy
+from itertools import chain
+import math
+import nltk
+import pickle
+import logging
+log=logging.getLogger(__name__)
+base_path = os.path.dirname(__file__)
+sys.path.append(base_path)
+if not base_path.endswith("/"):
+    base_path=base_path+"/"
+ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
+ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
+class AlgorithmTypes(object):
+    regression = "regression"
+    classification = "classifiction"
+def create_model_path(model_path):
+    if not model_path.startswith("/") and not model_path.startswith("models/"):
+        model_path="/" + model_path
+    if not model_path.startswith("models"):
+        model_path = "models" + model_path
+    if not model_path.endswith(".p"):
+        model_path+=".p"
+    return model_path
+def sub_chars(string):
+    """
+    Strips illegal characters from a string.  Used to sanitize input essays.
+    Removes all non-punctuation, digit, or letter characters.
+    Returns sanitized string.
+    """
+    sub_pat = r"[^A-Za-z\.\?!,';:]"
+    char_pat = r"\."
+    com_pat = r","
+    ques_pat = r"\?"
+    excl_pat = r"!"
+    sem_pat = r";"
+    col_pat = r":"
+    whitespace_pat = r"\s{1,}"
+    whitespace_comp = re.compile(whitespace_pat)
+    sub_comp = re.compile(sub_pat)
+    char_comp = re.compile(char_pat)
+    com_comp = re.compile(com_pat)
+    ques_comp = re.compile(ques_pat)
+    excl_comp = re.compile(excl_pat)
+    sem_comp = re.compile(sem_pat)
+    col_comp = re.compile(col_pat)
+    nstring = sub_comp.sub(" ", string)
+    nstring = char_comp.sub(" .", nstring)
+    nstring = com_comp.sub(" ,", nstring)
+    nstring = ques_comp.sub(" ?", nstring)
+    nstring = excl_comp.sub(" !", nstring)
+    nstring = sem_comp.sub(" ;", nstring)
+    nstring = col_comp.sub(" :", nstring)
+    nstring = whitespace_comp.sub(" ", nstring)
+    return nstring
+def spell_correct(string):
+    """
+    Uses aspell to spell correct an input string.
+    Requires aspell to be installed and added to the path.
+    Returns the spell corrected string if aspell is found, original string if not.
+    """
+    f = open('tmpfile', 'w')
+    f.write(string)
+    f_path = os.path.abspath(f.name)
+    f.close()
+    try:
+        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
+    except:
+        log.exception("Could not find aspell, so could not spell correct!")
+        return string,0, string
+    incorrect = p.readlines()
+    p.close()
+    incorrect_words = list()
+    correct_spelling = list()
+    for i in range(1, len(incorrect)):
+        if(len(incorrect[i]) > 10):
+            match = re.search(":", incorrect[i])
+            if hasattr(match, "start"):
+                begstring = incorrect[i][2:match.start()]
+                begmatch = re.search(" ", begstring)
+                begword = begstring[0:begmatch.start()]
+                sugstring = incorrect[i][match.start() + 2:]
+                sugmatch = re.search(",", sugstring)
+                if hasattr(sugmatch, "start"):
+                    sug = sugstring[0:sugmatch.start()]
+                    incorrect_words.append(begword)
+                    correct_spelling.append(sug)
+    newstring = string
+    markup_string = string
+    already_subbed=[]
+    for i in range(0, len(incorrect_words)):
+        sub_pat = r"\b" + incorrect_words[i] + r"\b"
+        sub_comp = re.compile(sub_pat)
+        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
+        if incorrect_words[i] not in already_subbed:
+            markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
+            already_subbed.append(incorrect_words[i])
+    return newstring,len(incorrect_words),markup_string
+def ngrams(tokens, min_n, max_n):
+    """
+    Generates ngrams(word sequences of fixed length) from an input token sequence.
+    tokens is a list of words.
+    min_n is the minimum length of an ngram to return.
+    max_n is the maximum length of an ngram to return.
+    returns a list of ngrams (words separated by a space)
+    """
+    all_ngrams = list()
+    n_tokens = len(tokens)
+    for i in xrange(n_tokens):
+        for j in xrange(i + min_n, min(n_tokens, i + max_n) + 1):
+            all_ngrams.append(" ".join(tokens[i:j]))
+    return all_ngrams
+def f7(seq):
+    """
+    Makes a list unique
+    """
+    seen = set()
+    seen_add = seen.add
+    return [x for x in seq if x not in seen and not seen_add(x)]
+def count_list(the_list):
+    """
+    Generates a count of the number of times each unique item appears in a list
+    """
+    count = the_list.count
+    result = [(item, count(item)) for item in set(the_list)]
+    result.sort()
+    return result
+def regenerate_good_tokens(string):
+    """
+    Given an input string, part of speech tags the string, then generates a list of
+    ngrams that appear in the string.
+    Used to define grammatically correct part of speech tag sequences.
+    Returns a list of part of speech tag sequences.
+    """
+    toks = nltk.word_tokenize(string)
+    pos_string = nltk.pos_tag(toks)
+    pos_seq = [tag[1] for tag in pos_string]
+    pos_ngrams = ngrams(pos_seq, 2, 4)
+    sel_pos_ngrams = f7(pos_ngrams)
+    return sel_pos_ngrams
+def get_vocab(text, score, max_feats=750, max_feats2=200):
+    """
+    Uses a fisher test to find words that are significant in that they separate
+    high scoring essays from low scoring essays.
+    text is a list of input essays.
+    score is a list of scores, with score[n] corresponding to text[n]
+    max_feats is the maximum number of features to consider in the first pass
+    max_feats2 is the maximum number of features to consider in the second (final) pass
+    Returns a list of words that constitute the significant vocabulary
+    """
+    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
+    dict_mat = dict.fit_transform(text)
+    set_score = numpy.asarray(score, dtype=numpy.int)
+    med_score = numpy.median(set_score)
+    new_score = set_score
+    if(med_score == 0):
+        med_score = 1
+    new_score[set_score < med_score] = 0
+    new_score[set_score >= med_score] = 1
+    fish_vals = []
+    for col_num in range(0, dict_mat.shape[1]):
+        loop_vec = dict_mat.getcol(col_num).toarray()
+        good_loop_vec = loop_vec[new_score == 1]
+        bad_loop_vec = loop_vec[new_score == 0]
+        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
+        good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
+        bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
+        bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
+        fish_val = fisher.FishersExactTest.probability_of_table(
+            [[good_loop_present, bad_loop_present], [good_loop_missing, bad_loop_missing]])
+        fish_vals.append(fish_val)
+    cutoff = 1
+    if(len(fish_vals) > max_feats2):
+        cutoff = sorted(fish_vals)[max_feats2]
+    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
+    getVar = lambda searchList, ind: [searchList[i] for i in ind]
+    vocab = getVar(dict.get_feature_names(), good_cols)
+    return vocab
+def edit_distance(s1, s2):
+    """
+    Calculates string edit distance between string 1 and string 2.
+    Deletion, insertion, substitution, and transposition all increase edit distance.
+    """
+    d = {}
+    lenstr1 = len(s1)
+    lenstr2 = len(s2)
+    for i in xrange(-1, lenstr1 + 1):
+        d[(i, -1)] = i + 1
+    for j in xrange(-1, lenstr2 + 1):
+        d[(-1, j)] = j + 1
+    for i in xrange(lenstr1):
+        for j in xrange(lenstr2):
+            if s1[i] == s2[j]:
+                cost = 0
+            else:
+                cost = 1
+            d[(i, j)] = min(
+                d[(i - 1, j)] + 1, # deletion
+                d[(i, j - 1)] + 1, # insertion
+                d[(i - 1, j - 1)] + cost, # substitution
+            )
+            if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
+                d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
+    return d[lenstr1 - 1, lenstr2 - 1]
+class Error(Exception):
+    pass
+class InputError(Error):
+    def __init__(self, expr, msg):
+        self.expr = expr
+        self.msg = msg
+def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
+    """
+    Generates cross validated predictions using an input classifier and data.
+    clf is a classifier that implements that implements the fit and predict methods.
+    arr is the input data array (X)
+    sel_score is the target list (y).  y[n] corresponds to X[n,:]
+    num_chunks is the number of cross validation folds to use
+    Returns an array of the predictions where prediction[n] corresponds to X[n,:]
+    """
+    cv_len = int(math.floor(len(sel_score) / num_chunks))
+    chunks = []
+    for i in range(0, num_chunks):
+        range_min = i * cv_len
+        range_max = ((i + 1) * cv_len)
+        if i == num_chunks - 1:
+            range_max = len(sel_score)
+        chunks.append(range(range_min, range_max))
+    preds = []
+    set_score = numpy.asarray(sel_score, dtype=numpy.int)
+    chunk_vec = numpy.asarray(range(0, len(chunks)))
+    for i in xrange(0, len(chunks)):
+        loop_inds = list(
+            chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
+        sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
+        preds.append(list(sim_fit.predict(arr[chunks[i]])))
+    all_preds = list(chain(*preds))
+    return(all_preds)
+def gen_model(clf, arr, sel_score):
+    """
+    Fits a classifier to data and a target score
+    clf is an input classifier that implements the fit method.
+    arr is a data array(X)
+    sel_score is the target list (y) where y[n] corresponds to X[n,:]
+    sim_fit is not a useful return value.  Instead the clf is the useful output.
+    """
+    set_score = numpy.asarray(sel_score, dtype=numpy.int)
+    sim_fit = clf.fit(arr, set_score)
+    return(sim_fit)
+def gen_preds(clf, arr):
+    """
+    Generates predictions on a novel data array using a fit classifier
+    clf is a classifier that has already been fit
+    arr is a data array identical in dimension to the array clf was trained on
+    Returns the array of predictions.
+    """
+    if(hasattr(clf, "predict_proba")):
+        ret = clf.predict(arr)
+        # pred_score=preds.argmax(1)+min(x._score)
+    else:
+        ret = clf.predict(arr)
+    return ret
+def calc_list_average(l):
+    """
+    Calculates the average value of a list of numbers
+    Returns a float
+    """
+    total = 0.0
+    for value in l:
+        total += value
+    return total / len(l)
+stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Calculates kappa correlation between rater_a and rater_b.
+    Kappa measures how well 2 quantities vary together.
+    rater_a is a list of rater a scores
+    rater_b is a list of rater b scores
+    min_rating is an optional argument describing the minimum rating possible on the data set
+    max_rating is an optional argument describing the maximum rating possible on the data set
+    Returns a float corresponding to the kappa correlation
+    """
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:
+        min_rating = min(rater_a + rater_b)
+    if max_rating is None:
+        max_rating = max(rater_a + rater_b)
+    conf_mat = confusion_matrix(rater_a, rater_b,
+        min_rating, max_rating)
+    num_ratings = len(conf_mat)
+    num_scored_items = float(len(rater_a))
+    hist_rater_a = histogram(rater_a, min_rating, max_rating)
+    hist_rater_b = histogram(rater_b, min_rating, max_rating)
+    numerator = 0.0
+    denominator = 0.0
+    if(num_ratings > 1):
+        for i in range(num_ratings):
+            for j in range(num_ratings):
+                expected_count = (hist_rater_a[i] * hist_rater_b[j]
+                                  / num_scored_items)
+                d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
+                numerator += d * conf_mat[i][j] / num_scored_items
+                denominator += d * expected_count / num_scored_items
+        return 1.0 - numerator / denominator
+    else:
+        return 1.0
+def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Generates a confusion matrix between rater_a and rater_b
+    A confusion matrix shows how often 2 values agree and disagree
+    See quadratic_weighted_kappa for argument descriptions
+    """
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:
+        min_rating = min(rater_a)
+    if max_rating is None:
+        max_rating = max(rater_a)
+    num_ratings = int(max_rating - min_rating + 1)
+    conf_mat = [[0 for i in range(num_ratings)]
+                for j in range(num_ratings)]
+    for a, b in zip(rater_a, rater_b):
+        conf_mat[a - min_rating][b - min_rating] += 1
+    return conf_mat
+def histogram(ratings, min_rating=None, max_rating=None):
+    """
+    Generates a frequency count of each rating on the scale
+    ratings is a list of scores
+    Returns a list of frequencies
+    """
+    if min_rating is None:
+        min_rating = min(ratings)
+    if max_rating is None:
+        max_rating = max(ratings)
+    num_ratings = int(max_rating - min_rating + 1)
+    hist_ratings = [0 for x in range(num_ratings)]
+    for r in ratings:
+        hist_ratings[r - min_rating] += 1
+    return hist_ratings
+def get_wordnet_syns(word):
+    """
+    Utilize wordnet (installed with nltk) to get synonyms for words
+    word is the input word
+    returns a list of unique synonyms
+    """
+    synonyms = []
+    regex = r"_"
+    pat = re.compile(regex)
+    synset = nltk.wordnet.wordnet.synsets(word)
+    for ss in synset:
+        for swords in ss.lemma_names:
+            synonyms.append(pat.sub(" ", swords.lower()))
+    synonyms = f7(synonyms)
+    return synonyms
+def get_separator_words(toks1):
+    """
+    Finds the words that separate a list of tokens from a background corpus
+    Basically this generates a list of informative/interesting words in a set
+    toks1 is a list of words
+    Returns a list of separator words
+    """
+    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
+    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
+        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
+    else:
+        essay_corpus = open(ESSAY_CORPUS_PATH).read()
+        essay_corpus = sub_chars(essay_corpus)
+        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
+        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
+    sep_words = []
+    for word in tab_toks1.keys():
+        tok1_present = tab_toks1[word]
+        if(tok1_present > 2):
+            tok1_total = tab_toks1._N
+            tok2_present = toks2[word]
+            tok2_total = toks2._N
+            fish_val = fisher.FishersExactTest.probability_of_table(
+                [[tok1_present, tok2_present], [tok1_total, tok2_total]])
+            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
+                sep_words.append(word)
+    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
+    return sep_words
+def encode_plus(s):
+    """
+    Literally encodes the plus sign
+    input is a string
+    returns the string with plus signs encoded
+    """
+    regex = r"\+"
+    pat = re.compile(regex)
+    return pat.sub("%2B", s)
+def getMedian(numericValues):
+    """
+    Gets the median of a list of values
+    Returns a float/int
+    """
+    theValues = sorted(numericValues)
+    if len(theValues) % 2 == 1:
+        return theValues[(len(theValues) + 1) / 2 - 1]
+    else:
+        lower = theValues[len(theValues) / 2 - 1]
+        upper = theValues[len(theValues) / 2]
+        return (float(lower + upper)) / 2 
\ No newline at end of file