Remove all

ade012fc · Vik Paruchuri · ec214872 · ec214872 · ec214872 · ec214872
Commit ade012fc authored Feb 26, 2013 by Vik Paruchuri
43 changed files
--- a/.gitignore
+++ b/.gitignore
-.idea/
-__pycache__/
-models/
-*.pyc
-*~
-tests/
--- a/Readme.md
+++ b/Readme.md
-Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run.  numpy, scipy, sklearn, and nltk also need to be installed.
-
-Nltk also requires the treebank maxent tagger and wordnet to be installed.  These can be installed through the nltk downloader(nltk.download()), or programatically through  `python -m nltk.downloader maxent_treebank_pos_tagger wordnet` .
-
-Runnable files:
-
-1. tests/test_models.py 
-
-	Generates test models when used like: `python create_test_models.py train_file prompt_file model_path`.  Use `python create_test_models.py train.tsv prompt.txt models/essay_set_1.p` to generate a model using sample data.
-
-2. test_server_code/pyxserver_wsgi.py
-
-	Starts a server instance that can be sent answers to score.  Calls grade.py to score responses.  Run server with `gunicorn -w 4 -b 127.0.0.1:3031 pyxserver_wsgi:application` . 
-
-3. tests/test.py
-
-	Submits test data found in directories within the tests folder to the xserver and displays results.  See tests/simple_essay for an example of how to format files.  You need payload.json, wrong.txt, and answer.txt to make a test.
-
-Testing:
-
-Tests can be run by running nosetests in the tests directory.  Make sure the test server is running first! 
--- a/create.py
+++ b/create.py
-import os
-import sys
-import logging
-log = logging.getLogger(__name__)
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import model_creator
-import util_functions
-import predictor_set
-import predictor_extractor
-
-from statsd import statsd
-
-@statsd.timed('open_ended_assessment.machine_learning.creator.time')
-def create(text,score,prompt_string,model_path):
-
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : ""}
-
-    if len(text)!=len(score):
-        msg = "Target and text lists must be same length."
-        results['errors'].append(msg)
-        log.exception(msg)
-        return results
-
-    try:
-        e_set = model_creator.create_essay_set(text, score, prompt_string)
-    except:
-        msg = "essay set creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-    try:
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
-        results['cv_kappa']=cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
-        results['feature_ext']=feature_ext
-        results['classifier']=classifier
-        results['success']=True
-    except:
-        msg = "feature extraction and model creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-
-    #Count number of successful/unsuccessful creations
-    statsd.increment("open_ended_assessment.machine_learning.creator_count",
-        tags=["success:{0}".format(results['success'])])
-
-    return results
-
-
-def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
-    results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
-               'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
-
-    if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
-        msg = "Target, numeric features, and text features must all be the same length."
-        results['errors'].append(msg)
-        log.exception(msg)
-        return results
-
-    try:
-        pset = predictor_set.PredictorSet(type="train")
-        for i in xrange(0, len(numeric_values)):
-            pset.add_row(numeric_values[i], textual_values[i], target[i])
-    except:
-        msg = "predictor set creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-
-    try:
-        feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
-        results['cv_kappa']=cv_error_results['kappa']
-        results['cv_mean_absolute_error']=cv_error_results['mae']
-        results['feature_ext']=feature_ext
-        results['classifier']=classifier
-        results['success']=True
-    except:
-        msg = "feature extraction and model creation failed."
-        results['errors'].append(msg)
-        log.exception(msg)
-
-        #Count number of successful/unsuccessful creations
-    statsd.increment("open_ended_assessment.machine_learning.creator_count",
-        tags=["success:{0}".format(results['success'])])
-
-    return results
\ No newline at end of file
--- a/data/essaycorpus.txt
+++ b/data/essaycorpus.txt
--- a/data/good_pos_ngrams.p
+++ b/data/good_pos_ngrams.p
--- a/documentation/deployment_steps.txt
+++ b/documentation/deployment_steps.txt
-Old steps (deprecated):
-
-source opt/edx/bin/activate
-sudo apt-get upgrade gcc
-sudo pip install numpy
-#in ml dir
-sudo pip install -r requirements.txt
-#in controller dir
-sudo pip install -r requirements.txt
-mkdir log in both repos
-mkdir /opt/wwc/log
-touch /opt/wwc/log/edx.log
-sudo su makeitso
-django-admin syncdb
-django-admin migrate
-
-
-
-sudo su makeitso
-bash
-source /opt/edx/bin/activate
-cd /opt/wwc/grading-controller
-pip install numpy
-pip install -r requirements.txt
-cd opt/wwc/machine-learning
-pup install -r requirements.txt
-python -m nltk.downloader maxent_treebank_pos_tagger wordnet
-sudo mv /path/to/nltk_data /usr/share
-sudo apt-get install aspell
-
-python /opt/wwc/grading-controller/manage.py update_users
-
-Copy auth.json and env.json into the grading-controller folder (no idea why this is needed, but django-admin doesn't find them at ENV_ROOT)
-chown grading-controller/edx.log to makeitso
-
-make db file (for sqlite)
-chown db file to makeitso
-
-For some reason, have to copy auth.json and env.json to both opt/wwc/grading-controller and opt/wwc/ dirs
-
-sudo apt-get install super
-
-pip install MySQL-python
-
-In upstart files, be sure to specify settings file and full python path! (/opt/wwc/grading-controller)
-
-
-How to run:
-/opt/edx/bin/gunicorn --preload -b 127.0.0.1:8000 -w 4 --timeout=300 --pythonpath=/opt/wwc/grading-controller grading_controller.wsgi
-
-
-
-Upstart tasks:
-
-grader
-ml_grader
-ml_creator
-pull_from_xqueue
-expire_old
-
-
-
-
-  "DATABASES": {
-    "default": {
-      "ENGINE": "django.db.backends.mysql",
-      "NAME": "sandbox_grader",
-      "USERNAME": "sandbox_grader",
-      "PORT": "3306",
-      "PASSWORD": "faarg16ren",
-      "HOST": "sandbox.rds.edx.org"
-    }
-  },
-
--- a/documentation/install.txt
+++ b/documentation/install.txt
-sudo apt-get update
-sudo apt-get upgrade gcc
-sudo xargs -a apt-packages.txt apt-get install
-sudo pip install virtualenv
-sudo mkdir /opt/edx
-source /opt/edx/bin/activate
-cd /opt/wwc/machine-learning
-pip install numpy
-pip install scipy
-pip install -r requirements.txt
-cd opt/wwc/machine-learning
-pup install -r requirements.txt
-python -m nltk.downloader maxent_treebank_pos_tagger wordnet
-sudo mv /path/to/nltk_data /usr/share
\ No newline at end of file
--- a/essay_set.py
+++ b/essay_set.py
-"""
-Defines an essay set object, which encapsulates essays from training and test sets.
-Performs spell and grammar checking, tokenization, and stemming.
-"""
-
-import numpy
-import nltk
-import sys
-import random
-import os
-import logging
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-import util_functions
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-log=logging.getLogger(__name__)
-
-MAXIMUM_ESSAY_LENGTH=20000
-
-class EssaySet(object):
-    def __init__(self, type="train"):
-        """
-        Initialize variables and check essay set type
-        """
-        if(type != "train" and type != "test"):
-            type = "train"
-
-        self._type = type
-        self._score=[]
-        self._text=[]
-        self._id=[]
-        self._clean_text=[]
-        self._tokens=[]
-        self._pos=[]
-        self._clean_stem_text=[]
-        self._generated = []
-        self._prompt = ""
-        self._spelling_errors=[]
-        self._markup_text=[]
-
-    def add_essay(self, essay_text, essay_score, essay_generated=0):
-        """
-        Add new (essay_text,essay_score) pair to the essay set.
-        essay_text must be a string.
-        essay_score must be an int.
-        essay_generated should not be changed by the user.
-        Returns a confirmation that essay was added.
-        """
-        # Get maximum current essay id, or set to 0 if this is the first essay added
-        if(len(self._id) > 0):
-            max_id = max(self._id)
-        else:
-            max_id = 0
-            # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
-
-        try:
-            essay_text=essay_text.encode('ascii', 'ignore')
-            if len(essay_text)<5:
-                essay_text="Invalid essay."
-        except:
-            log.exception("Could not parse essay into ascii.")
-
-        try:
-            #Try conversion of types
-            essay_score=int(essay_score)
-            essay_text=str(essay_text)
-        except:
-            #Nothing needed here, will return error in any case.
-            log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
-
-        if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
-        and (essay_generated == 0 or essay_generated == 1):
-            self._id.append(max_id + 1)
-            self._score.append(essay_score)
-            # Clean text by removing non digit/work/punctuation characters
-            try:
-                essay_text=str(essay_text.encode('ascii', 'ignore'))
-            except:
-                essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
-            cleaned_essay=util_functions.sub_chars(essay_text).lower()
-            if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
-                cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
-            self._text.append(cleaned_essay)
-            # Spell correct text using aspell
-            cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
-            self._clean_text.append(cleaned_text)
-            self._spelling_errors.append(spell_errors)
-            self._markup_text.append(markup_text)
-            # Tokenize text
-            self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
-            # Part of speech tag text
-            self._pos.append(nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" ")))
-            self._generated.append(essay_generated)
-            # Stem spell corrected text
-            porter = nltk.PorterStemmer()
-            por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
-            self._clean_stem_text.append(por_toks)
-
-            ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
-        else:
-            raise util_functions.InputError(essay_text, "arguments need to be in format "
-                                                        "(text,score). text needs to be string,"
-                                                        " score needs to be int.")
-
-    def update_prompt(self, prompt_text):
-        """
-        Update the default prompt string, which is "".
-        prompt_text should be a string.
-        Returns the prompt as a confirmation.
-        """
-        if(type(prompt_text) == type("text")):
-            self._prompt = util_functions.sub_chars(prompt_text)
-            ret = self._prompt
-        else:
-            raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
-        return ret
-
-    def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
-        """
-        Substitute synonyms to generate extra essays from existing ones.
-        This is done to increase the amount of training data.
-        Should only be used with lowest scoring essays.
-        e_text is the text of the original essay.
-        e_score is the score of the original essay.
-        dict is a fixed dictionary (list) of words to replace.
-        max_syns defines the maximum number of additional essays to generate.  Do not set too high.
-        """
-        random.seed(1)
-        e_toks = nltk.word_tokenize(e_text)
-        all_syns = []
-        for word in e_toks:
-            synonyms = util_functions.get_wordnet_syns(word)
-            if(len(synonyms) > max_syns):
-                synonyms = random.sample(synonyms, max_syns)
-            all_syns.append(synonyms)
-        new_essays = []
-        for i in range(0, max_syns):
-            syn_toks = e_toks
-            for z in range(0, len(e_toks)):
-                if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
-                    syn_toks[z] = all_syns[z][i]
-            new_essays.append(" ".join(syn_toks))
-        for z in xrange(0, len(new_essays)):
-            self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
--- a/external_code/__init__.py
+++ b/external_code/__init__.py
-__author__ = 'vik'
--- a/external_code/fisher/LICENSE.txt
+++ b/external_code/fisher/LICENSE.txt
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/external_code/fisher/__init__.py
+++ b/external_code/fisher/__init__.py
-__author__ = 'vik'
--- a/external_code/fisher/fisher.py
+++ b/external_code/fisher/fisher.py
-#! /usr/bin/env python
-
-##############################################################################
-# Following functions have been taken from the DendroPy library from:
-##
-## DendroPy Phylogenetic Computing Library.
-##
-## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
-## All rights reserved.
-##
-## See "LICENSE.txt" for terms and conditions of usage.
-##
-## If you use this work or any portion thereof in published work,
-## please cite it as:
-##
-## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
-## for phylogenetic computing. Bioinformatics 26: 1569-1571.
-##
-##############################################################################
-
-import math
-
-## From dendropy.mathlib.probability
-def hypergeometric_pmf(x, m, n, k):
-    """
-Given a population consisting of `m` items of class M and `n` items of class N,
-this returns the probability of observing `x` items of class M when sampling
-`k` times without replacement from the entire population (i.e., {M,N})
-
-p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
-"""
-    # following fails with 'OverflowError: long int too large to convert to
-    # float' with large numbers
-    # return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
-    a = math.log(binomial_coefficient(m, x))
-    b = math.log(binomial_coefficient(n, k-x))
-    c = math.log(binomial_coefficient(m+n, k))
-    return math.exp(a+b-c)
-
-## From dendropy.mathlib.probability
-def binomial_coefficient(population, sample):
-    "Returns `population` choose `sample`."
-    s = max(sample, population - sample)
-    assert s <= population
-    assert population > -1
-    if s == population:
-        return 1
-    numerator = 1
-    denominator = 1
-    for i in xrange(s+1, population + 1):
-        numerator *= i
-        denominator *= (i - s)
-    return numerator/denominator
-
-## From dendropy.mathlib.statistics
-class FishersExactTest(object):
-    """
-Given a 2x2 table:
-
-+---+---+
-| a | b |
-+---+---+
-| c | d |
-+---+---+
-
-represented by a list of lists::
-
-[[a,b],[c,d]]
-
-this calculates the sum of the probability of this table and all others
-more extreme under the null hypothesis that there is no association between
-the categories represented by the vertical and horizontal axes.
-"""
-
-    def probability_of_table(table):
-        """
-Given a 2x2 table:
-
-+---+---+
-| a | b |
-+---+---+
-| c | d |
-+---+---+
-
-represented by a list of lists::
-
-[[a,b],[c,d]]
-
-this returns the probability of this table under the null hypothesis of
-no association between rows and columns, which was shown by Fisher to be
-a hypergeometric distribution:
-
-p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
-
-"""
-        a = table[0][0]
-        b = table[0][1]
-        c = table[1][0]
-        d = table[1][1]
-        return hypergeometric_pmf(a, a+b, c+d, a+c)
-    probability_of_table = staticmethod(probability_of_table)
-
-    def __init__(self, table):
-        self.table = table
-        self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
-        self.min_value = min(self.flat_table)
-        self.max_value = max(self.flat_table)
-
-    def _rotate_cw(self, table):
-        """
-Returns a copy of table such that all the values
-are rotated clockwise once.
-"""
-        return [ [ table[1][0], table[0][0] ],
-                [table[1][1], table[0][1] ] ]
-
-    def _min_rotation(self):
-        """
-Returns copy of self.table such that the smallest value is in the first
-(upper left) cell.
-"""
-        table = [list(self.table[0]), list(self.table[1])]
-        while table[0][0] != self.min_value:
-            table = self._rotate_cw(table)
-        return table
-
-    def _max_rotation(self):
-        """
-Returns copy of self.table such that the largest value is in the first
-(upper left) cell.
-"""
-        table = [list(self.table[0]), list(self.table[1])]
-        while table[0][0] != self.max_value:
-            table = self._rotate_cw(table)
-        return table
-
-    def _sum_left_tail(self):
-        # left_tail_tables = self._get_left_tail_tables()
-        # p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
-        p_vals = self._get_left_tail_probs()
-        return sum(p_vals)
-
-    def _sum_right_tail(self):
-        # right_tail_tables = self._get_right_tail_tables()
-        # p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
-        p_vals = self._get_right_tail_probs()
-        return sum(p_vals)
-
-    def _get_left_tail_probs(self):
-        table = self._min_rotation()
-        row_totals = [sum(table[0]), sum(table[1])]
-        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
-        p_vals = []
-        while True:
-            table[0][0] -= 1
-            if table[0][0] < 0:
-                break
-            table[0][1] = row_totals[0] - table[0][0]
-            table[1][0] = col_totals[0] - table[0][0]
-            table[1][1] = row_totals[1] - table[1][0]
-            p_vals.append(self.probability_of_table(table))
-        return p_vals
-
-    def _get_right_tail_probs(self):
-        table = self._min_rotation()
-        row_totals = [sum(table[0]), sum(table[1])]
-        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
-        p_vals = []
-        while True:
-            table[0][0] += 1
-            table[0][1] = row_totals[0] - table[0][0]
-            if table[0][1] < 0:
-                break
-            table[1][0] = col_totals[0] - table[0][0]
-            if table[1][0] < 0:
-                break
-            table[1][1] = row_totals[1] - table[1][0]
-            if table[1][1] < 0:
-                break
-            p_vals.append(self.probability_of_table(table))
-        return p_vals
-
-    def _get_left_tail_tables(self):
-        table = self._min_rotation()
-        row_totals = [sum(table[0]), sum(table[1])]
-        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
-        left_tail_tables = []
-        while True:
-            table[0][0] -= 1
-            if table[0][0] < 0:
-                break
-            table[0][1] = row_totals[0] - table[0][0]
-            table[1][0] = col_totals[0] - table[0][0]
-            table[1][1] = row_totals[1] - table[1][0]
-            left_tail_tables.append([list(table[0]), list(table[1])])
-        return left_tail_tables
-
-    def _get_right_tail_tables(self):
-        table = self._min_rotation()
-        row_totals = [sum(table[0]), sum(table[1])]
-        col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
-        right_tail_tables = []
-        while True:
-            table[0][0] += 1
-            table[0][1] = row_totals[0] - table[0][0]
-            if table[0][1] < 0:
-                break
-            table[1][0] = col_totals[0] - table[0][0]
-            if table[1][0] < 0:
-                break
-            table[1][1] = row_totals[1] - table[1][0]
-            if table[1][1] < 0:
-                break
-            right_tail_tables.append([list(table[0]), list(table[1])])
-        return right_tail_tables
-
-    def left_tail_p(self):
-        """
-Returns the sum of probabilities of this table and all others more
-extreme.
-"""
-        return self.probability_of_table(self.table) + self._sum_left_tail()
-
-    def right_tail_p(self):
-        """
-Returns the sum of probabilities of this table and all others more
-extreme.
-"""
-        return self.probability_of_table(self.table) + self._sum_right_tail()
-
-    def two_tail_p(self):
-        """
-Returns the sum of probabilities of this table and all others more
-extreme.
-"""
-        p0 = self.probability_of_table(self.table)
-        all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
-        p_vals = []
-        for p in all_p_vals:
-            if p <= p0:
-                p_vals.append(p)
-        return sum(p_vals) + p0
-
-def assert_almost_equal(v1, v2, prec=8):
-    if abs(v1-v2) <= 10**(-prec):
-        print "OK: {} == {}".format(v1, v2)
-    else:
-        print "FAIL: {} != {}".format(v1, v2)
-
-if __name__ == "__main__":
-    table = [[12, 5], [29, 2]]
-    ft = FishersExactTest(table)
-    assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
-    assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
-    assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
--- a/feature_extractor.py
+++ b/feature_extractor.py
-"""
-Extracts features from training set and test set essays
-"""
-
-import numpy
-import re
-import nltk
-import sys
-from sklearn.feature_extraction.text import CountVectorizer
-import pickle
-import os
-from itertools import chain
-import copy
-import operator
-import logging
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-from essay_set import EssaySet
-import util_functions
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-log = logging.getLogger(__name__)
-
-NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
-ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
-
-class FeatureExtractor(object):
-    def __init__(self):
-        self._good_pos_ngrams = self.get_good_pos_ngrams()
-        self.dict_initialized = False
-        self._spell_errors_per_character=0
-        self._grammar_errors_per_character=0
-
-    def initialize_dictionaries(self, e_set, max_feats2 = 200):
-        """
-        Initializes dictionaries from an essay set object
-        Dictionaries must be initialized prior to using this to extract features
-        e_set is an input essay set
-        returns a confirmation of initialization
-        """
-        if(hasattr(e_set, '_type')):
-            if(e_set._type == "train"):
-                nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
-                svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
-                self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
-                self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
-                self.dict_initialized = True
-                self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
-                self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
-                good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
-                self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
-                bag_feats=self.gen_bag_feats(e_set)
-                f_row_sum=numpy.sum(bag_feats[:,:])
-                self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
-                ret = "ok"
-            else:
-                raise util_functions.InputError(e_set, "needs to be an essay set of the train type.")
-        else:
-            raise util_functions.InputError(e_set, "wrong input. need an essay set object")
-        return ret
-
-    def get_good_pos_ngrams(self):
-        """
-        Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
-        Returns the list and caches the file
-        """
-        if(os.path.isfile(NGRAM_PATH)):
-            good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
-        elif os.path.isfile(ESSAY_CORPUS_PATH):
-            essay_corpus = open(ESSAY_CORPUS_PATH).read()
-            essay_corpus = util_functions.sub_chars(essay_corpus)
-            good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
-            pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
-        else:
-            #Hard coded list in case the needed files cannot be found
-            good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
-             '. DT NNP', '. DT NNP NNP', 'DT NNP', 'DT NNP NNP', 'DT NNP NNP NNP', 'NNP NNP',
-             'NNP NNP NNP', 'NNP NNP NNP NNP', 'NNP NNP NNP .', 'NNP NNP .', 'NNP NNP . TO',
-             'NNP .', 'NNP . TO', 'NNP . TO NNP', '. TO', '. TO NNP', '. TO NNP NNP',
-             'TO NNP', 'TO NNP NNP']
-
-        return good_pos_ngrams
-
-    def _get_grammar_errors(self,pos,text,tokens):
-        """
-        Internal function to get the number of grammar errors in given text
-        """
-        word_counts = [max(len(t),1) for t in tokens]
-        good_pos_tags = []
-        min_pos_seq=2
-        max_pos_seq=4
-        bad_pos_positions=[]
-        for i in xrange(0, len(text)):
-            pos_seq = [tag[1] for tag in pos[i]]
-            pos_ngrams = util_functions.ngrams(pos_seq, min_pos_seq, max_pos_seq)
-            long_pos_ngrams=[z for z in pos_ngrams if z.count(' ')==(max_pos_seq-1)]
-            bad_pos_tuples=[[z,z+max_pos_seq] for z in xrange(0,len(long_pos_ngrams)) if long_pos_ngrams[z] not in self._good_pos_ngrams]
-            bad_pos_tuples.sort(key=operator.itemgetter(1))
-            to_delete=[]
-            for m in reversed(xrange(len(bad_pos_tuples)-1)):
-                start, end = bad_pos_tuples[m]
-                for j in xrange(m+1, len(bad_pos_tuples)):
-                    lstart, lend = bad_pos_tuples[j]
-                    if lstart >= start and lstart <= end:
-                        bad_pos_tuples[m][1]=bad_pos_tuples[j][1]
-                        to_delete.append(j)
-
-            fixed_bad_pos_tuples=[bad_pos_tuples[z] for z in xrange(0,len(bad_pos_tuples)) if z not in to_delete]
-            bad_pos_positions.append(fixed_bad_pos_tuples)
-            overlap_ngrams = [z for z in pos_ngrams if z in self._good_pos_ngrams]
-            if (len(pos_ngrams)-len(overlap_ngrams))>0:
-                divisor=len(pos_ngrams)/len(pos_seq)
-            else:
-                divisor=1
-            good_pos_tags.append((len(pos_ngrams)-len(overlap_ngrams))/divisor)
-        return good_pos_tags,bad_pos_positions
-
-    def gen_length_feats(self, e_set):
-        """
-        Generates length based features from an essay set
-        Generally an internal function called by gen_feats
-        Returns an array of length features
-        """
-        text = e_set._text
-        lengths = [len(e) for e in text]
-        word_counts = [max(len(t),1) for t in e_set._tokens]
-        comma_count = [e.count(",") for e in text]
-        ap_count = [e.count("'") for e in text]
-        punc_count = [e.count(".") + e.count("?") + e.count("!") for e in text]
-        chars_per_word = [lengths[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-
-        good_pos_tags,bad_pos_positions= self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
-        good_pos_tag_prop = [good_pos_tags[m] / float(word_counts[m]) for m in xrange(0, len(text))]
-
-        length_arr = numpy.array((
-        lengths, word_counts, comma_count, ap_count, punc_count, chars_per_word, good_pos_tags,
-        good_pos_tag_prop)).transpose()
-
-        return length_arr.copy()
-
-    def gen_bag_feats(self, e_set):
-        """
-        Generates bag of words features from an input essay set and trained FeatureExtractor
-        Generally called by gen_feats
-        Returns an array of features
-        """
-        if(hasattr(self, '_stem_dict')):
-            sfeats = self._stem_dict.transform(e_set._clean_stem_text)
-            nfeats = self._normal_dict.transform(e_set._text)
-            bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
-        else:
-            raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
-        return bag_feats.copy()
-
-    def gen_feats(self, e_set):
-        """
-        Generates bag of words, length, and prompt features from an essay set object
-        returns an array of features
-        """
-        bag_feats = self.gen_bag_feats(e_set)
-        length_feats = self.gen_length_feats(e_set)
-        prompt_feats = self.gen_prompt_feats(e_set)
-        overall_feats = numpy.concatenate((length_feats, prompt_feats, bag_feats), axis=1)
-        overall_feats = overall_feats.copy()
-
-        return overall_feats
-
-    def gen_prompt_feats(self, e_set):
-        """
-        Generates prompt based features from an essay set object and internal prompt variable.
-        Generally called internally by gen_feats
-        Returns an array of prompt features
-        """
-        prompt_toks = nltk.word_tokenize(e_set._prompt)
-        expand_syns = []
-        for word in prompt_toks:
-            synonyms = util_functions.get_wordnet_syns(word)
-            expand_syns.append(synonyms)
-        expand_syns = list(chain.from_iterable(expand_syns))
-        prompt_overlap = []
-        prompt_overlap_prop = []
-        for j in e_set._tokens:
-            tok_length=len(j)
-            if(tok_length==0):
-                tok_length=1
-            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
-            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
-        expand_overlap = []
-        expand_overlap_prop = []
-        for j in e_set._tokens:
-            tok_length=len(j)
-            if(tok_length==0):
-                tok_length=1
-            expand_overlap.append(len([i for i in j if i in expand_syns]))
-            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))
-
-        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()
-
-        return prompt_arr.copy()
-
-    def gen_feedback(self, e_set, features=None):
-        """
-        Generate feedback for a given set of essays
-        e_set - EssaySet object
-        features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
-        in order to get off topic feedback.
-        Returns a list of lists (one list per essay in e_set)
-        """
-
-        #Set ratio to modify thresholds for grammar/spelling errors
-        modifier_ratio=1.05
-
-        #Calc number of grammar and spelling errors per character
-        set_grammar,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
-        set_grammar_per_character=[set_grammar[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
-        set_spell_errors_per_character=[e_set._spelling_errors[m]/float(len(e_set._text[m])+.1) for m in xrange(0,len(e_set._text))]
-
-        #Iterate through essays and create a feedback dict for each
-        all_feedback=[]
-        for m in xrange(0,len(e_set._text)):
-            #Be very careful about changing these messages!
-            individual_feedback={'grammar' : "Grammar: Ok.",
-                                 'spelling' : "Spelling: Ok.",
-                                 'markup_text' : "",
-                                 'grammar_per_char' : set_grammar_per_character[m],
-                                 'spelling_per_char' : set_spell_errors_per_character[m],
-                                 'too_similar_to_prompt' : False,
-                                 }
-            markup_tokens=e_set._markup_text[m].split(" ")
-
-            #This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
-            #disjointed
-            bad_pos_starts=[z[0] for z in bad_pos_positions[m]]
-            bad_pos_ends=[z[1]-1 for z in bad_pos_positions[m]]
-            for z in xrange(0,len(markup_tokens)):
-                if z in bad_pos_starts:
-                    markup_tokens[z]='<bg>' + markup_tokens[z]
-                elif z in bad_pos_ends:
-                    markup_tokens[z]=markup_tokens[z] + "</bg>"
-            if(len(bad_pos_ends)>0 and len(bad_pos_starts)>0 and len(markup_tokens)>1):
-                if max(bad_pos_ends)>(len(markup_tokens)-1) and max(bad_pos_starts)<(len(markup_tokens)-1):
-                    markup_tokens[len(markup_tokens)-1]+="</bg>"
-
-            #Display messages if grammar/spelling errors greater than average in training set
-            if set_grammar_per_character[m]>(self._grammar_errors_per_character*modifier_ratio):
-                individual_feedback['grammar']="Grammar: More grammar errors than average."
-            if set_spell_errors_per_character[m]>(self._spell_errors_per_character*modifier_ratio):
-                individual_feedback['spelling']="Spelling: More spelling errors than average."
-
-            #Test topicality by calculating # of on topic words per character and comparing to the training set
-            #mean.  Requires features to be passed in
-            if features is not None:
-                f_row_sum=numpy.sum(features[m,12:])
-                f_row_prop=f_row_sum/len(e_set._text[m])
-                if f_row_prop<(self._mean_f_prop/1.5) or len(e_set._text[m])<20:
-                    individual_feedback['topicality']="Topicality: Essay may be off topic."
-
-                if(features[m,9]>.6):
-                    individual_feedback['prompt_overlap']="Prompt Overlap: Too much overlap with prompt."
-                    individual_feedback['too_similar_to_prompt']=True
-                    log.debug(features[m,9])
-
-            #Create string representation of markup text
-            markup_string=" ".join(markup_tokens)
-            individual_feedback['markup_text']=markup_string
-            all_feedback.append(individual_feedback)
-
-        return all_feedback
--- a/grade.py
+++ b/grade.py
-#Grader called by pyxserver_wsgi.py
-#Loads a grader file, which is a dict containing the prompt of the question,
-#a feature extractor object, and a trained model.
-#Extracts features and runs trained model on the submission to produce a final score.
-#Correctness determined by ratio of score to max possible score.
-#Requires aspell to be installed and added to the path.
-
-import sys
-import pickle
-import os
-import numpy
-import logging
-from statsd import statsd
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-from essay_set import EssaySet
-import predictor_extractor
-import predictor_set
-import util_functions
-
-#Imports needed to unpickle grader data
-import feature_extractor
-import sklearn.ensemble
-import math
-
-log = logging.getLogger(__name__)
-
-@statsd.timed('open_ended_assessment.machine_learning.grader.time')
-def grade(grader_data,grader_config,submission):
-
-    results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
-
-    has_error=False
-
-    #Try to find and load the model file
-
-    grader_set=EssaySet(type="test")
-
-    #Try to add essays to essay set object
-    try:
-        grader_set.add_essay(str(submission),0)
-        grader_set.update_prompt(str(grader_data['prompt']))
-    except:
-        results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
-        has_error=True
-
-    #Try to extract features from submission and assign score via the model
-    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
-        feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
-        results['score']=int(grader_data['model'].predict(grader_feats)[0])
-    except :
-        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
-
-    #Try to determine confidence level
-    try:
-        min_score=min(numpy.asarray(grader_data['score']))
-        max_score=max(numpy.asarray(grader_data['score']))
-        raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-        #TODO: Normalize confidence somehow here
-        results['confidence']=raw_confidence
-    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
-        log.exception("Problem generating confidence value")
-
-    if not has_error:
-
-        if(feedback['too_similar_to_prompt']):
-            results['score']=0
-            results['correct']=False
-
-        results['success']=True
-
-        #Generate short form output--number of problem areas identified in feedback
-        problem_areas=0
-        for tag in feedback:
-            if tag in ['topicality', 'prompt-overlap', 'spelling', 'grammar']:
-                problem_areas+=len(feedback[tag])>5
-
-        #Add feedback to results
-        results['feedback'] = {}
-        if 'topicality' in feedback and 'prompt_overlap' in feedback:
-            results['feedback'].update({
-                'topicality' : feedback['topicality'],
-                'prompt-overlap' : feedback['prompt_overlap'],
-            })
-
-        if results['score']/float(max_score)<.33:
-            results['feedback'].update(
-                {'spelling' : feedback['spelling'],
-            'grammar' : feedback['grammar'],
-            'markup-text' : feedback['markup_text'],
-            })
-
-    else:
-        #If error, success is False.
-        results['success']=False
-
-    #Count number of successful/unsuccessful gradings
-    statsd.increment("open_ended_assessment.machine_learning.grader_count",
-        tags=["success:{0}".format(results['success'])])
-
-    return results
-
-def grade_generic(grader_data, grader_config, numeric_features, textual_features):
-    results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
-
-    has_error=False
-
-    #Try to find and load the model file
-
-    grader_set=predictor_set.PredictorSet(type="test")
-
-    #Try to add essays to essay set object
-    try:
-        grader_set.add_row(numeric_features, textual_features,0)
-    except:
-        results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
-        has_error=True
-
-    #Try to extract features from submission and assign score via the model
-    try:
-        grader_feats=grader_data['extractor'].gen_feats(grader_set)
-        results['score']=grader_data['model'].predict(grader_feats)[0]
-    except :
-        results['errors'].append("Could not extract features and score essay.")
-        has_error=True
-
-    #Try to determine confidence level
-    try:
-        min_score=min(numpy.asarray(grader_data['score']))
-        max_score=max(numpy.asarray(grader_data['score']))
-        if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
-            raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
-            #TODO: Normalize confidence somehow here
-            results['confidence']=raw_confidence
-        else:
-            raw_confidence = grader_data['model'].predict(grader_feats)[0]
-            confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
-            results['confidence'] = confidence
-    except:
-        #If there is an error getting confidence, it is not a show-stopper, so just log
-        log.exception("Problem generating confidence value")
-
-        #Count number of successful/unsuccessful gradings
-    statsd.increment("open_ended_assessment.machine_learning.grader_count",
-        tags=["success:{0}".format(results['success'])])
-
-    if not has_error:
-        results['success'] = True
-
-    return results
--- a/install/apt-packages.txt
+++ b/install/apt-packages.txt
-python-pip
-python-scipy
-python-mysqldb
-ipython
-nginx
-git
-redis-server
-libmysqlclient-dev
-gfortran
-libblas3gf
-libblas-dev
-liblapack3gf
-liblapack-dev
-libatlas-base-dev
-libxml2-dev
-libxslt1-dev
-libreadline6
-libreadline6-dev
-build-essential
-curl
-aspell
-python
\ No newline at end of file
--- a/install/install_system_req.sh
+++ b/install/install_system_req.sh
-#!/usr/bin/env bash
-
-# posix compliant sanity check
-if [ -z $BASH ] || [  $BASH = "/bin/sh" ]; then
-    echo "Please use the bash interpreter to run this script"
-    exit 1
-fi
-
-error() {
-      printf '\E[31m'; echo "$@"; printf '\E[0m'
-}
-output() {
-      printf '\E[36m'; echo "$@"; printf '\E[0m'
-}
-
-
-### START
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-BREW_FILE=$DIR/"brew-formulas.txt"
-APT_PKGS_FILE=$DIR/"apt-packages.txt"
-
-case `uname -s` in
-    [Ll]inux)
-        command -v lsb_release &>/dev/null || {
-            error "Please install lsb-release."
-            exit 1
-        }
-
-        distro=`lsb_release -cs`
-        case $distro in
-            maya|lisa|natty|oneiric|precise|quantal)
-                output "Installing Ubuntu requirements"
-
-                # DEBIAN_FRONTEND=noninteractive is required for silent mysql-server installation
-                export DEBIAN_FRONTEND=noninteractive
-
-                # install packages listed in APT_PKGS_FILE
-                cat $APT_PKGS_FILE | xargs sudo apt-get -y install
-                ;;
-            *)
-                error "Unsupported distribution - $distro"
-                exit 1
-               ;;
-        esac
-        ;;
-    Darwin)
-
-        if [[ ! -w /usr/local ]]; then
-            cat<<EO
-
-        You need to be able to write to /usr/local for
-        the installation of brew and brew packages.
-
-        Either make sure the group you are in (most likely 'staff')
-        can write to that directory or simply execute the following
-        and re-run the script:
-
-        $ sudo chown -R $USER /usr/local
-EO
-
-            exit 1
-
-        fi
-
-        output "Installing OSX requirements"
-        if [[ ! -r $BREW_FILE ]]; then
-            error "$BREW_FILE does not exist, needed to install brew"
-            exit 1
-        fi
-
-        # brew errors if the package is already installed
-        for pkg in $(cat $BREW_FILE); do
-            grep $pkg <(brew list) &>/dev/null || {
-                output "Installing $pkg"
-                brew install $pkg
-            }
-        done
-
-        # paths where brew likes to install python scripts
-        PATH=/usr/local/share/python:/usr/local/bin:$PATH
-
-        command -v pip &>/dev/null || {
-            output "Installing pip"
-            easy_install pip
-        }
-
-        if ! grep -Eq ^1.7 <(virtualenv --version 2>/dev/null); then
-            output "Installing virtualenv >1.7"
-            pip install 'virtualenv>1.7' virtualenvwrapper
-        fi
-
-        command -v coffee &>/dev/null || {
-            output "Installing coffee script"
-            curl --insecure https://npmjs.org/install.sh | sh
-            npm install -g coffee-script
-        }
-        ;;
-    *)
-        error "Unsupported platform"
-        exit 1
-        ;;
-esac
--- a/install/pre-requirements.txt
+++ b/install/pre-requirements.txt
-numpy==1.6.2
--- a/install/requirements.txt
+++ b/install/requirements.txt
-boto==2.6.0
-coverage==3.5.3
-dogstatsd-python==0.2
-lxml==3.0.1
-mock==0.8.0
-nltk==2.0.3
-nose==1.2.1
-scipy==0.11.0
-path.py
-pip
-pygraphviz==1.1
-pylint==0.26.0
-pytz==2012h
-scikit-learn==0.12.1
-
--- a/model_creator.py
+++ b/model_creator.py
-#Provides interface functions to create and save models
-
-import numpy
-import re
-import nltk
-import sys
-from sklearn.feature_extraction.text import CountVectorizer
-import pickle
-import os
-import sklearn.ensemble
-from itertools import chain
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-from essay_set import EssaySet
-import util_functions
-import feature_extractor
-import logging
-import predictor_extractor
-
-log=logging.getLogger()
-
-def read_in_test_data(filename):
-    """
-    Reads in test data file found at filename.
-    filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
-    returns the score and the text
-    """
-    id, e_set, score, score2, text = [], [], [], [], []
-    combined_raw = open(filename).read()
-    raw_lines = combined_raw.splitlines()
-    for row in xrange(1, len(raw_lines)):
-        id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
-        id.append(int(id1))
-        text.append(text1)
-        e_set.append(int(set1))
-        score.append(int(score1))
-        score2.append(int(score12))
-
-    return score, text
-
-
-def read_in_test_prompt(filename):
-    """
-    Reads in the prompt from a text file
-    Returns string
-    """
-    prompt_string = open(filename).read()
-    return prompt_string
-
-def read_in_test_data_twocolumn(filename,sep=","):
-    """
-    Reads in a two column version of the test data.
-    Filename must point to a delimited file.
-    In filename, the first column should be integer score data.
-    The second column should be string text data.
-    Sep specifies the type of separator between fields.
-    """
-    score, text = [], []
-    combined_raw = open(filename).read()
-    raw_lines = combined_raw.splitlines()
-    for row in xrange(1, len(raw_lines)):
-        score1, text1 = raw_lines[row].strip().split("\t")
-        text.append(text1)
-        score.append(int(score1))
-
-    return score, text
-
-
-def create_essay_set(text, score, prompt_string, generate_additional=True):
-    """
-    Creates an essay set from given data.
-    Text should be a list of strings corresponding to essay text.
-    Score should be a list of scores where score[n] corresponds to text[n]
-    Prompt string is just a string containing the essay prompt.
-    Generate_additional indicates whether to generate additional essays at the minimum score point or not.
-    """
-    x = EssaySet()
-    for i in xrange(0, len(text)):
-        x.add_essay(text[i], score[i])
-        if score[i] == min(score) and generate_additional == True:
-            x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
-
-    x.update_prompt(prompt_string)
-
-    return x
-
-def get_cv_error(clf,feats,scores):
-    results={'success' : False, 'kappa' : 0, 'mae' : 0}
-    try:
-        cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
-        err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
-        kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-        results['mae']=err
-        results['kappa']=kappa
-        results['success']=True
-    except ValueError:
-        #If this is hit, everything is fine.  It is hard to explain why the error occurs, but it isn't a big deal.
-        log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
-    except:
-        log.exception("Error getting cv error estimates.")
-
-    return results
-
-def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
-    if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
-        algorithm = util_functions.AlgorithmTypes.regression
-
-    f = predictor_extractor.PredictorExtractor()
-    f.initialize_dictionaries(predictor_set)
-
-    train_feats = f.gen_feats(predictor_set)
-
-    if type == util_functions.AlgorithmTypes.classification:
-        clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-    else:
-        clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-        clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
-            max_depth=4, random_state=1,min_samples_leaf=3)
-
-    cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
-
-    try:
-        set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
-        clf.fit(train_feats, set_score)
-    except ValueError:
-        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
-        set_score[1]=0
-        clf.fit(train_feats, set_score)
-
-    return f, clf, cv_error_results
-
-
-def extract_features_and_generate_model(essays,additional_array=None):
-    """
-    Feed in an essay set to get feature vector and classifier
-    essays must be an essay set object
-    additional array is an optional argument that can specify
-    a numpy array of values to add in
-    returns a trained FeatureExtractor object and a trained classifier
-    """
-    f = feature_extractor.FeatureExtractor()
-    f.initialize_dictionaries(essays)
-
-    train_feats = f.gen_feats(essays)
-    if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
-        if(additional_array.shape[0]==train_feats.shape[0]):
-            train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
-
-    clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-        max_depth=4, random_state=1,min_samples_leaf=3)
-
-    clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
-        max_depth=4, random_state=1,min_samples_leaf=3)
-
-    cv_error_results=get_cv_error(clf2,train_feats,essays._score)
-
-    try:
-        set_score = numpy.asarray(essays._score, dtype=numpy.int)
-        clf.fit(train_feats, set_score)
-    except ValueError:
-        log.exception("Not enough classes (0,1,etc) in sample.")
-        set_score[0]=1
-        set_score[1]=0
-        clf.fit(train_feats, set_score)
-
-    return f, clf, cv_error_results
-
-def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
-    """
-    Writes out a model to a file.
-    prompt string is a string containing the prompt
-    feature_ext is a trained FeatureExtractor object
-    classifier is a trained classifier
-    model_path is the path of write out the model file to
-    """
-    model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
-    pickle.dump(model_file, file=open(model_path, "w"))
-
-def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
-    """
-    Function that creates essay set, extracts features, and writes out model
-    See above functions for argument descriptions
-    """
-    essay_set=create_essay_set(text_score,prompt)
-    feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
-    dump_model_to_file(prompt,feature_ext,clf,model_path)
-
-
--- a/predictor_extractor.py
+++ b/predictor_extractor.py
-import numpy
-import re
-import nltk
-import sys
-from sklearn.feature_extraction.text import CountVectorizer
-import pickle
-import os
-from itertools import chain
-import copy
-import operator
-import logging
-import math
-from feature_extractor import FeatureExtractor
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-from essay_set import EssaySet
-import util_functions
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-log = logging.getLogger(__name__)
-
-class PredictorExtractor(object):
-    def __init__(self):
-        self._extractors = []
-        self._initialized = False
-
-    def initialize_dictionaries(self, p_set):
-        success = False
-        if not (hasattr(p_set, '_type')):
-            error_message = "needs to be an essay set of the train type."
-            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
-
-        if not (p_set._type == "train"):
-            error_message = "needs to be an essay set of the train type."
-            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
-
-        div_length=len(p_set._essay_sets)
-        if div_length==0:
-            div_length=1
-
-        max_feats2 = int(math.floor(200/div_length))
-        for i in xrange(0,len(p_set._essay_sets)):
-            self._extractors.append(FeatureExtractor())
-            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
-            self._initialized = True
-            success = True
-        return success
-
-    def gen_feats(self, p_set):
-        if self._initialized!=True:
-            error_message = "Dictionaries have not been initialized."
-            log.exception(error_message)
-            raise util_functions.InputError(p_set, error_message)
-
-        textual_features = []
-        for i in xrange(0,len(p_set._essay_sets)):
-            textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
-
-        textual_matrix = numpy.concatenate(textual_features, axis=1)
-        predictor_matrix = numpy.array(p_set._numeric_features)
-
-        print textual_matrix.shape
-        print predictor_matrix.shape
-
-        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
-
-        return overall_matrix.copy()
--- a/predictor_set.py
+++ b/predictor_set.py
-import numpy
-import nltk
-import sys
-import random
-import os
-import logging
-import essay_set
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-import util_functions
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-log=logging.getLogger(__name__)
-
-class PredictorSet(object):
-    def __init__(self, type = "train"):
-        """
-        Initialize variables and check essay set type
-        """
-        if(type != "train" and type != "test"):
-            type = "train"
-
-        self._type = type
-        self._target=[]
-        self._textual_features=[]
-        self._numeric_features=[]
-        self._essay_sets=[]
-
-    def add_row(self, numeric_features, textual_features, target):
-        #Basic input checking
-        if not isinstance(target, (int, long, float)):
-            error_message = "Target is not a numeric value."
-            log.exception(error_message)
-            raise util_functions.InputError(target, error_message)
-
-        if not isinstance(numeric_features, list):
-            error_message = "Numeric features are not a list."
-            log.exception(error_message)
-            raise util_functions.InputError(numeric_features, error_message)
-
-        if not isinstance(textual_features, list):
-            error_message = "Textual features are not a list."
-            log.exception(error_message)
-            raise util_functions.InputError(textual_features, error_message)
-
-        #Do some length checking for parameters
-        if len(self._numeric_features)>0:
-            numeric_length  = len(self._numeric_features[-1])
-            current_numeric_length = len(numeric_features)
-            if numeric_length != current_numeric_length:
-                error_message = "Numeric features are an improper length."
-                log.exception(error_message)
-                raise util_functions.InputError(numeric_features, error_message)
-
-        if len(self._textual_features)>0:
-            textual_length  = len(self._textual_features[-1])
-            current_textual_length = len(textual_features)
-            if textual_length != current_textual_length:
-                error_message = "Textual features are an improper length."
-                log.exception(error_message)
-                raise util_functions.InputError(textual_features, error_message)
-
-        #Now check to see if text features and numeric features are individually correct
-
-        for i in xrange(0,len(numeric_features)):
-            try:
-                numeric_features[i] = float(numeric_features[i])
-            except:
-                error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
-                log.exception(error_message)
-                raise util_functions.InputError(numeric_features, error_message)
-
-
-        for i in xrange(0,len(textual_features)):
-            try:
-                textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
-            except:
-                error_message = "Textual feature {0} not string.".format(textual_features[i])
-                log.exception(error_message)
-                raise util_functions.InputError(textual_features, error_message)
-
-        #Create essay sets for textual features if needed
-        if len(self._textual_features)==0:
-            for i in xrange(0,len(textual_features)):
-                self._essay_sets.append(essay_set.EssaySet(type=self._type))
-
-        #Add numeric and textual features
-        self._numeric_features.append(numeric_features)
-        self._textual_features.append(textual_features)
-
-        #Add targets
-        self._target.append(target)
-
-        #Add textual features to essay sets
-        for i in xrange(0,len(textual_features)):
-            self._essay_sets[i].add_essay(textual_features[i], target)
-
--- a/tests/__init__.py
+++ b/tests/__init__.py
-__author__ = 'vik'
--- a/tests/bad_grammar_and_spelling/answer.txt
+++ b/tests/bad_grammar_and_spelling/answer.txt
-This experement didn't have a controle and the grupe didn't do multiple triles. You would also may need to know what tempriture the rome is.
-
--- a/tests/bad_grammar_and_spelling/payload.json
+++ b/tests/bad_grammar_and_spelling/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/bad_grammar_and_spelling/wrong.txt
+++ b/tests/bad_grammar_and_spelling/wrong.txt
-In order for I for  replicate this expirement I woukd need to know what are the reaserching with this expirement what kind of result are  being booked at and the mass of each sample at the end of expirment  theie results.
-
--- a/tests/essaycorpus.txt
+++ b/tests/essaycorpus.txt
--- a/tests/essays_with_symbols/answer.txt
+++ b/tests/essays_with_symbols/answer.txt
-<b><fg>In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard&&&&&and many other sample materials that they&;;;& didn't use and would get different results. Also I would also<>>> need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell//////where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.<b><b>
--- a/tests/essays_with_symbols/payload.json
+++ b/tests/essays_with_symbols/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/essays_with_symbols/wrong.txt
+++ b/tests/essays_with_symbols/wrong.txt
-In order to conduct the experiment, the students would need to know the mass of the marble, the height of the drop, and the air temperature.
--- a/tests/good_pos_ngrams.p
+++ b/tests/good_pos_ngrams.p
--- a/tests/prompt.txt
+++ b/tests/prompt.txt
-"A group of students wrote the following procedure for their investigation. Procedure: 1. Determine the mass of four different samples. 2. Pour vinegar in each of four separate, but identical, containers. 3. Place a sample of one material into one container and label. Repeat with remaining samples, placing a single sample into a single container. 4. After 24 hours, remove the samples from the containers and rinse each sample with distilled water. 5. Allow the samples to sit and dry for 30 minutes. 6. Determine the mass of each sample. The students’ data are recorded in the table below. Sample Starting Mass (g) Ending Mass (g) Difference in Mass (g) Marble 9.8 9.4 –0.4 Limestone 10.4 9.1 –1.3 Wood 11.2 11.2 0.0 Plastic 7.2 7.1 –0.1"
-
--- a/tests/sa_data.tsv
+++ b/tests/sa_data.tsv
-"score"	"answer"
-1	""
-1	""
-2	"The unfilled pband of Mg can be used exactly the same way as the unfilled sband of Na is used to be filled with excited electrons."
-2	"The unfilled pband of Mg can be used exactly the same way as the unfilled sband of Na is used to be filled with excited electrons."
-0	"Magnesium is a metal inspite of its fully filled sband, because of the availability of the empty Porbitals that make up the Pband which can grow close enough to overlap with some of the sorbitals in the sband and in so doing gives rise to hybridisation. Mixing of bands of the s and p orbitals is responsible for the metal properties exhibited by Mg."
-3	"Magnesium is a metal inspite of its fully filled sband, because of the availability of the empty Porbitals that make up the Pband which can grow close enough to overlap with some of the sorbitals in the sband and in so doing gives rise to hybridisation. Mixing of bands of the s and p orbitals is responsible for the metal properties exhibited by Mg.nnNa has empty sorbitals in the half filled sband that means availability states to transit or move for electronsn"
-3	"the overlapping pband and sband of Mg metal ensure it has unoccupied energy levels it only takes up 14 of the valence orbitals "
-3	"the overlapping pband and sband of Mg metal ensure it has unoccupied energy levels it only takes up 14 of the valence orbitals nSodium has only 1 valence electron in the sorbital and so, it has halffilled sband"
-3	" Metals tend to be good electronic conductors, meaning that they have a large number of electrons which are able to access empty mobile energy states within the material.n Sodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.n Magnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-2	"Sodium behaves as a metal because the electrons in the sband are able to accept energy from an electric field so can move into the unfilled sbands. In the case of Magnesium even though the s band is filled the electrons can still accept energy from an electric field but instead of moving to an unfilled sband it moves to an unfilled pband still enabling it to move around the structure. This gives Magnesium its metallic properties."
-3	"All metals tend to conduct electricity as the electrons in the structure can change energy states. Sodium acts as a metal as the sband is only half filled so the electrons can change energy states into the unfilled bands. Magnesium acts as a metal because even though the sband is full the s and p bands overlap so the electrons in the sband can change energy states into the p band allowing them the conduct electricity "
-1	"the energy difference the filled s band and the empty band is negligible.nhence a small amount of energy can excite electrons to the empty band.nhence they conduct electricity and act as metalsn"
-3	"Hi"
-3	"Metals are good electronic conductors as they have a large number of electrons which are able to access empty mobile energy states within the material.nSodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.nMagnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-2	"The p orbitals of Mg are very close in energy levels to the s orbitals so the electrons can jump from the s to p orbitals in much the same way that tha Na electrons can move to the upper unfilled s orbitals."
-2	"Because the highest filled sband molecular orbital in Mg metal is situated near the empty pband molecular orbital, so electrons need really a little amount of energy to go from one molecular orbital to another. Just like in Na metal."
-3	"Both elements have filled unfilled orbitals.nIn Na, electrons can be excited from the highest filled bonding s orbitals Valence bands into the lowest unfilled s orbitals antibonding u2013 Conduction bands.nIn Mg, the 3s 3p valence orbitals interact much more strongly with each other than with the filled core shells the Valence bands have a large band width u2013 so large, in fact, that the band width is wider than the energy gap the bands overlap. Thus it requires only a small amount of energy to excite an electron from a filled s orbital s band into an unfilled p orbital p band. This electron can then migrate through the crystal in response to an applied electric field thus conduct electricity.n"
-3	"Metallic behavior can arise either from the presence of a single partially filled band or from two overlapping bands one full and one empty."
-2	""
-1	"The electrons at the top of the filled sband in Mg can move into the empty pband."
-3	"First, we will define a metal as being a good conductor of electricity, that is, a material where the electric charges move through easily. We know that the mobile charges are the electrons.nIn order for a solid to be a good conductor, the electrons need to dispose of empty extended molecular orbitals to which they can access with the small amounts of energy that are provided by an external applied electric field the electrons move then through these extended orbitals.nIn the case of Na atomic structure Ne 3s1, the system presents an sband, that is, a u201ccontinuousu201d bundle of extended molecular orbitals, which in this case is half filled. In this way, when an electric field is applied to the system, the electrons placed in the upper occupied band levels of the sband take the transferred energy and move to band levels with slightly higher energy, and then they move through the extended molecular levels linked with these band levels.nIn the case of Mg Ne 3s2, the sband is completely filled, so there is no possibility of transitions inside this band. However, the p atomic orbitals of Mg form a pband which overlaps with the sband, in such a way that the electrons in the upper part of the sband can jump to the pband with the amounts of energy provided by the applied electric field, and move through the pband extended molecular orbitals."
-1	"The upper sband orbitals of magnesium have pband orbitals close to them. So the electrons can move to the porbitals accepting energy of the electric field."
-2	"The electronic configuration of magnesium is 1s2 2s2 2p6 3s2. In this structure it appears that all the energy bands, 1s, 2s, 2p and 3s are completely filled, thus not satisfying the metallic feature according to band theory, however the magnesium is a metal. This is due to the fact that the 3s and 3p bands overlap in energy and consequently some electrons enter from 3s to 3p band before all the allowed states in 3s band are occupied. Therefore the current flow is possible and therefore the magnesium has a metallic behaviour."
-2	"The electronic configuration of magnesium is 1s2 2s2 2p6 3s2. In this structure it appears that all the energy bands, 1s, 2s, 2p and 3s are completely filled, thus not satisfying the metallic feature according to band theory, however the magnesium is a metal. This is due to the fact that the 3s and 3p bands overlap in energy and consequently some electrons enter from 3s to 3p band before all the allowed states in 3s band are occupied. Therefore the current flow is possible and therefore the magnesium has a metallic behaviour."
-3	"p orbitals overlap with a orbitals"
-2	"THEY HAVE EMPTY SHELLS AT THE TOP FOR MOVING ELECTRONS "
-2	"The electronic configuration of sodium is 1s2 2s2 2p6 3s1 and the electronic configuration of magnesium is 1s2 2s2 2p6 3s2. All the energy bands are completely filled. The magnesium has a metallic behaviour because this is due to the fact that the 3s and 3p bands overlap in energy and consequently some electrons enter from 3s to 3p band before all the allowed states in 3s band are occupied. "
-2	"Because the electrons can be affected by an electric field and go to the higher level "
-1	"Electrons at the top of the sband of Mg, when influenced by the electric field, have enough energy to jump to the bottom of the pband, thus displaying metallic properties."
-1	"At Sodium has at last level of 1 s electron, and it is not u0440 electron. At Magnesium has at last level of 2 s electron, and it is not u0440 electron, but can pass 1 electron with s on u0440"
-3	""
-3	"In both cases, there are electrons whose energy levels are located such that there is another energy level only a little bit above them, thus allowing them to absorb the tiny amount of energy imparted by an electric field in the presence of collisions."
-2	"As these two bands overlape they become free electrons and act as conductors.rising levels of these electrons acquire an additional energy which enables them to move higher to unaccupied levels,transforming into kentic energy.thus behaving as metals."
-3	"The sband is complete in the Mg because the Mg has two electrons of valence and the sband of Na only has one electron.nnThe electrons in the Na, can move in the sband but the electrons in the Mg only can move to pband."
-2	"Na is a metal based on the previous lecture. Mg is a metal because the p and s bands overlap so electrons from the full sband can move into the pband with no energy barrier. Once in the pband, the Mg electrons are free to conduct by gaining minute amounts of energy from the electric field."
-2	"The sband of Magnesium is filled because it has double electrons than Na.nMagnesium is metal because the electrons closed to the filled boundary can move to the pband."
-2	"This is because the energy levels between the molecular orbital s band and the p band are very close to each other. And therefore any disturbance caused by an electric field is felt by the electrons present in these bands, who respond to this stimulus moving between spaces available."
-2	""
-2	"Sodium is a simple metal we have a halffilled sband.nMagnesium The sband and the pband are overlapped. Althought the sband is filled, we have empty orbitals in the pband.n"
-1	"Because there is no gap between the s and p bands of Mg"
-3	"Sodium is behaving like a metal because it have a half band occupied, and its possible that some electron can jump to unoccupied orbitals easily.nnMagnesium is behaving like a metal because it have two band very closed, and its possible that some electron of sband can jump to pband easily.nnIf the gap between bands is greatest, the element is behaving like nometal, because the promotion of electrons would be difficult"
-0	""
-1	"Magnesium behaves as a metal even though its sband is filled because the 3s and 3p bands overlap. Which allows some electrons to enter the 3p band before all the states in the 3s band have been filled. Making both bands partially filled and giving electrons access to empty states close to the top of the nearly filled boundary."
-2	"An atom of sodium has electronic configuration 1s2 2s2 2p6 3s1 and therefore, the energy levels containing electrons are slevel, 2slevel, 2plevel and 3slevels. of these 1s, 2s, 2p levels are completely filled, but the 3s level is only half filled. accordingly in a solid sodium piece containing Natoms, there would be a 1sband, a 2sband, a 2pband and a 3sband of energy containing 2N, 2N, 6N, 2N, electronic states.of these 1s, 2s and 2pbands are completely filled but the 3sband is only halffilled.nNow if an electric field is applied across the piece of solid sodium, then the electrons in the valence band easily acquire additional energy to move to the higher unoccupied levels within the same band without acrossing any energy gap. the additional energy is in the form of kinetic energy and the moving electron consititutes electric current.nFor the matel magnesium whose electronic configuration 1s2 2s2 2p6 3s2. in this structure it appears that all the energy bands are completely filled, thus not satisfy ing the metalic feature according to band theory, but magnesium is a metal. this is due to the fact that the 3s and the 3p bands overlap is energy as shown in the figure above, and consequently some electrons enter from 3s to 3pband before all the allowed states in 3sband are occupied. this makes the 3sband partially filled and 3pband partially occupied. therefore the current flow is possible. this is explaine the metallic behaviour of magnesium."
-2	"An atom of sodium has electronic configuration 1s2 2s2 2p6 3s1 and therefore, the energy levels containing electrons are slevel, 2slevel, 2plevel and 3slevels. of these 1s, 2s, 2p levels are completely filled, but the 3s level is only half filled. accordingly in a solid sodium piece containing Natoms, there would be a 1sband, a 2sband, a 2pband and a 3sband of energy containing 2N, 2N, 6N, 2N, electronic states.of these 1s, 2s and 2pbands are completely filled but the 3sband is only halffilled.nNow if an electric field is applied across the piece of solid sodium, then the electrons in the valence band easily acquire additional energy to move to the higher unoccupied levels within the same band without acrossing any energy gap. the additional energy is in the form of kinetic energy and the moving electron consititutes electric current.nFor the matel magnesium whose electronic configuration 1s2 2s2 2p6 3s2. in this structure it appears that all the energy bands are completely filled, thus not satisfy ing the metalic feature according to band theory, but magnesium is a metal. this is due to the fact that the 3s and the 3p bands overlap is energy as shown in the figure above, and consequently some electrons enter from 3s to 3pband before all the allowed states in 3sband are occupied. this makes the 3sband partially filled and 3pband partially occupied. therefore the current flow is possible. this is explaine the metallic behaviour of magnesium."
-2	"An atom of sodium has electronic configuration 1s2 2s2 2p6 3s1 and therefore, the energy levels containing electrons are slevel, 2slevel, 2plevel and 3slevels. of these 1s, 2s, 2p levels are completely filled, but the 3s level is only half filled. accordingly in a solid sodium piece containing Natoms, there would be a 1sband, a 2sband, a 2pband and a 3sband of energy containing 2N, 2N, 6N, 2N, electronic states.of these 1s, 2s and 2pbands are completely filled but the 3sband is only halffilled.nNow if an electric field is applied across the piece of solid sodium, then the electrons in the valence band easily acquire additional energy to move to the higher unoccupied levels within the same band without acrossing any energy gap. the additional energy is in the form of kinetic energy and the moving electron consititutes electric current.nFor the matel magnesium whose electronic configuration 1s2 2s2 2p6 3s2. in this structure it appears that all the energy bands are completely filled, thus not satisfy ing the metalic feature according to band theory, but magnesium is a metal. this is due to the fact that the 3s and the 3p bands overlap is energy as shown in the figure above, and consequently some electrons enter from 3s to 3pband before all the allowed states in 3sband are occupied. this makes the 3sband partially filled and 3pband partially occupied. therefore the current flow is possible. this is explaine the metallic behaviour of magnesium."
-3	""
-3	"In all metals, a large number of electrons are free to move about the crystal at room temperature. Sodium is a metal because the valence band half of the sorbitals is filled, and it only takes an incremental amount of energy to excite an electron from the valence band the conduction band. Magnesium is a metal because the sorbitals are fully filled, but they overlap with the porbitals, so it is easy to move electrons from the valence band to the conduction band."
-2	"The sband of the sodium metal is only halffilled, therefore the electrons may move within the band to the higher energy levels.nnDue to the fact that the sband and the pband of the magnesium metal overlap, there is no significant difference in energy between these two bands, and the electrons may move from the sband to the higher energy levels of the pband."
-0	"It is because both have one electron in the outer band. whatever. huh"
-3	"High energy sband electrons in Mg metal may be excited to the pband orbitals, hence meaning they are able to accept the energy imparted by the electric field and move hence conducting a current."
-2	"While the 3s band could be filled, the upper range of energies in this band overlap with the lower range of energies in the 3p bands. It requires little energy for electrons to move into vacant conduction levels."
-3	"magnesium Mg has an electron configuration 1s2 2s2 2p6 3s2, it has a filled 3s shelln therefore the filled 3s shell on its own would not allow for electrons to gain energy, if the 3pnband was separated by a gap from the 3s bandn in Mg however, the 3s and 3p bands overlap in energy the 3p band can accommodate 22l1 6 electrons per atom, . jointly the 3s and the 3pnnorbitals form a band that can accommodate 8 N electronsn thus the conduction band is only 25 filled rendering Mg a good conductor and magnesium behave as metalnnso in sodium a good metal conductor there are 22l1 with l 0navailable electron states per atom in the 3s shell only one of which isnfilled with the single valence electronn as a result the 3s shell of sodium is only half filled n thus the electrons are free to change their energies within the 3snbandn this allows electrons to pick up a kinetic energy from an appliednelectric field leading to a electron drift velocity generating currentnthat makes Na a good metallic conductor"
-3	"Magnesium conducts because of the overlapping of s and p energy bands. So even thought 3sband is completely filled, 3pband become aviable. Given that i havent any energy gap between bonding and conducting band,because now electrons can fill free 3pband ,Magnesium exhibits conduction."
-3	"Sodium can conduct due to halffilles 3sband,so providing an external source of energy such as heat or elettric fields,electrons are free to occupy higher energy sband level.nMagnesium conducts because of the overlapping of s and p energy bands. So even thought 3sband is completely filled, 3pband become aviable. Given that i havent any energy gap between bonding and conducting band,because now electrons can fill free 3pband which is just a little over the highest sband energy,Magnesium exhibits conduction."
-1	"There is a partial overlap of the 3s and 3p bands in Mg, therefore if there is enough energy the electron can transfer from valence band to the empty 3 p conduction band."
-1	"Electronic configuration of Mg 3s2e 3p0e. 3s and 3p orbitals have very close energy, so electrons can occupy free 3porbitals, move there freely."
-2	"Because the metallic behavior can arise from two overlapping bands one full and one empty, that is the case for Magnesium."
-3	"Na metal sband not fully completed, an e can go upnMg metal sband and pband overlap an e can go upn"
-1	"Magnesium is showing metal properties because it has pband which is overlaping with filled sband and thanks to this it is accesible to the highest energetic electrons from this sband"
-3	"because p and s waves are superimposed"
-1	"The filled sband is overlaped by empty pband. The electrons from the sband have access to empty state very close to the top of the sband. "
-0	""
-1	"The pband of Mg overlap the s band, creating a way for the electrons."
-3	" Metals tend to be good electronic conductors, meaning that they have a large number of electrons which are able to access empty mobile energy states within the material.n Sodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.n Magnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-1	"In magnesium, the s and p orbitals are very close in energy so there is an overlap of orbitals and the electrons can easily move from the filled sband to the empty pband"
-3	"For mg, the sband electrons bordering the pband behave similar to the electrons in the middle of the halffilled sband."
-0	""
-0	""
-3	"The difference in energy between s antibonding orbitals and p bonding orbitals is small they are overlaped, so electrons can go from s band to p band when are influenced by a electric field."
-0	"The metallic bond is independent of the valence electrons."
-0	""
-3	"Metals are a good electronic conductors.nBoth of metals have energy levels available"
-3	"Orbitale Mg mogu0105 przyju0105u0107 8 elektronu00f3w zaju0119te su0105 tylko przez 2. Tym samym pozostaje au017c 75 wolnego miejsca na wzbudzanie elektronu00f3w. Orbitale Na 4 elektrony su0105 zaju0119te przez 2. Maju0105 50 miejsca na wzbudzone elektrony."
-1	"The upper s band of Mg overlaps its pband thus providing a partiallyfilled joint band that allows electron movement. "
-3	"In sodium, the valence and conduction bands are very close in energy so that even the small amount of energy an electron will receive is enough to excite the electrons to the conduction band.nIn magnesium, the sband is filled. However, the empty pband orbitals are now available for the electrons to be excited into some, in fact, are lower than the maximum energy level of the filled sband."
-3	"The magnesium behaves as a metal because it has an open pband overlapping with the filled sband. Therefore the electrons that are near the border between the two bands can absorb energy and be influenced by an electric field. This mimics the half filled sband present in Sodium metal."
-3	"They both behave as metals because they are metals. nnAlthough the outer 3s energy band in magnesium is filled, these metalnhas good electrical conductivity because their 3s bands overlap their 3p bands. In the case of magnesium, the empty 3p band combines with the 3s band to form a partially filled 3sp band.nnSodium atoms form an extended array in which the valence electron can be delocalized generally across the array. The ionization energy is sufficiently low to allow this. The underlying reason is that its electrons are delocalized , that is to say they are present more as a cloud than as in association with specific atoms.nnA metal is defined as an element that readily loses electrons to form positive ions cations and forms metallic bonds with other metal atoms. Metals form ionic bonds with nonmetals.nn"
-1	"Magnesium has an unfilled p band, in which electrons may get excited and jump to. This frees up the s band a bit too."
-3	"Metals have lots of empty energy levels hence being a good conductor.nnMagnesium has s and p overlap a higher energy state is right above the s for electrons to jump to.nnSodium is half filled s, so there is an empty higher energy state available immediately above."
-3	"Both have available low lying available energy levels near their highest occupied molecular orbital. In the case of sodium, those available levels are part of the same band formed from the superposition of s orbitals, while in the case of magnesium, the low lying energy level is part of a pand formed from p orbitals."
-2	"In Mg, the empty p band overlaps in energy with the full s band, allowing mobility of electrons in the same way that a partially full band would."
-2	"Bottom of pbang in Mg is close enough in energy to the top of the sband for the electrons to jump there. "
-3	"Na has a halffull sband, so there are a lot of electrons to move and an available energy state for them to move to.nnThe sband of Mg is full, but the bottom of pband is close enough in energy to the top of the sband for the electrons to jump there. Again, there are plenty of electrons available for this jump. "
-2	"Because they have the half band full nnTheir orbital mixes "
-3	"Sodium behaves as a metal as it has a half filled sband and the electrons on the highest level of the filled states can accept the energy and move into higher states to exhibit metallic properties.nIn case of magnesium, though the sband is completely filled, it overlaps with the pband of the crystal. As a result of which the electrons on the outer level can accept the energy to move into the empty pband and hence behave as a metal."
-1	""
-3	"Conduction in overlapping pband."
-3	"Engery of lower pband orbitals are very similar and even lower than the highest sband orbitals. This allows very the electrons to absorb very small amounts of energy as then get occupy orbitals with slightly higher energy."
-1	"Even though the 3s orbitals of Mg is filled which can occupy the sband where as empty 3p orbitals available for conduction. "
-2	"both the Na and the Mg have a half filled band so electrons can only get a higher energy at the boundary level between the filled part of the band and the unfilled part of the other band"
-2	"Metals have partially filled bands of molecular orbitals that allow electrons to absorb very small amounts of energy relative to non metals. e.g. electrons can travel through metal easily because it can absorb very low amounts of energy in an electric field by being excited to a higher molecular orbital. nNa has a half filled sband made up of 3s orbitals and it takes very small amounts of energy to excite the electron to the next molecular orbital in the band.nMg has enough valence electrons to fill up the entire sband but because the s and p bands overlap there is still a very small energy gap between molecular orbitals and the Mg crystal still exhibits metallic behaviour."
-3	"In Mg, some electrons enter from 3s to 3p band before all the allowed states in 3s band are occupied. this makes the 3s band partially filled and 3p band partially occupied. therefore the current flow is possible."
-3	"The bands derived from the 3s and 3p atomic orbitals are wider than the energy gap between then resulting in overlapping bands. This causes a combined band from the overlap of the 3s and the 3p orbitals with rooms for 8 electrons. This means the combined band of Mg is only partially filled which is crucial for the metallic behavious because there are unoccupied energy levels at an infinitesimally small energy above the highest occupied level."
-1	"Because in both cases, there are unoccupied orbitals that are close in energy to filled orbitals. Electrons in these nearby filled orbitals can accept small amounts of energy and move into these unoccupied orbitals.nnIt is the ability of electrons to accept small amounts of energy that defines metallic behaviour"
-1	"The molecular orbitals relating to the 4s and 3p subshells overlap as there is a relatively small energy difference between these levels when viewed as atomic orbitals. This overlap in energy levels means the two subshells merge to form a single band and this allows spare energy levels for the electrons to move into."
-3	"We saw that the half empty sband gives the higher energy electrons opportunity to be excited and form a current. This principle explains the metallic characteristics of Na. For Mg, the sband is filled, but the empty pband, which overlaps the top of the sband in terms of energy level, functions as the next energy level to be excited to. From this theory, we can further assume that the reason elements such as P are non metals, is that their pband and sband differs greatly in energy."
-2	"in soduim, half of the sband is filled and as we can see from the picture, we need a small amount of energy the make the valence electrons jump from the bonding band to the antibonding band.nIn magnesuim, even if the sband is filled, the pband is overlapped with the sband, so we only need a small amount of energy the make the valence electrons jump from the s band to the p band."
-2	"If the pband energies are close to the sband energies, or even overlapping as the diagram suggests, the high energy electrons in the sband can easily move to an unoccupied pband."
-2	"The metallic properties of sodium are obvious, due to the half filled sband, which provides the amount of electrons near the edge of filling, occupying the energy states which enable the electrons to accept the energy from the field and move to a higher energy state in the sband.nnLooking at the band diagram of magnesium we can see that even though the sband is completely filled, the bands have a small overlap. Thus it gives the ability for the highenergysband electrons to be affected by the electric field the ability to change their energy, jumping to the empty pband."
-3	"They are metals because electrons can go to an empty band with a really little jump in energy.nnNa is a metal because the s band in halffilled, so electrons can move on the bandnnMg is a metal because even having a full sband, the pband overlaps it, so electrons can go to the pband as if it was a continuous halffilled band"
-3	"band theory can easily explain the electrical conductivity of the metal as evidenced by the corresponding example sodium, sodium in the inner bands are completely filled, while the bands originated from the atomic orbitals of the valence shell 3s and 3p bands, is half full 3s, 3p empty, being 3s the valence band and the next higher conduction band. magnesium has completely filled valence band you would expect in principle that magnesium ho not have driver for free energy levels in the valence band. but the fact that for the equilibrium internuclear distance of the valence band overlaps the conduction band levels makes this easily accessible to the valence electrons, thus favoring metallic conduction."
-1	" Even if the sband of Mg is completely filled it behaves as a metal, because the only prerequisite for behaving as metal is at least some of the electrons in its filled band must have access to an empty higher energy band .n In Mg even if the s band is completely filled some of the electrons in this completely filled sband present at the boundary is having access to the subsequent higher energy empty pband since both these bands overlaps each other.n Hence it act as metal."
-3	"Becouse both of them have e near the boundary."
-1	"3s shell of sodium is half filled, and 3s2 of magnesium is completely filled with electrons."
-3	"Magnesium behaves like metal due to superposition of 3s and 3p orbitals due to the small energy difference between them. Low Band Gap"
-3	"Both have partially filled conduction bands. Its just that the Mg Conduction band is made of overlapping s p bands."
-2	"Because the magnesiums 3selectron can replace to one of the 3porbitals."
-2	"Because the sband for the magnesium is in contact with its pband, so some electrons can go from the highest orbitals in the sband to the lowest ones in the pband."
-2	"Because the sband for the magnesium is in contact with its pband, so some electrons can go from the highest orbitals in the sband to the lowest ones in the pband."
-2	"Magnesium behave as a metal because of a partial overlap of the 3s and the empty 3p bands. With this overlap, electrons can be activated into empty 3p states and exhibit conduction, as in the partly filled s band in Na."
-1	"Because of overlap between pband and sband , the high energy level electron in magnesium can move up to unoccupied pband."
-2	"Because the combination of the p atomic orbitals of Magnesium generates a band of p molecular orbitals that can be occupied by the electrons when excited"
-3	"Because the combination of the p atomic orbitals of Magnesium generates a band of p molecular orbitals that can be occupied by the electrons when excited"
-3	"sodium behaves as metal because electrons can move from one molecular orbital to another molecular orbital in sband , thus allowing conduction . in case of magnesium the energy gap between sband and pband is very low , so even though the sband is filled conduction occurs by movement of electrons from sband to pband , thus magnesium also behaves as metal."
-3	"Because, the electrons at the top energy level are able to move to a empty level which has a higher energy. For sodium, its s band is not full so for some electrons they can get energy to another higher level. As for magnesium, although the sband is filled, the pband has the near energy, or in other words, the two energy band have intersect. So the electrons at the sband can get energy and move to the pband.nFrom the analysis above, we can see that both sodium and magnesium behave as metals"
-2	"To be a metal electrons need to be able to move. They do this by moving from one energy band to that if a higher energy band.nIn Na the other s orbital is vacant, so electrons can be promoted into these.nIs 2s. In 2 orbital are also spaces for 6 p electrons. nMg can be a metal as electrons promoted from s to p"
-3	"Although the sband of magnesium is filled, the band gap between the sband and the pband is very small, this allows the electrons from the sband jump to the pband. The electrons in Na also have this movility in the sband, this movility gives the material the proporty of conduct electricity typicall of metals "
-2	"The pband of magnesium is so close to the sband they actually overlap that it takes nearly no energy to excite an electron into the pband. Therefore, electrons near the pband can accept energies that are released from colliding with nuclei and can be accelerated by an electric field. Also, they can absorb light of many different wavelengths and reemitt light of many different wavelengths, giving the metal its shiny appearance."
-0	"Na and Md have selectrons on the valence electron level. Na 1e and Mg 2e. But in Mg crystal electrons can occupai pband too becaus they can emigrate from slevel to plevel. And Na have only sband, but Mg can have s and pbands together.n Electrons from Na and Mg crystals can be cut of easy from sband and pband."
-1	"It is obvious that the magnesiums pband orbitals overlap with the sband. This provides the possibility for the electrons from the sband to diffuse into higher energy orbitals, thus every metal atom with nearly energetically allocated other types of bands can behave as metal conduct current."
-2	"Thats because the pband of magnesium is close to its sband and nottaken such that the electrons from the latter can move into the pband when electric field is applied."
-2	"Both sodium and magnesium behave as metals, even though the sband of magnesium is filled because the 3p orbitals of magnesium, while empty in an single atom, expand into a band that overlaps the 3s band. As magnesium atoms push together, the electrons initially enter the 3s band, but ultimately the highest energy electrons in the 3s band spill over into the lower energy levels in the 3p band. Sodium, instead, behaves as a metal due to the half filled 3s band that forms the cohesive energy."
-2	"Sodiums electrons can move because sband is halffilled. Mg has an overlap between pband and sband, so that electrons have access to the higher energy state from sband to pband."
-2	"The sband of Mg is filled with electrons 3s2. However, the s and p band energies have started to overlap.Electrons from the upper energy levels of the sband can jump into the pband and have freedom to move. As they can move about they can behave as a metal and conduct electricity."
-2	"electron configuration of Mg is Ne3s2.and we know tow energy levels 3s and 3p are close. however sbanding is filled, as schematic diagram shows sband and pband over lap energy, so electrons can change energy easily and get pband which is empty. in fact, Mg has conductive band .as result of it, Mg behave a metal.nn Na has an electron configuration Ne 3s1 so half of s shell is occupied. the schematic diagram shows just half of sband is filled therefor Na has conductive band and electrons have opportunity to change energy easily. because they dont a lot of energy so, it is a good electron conductor. "
-2	"The Na would because of the 3s1 orbital,this would allow the electrons to have access to the boundary. The Mg because it overlaps into the 3P orbitalnallowing room for electrons energy gain. "
-1	"Electrons can easily move into empty pband, hence conductive,"
-3	"Part of the bonding region in the pband of Magnesium is around the same energy as part of the antibonding region of the sband. The sband of Magnesium is completely filled, while the pband is completely empty. Therefore, electrons in the s subshell can accept energy and transfer into the p subshell. This allows for conductivity and Magnesium behaves like a metal. Sodium behaves like a metal because of energy transfer within the s subshell that occur between the bonding and antibonding band regions."
-1	"Because still only few electrons compared to the whole neighbours with the pband.n"
-0	""
-3	"Sband is halffilled in Na and empty bands are very close to full bands so electrons can jump to conduction bands. In Mg pband overlaps with sband to give similar properties.n"
-3	"For a material to behave as a metal or exhibit properties of a metal, it is required to have an empty band of molecular orbitals with slightly higher energy. This is because, the properties of a metal are on account of the existence of free electrons within the structure. Now, In sodium metal, the band of molecular orbitals formed is from the 2s atomic orbitals only. This band is halffilled, and as a result, the electrons with the energies equivalent to or just equivalent to the level of the halffilled band can be excited and sent to the unfilled half part of the band. However, in case of Magnesium metal, in spite of both the electrons in a single atom of Magnesium being in the 2s atomic orbital, the molecular orbitals band is formed from the 2s as well as the empty 2p atomic orbitals. The electrons completely fill up the s part of the band, but the p part remains unfilled. This facilitates for some of the electrons to be excited and sent to a higher energy band, where they can be free to some extent. It is this availability of the empty part in the band of molecular orbitals, and thus of electrons to be able to excite, that enables Magnesium metal also to behave as a metal, justlike Sodium. "
-2	"In Mg metal electrons from the filled sband can move into the lower levels of the empty pband, as its energy level overlaps with the top of the sband. This movement of electrons in energy bands allows the metallic behaviour. "
-3	"According to Aufbaus principle, the lowest energy states of the band are filled first and the upper states remain empty u2013 but can readily be occupied by electrons upon thermal excitation or the application of an electric field. In other words, to be a metal, it requires some of the electrons in the electronic structure to have access to empty states very close to the top of this filled boundary. nHowever, in the case of Magnesium where the sband is filled, there is the pband for electrons to move to. There are unoccupied orbitals just on top of the 3sband which is the 3pband. As the sband and pband meet, they mix therefore giving electrons enough space to roam."
-2	"Because there is an overlap between the S band and the P band. Thus, it is possible for the electrons to get excited into the p band in the magnesium metal."
-3	""
-2	"Sodium is located in one family and this group all elements are metals, are excellent conductors of electricity, soft and highly reactive. Have outermost electron in an electron weakly bonded to the core and generally forms univalent compounds, ionic and colorless. He has one valence electron in the outermost orbital s an electron, which occupies a spherical orbital. Ignoring the internal filled electron shell, their electronic configurations can be written as 2s1. The valence electron is quite removed from the core. Thus, the core is loosely bound and can be removed with ease. In contrast, the remaining electrons are closer to the core are more firmly attached and removed with difficulty. At room temperature adopt the bodycentered cubic structure, with coordination number 8.nNow magnesium is less reactive than sodium. he is bivalent and form colorless ionic compounds, have two electrons in the outermost electron level is a strong reducing agent, the fact that it has two electrons in the outer shell gives a distinctive feature to the metal."
-2	"Magnesium behaves as metal even though the sband is filled because the pband is close in energy to the sband, so the electrons in the higher energy orbitals of the sband can move to the pband."
-2	"This is because the magnesium metal has low energy empty orbitals of the pi band that can accept electron density form the s band orbitals. This is, even though the s band is full, Mg metal can donate electrons to the p band and be modifyed by an electric field as any other metal with empty s band orbitals."
-2	"Magnesium can behave as a metal since, though to 3s subshell is full, the 3p subshell is not, and so electrons can jump to those orbitals and move freely. Thus magnesium can also display metallic properties, despite having a filled outermost orbital."
-3	"because there is empty energy states or unfilled orbitals very close to the top of the filled boundary"
-3	""
-1	"Perhaps Mg creates an sp2 hybrid orbital that eventually forms the molecular orbitals. Which can therefore be halffilled."
-1	"The overlap in s and pbands in Mg allows for the small energy transitions of electrons necessary for metals."
-1	"."
-2	"Magnesium behaves as metal, because the filled sband and the empty pband overlap. The pband in Magnesium is partially filled. "
-1	"This effect is due to the fact that in the magnesium metal, even though the sband is completely filled, the pband is empty and close enough in energy that the electron, when supplied energy, will jump from the sband to the pband."
-2	"Na only half of the sband is filled with e. With little energy gaining by an Efield, for example some e can reach empty sorbitals and travel around the cristal.nnMg the whole sband is filled with e. But the free pband overlaps on the energyscale and so some e can occupy pbandorbitals and freeing sbandorbitals and lowering the overall energy of the cristal. Now, with little energy gaining by an Efield, for example some e can reach empty s and porbitals and travel around the cristal."
-3	" The explanation is that, since 3p and 3s valence orbitals of adjacent atoms are farther from their nucleus, their interaction is stronger than that of filled inner orbitals in the core. This results in valence bands to have a larger bandwidth causing 3s and 3p orbitals to overlap and the energy gap between them to disappear. The consequence is that a new band with a total capacity of eight electrons is obtained.n This way Nas band and Mgs band, both formed by delocalized orbitals, are partially filled 18th and 14th respectively and comply with the two requisites to behave as a metal."
-1	"Because in both of the cases, there are electrons that have access to the empty states at the top."
-2	"Magnesium is a metal with the structure 1su00b2 2su00b2 2pu2076 3su00b2. For the quantum orbits with n3 it seems to be that the upper part of the 3s band created by all the atoms together in the solid overlaps with the lower part of its 3p band. That overlapping makes possible for eu207b to jump to the 3p as it was the conduction band empty of eu207b, with the result that it behaves like the antibonding band of the Sodium. In that way eu207b can travel through the valence band upper 3s band to the conduction band lower 3p band and the Mg behaves as a metal element. That implies the possibility to transmit a current through Mg and its metal behaviour."
-1	"The sband of Mg, though filled, overlaps with the pband orbitals. Electrons at the top of the sband are able to use orbitals in the pband."
-2	"The excited electrons from Magnesium would not be able to create a molecular orbital sshell if it is already filled, so they must jump into the porbital."
-1	"The molecular orbitals of Magnesium can be formed by both s and p orbitals i.e, s and p orbitals can hybridize because they are close in energy. Thus Magnesium will also have partially filled band and can behave as a metal."
-1	"In the magnesium, the sband and the pband overlap. Electrons are able to move from the sband to the pband the same way electrons are able to move higher in the sband in the sodium. "
-1	"Magnesium behave live metal because have empty orbitals in the pband that are closely the boundary filled orbitals of the sband. That way, the electrons of that boundary orbitals, can easily be influenced by the electric field."
-2	"By linear combination of atomic orbitals LCAO to determine the MO state, we obtain the number of electronic states depends on the chain of atoms within upper and lower limit an energy band. According to the Pauli principle, the electronic states orbitals within an energy band are filled progressively by pairs of electron.nWe can see that each 3s state of valence shell in the Na atoms is halffilled 3s1 while Mg atoms is doubly occupied 3s2 therefore the sband of Na will be halffilled and sband of Mg is filled actually there is a partially overlap of the 3s and 3p."
-3	" Metals tend to be good electronic conductors, meaning that they have a large number of electrons which are able to access empty mobile energy states within the material.n Sodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.n Magnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-2	"At interaction of atoms Mg participate not only ssubshells, but psubshells too,the continuous power zone is as a result formed, this zone is not fulled by electrons, pband adjoins to sband, and we observe u0435u0440u0444u0435 magnesium behave as metal"
-1	"Even though the sband of magnesium is filled, pband has plenty of empty orbitals, and there is no gap between the s and pbands, therefore electrons can behave in the same way, leading to metallic behavior."
-3	"Sodium behaves as metal, because it has a half filled sbandwith and enough free electronic states to which electrons can be excited by an electric field. In Magnesium the sband is completely filled. But since the sband overlaps with the pband, the small amount of energy supplied by an electric field is sufficient to excite selectrons to electronic states in the empty pband. So also magnesium is metallic. Just as in sodium, an electric field can induce an electric current."
-3	"p orbital is empty"
-3	"In Metals electrons are mobile.nIn Na, the higher energy electrons located in the valance band can easily transition to the conduction band. nnIn Mg the only difference is that the conduction band is a pband, not an sband. Mg highest energy electrons can easily move to this pband the conduction band."
-0	"Because the difference between states even in magnesium is very little and all the electrons from s band of magnesium act as free electrons somehow and they can move also with little energy."
-1	"The overlapping region between the sband and the pband confers metallic properties to magnesium."
-3	"Magnesium has all the 3s energy level completed with electrons, so it wouldnt be a conductor in normal cases , but in this , p band is very close in energy so the electrons can be excited and go to the p band making it conductive metal , if the gap was larger , we will have a semiconductor , and if it was more larger we will have non conductor . In Na metal the 3s level is not filled so electron can move in the last ocuppied level is half full so we have a good conductor ."
-2	"Because in Mg electrons are able to reach empty porbitals and thus accept the energy on applying electric field."
-2	"Because the electrons of sband of Mg, even this band is filled, can accept the energy of the electric field and fill the pband that is empty."
-1	"The p and sband in Mg overlap, so the electrons in the sband can be excited to an Elevel in the pband which serves as the conduction band. "
-2	"There is some overlap between the s and p band, so we get a hybridised sp band. This is only 14 full, so conduction can occur"
-3	"Magnesium behave as metal even though the sband is filled, because it has the pband completely empty. So, the electrons that are on the threshold between the sband and pband are able to acquire more energy by moving to the pband."
-3	"Metals tend to be good electronic conductors, meaning that they have a large number of electrons which are able to access empty mobile energy states within the material.nSodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.nMagnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-2	"The electrons are able to move into an unoccupied shell where they are free to move. "
-3	""
-2	"sodium has a half filled band and conducts by its antibonding half or conducting band, so itu00b4s a metal, in the Mg case we have an overlaping of the s and p bands so electrons can use the p orbitals to travel so the p band is the conduction band.nIm portuguese so sorry for any typos, and i saw all the classes before answering, i usualy take all the classes first"
-3	""
-3	"They are both metals because the electrons that are on the border of the Sband either S to p band as in Mg, or half the Sband as in Na. These electrons on the border are going to be affected by an electron field, causing them to move up into the empty shells. This will cause energy to be released, giving metallic properties. "
-2	"Sodium behaves as a metal because the electrons on the border of the filled and empty orbitals can be influenced by the electric field and can move up to a higher energy state. Magnesium behaves as a metal because the electrons in the overlap between the s and p orbitals have an ability to be influenced by the electric field and therefore move up to a higher energy state. This is why both sodium and magnesium behave as metals."
-2	"The atom of Na has next electronic configuration Ne 3s1. Sodium behaves as metal because the net of nucleons has a halffilled sband with 3selectrons, where there are filled the 3sbonding orbitals .nThe Mg with next electronic configuration Ne 3s2, has completed the 3s orbital, but still there are empty 3porbitals to be filled for any 3selectron in interaction with a electric field."
-1	"The sband and pband overlap in magnesium"
-3	"Both behave as metals because there are enough free carrier energy levels inmediately above the fermi level. No energy gap exists between both bands in the Mg metal because of the overlap between the s and pband. Consequently, any electron of these energy bands can move freely occupying empty levels inmediately above the fermi level and, thus, showing metal behavior."
-3	"Both Na and Mg behave as metals because they have empty MO close in energy to the filled MO. This means that the e at the highest energy filled MO can absorb tiny amounts of energy, so they can interact with an electric field when it is applied to the material."
-3	""
-3	"In both cases electrons can gain a little part of energy be accelerated by the electric field and occupied a higher state. Sodium sband is half full and electron around middle can be excited to the higher state in sband. In Magnesium case full filled sband, the higher states in sband overlap the state in pband. Energy of highest state in sband is higher then lowest state in pband. Electron of highest state in sband can get a little part of energy and occupied a pband state, so it can be accelerated by the electric field."
-3	"Sodium has a completely filled s valence band, but the s conduction band is empty and very close in energy to the valence band, so electrons at room temperature have enough energy to move into the conduction band where they can flow freely. Magnesium, in contrast, has a completely filled s band, but the empty p band overlaps with the s band, again allowing the electrons to transition from one to the other and flow."
-2	""
-0	""
-3	"Metals tend to be good electronic conductors, meaning that they have a large number of electrons which are able to access empty mobile energy states within the material.nnSodium has a halffilled sband, so there are a number of empty states immediately above the highest occupied energy levels within the band.nnMagnesium has a full sband, but the the sband and pband overlap in magnesium. Thus are still a large number of available energy states immediately above the sband highest occupied energy level."
-1	"because the pband is very close in energy to the s band and we have a lot of empty p orbitals close in energy to the sband and because the p orbitals are partly filled."
-2	"in sodium 3s orbit is half filled and in magnisium electrons jumps from 3s to vacant 3p orbit."
-3	"Nes turi laisvu0105 p band p orbitales u012f kurias gali peru0161okti elektronas."
-1	"The different level of energy are very similar for Mg as seen, there is some overlapp that allows electron to move into higher energy simply with room temperature."
-2	"Thinking only about the sband in a magnesium crystal, its not clear how this metal have a behavior ofa metal. The full band shows that we cant have charge carriers, so, the eletrical condutivity, based on this facts, must be zero.nFor another point, we need to take care with the pband energy structure. If we look to the figure, we can see that the top of th sband have the same energy of the bottom of the pband. This means that, for thermal excitations, the charge carriers electrons can jump for the pband, where they are free to move as charge carriers."
-1	"Magnesium electrons still have unfilled energy states that are relatively close to the top of the sband."
-1	"Band s and band p overlap, so the electron can go fron s band into p band, because there is no gap between s and p band."
-1	"because the electron can be excited to the p band."
-3	"The bands overlap, so no problem if an electron from the s band gets some small energy. It just moves to pband. In any case the electron only needs a valid orbital to move to and is pretty much indifferent on how we name it."
-3	"The Na has the s band half filled, so the electron can move trough the half band that is empty, while the Mg, in spite of having the s band filled, is also a metal because there are not energy gap between the s band and the p band, so the electron can move trough the p band"
-1	"Because the sband and pband of Mg overlap, i.e. they are partially degenerate, electrons in orbitals close in energy to the pband can accept energy and move into pband orbitals. If the s and p bands did not overlap, the electrons from the s band would not be able to cross the band gap they would not have enough energy."
-2	"Its because the energy level of the free pband begins under the top of sband, so the electrons can freely move from one band to another and use free pband as the extension of the filled sband. "
-0	"Sodium and Magnesium both behave as metals, even though the sband of magnesium is filled because the band is the same size with similar magnitude despite Magnesium having a mixture of a pband and a sband slightly overlapping each other as same diameter in this comparison showing same pressure because Magnesium usually has a larger atomic radius."
-2	"Sodium behaves as a metal because even though the molecular orbitals are filled, the electrons near the boundary between the bonding and anti bonding orbitals are able to move into the anti bonding orbitals a higher orbital when an electric field is applied because they are supplied with energy which accelerates the electrons through the crystal structure. The same applies for Magnesium Metal because all though the sband is full the electrons near the boundary are able to move into the higher empty porbitals."
-3	"In order for the elements to behave as metals, both Na and Mg must have empty states in their respective bands for electrons to occupy when accepting energy from an electric field. Sodium has a halfempty sband, which is occupied by boundary electrons when they accept energy. Although the sband of the Mg metal is filled, there is an overlap between the pband and the sband. Boundary electrons in the Mg metals sband are therefore able to occupy empty states in the pband when accepting energy from an electric field, which is the behavior necessary to be a metal."
-1	"Because the distance in energy between s and p orbitals in Mg is small, and the higher energy s electrons can propell themselves under the force of an electric field to the higher in energy p band. So p band in a way is acting as an empty continuation of s band.nThe low electronegativity of Mg helps this,too."
-2	"In both cases, there are unoccupied orbitals withinreach of occupied orbitals. In sodiums case, it is the upper half of the sband within reach of the lower half in magnesiums case, it is the pband within reach of the sband."
-1	"Mg Electrons can move into the unfilled Pband where they can conduct. The filled SBand overlaps with the unfilled PBand and so not much energy is needed to achieve a conducting state."
-3	"Presumably the electrons in Mg near the surface can move into the p orbitals when they gain energy"
-0	"Presumably the electrons in Mg near the surface can move into the p orbitals when they gain energynNa has s orbitals that the energised electrons can move inton"
-3	""
-1	"Because e in magnesium could jump out to the pband"
-1	"Because the metals need some free space to move electrons and they both have enough free space."
-1	"Because the sband and the pbands overlap thus the electrons in that overlapping area can move to the pband."
-2	"Both elements have valence bands that are partly filled, so in both cases electrons can be promoted to other energies within the band for a very low deltaE. "
-2	"There are orbitals on top of the upper most filled molecule orbitals."
-2	"Because energy levels of p and sbands in Mg overlap no band gap, electrons can accelerate from valence to the conductivity zone."
-3	"As we have seen in the sreenshot before, for an electron in order to accept the energy given by an electric field, it needs to be able to move up to another unoccupied level of energy. Only the electrons near the boundary between the filled molecular orbitals and the empty molecular orbitals can do so. This means that for a given element to have metallic character, it requires some of the electrons in the electronic structure to have access to empty states very close to the top of this filled boundary. For both Na and Mg, this is the casenNa has access to the non filled orbitals in the s band, while Mg has access to the non filled orbitals of the pband, that are that close in energy with its Sband that both sband and pband are able to overlap, allowing the electrons to reach the nonfilled energy levels."
-1	"Because the electrons would gain enough energy to jump orbitals if there is an electric field."
-2	""
-3	"fre"
-3	"Because the s and p bands in magnesium slightly overlap, and therefore the electrons in states closely below the Fermi level have states closely above it readily available, which is a requisite for metallic electron transport."
-3	"Electron conduct under an external excitation such as a bias voltage occurs if there exist vacant states closely above the Fermi level readily available to the electrons occupying the band states closely below the Fermi level.nnThis requisite is obviously satisfied in sodium, as the sband is halffilled and it is also satisfied in magnesium, for which the the s and p bands show a slight overlap in their upper and lower ends, and regardless of the fact that the sband is completely filled by electrons, theres still vacant states readily available for these electrons."
-3	""
-3	"firts Metals tend to be good electronic conductors. Second Sodium has a halffilled sband. Third Magnesium has a full sbandn"
-2	"Because sand pband acts as continous band therefore, the upper sband electrons have access to empty states pband lower states located very close to them. This access to very close empity states defines a metal."
-2	"Sodium has a half filled sband and so the electron can mobilize throughout the sband characteristic of metals.nMagnesium sband and pbands overlap meaning they are energetically similar and the electron from the sband can mobilize within the pband which is characteristic of a metal."
--- a/tests/simple_essay/answer.txt
+++ b/tests/simple_essay/answer.txt
-In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard and many other sample materials that they didn't use and would get different results. Also I would also need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.
--- a/tests/simple_essay/payload.json
+++ b/tests/simple_essay/payload.json
-{"grader":"tests/models/essay_set_1.p"}
--- a/tests/simple_essay/wrong.txt
+++ b/tests/simple_essay/wrong.txt
-this is an incorrect response
--- a/tests/test_cv_accuracy.py
+++ b/tests/test_cv_accuracy.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import essay_set
-import feature_extractor
-import numpy
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-FILENAME="sa_data.tsv"
-
-
-all_err=[]
-all_kappa=[]
-
-for t_len in [0,50,100,200,300]:
-    sa_val = file(FILENAME)
-    scores=[]
-    texts=[]
-    lines=sa_val.readlines()
-    eset=essay_set.EssaySet(type="train")
-    for i in xrange(1,len(lines)):
-        score,text=lines[i].split("\t\"")
-        if len(text)>t_len:
-            scores.append(int(score))
-            texts.append(text)
-            eset.add_essay(text,int(score))
-            #if int(score)==0:
-            #    eset.generate_additional_essays(text,int(score))
-    extractor=feature_extractor.FeatureExtractor()
-    extractor.initialize_dictionaries(eset)
-    train_feats=extractor.gen_feats(eset)
-    clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-    cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
-    err=numpy.mean(numpy.abs(cv_preds-scores))
-    print err
-    kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-    print kappa
-    all_err.append(err)
-    all_kappa.append(kappa)
-
-    """
-    outfile=open("full_cvout.tsv",'w+')
-    outfile.write("cv_pred" + "\t" + "actual")
-    for i in xrange(0,len(cv_preds)):
-        outfile.write("{0}\t{1}".format(cv_preds[i],scores[i]))
-    """
-
-
-
--- a/tests/test_cv_full.py
+++ b/tests/test_cv_full.py
-import os
-import sys
-#base_path = os.path.dirname(__file__)
-base_path = "/home/vik/mitx_all/machine-learning"
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(base_path,'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import essay_set
-import feature_extractor
-import numpy
-import math
-from multiprocessing import Pool
-
-from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-data_path = "/home/vik/mitx_all/vik_sandbox/hewlett_essay_data/split_data"
-if not data_path.endswith("/"):
-    data_path=data_path+"/"
-filenames = [str(i) +".tsv" for i in xrange(1,19)]
-
-run_cv = False
-
-def run_single_worker(args):
-    filename,data_path,run_cv = args
-    base_name = data_path + filename
-    print base_name
-    sa_val = file(base_name)
-    id_vals=[]
-    essay_set_nums=[]
-    score1s=[]
-    score2s=[]
-    texts=[]
-    lines=sa_val.readlines()
-    eset=essay_set.EssaySet(type="train")
-    for i in xrange(1,len(lines)):
-        id_val,essay_set_num,score1,score2,text=lines[i].split("\t")
-        score1s.append(int(score1))
-        score2s.append(int(score2))
-        texts.append(text)
-        essay_set_nums.append(essay_set_num)
-        id_vals.append(id_val)
-        eset.add_essay(text,int(score1))
-        #if int(score)==0:
-        #    eset.generate_additional_essays(text,int(score))
-    extractor=feature_extractor.FeatureExtractor()
-    extractor.initialize_dictionaries(eset)
-    train_feats=extractor.gen_feats(eset)
-    print(max(score1s))
-    if max(score1s)<=3:
-        clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-    else:
-        clf=GradientBoostingRegressor(n_estimators=100, learn_rate=.05, max_depth=4, random_state=1, min_samples_leaf=3)
-
-    if run_cv:
-        try:
-            cv_preds=util_functions.gen_cv_preds(clf,train_feats,score1s, num_chunks = 10) # int(math.floor(len(texts)/2)
-        except:
-            cv_preds = score1s
-    else:
-        try:
-            count_to_train_on = 100
-            random_nums = list(numpy.random.random_integers(0, train_feats.shape[0], count_to_train_on))
-            out_group_rows = [row for row in xrange(0,train_feats.shape[0]) if row not in random_nums]
-            in_group_scores = list(numpy.array(score1s)[random_nums])
-            out_group_scores = list(numpy.array(score1s)[out_group_rows])
-            out_group_score2s = list(numpy.array(score2s)[out_group_rows])
-            score1s = out_group_scores
-            score2s = out_group_score2s
-            model = util_functions.gen_model(clf,train_feats[random_nums,:],in_group_scores)
-            cv_preds = util_functions.gen_preds(model,train_feats[out_group_rows,:])
-        except:
-            print "Error with generating cv preds"
-            random_nums = list(numpy.random.random_integers(0, train_feats.shape[0], count_to_train_on))
-            out_group_rows = [row for row in xrange(0,train_feats.shape[0]) if row not in random_nums]
-            in_group_scores = list(numpy.array(score1s)[random_nums])
-            out_group_scores = list(numpy.array(score1s)[out_group_rows])
-            out_group_score2s = list(numpy.array(score2s)[out_group_rows])
-            score1s = out_group_scores
-            score2s = out_group_score2s
-            cv_preds = score1s
-
-    rounded_cv = [int(round(cv)) for cv in list(cv_preds)]
-    added_score1 = [s1+1 for s1 in score1s]
-    err=numpy.mean(numpy.abs(numpy.array(cv_preds)-score1s))
-    kappa=util_functions.quadratic_weighted_kappa(rounded_cv, score1s)
-    percent_error = numpy.mean(numpy.abs(score1s - numpy.array(cv_preds))/added_score1)
-    human_err=numpy.mean(numpy.abs(numpy.array(score2s)-score1s))
-    human_kappa=util_functions.quadratic_weighted_kappa(list(score2s),score1s)
-    human_percent_error = numpy.mean(numpy.abs(score1s - numpy.array(score2s))/added_score1)
-
-    outfile=open(data_path + "outdata/" + filename,'w+')
-    outfile.write("cv_pred" + "\t" + "actual1\t" + "actual2\n")
-    for i in xrange(0,len(cv_preds)):
-        outfile.write("{0}\t{1}\t{2}\n".format(str(cv_preds[i]),str(score1s[i]), str(score2s[i])))
-    outfile.close()
-
-    return err, kappa,percent_error,human_err,human_kappa,human_percent_error
-
-length = len(filenames)
-np=8
-p = Pool(processes=np)
-errs, kappas,percent_errors,human_errs,human_kappas,human_percent_errors = zip(*p.map(run_single_worker,[(filenames[i],data_path,run_cv) for i in xrange(0,length)]))
-
-outfile=open(data_path + "outdata/summary.tsv",'w+')
-outfile.write("set\terr\tkappa\tpercent_error\thuman_err\thuman_kappa\thuman_percent_error\n")
-for i in xrange(0,len(errs)):
-    outfile.write("{set}\t{err}\t{kappa}\t{percent_error}\t{human_err}\t{human_kappa}\t{human_percent_error}\n".format(
-        set=i+1,err=errs[i],kappa=kappas[i],percent_error=percent_errors[i], human_err=human_errs[i],
-        human_kappa=human_kappas[i], human_percent_error=human_percent_errors[i]))
-outfile.close()
-
-
-
-
-
-
--- a/tests/test_cv_single.py
+++ b/tests/test_cv_single.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import essay_set
-import feature_extractor
-import numpy
-import math
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-filenames = ['LSQ_W09_60_MLT.tsv',
-             'LSQ_W10_22_a.tsv',
-             'LSQ_W11_21_MLT.tsv',
-             ]
-
-for filename in filenames:
-    base_name = base_path + filename
-    print base_name
-    sa_val = file(base_name)
-    scores=[]
-    texts=[]
-    lines=sa_val.readlines()
-    eset=essay_set.EssaySet(type="train")
-    for i in xrange(1,len(lines)):
-        score,text=lines[i].split("\t\"")
-        scores.append(int(score))
-        texts.append(text)
-        eset.add_essay(text,int(score))
-        #if int(score)==0:
-        #    eset.generate_additional_essays(text,int(score))
-    extractor=feature_extractor.FeatureExtractor()
-    extractor.initialize_dictionaries(eset)
-    train_feats=extractor.gen_feats(eset)
-    clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-    cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores, num_chunks = int(math.floor(len(texts)/2)))
-    err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
-    print err
-    kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-    print kappa
-
-    outfile=open(filename + "_cvout.tsv",'w+')
-    outfile.write("cv_pred" + "\t" + "actual\n")
-    for i in xrange(0,len(cv_preds)):
-        outfile.write("{0}\t{1}\n".format(str(cv_preds[i]),str(scores[i])))
-    outfile.close()
\ No newline at end of file
--- a/tests/test_generic_ml.py
+++ b/tests/test_generic_ml.py
-import os
-import sys
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path=os.path.abspath(os.path.join(base_path,'..'))
-sys.path.append(one_up_path)
-
-import util_functions
-import predictor_set
-import predictor_extractor
-import numpy
-
-from sklearn.ensemble import GradientBoostingClassifier
-
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-FILENAME="sa_data.tsv"
-
-
-sa_val = file(FILENAME)
-scores=[]
-texts=[]
-lines=sa_val.readlines()
-pset = predictor_set.PredictorSet(type="train")
-for i in xrange(1,len(lines)):
-    score,text=lines[i].split("\t\"")
-    if len(text)>t_len:
-        scores.append(int(score))
-        texts.append(text)
-        pset.add_row([1],[text],int(score))
-extractor=predictor_extractor.PredictorExtractor()
-extractor.initialize_dictionaries(pset)
-train_feats=extractor.gen_feats(pset)
-
-clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
-cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
-err=numpy.mean(numpy.abs(cv_preds-scores))
-print err
-kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
-print kappa
\ No newline at end of file
--- a/tests/test_graders.py
+++ b/tests/test_graders.py
-#!/usr/bin/env python
-"""
-Send some test programs to an xserver.
-
-For each dir in the current directory, send the contents of payload.xml and each
-of the answer*.py, right*.py and wrong*.py files.
-"""
-
-import argparse
-import glob
-import json
-import os
-import os.path
-from path import path
-import requests
-import sys
-import time
-
-xserver = 'http://127.0.0.1:3031/'
-
-def send(payload, answer):
-    """
-    Send a grading request to the xserver
-    """
-
-    body = {'grader_payload': payload,
-            'student_response': answer}
-
-    data = {'xqueue_body': json.dumps(body),
-            'xqueue_files': ''}
-
-    start = time.time()
-    r = requests.post(xserver, data=json.dumps(data))
-    end = time.time()
-    print "Request took %.03f sec" % (end - start)
-
-    if r.status_code != requests.codes.ok:
-        print "Request error:{0},{1},{2}".format(r.headers,payload,answer)
-
-    parsed_text=json.loads(r.text)
-    print("\nAnswer: {0}\nScore: {1} Correct: {2} \nFeedback: {3}"
-          .format(answer,parsed_text['score'],parsed_text['correct'],
-          parsed_text['feedback']))
-    #print "Score:{0} {1}".format(parsed_text['score'],parsed_text['correct'])
-    return r.text
-
-
-def check_contains(string, substr):
-    if not substr in string:
-        print "ERROR: Expected to be {0}".format(substr)
-        return False
-    else:
-        return True
-
-def check_not_contains(string, substr):
-    if substr in string:
-        print "ERROR: Expected to be {0}".format(substr)
-        return False
-    else:
-        return True
-
-def check_right(string):
-    return check_contains(string, '\"correct\": true')
-
-def check_wrong(string):
-    return check_contains(string, '\"correct\": false')
-
-def globs(dirname, *patterns):
-    """
-    Produce a sequence of all the files matching any of our patterns in dirname.
-    """
-    for pat in patterns:
-        for fname in glob.glob(os.path.join(dirname, pat)):
-            yield fname
-
-def contents(fname):
-    """
-    Return the contents of the file `fname`.
-    """
-    with open(fname) as f:
-        return f.read()
-
-def check(dirname,type):
-    """
-    Look for payload.json, answer*.py, right*.py, wrong*.py, run tests.
-    """
-    payload_file = os.path.join(dirname, 'payload.json')
-    if os.path.isfile(payload_file):
-        payload = contents(payload_file)
-        print("found payload: " + payload)
-    else:
-        graders = list(globs(dirname, 'grade*.py'))
-        if not graders:
-            #print "No payload.json or grade*.py in {0}".format(dirname)
-            return
-        if len(graders) > 1:
-            print "More than one grader in {0}".format(dirname)
-            return
-        payload = json.dumps({'grader': os.path.abspath(graders[0])})
-
-    for name in globs(dirname, 'answer*.txt', 'right*.py'):
-        #print "Checking correct response from {0}".format(name)
-        answer = contents(name)
-        right=check_right(send(payload, answer))
-
-    for name in globs(dirname, 'wrong*.txt'):
-        #print "Checking wrong response from {0}".format(name)
-        answer = contents(name)
-        wrong=check_wrong(send(payload, answer))
-    if(type=="test"):
-        assert wrong and right
-
-def main(argv):
-    global xserver
-
-    #parser = argparse.ArgumentParser(description="Send dummy requests to a qserver")
-    #parser.add_argument('server')
-    #parser.add_argument('root', nargs='?')
-
-    #args = parser.parse_args(argv)
-
-    #xserver = args.server
-    if not xserver.endswith('/'):
-        xserver += '/'
-
-    #root = args.root or '.'
-    root=os.path.dirname( os.path.abspath(__file__ ))
-    for dirpath, _, _ in os.walk(root):
-        print("checking" + dirpath)
-        check(dirpath,"normal")
-
-if __name__=="__main__":
-    main(sys.argv[1:])
-
-def test_graders():
-    root=os.path.dirname( os.path.abspath(__file__ ))
-    for dirpath, _, _ in os.walk(root):
-        print("checking" + dirpath)
-        yield check, dirpath, "test"
-
-def test_model_creation():
-    model_creator_dir=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-
--- a/tests/test_models.py
+++ b/tests/test_models.py
-# Run with arguments train_file prompt_file model_path to generate a sample model file
-
-import os
-import sys
-import argparse
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-
-one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
-sys.path.append(one_up_path)
-
-import model_creator
-
-
-def main(argv):
-    parser = argparse.ArgumentParser(description="Generate model from test data files")
-    parser.add_argument('train_file')
-    parser.add_argument('prompt_file')
-    parser.add_argument('model_path')
-
-    args = parser.parse_args(argv)
-
-    score, text = model_creator.read_in_test_data(args.train_file)
-    prompt_string = model_creator.read_in_test_prompt(args.prompt_file)
-    print("data read")
-    e_set = model_creator.create_essay_set(text, score, prompt_string)
-    print("essay set created")
-    feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
-    print("features pulled out and model generated")
-    model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, text, score, args.model_path)
-    print("model file written")
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
-
-
-def test_model_creation():
-    try:
-        score, text = model_creator.read_in_test_data("train.tsv")
-        prompt_string = model_creator.read_in_test_prompt("prompt.txt")
-        e_set = model_creator.create_essay_set(text, score, prompt_string)
-        feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
-        model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, args.model_path)
-        assert True
-    except:
-        assert False
--- a/tests/train.tsv
+++ b/tests/train.tsv
--- a/util_functions.py
+++ b/util_functions.py
-#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
-#Requires aspell to be installed and added to the path
-from external_code.fisher import fisher
-
-aspell_path = "aspell"
-import re
-import os
-from sklearn.feature_extraction.text import CountVectorizer
-import numpy
-from itertools import chain
-import math
-import nltk
-import pickle
-import logging
-
-log=logging.getLogger(__name__)
-
-base_path = os.path.dirname(__file__)
-sys.path.append(base_path)
-if not base_path.endswith("/"):
-    base_path=base_path+"/"
-
-ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
-ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
-
-class AlgorithmTypes(object):
-    regression = "regression"
-    classification = "classifiction"
-
-def create_model_path(model_path):
-    if not model_path.startswith("/") and not model_path.startswith("models/"):
-        model_path="/" + model_path
-    if not model_path.startswith("models"):
-        model_path = "models" + model_path
-    if not model_path.endswith(".p"):
-        model_path+=".p"
-
-    return model_path
-
-def sub_chars(string):
-    """
-    Strips illegal characters from a string.  Used to sanitize input essays.
-    Removes all non-punctuation, digit, or letter characters.
-    Returns sanitized string.
-    """
-    sub_pat = r"[^A-Za-z\.\?!,';:]"
-    char_pat = r"\."
-    com_pat = r","
-    ques_pat = r"\?"
-    excl_pat = r"!"
-    sem_pat = r";"
-    col_pat = r":"
-
-    whitespace_pat = r"\s{1,}"
-    whitespace_comp = re.compile(whitespace_pat)
-    sub_comp = re.compile(sub_pat)
-    char_comp = re.compile(char_pat)
-    com_comp = re.compile(com_pat)
-    ques_comp = re.compile(ques_pat)
-    excl_comp = re.compile(excl_pat)
-    sem_comp = re.compile(sem_pat)
-    col_comp = re.compile(col_pat)
-
-    nstring = sub_comp.sub(" ", string)
-    nstring = char_comp.sub(" .", nstring)
-    nstring = com_comp.sub(" ,", nstring)
-    nstring = ques_comp.sub(" ?", nstring)
-    nstring = excl_comp.sub(" !", nstring)
-    nstring = sem_comp.sub(" ;", nstring)
-    nstring = col_comp.sub(" :", nstring)
-
-    nstring = whitespace_comp.sub(" ", nstring)
-    return nstring
-
-
-def spell_correct(string):
-    """
-    Uses aspell to spell correct an input string.
-    Requires aspell to be installed and added to the path.
-    Returns the spell corrected string if aspell is found, original string if not.
-    """
-    f = open('tmpfile', 'w')
-    f.write(string)
-    f_path = os.path.abspath(f.name)
-    f.close()
-    try:
-        p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
-    except:
-        log.exception("Could not find aspell, so could not spell correct!")
-        return string,0, string
-    incorrect = p.readlines()
-    p.close()
-    incorrect_words = list()
-    correct_spelling = list()
-    for i in range(1, len(incorrect)):
-        if(len(incorrect[i]) > 10):
-            match = re.search(":", incorrect[i])
-            if hasattr(match, "start"):
-                begstring = incorrect[i][2:match.start()]
-                begmatch = re.search(" ", begstring)
-                begword = begstring[0:begmatch.start()]
-
-                sugstring = incorrect[i][match.start() + 2:]
-                sugmatch = re.search(",", sugstring)
-                if hasattr(sugmatch, "start"):
-                    sug = sugstring[0:sugmatch.start()]
-
-                    incorrect_words.append(begword)
-                    correct_spelling.append(sug)
-    newstring = string
-    markup_string = string
-    already_subbed=[]
-    for i in range(0, len(incorrect_words)):
-        sub_pat = r"\b" + incorrect_words[i] + r"\b"
-        sub_comp = re.compile(sub_pat)
-        newstring = re.sub(sub_comp, correct_spelling[i], newstring)
-        if incorrect_words[i] not in already_subbed:
-            markup_string=re.sub(sub_comp,'<bs>' + incorrect_words[i] + "</bs>", markup_string)
-            already_subbed.append(incorrect_words[i])
-
-    return newstring,len(incorrect_words),markup_string
-
-
-def ngrams(tokens, min_n, max_n):
-    """
-    Generates ngrams(word sequences of fixed length) from an input token sequence.
-    tokens is a list of words.
-    min_n is the minimum length of an ngram to return.
-    max_n is the maximum length of an ngram to return.
-    returns a list of ngrams (words separated by a space)
-    """
-    all_ngrams = list()
-    n_tokens = len(tokens)
-    for i in xrange(n_tokens):
-        for j in xrange(i + min_n, min(n_tokens, i + max_n) + 1):
-            all_ngrams.append(" ".join(tokens[i:j]))
-    return all_ngrams
-
-
-def f7(seq):
-    """
-    Makes a list unique
-    """
-    seen = set()
-    seen_add = seen.add
-    return [x for x in seq if x not in seen and not seen_add(x)]
-
-
-def count_list(the_list):
-    """
-    Generates a count of the number of times each unique item appears in a list
-    """
-    count = the_list.count
-    result = [(item, count(item)) for item in set(the_list)]
-    result.sort()
-    return result
-
-
-def regenerate_good_tokens(string):
-    """
-    Given an input string, part of speech tags the string, then generates a list of
-    ngrams that appear in the string.
-    Used to define grammatically correct part of speech tag sequences.
-    Returns a list of part of speech tag sequences.
-    """
-    toks = nltk.word_tokenize(string)
-    pos_string = nltk.pos_tag(toks)
-    pos_seq = [tag[1] for tag in pos_string]
-    pos_ngrams = ngrams(pos_seq, 2, 4)
-    sel_pos_ngrams = f7(pos_ngrams)
-    return sel_pos_ngrams
-
-
-def get_vocab(text, score, max_feats=750, max_feats2=200):
-    """
-    Uses a fisher test to find words that are significant in that they separate
-    high scoring essays from low scoring essays.
-    text is a list of input essays.
-    score is a list of scores, with score[n] corresponding to text[n]
-    max_feats is the maximum number of features to consider in the first pass
-    max_feats2 is the maximum number of features to consider in the second (final) pass
-    Returns a list of words that constitute the significant vocabulary
-    """
-    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
-    dict_mat = dict.fit_transform(text)
-    set_score = numpy.asarray(score, dtype=numpy.int)
-    med_score = numpy.median(set_score)
-    new_score = set_score
-    if(med_score == 0):
-        med_score = 1
-    new_score[set_score < med_score] = 0
-    new_score[set_score >= med_score] = 1
-
-    fish_vals = []
-    for col_num in range(0, dict_mat.shape[1]):
-        loop_vec = dict_mat.getcol(col_num).toarray()
-        good_loop_vec = loop_vec[new_score == 1]
-        bad_loop_vec = loop_vec[new_score == 0]
-        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
-        good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
-        bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
-        bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
-        fish_val = fisher.FishersExactTest.probability_of_table(
-            [[good_loop_present, bad_loop_present], [good_loop_missing, bad_loop_missing]])
-        fish_vals.append(fish_val)
-
-    cutoff = 1
-    if(len(fish_vals) > max_feats2):
-        cutoff = sorted(fish_vals)[max_feats2]
-    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])
-
-    getVar = lambda searchList, ind: [searchList[i] for i in ind]
-    vocab = getVar(dict.get_feature_names(), good_cols)
-
-    return vocab
-
-
-def edit_distance(s1, s2):
-    """
-    Calculates string edit distance between string 1 and string 2.
-    Deletion, insertion, substitution, and transposition all increase edit distance.
-    """
-    d = {}
-    lenstr1 = len(s1)
-    lenstr2 = len(s2)
-    for i in xrange(-1, lenstr1 + 1):
-        d[(i, -1)] = i + 1
-    for j in xrange(-1, lenstr2 + 1):
-        d[(-1, j)] = j + 1
-
-    for i in xrange(lenstr1):
-        for j in xrange(lenstr2):
-            if s1[i] == s2[j]:
-                cost = 0
-            else:
-                cost = 1
-            d[(i, j)] = min(
-                d[(i - 1, j)] + 1, # deletion
-                d[(i, j - 1)] + 1, # insertion
-                d[(i - 1, j - 1)] + cost, # substitution
-            )
-            if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
-                d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
-
-    return d[lenstr1 - 1, lenstr2 - 1]
-
-
-class Error(Exception):
-    pass
-
-
-class InputError(Error):
-    def __init__(self, expr, msg):
-        self.expr = expr
-        self.msg = msg
-
-
-def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
-    """
-    Generates cross validated predictions using an input classifier and data.
-    clf is a classifier that implements that implements the fit and predict methods.
-    arr is the input data array (X)
-    sel_score is the target list (y).  y[n] corresponds to X[n,:]
-    num_chunks is the number of cross validation folds to use
-    Returns an array of the predictions where prediction[n] corresponds to X[n,:]
-    """
-    cv_len = int(math.floor(len(sel_score) / num_chunks))
-    chunks = []
-    for i in range(0, num_chunks):
-        range_min = i * cv_len
-        range_max = ((i + 1) * cv_len)
-        if i == num_chunks - 1:
-            range_max = len(sel_score)
-        chunks.append(range(range_min, range_max))
-    preds = []
-    set_score = numpy.asarray(sel_score, dtype=numpy.int)
-    chunk_vec = numpy.asarray(range(0, len(chunks)))
-    for i in xrange(0, len(chunks)):
-        loop_inds = list(
-            chain.from_iterable([chunks[int(z)] for z, m in enumerate(range(0, len(chunks))) if int(z) != i]))
-        sim_fit = clf.fit(arr[loop_inds], set_score[loop_inds])
-        preds.append(list(sim_fit.predict(arr[chunks[i]])))
-    all_preds = list(chain(*preds))
-    return(all_preds)
-
-
-def gen_model(clf, arr, sel_score):
-    """
-    Fits a classifier to data and a target score
-    clf is an input classifier that implements the fit method.
-    arr is a data array(X)
-    sel_score is the target list (y) where y[n] corresponds to X[n,:]
-    sim_fit is not a useful return value.  Instead the clf is the useful output.
-    """
-    set_score = numpy.asarray(sel_score, dtype=numpy.int)
-    sim_fit = clf.fit(arr, set_score)
-    return(sim_fit)
-
-
-def gen_preds(clf, arr):
-    """
-    Generates predictions on a novel data array using a fit classifier
-    clf is a classifier that has already been fit
-    arr is a data array identical in dimension to the array clf was trained on
-    Returns the array of predictions.
-    """
-    if(hasattr(clf, "predict_proba")):
-        ret = clf.predict(arr)
-        # pred_score=preds.argmax(1)+min(x._score)
-    else:
-        ret = clf.predict(arr)
-    return ret
-
-
-def calc_list_average(l):
-    """
-    Calculates the average value of a list of numbers
-    Returns a float
-    """
-    total = 0.0
-    for value in l:
-        total += value
-    return total / len(l)
-
-stdev = lambda d: (sum((x - 1. * sum(d) / len(d)) ** 2 for x in d) / (1. * (len(d) - 1))) ** .5
-
-def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
-    """
-    Calculates kappa correlation between rater_a and rater_b.
-    Kappa measures how well 2 quantities vary together.
-    rater_a is a list of rater a scores
-    rater_b is a list of rater b scores
-    min_rating is an optional argument describing the minimum rating possible on the data set
-    max_rating is an optional argument describing the maximum rating possible on the data set
-    Returns a float corresponding to the kappa correlation
-    """
-    assert(len(rater_a) == len(rater_b))
-    if min_rating is None:
-        min_rating = min(rater_a + rater_b)
-    if max_rating is None:
-        max_rating = max(rater_a + rater_b)
-    conf_mat = confusion_matrix(rater_a, rater_b,
-        min_rating, max_rating)
-    num_ratings = len(conf_mat)
-    num_scored_items = float(len(rater_a))
-
-    hist_rater_a = histogram(rater_a, min_rating, max_rating)
-    hist_rater_b = histogram(rater_b, min_rating, max_rating)
-
-    numerator = 0.0
-    denominator = 0.0
-
-    if(num_ratings > 1):
-        for i in range(num_ratings):
-            for j in range(num_ratings):
-                expected_count = (hist_rater_a[i] * hist_rater_b[j]
-                                  / num_scored_items)
-                d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
-                numerator += d * conf_mat[i][j] / num_scored_items
-                denominator += d * expected_count / num_scored_items
-
-        return 1.0 - numerator / denominator
-    else:
-        return 1.0
-
-
-def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
-    """
-    Generates a confusion matrix between rater_a and rater_b
-    A confusion matrix shows how often 2 values agree and disagree
-    See quadratic_weighted_kappa for argument descriptions
-    """
-    assert(len(rater_a) == len(rater_b))
-    if min_rating is None:
-        min_rating = min(rater_a)
-    if max_rating is None:
-        max_rating = max(rater_a)
-    num_ratings = int(max_rating - min_rating + 1)
-    conf_mat = [[0 for i in range(num_ratings)]
-                for j in range(num_ratings)]
-    for a, b in zip(rater_a, rater_b):
-        conf_mat[a - min_rating][b - min_rating] += 1
-    return conf_mat
-
-
-def histogram(ratings, min_rating=None, max_rating=None):
-    """
-    Generates a frequency count of each rating on the scale
-    ratings is a list of scores
-    Returns a list of frequencies
-    """
-    if min_rating is None:
-        min_rating = min(ratings)
-    if max_rating is None:
-        max_rating = max(ratings)
-    num_ratings = int(max_rating - min_rating + 1)
-    hist_ratings = [0 for x in range(num_ratings)]
-    for r in ratings:
-        hist_ratings[r - min_rating] += 1
-    return hist_ratings
-
-
-def get_wordnet_syns(word):
-    """
-    Utilize wordnet (installed with nltk) to get synonyms for words
-    word is the input word
-    returns a list of unique synonyms
-    """
-    synonyms = []
-    regex = r"_"
-    pat = re.compile(regex)
-    synset = nltk.wordnet.wordnet.synsets(word)
-    for ss in synset:
-        for swords in ss.lemma_names:
-            synonyms.append(pat.sub(" ", swords.lower()))
-    synonyms = f7(synonyms)
-    return synonyms
-
-
-def get_separator_words(toks1):
-    """
-    Finds the words that separate a list of tokens from a background corpus
-    Basically this generates a list of informative/interesting words in a set
-    toks1 is a list of words
-    Returns a list of separator words
-    """
-    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
-    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
-        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
-    else:
-        essay_corpus = open(ESSAY_CORPUS_PATH).read()
-        essay_corpus = sub_chars(essay_corpus)
-        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
-        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
-    sep_words = []
-    for word in tab_toks1.keys():
-        tok1_present = tab_toks1[word]
-        if(tok1_present > 2):
-            tok1_total = tab_toks1._N
-            tok2_present = toks2[word]
-            tok2_total = toks2._N
-            fish_val = fisher.FishersExactTest.probability_of_table(
-                [[tok1_present, tok2_present], [tok1_total, tok2_total]])
-            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
-                sep_words.append(word)
-    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
-    return sep_words
-
-
-def encode_plus(s):
-    """
-    Literally encodes the plus sign
-    input is a string
-    returns the string with plus signs encoded
-    """
-    regex = r"\+"
-    pat = re.compile(regex)
-    return pat.sub("%2B", s)
-
-
-def getMedian(numericValues):
-    """
-    Gets the median of a list of values
-    Returns a float/int
-    """
-    theValues = sorted(numericValues)
-
-    if len(theValues) % 2 == 1:
-        return theValues[(len(theValues) + 1) / 2 - 1]
-    else:
-        lower = theValues[len(theValues) / 2 - 1]
-        upper = theValues[len(theValues) / 2]
-
-        return (float(lower + upper)) / 2 
\ No newline at end of file