Commit 4286f7e9 by Vik Paruchuri

Add files back in

parent ade012fc
.idea/
__pycache__/
models/
*.pyc
*~
tests/
Project to integrate machine learning based essay scoring with xserver. Aspell must be installed and added to path to run. numpy, scipy, sklearn, and nltk also need to be installed.
Nltk also requires the treebank maxent tagger and wordnet to be installed. These can be installed through the nltk downloader(nltk.download()), or programatically through `python -m nltk.downloader maxent_treebank_pos_tagger wordnet` .
Runnable files:
1. tests/test_models.py
Generates test models when used like: `python create_test_models.py train_file prompt_file model_path`. Use `python create_test_models.py train.tsv prompt.txt models/essay_set_1.p` to generate a model using sample data.
2. test_server_code/pyxserver_wsgi.py
Starts a server instance that can be sent answers to score. Calls grade.py to score responses. Run server with `gunicorn -w 4 -b 127.0.0.1:3031 pyxserver_wsgi:application` .
3. tests/test.py
Submits test data found in directories within the tests folder to the xserver and displays results. See tests/simple_essay for an example of how to format files. You need payload.json, wrong.txt, and answer.txt to make a test.
Testing:
Tests can be run by running nosetests in the tests directory. Make sure the test server is running first!
import os
import sys
import logging
log = logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path)
import model_creator
import util_functions
import predictor_set
import predictor_extractor
from statsd import statsd
@statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string,model_path):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
msg = "essay set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Old steps (deprecated):
source opt/edx/bin/activate
sudo apt-get upgrade gcc
sudo pip install numpy
#in ml dir
sudo pip install -r requirements.txt
#in controller dir
sudo pip install -r requirements.txt
mkdir log in both repos
mkdir /opt/wwc/log
touch /opt/wwc/log/edx.log
sudo su makeitso
django-admin syncdb
django-admin migrate
sudo su makeitso
bash
source /opt/edx/bin/activate
cd /opt/wwc/grading-controller
pip install numpy
pip install -r requirements.txt
cd opt/wwc/machine-learning
pup install -r requirements.txt
python -m nltk.downloader maxent_treebank_pos_tagger wordnet
sudo mv /path/to/nltk_data /usr/share
sudo apt-get install aspell
python /opt/wwc/grading-controller/manage.py update_users
Copy auth.json and env.json into the grading-controller folder (no idea why this is needed, but django-admin doesn't find them at ENV_ROOT)
chown grading-controller/edx.log to makeitso
make db file (for sqlite)
chown db file to makeitso
For some reason, have to copy auth.json and env.json to both opt/wwc/grading-controller and opt/wwc/ dirs
sudo apt-get install super
pip install MySQL-python
In upstart files, be sure to specify settings file and full python path! (/opt/wwc/grading-controller)
How to run:
/opt/edx/bin/gunicorn --preload -b 127.0.0.1:8000 -w 4 --timeout=300 --pythonpath=/opt/wwc/grading-controller grading_controller.wsgi
Upstart tasks:
grader
ml_grader
ml_creator
pull_from_xqueue
expire_old
"DATABASES": {
"default": {
"ENGINE": "django.db.backends.mysql",
"NAME": "sandbox_grader",
"USERNAME": "sandbox_grader",
"PORT": "3306",
"PASSWORD": "faarg16ren",
"HOST": "sandbox.rds.edx.org"
}
},
sudo apt-get update
sudo apt-get upgrade gcc
sudo xargs -a apt-packages.txt apt-get install
sudo pip install virtualenv
sudo mkdir /opt/edx
source /opt/edx/bin/activate
cd /opt/wwc/machine-learning
pip install numpy
pip install scipy
pip install -r requirements.txt
cd opt/wwc/machine-learning
pup install -r requirements.txt
python -m nltk.downloader maxent_treebank_pos_tagger wordnet
sudo mv /path/to/nltk_data /usr/share
\ No newline at end of file
"""
Defines an essay set object, which encapsulates essays from training and test sets.
Performs spell and grammar checking, tokenization, and stemming.
"""
import numpy
import nltk
import sys
import random
import os
import logging
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object):
def __init__(self, type="train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._score=[]
self._text=[]
self._id=[]
self._clean_text=[]
self._tokens=[]
self._pos=[]
self._clean_stem_text=[]
self._generated = []
self._prompt = ""
self._spelling_errors=[]
self._markup_text=[]
def add_essay(self, essay_text, essay_score, essay_generated=0):
"""
Add new (essay_text,essay_score) pair to the essay set.
essay_text must be a string.
essay_score must be an int.
essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if(len(self._id) > 0):
max_id = max(self._id)
else:
max_id = 0
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try:
essay_text=essay_text.encode('ascii', 'ignore')
if len(essay_text)<5:
essay_text="Invalid essay."
except:
log.exception("Could not parse essay into ascii.")
try:
#Try conversion of types
essay_score=int(essay_score)
essay_text=str(essay_text)
except:
#Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters
try:
essay_text=str(essay_text.encode('ascii', 'ignore'))
except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay)
# Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text)
# Tokenize text
self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
# Part of speech tag text
self._pos.append(nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" ")))
self._generated.append(essay_generated)
# Stem spell corrected text
porter = nltk.PorterStemmer()
por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
self._clean_stem_text.append(por_toks)
ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
else:
raise util_functions.InputError(essay_text, "arguments need to be in format "
"(text,score). text needs to be string,"
" score needs to be int.")
def update_prompt(self, prompt_text):
"""
Update the default prompt string, which is "".
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if(type(prompt_text) == type("text")):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else:
raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret
def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
"""
Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
e_text is the text of the original essay.
e_score is the score of the original essay.
dict is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
"""
random.seed(1)
e_toks = nltk.word_tokenize(e_text)
all_syns = []
for word in e_toks:
synonyms = util_functions.get_wordnet_syns(word)
if(len(synonyms) > max_syns):
synonyms = random.sample(synonyms, max_syns)
all_syns.append(synonyms)
new_essays = []
for i in range(0, max_syns):
syn_toks = e_toks
for z in range(0, len(e_toks)):
if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks))
for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
__author__ = 'vik'
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
#! /usr/bin/env python
##############################################################################
# Following functions have been taken from the DendroPy library from:
##
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.txt" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
import math
## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
this returns the probability of observing `x` items of class M when sampling
`k` times without replacement from the entire population (i.e., {M,N})
p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
"""
# following fails with 'OverflowError: long int too large to convert to
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x))
c = math.log(binomial_coefficient(m+n, k))
return math.exp(a+b-c)
## From dendropy.mathlib.probability
def binomial_coefficient(population, sample):
"Returns `population` choose `sample`."
s = max(sample, population - sample)
assert s <= population
assert population > -1
if s == population:
return 1
numerator = 1
denominator = 1
for i in xrange(s+1, population + 1):
numerator *= i
denominator *= (i - s)
return numerator/denominator
## From dendropy.mathlib.statistics
class FishersExactTest(object):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this calculates the sum of the probability of this table and all others
more extreme under the null hypothesis that there is no association between
the categories represented by the vertical and horizontal axes.
"""
def probability_of_table(table):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this returns the probability of this table under the null hypothesis of
no association between rows and columns, which was shown by Fisher to be
a hypergeometric distribution:
p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
"""
a = table[0][0]
b = table[0][1]
c = table[1][0]
d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c)
probability_of_table = staticmethod(probability_of_table)
def __init__(self, table):
self.table = table
self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
self.min_value = min(self.flat_table)
self.max_value = max(self.flat_table)
def _rotate_cw(self, table):
"""
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return [ [ table[1][0], table[0][0] ],
[table[1][1], table[0][1] ] ]
def _min_rotation(self):
"""
Returns copy of self.table such that the smallest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.min_value:
table = self._rotate_cw(table)
return table
def _max_rotation(self):
"""
Returns copy of self.table such that the largest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.max_value:
table = self._rotate_cw(table)
return table
def _sum_left_tail(self):
# left_tail_tables = self._get_left_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
p_vals = self._get_left_tail_probs()
return sum(p_vals)
def _sum_right_tail(self):
# right_tail_tables = self._get_right_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
p_vals = self._get_right_tail_probs()
return sum(p_vals)
def _get_left_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_right_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_left_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
left_tail_tables = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
left_tail_tables.append([list(table[0]), list(table[1])])
return left_tail_tables
def _get_right_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
right_tail_tables = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
right_tail_tables.append([list(table[0]), list(table[1])])
return right_tail_tables
def left_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_left_tail()
def right_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_right_tail()
def two_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
p0 = self.probability_of_table(self.table)
all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
p_vals = []
for p in all_p_vals:
if p <= p0:
p_vals.append(p)
return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec):
print "OK: {} == {}".format(v1, v2)
else:
print "FAIL: {} != {}".format(v1, v2)
if __name__ == "__main__":
table = [[12, 5], [29, 2]]
ft = FishersExactTest(table)
assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
#Grader called by pyxserver_wsgi.py
#Loads a grader file, which is a dict containing the prompt of the question,
#a feature extractor object, and a trained model.
#Extracts features and runs trained model on the submission to produce a final score.
#Correctness determined by ratio of score to max possible score.
#Requires aspell to be installed and added to the path.
import sys
import pickle
import os
import numpy
import logging
from statsd import statsd
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
#Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.grader.time')
def grade(grader_data,grader_config,submission):
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=EssaySet(type="test")
#Try to add essays to essay set object
try:
grader_set.add_essay(str(submission),0)
grader_set.update_prompt(str(grader_data['prompt']))
except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0])
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
if(feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
results['success']=True
#Generate short form output--number of problem areas identified in feedback
problem_areas=0
for tag in feedback:
if tag in ['topicality', 'prompt-overlap', 'spelling', 'grammar']:
problem_areas+=len(feedback[tag])>5
#Add feedback to results
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
})
if results['score']/float(max_score)<.33:
results['feedback'].update(
{'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
})
else:
#If error, success is False.
results['success']=False
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
else:
raw_confidence = grader_data['model'].predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
results['confidence'] = confidence
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
if not has_error:
results['success'] = True
return results
python-pip
python-scipy
python-mysqldb
ipython
nginx
git
redis-server
libmysqlclient-dev
gfortran
libblas3gf
libblas-dev
liblapack3gf
liblapack-dev
libatlas-base-dev
libxml2-dev
libxslt1-dev
libreadline6
libreadline6-dev
build-essential
curl
aspell
python
\ No newline at end of file
#!/usr/bin/env bash
# posix compliant sanity check
if [ -z $BASH ] || [ $BASH = "/bin/sh" ]; then
echo "Please use the bash interpreter to run this script"
exit 1
fi
error() {
printf '\E[31m'; echo "$@"; printf '\E[0m'
}
output() {
printf '\E[36m'; echo "$@"; printf '\E[0m'
}
### START
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
BREW_FILE=$DIR/"brew-formulas.txt"
APT_PKGS_FILE=$DIR/"apt-packages.txt"
case `uname -s` in
[Ll]inux)
command -v lsb_release &>/dev/null || {
error "Please install lsb-release."
exit 1
}
distro=`lsb_release -cs`
case $distro in
maya|lisa|natty|oneiric|precise|quantal)
output "Installing Ubuntu requirements"
# DEBIAN_FRONTEND=noninteractive is required for silent mysql-server installation
export DEBIAN_FRONTEND=noninteractive
# install packages listed in APT_PKGS_FILE
cat $APT_PKGS_FILE | xargs sudo apt-get -y install
;;
*)
error "Unsupported distribution - $distro"
exit 1
;;
esac
;;
Darwin)
if [[ ! -w /usr/local ]]; then
cat<<EO
You need to be able to write to /usr/local for
the installation of brew and brew packages.
Either make sure the group you are in (most likely 'staff')
can write to that directory or simply execute the following
and re-run the script:
$ sudo chown -R $USER /usr/local
EO
exit 1
fi
output "Installing OSX requirements"
if [[ ! -r $BREW_FILE ]]; then
error "$BREW_FILE does not exist, needed to install brew"
exit 1
fi
# brew errors if the package is already installed
for pkg in $(cat $BREW_FILE); do
grep $pkg <(brew list) &>/dev/null || {
output "Installing $pkg"
brew install $pkg
}
done
# paths where brew likes to install python scripts
PATH=/usr/local/share/python:/usr/local/bin:$PATH
command -v pip &>/dev/null || {
output "Installing pip"
easy_install pip
}
if ! grep -Eq ^1.7 <(virtualenv --version 2>/dev/null); then
output "Installing virtualenv >1.7"
pip install 'virtualenv>1.7' virtualenvwrapper
fi
command -v coffee &>/dev/null || {
output "Installing coffee script"
curl --insecure https://npmjs.org/install.sh | sh
npm install -g coffee-script
}
;;
*)
error "Unsupported platform"
exit 1
;;
esac
boto==2.6.0
coverage==3.5.3
dogstatsd-python==0.2
lxml==3.0.1
mock==0.8.0
nltk==2.0.3
nose==1.2.1
scipy==0.11.0
path.py
pip
pygraphviz==1.1
pylint==0.26.0
pytz==2012h
scikit-learn==0.12.1
#Provides interface functions to create and save models
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
import sklearn.ensemble
from itertools import chain
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
import feature_extractor
import logging
import predictor_extractor
log=logging.getLogger()
def read_in_test_data(filename):
"""
Reads in test data file found at filename.
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
"""
id, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)):
id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
id.append(int(id1))
text.append(text1)
e_set.append(int(set1))
score.append(int(score1))
score2.append(int(score12))
return score, text
def read_in_test_prompt(filename):
"""
Reads in the prompt from a text file
Returns string
"""
prompt_string = open(filename).read()
return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
In filename, the first column should be integer score data.
The second column should be string text data.
Sep specifies the type of separator between fields.
"""
score, text = [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)):
score1, text1 = raw_lines[row].strip().split("\t")
text.append(text1)
score.append(int(score1))
return score, text
def create_essay_set(text, score, prompt_string, generate_additional=True):
"""
Creates an essay set from given data.
Text should be a list of strings corresponding to essay text.
Score should be a list of scores where score[n] corresponds to text[n]
Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not.
"""
x = EssaySet()
for i in xrange(0, len(text)):
x.add_essay(text[i], score[i])
if score[i] == min(score) and generate_additional == True:
x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
x.update_prompt(prompt_string)
return x
def get_cv_error(clf,feats,scores):
results={'success' : False, 'kappa' : 0, 'mae' : 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
results['mae']=err
results['kappa']=kappa
results['success']=True
except ValueError:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
except:
log.exception("Error getting cv error estimates.")
return results
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
additional array is an optional argument that can specify
a numpy array of values to add in
returns a trained FeatureExtractor object and a trained classifier
"""
f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays)
if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
if(additional_array.shape[0]==train_feats.shape[0]):
train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
try:
set_score = numpy.asarray(essays._score, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
"""
Writes out a model to a file.
prompt string is a string containing the prompt
feature_ext is a trained FeatureExtractor object
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set=create_essay_set(text_score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path)
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
self._initialized = False
def initialize_dictionaries(self, p_set):
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
success = True
return success
def gen_feats(self, p_set):
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
import numpy
import nltk
import sys
import random
import os
import logging
import essay_set
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
raise util_functions.InputError(target, error_message)
if not isinstance(numeric_features, list):
error_message = "Numeric features are not a list."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if not isinstance(textual_features, list):
error_message = "Textual features are not a list."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
error_message = "Textual feature {0} not string.".format(textual_features[i])
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment