Commit 51d33f29 by VikParuchuri

Merge pull request #9 from MITx/vik/deployment_work

Vik/deployment work
parents ed930658 18fdc2ab
......@@ -3,3 +3,4 @@ __pycache__/
models/
*.pyc
*~
tests/
aspell
\ No newline at end of file
"""
Functions that create a machine learning model from training data
"""
import os
import sys
import logging
log = logging.getLogger(__name__)
from statsd import statsd
import numpy
#Define base path and add to sys path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path)
#Import modules that are dependent on the base path
import model_creator
import util_functions
import predictor_set
import predictor_extractor
from statsd import statsd
#Make a log
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string,model_path):
def create(text,score,prompt_string):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays
"""
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : ""}
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
'score' : score, 'text' : text, 'prompt' : prompt_string}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
......@@ -28,18 +44,30 @@ def create(text,score,prompt_string,model_path):
log.exception(msg)
return results
#Decide what algorithm to use (regression or classification)
try:
if len(util_functions.f7(list(score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
except:
type = util_functions.AlgorithmTypes.regression
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
msg = "essay set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set)
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['algorithm'] = type
results['success']=True
except:
msg = "feature extraction and model creation failed."
......@@ -53,7 +81,17 @@ def create(text,score,prompt_string,model_path):
return results
def create_generic(numeric_values, textual_values, target, model_path, algorithm = util_functions.AlgorithmTypes.regression):
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
algorithm - the type of algorithm that will be used
"""
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
......@@ -64,6 +102,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
return results
try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
......@@ -73,6 +112,7 @@ def create_generic(numeric_values, textual_values, target, model_path, algorithm
log.exception(msg)
try:
#Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
......
cv_pred actual
\ No newline at end of file
sudo apt-get update
sudo apt-get upgrade gcc
sudo xargs -a apt-packages.txt apt-get install
sudo pip install virtualenv
sudo mkdir /opt/edx
source /opt/edx/bin/activate
cd /opt/wwc/machine-learning
pip install numpy
pip install scipy
pip install -r requirements.txt
cd opt/wwc/machine-learning
pup install -r requirements.txt
python -m nltk.downloader maxent_treebank_pos_tagger wordnet
sudo mv /path/to/nltk_data /usr/share
\ No newline at end of file
......@@ -77,7 +77,10 @@ class EssaySet(object):
self._id.append(max_id + 1)
self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters
essay_text=str(essay_text.encode('ascii', 'ignore'))
try:
essay_text=str(essay_text.encode('ascii', 'ignore'))
except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
......
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
......@@ -24,6 +24,9 @@ if not base_path.endswith("/"):
log = logging.getLogger(__name__)
#Paths to needed data files
NGRAM_PATH = base_path + "data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH = util_functions.ESSAY_CORPUS_PATH
class FeatureExtractor(object):
def __init__(self):
......@@ -41,17 +44,26 @@ class FeatureExtractor(object):
"""
if(hasattr(e_set, '_type')):
if(e_set._type == "train"):
#normal text (unstemmed) useful words/bigrams
nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2 = max_feats2)
#stemmed and spell corrected vocab useful words/ngrams
svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2 = max_feats2)
#dictionary trained on proper vocab
self._normal_dict = CountVectorizer(ngram_range=(1,2), vocabulary=nvocab)
#dictionary trained on proper vocab
self._stem_dict = CountVectorizer(ngram_range=(1,2), vocabulary=svocab)
self.dict_initialized = True
#Average spelling errors in set. needed later for spelling detection
self._mean_spelling_errors=sum(e_set._spelling_errors)/float(len(e_set._spelling_errors))
self._spell_errors_per_character=sum(e_set._spelling_errors)/float(sum([len(t) for t in e_set._text]))
#Gets the number and positions of grammar errors
good_pos_tags,bad_pos_positions=self._get_grammar_errors(e_set._pos,e_set._text,e_set._tokens)
self._grammar_errors_per_character=(sum(good_pos_tags)/float(sum([len(t) for t in e_set._text])))
#Generate bag of words features
bag_feats=self.gen_bag_feats(e_set)
#Sum of a row of bag of words features (topical words in an essay)
f_row_sum=numpy.sum(bag_feats[:,:])
#Average index of how "topical" essays are
self._mean_f_prop=f_row_sum/float(sum([len(t) for t in e_set._text]))
ret = "ok"
else:
......@@ -65,13 +77,13 @@ class FeatureExtractor(object):
Gets a list of gramatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
"""
if(os.path.isfile(base_path + "good_pos_ngrams.p")):
good_pos_ngrams = pickle.load(open(base_path + 'good_pos_ngrams.p', 'rb'))
elif os.path.isfile(base_path + "essaycorpus.txt"):
essay_corpus = open(base_path + "essaycorpus.txt").read()
if(os.path.isfile(NGRAM_PATH)):
good_pos_ngrams = pickle.load(open(NGRAM_PATH, 'rb'))
elif os.path.isfile(ESSAY_CORPUS_PATH):
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = util_functions.sub_chars(essay_corpus)
good_pos_ngrams = util_functions.regenerate_good_tokens(essay_corpus)
pickle.dump(good_pos_ngrams, open(base_path + 'good_pos_ngrams.p', 'wb'))
pickle.dump(good_pos_ngrams, open(NGRAM_PATH, 'wb'))
else:
#Hard coded list in case the needed files cannot be found
good_pos_ngrams=['NN PRP', 'NN PRP .', 'NN PRP . DT', 'PRP .', 'PRP . DT', 'PRP . DT NNP', '. DT',
......@@ -85,6 +97,9 @@ class FeatureExtractor(object):
def _get_grammar_errors(self,pos,text,tokens):
"""
Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
text - normal text (list)
tokens - list of lists of tokenized text
"""
word_counts = [max(len(t),1) for t in tokens]
good_pos_tags = []
......@@ -121,6 +136,7 @@ class FeatureExtractor(object):
Generates length based features from an essay set
Generally an internal function called by gen_feats
Returns an array of length features
e_set - EssaySet object
"""
text = e_set._text
lengths = [len(e) for e in text]
......@@ -144,6 +160,7 @@ class FeatureExtractor(object):
Generates bag of words features from an input essay set and trained FeatureExtractor
Generally called by gen_feats
Returns an array of features
e_set - EssaySet object
"""
if(hasattr(self, '_stem_dict')):
sfeats = self._stem_dict.transform(e_set._clean_stem_text)
......@@ -157,6 +174,7 @@ class FeatureExtractor(object):
"""
Generates bag of words, length, and prompt features from an essay set object
returns an array of features
e_set - EssaySet object
"""
bag_feats = self.gen_bag_feats(e_set)
length_feats = self.gen_length_feats(e_set)
......@@ -171,6 +189,7 @@ class FeatureExtractor(object):
Generates prompt based features from an essay set object and internal prompt variable.
Generally called internally by gen_feats
Returns an array of prompt features
e_set - EssaySet object
"""
prompt_toks = nltk.word_tokenize(e_set._prompt)
expand_syns = []
......@@ -206,6 +225,7 @@ class FeatureExtractor(object):
features - optionally, pass in a matrix of features extracted from e_set using FeatureExtractor
in order to get off topic feedback.
Returns a list of lists (one list per essay in e_set)
e_set - EssaySet object
"""
#Set ratio to modify thresholds for grammar/spelling errors
......@@ -220,9 +240,9 @@ class FeatureExtractor(object):
all_feedback=[]
for m in xrange(0,len(e_set._text)):
#Be very careful about changing these messages!
individual_feedback={'grammar' : "Grammar: Ok.", 'spelling' : "Spelling: Ok.",
'topicality' : "Topicality: Ok.", 'markup_text' : "",
'prompt_overlap' : "Prompt Overlap: Ok.",
individual_feedback={'grammar' : "Grammar: Ok.",
'spelling' : "Spelling: Ok.",
'markup_text' : "",
'grammar_per_char' : set_grammar_per_character[m],
'spelling_per_char' : set_spell_errors_per_character[m],
'too_similar_to_prompt' : False,
......
#Grader called by pyxserver_wsgi.py
#Loads a grader file, which is a dict containing the prompt of the question,
#a feature extractor object, and a trained model.
#Extracts features and runs trained model on the submission to produce a final score.
#Correctness determined by ratio of score to max possible score.
#Requires aspell to be installed and added to the path.
"""
Functions to score specified data using specified ML models
"""
import sys
import pickle
......@@ -12,9 +9,11 @@ import numpy
import logging
from statsd import statsd
#Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
#Depend on base path to be imported
from essay_set import EssaySet
import predictor_extractor
import predictor_set
......@@ -28,18 +27,31 @@ import math
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.grader.time')
def grade(grader_data,grader_config,submission):
def grade(grader_data,submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
{
'model' : trained model,
'extractor' : trained feature extractor,
'prompt' : prompt for the question,
'algorithm' : algorithm for the question,
}
submission - The student submission (string)
"""
#Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=EssaySet(type="test")
#Try to add essays to essay set object
#This is to preserve legacy functionality
if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
grader_set.update_prompt(str(grader_data['prompt']))
except:
......@@ -57,17 +69,14 @@ def grade(grader_data,grader_config,submission):
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
......@@ -75,24 +84,23 @@ def grade(grader_data,grader_config,submission):
results['success']=True
#Generate short form output--number of problem areas identified in feedback
problem_areas=0
for tag in feedback:
if tag in ['topicality', 'prompt-overlap', 'spelling', 'grammar']:
problem_areas+=len(feedback[tag])>5
#Add feedback to results
results['feedback']={
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
}
if results['score']/float(max_score)<.33:
results['feedback'].update(
{'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
#Add feedback to results if available
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
})
results['feedback'].update(
{
'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
}
)
else:
#If error, success is False.
results['success']=False
......@@ -103,7 +111,17 @@ def grade(grader_data,grader_config,submission):
return results
def grade_generic(grader_data, grader_config, numeric_features, textual_features):
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
grader_data -- dictionary containing:
{
'algorithm' - Type of algorithm to use to score
}
numeric_features - list of numeric features to predict on
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
......@@ -129,16 +147,7 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
#Try to determine confidence level
try:
min_score=min(numpy.asarray(grader_data['score']))
max_score=max(numpy.asarray(grader_data['score']))
if grader_data['algorithm'] == util_functions.AlgorithmTypes.classification:
raw_confidence=grader_data['model'].predict_proba(grader_feats)[0,(results['score']-min_score)]
#TODO: Normalize confidence somehow here
results['confidence']=raw_confidence
else:
raw_confidence = grader_data['model'].predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
results['confidence'] = confidence
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
......@@ -151,3 +160,25 @@ def grade_generic(grader_data, grader_config, numeric_features, textual_features
results['success'] = True
return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
model - a trained model
grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model
"""
min_score=min(numpy.asarray(scores))
max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification:
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
else:
raw_confidence = model.predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
return confidence
python-pip
python-scipy
python-mysqldb
ipython
nginx
git
redis-server
libmysqlclient-dev
gfortran
libblas3gf
libblas-dev
liblapack3gf
liblapack-dev
libatlas-base-dev
libxml2-dev
libxslt1-dev
libreadline6
libreadline6-dev
build-essential
curl
aspell
python
\ No newline at end of file
......@@ -87,10 +87,16 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return x
def get_cv_error(clf,feats,scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
results['mae']=err
results['kappa']=kappa
......@@ -103,15 +109,11 @@ def get_cv_error(clf,feats,scores):
return results
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
def get_algorithms(type):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
......@@ -122,7 +124,24 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
return clf, clf2
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
......@@ -137,7 +156,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
return f, clf, cv_error_results
def extract_features_and_generate_model(essays,additional_array=None):
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
......@@ -149,20 +168,18 @@ def extract_features_and_generate_model(essays,additional_array=None):
f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays)
if(additional_array!=None and type(additional_array)==type(numpy.array([1]))):
if(additional_array.shape[0]==train_feats.shape[0]):
train_feats=numpy.concatenate((train_feats,additional_array),axis=1)
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf,clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
try:
set_score = numpy.asarray(essays._score, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
......
"""
Extracts features for an arbitrary set of textual and numeric inputs
"""
import numpy
import re
import nltk
......@@ -12,6 +16,7 @@ import logging
import math
from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
......@@ -28,6 +33,10 @@ class PredictorExtractor(object):
self._initialized = False
def initialize_dictionaries(self, p_set):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
"""
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
......@@ -43,6 +52,7 @@ class PredictorExtractor(object):
if div_length==0:
div_length=1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
......@@ -52,6 +62,10 @@ class PredictorExtractor(object):
return success
def gen_feats(self, p_set):
"""
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
......
#!/usr/bin/python
#------------------------------------------------------------
# Run me with (may need su privilege for logging):
# gunicorn -w 4 -b 127.0.0.1:3031 pyxserver_wsgi:application
#------------------------------------------------------------
import cgi # for the escape() function
import json
import logging
import os
import os.path
import sys
from time import localtime, strftime
script_dir = os.path.dirname(__file__)
sys.path.append(script_dir)
import settings # Not django, but do something similar
# make sure we can find the grader files
sys.path.append(settings.GRADER_ROOT)
import grade
results_template = """
<div class="test">
<header>Test results</header>
<section>
<div class="shortform">
{status}
</div>
<div class="longform">
{errors}
{results}
</div>
</section>
</div>
"""
results_correct_template = """
<div class="result-output result-correct">
<h4>{short-description}</h4>
<p>{long-description}</p>
<dl>
<dt>Output:</dt>
<dd class="result-actual-output">
<pre>{actual-output}</pre>
</dd>
</dl>
</div>
"""
results_incorrect_template = """
<div class="result-output result-incorrect">
<h4>{short-description}</h4>
<p>{long-description}</p>
<dl>
<dt>Your output:</dt>
<dd class="result-actual-output"><pre>{actual-output}</pre></dd>
<dt>Correct output:</dt>
<dd><pre>{expected-output}</pre></dd>
</dl>
</div>
"""
def format_errors(errors):
esc = cgi.escape
error_string = ''
error_list = [esc(e) for e in errors or []]
if error_list:
items = '\n'.join(['<li><pre>{0}</pre></li>\n'.format(e) for e in error_list])
error_string = '<ul>\n{0}</ul>\n'.format(items)
error_string = '<div class="result-errors">{0}</div>'.format(error_string)
return error_string
def to_dict(result):
# long description may or may not be provided. If not, don't display it.
# TODO: replace with mako template
esc = cgi.escape
if result[1]:
long_desc = '<p>{0}</p>'.format(esc(result[1]))
else:
long_desc = ''
return {'short-description': esc(result[0]),
'long-description': long_desc,
'correct': result[2], # Boolean; don't escape.
'expected-output': esc(result[3]),
'actual-output': esc(result[4])
}
def render_results(results):
output = []
test_results = [to_dict(r) for r in results['tests']]
for result in test_results:
if result['correct']:
template = results_correct_template
else:
template = results_incorrect_template
output += template.format(**result)
errors = format_errors(results['errors'])
status = 'INCORRECT'
if errors:
status = 'ERROR'
elif results['correct']:
status = 'CORRECT'
return results_template.format(status=status,
errors=errors,
results=''.join(output))
def do_GET(data):
return "Hey, the time is %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime())
def do_POST(data):
# This server expects jobs to be pushed to it from the queue
xpackage = json.loads(data)
body = xpackage['xqueue_body']
# Delivery from the lms
body = json.loads(body)
student_response = body['student_response']
payload = body['grader_payload']
try:
grader_config = json.loads(payload)
except ValueError as err:
# If parsing json fails, erroring is fine--something is wrong in the content.
# However, for debugging, still want to see what the problem is
raise
relative_grader_path = grader_config['grader']
grader_path = os.path.join(settings.GRADER_ROOT, relative_grader_path)
results = grade.grade(grader_path, student_response)
# Make valid JSON message
reply = { 'correct': results['correct'],
'score': results['score'],
'msg': render_results(results) }
return json.dumps(reply)
# Entry point
def application(env, start_response):
# Handle request
method = env['REQUEST_METHOD']
data = env['wsgi.input'].read()
def post_wrapper(data):
try:
return do_POST(data)
except:
return None
handlers = {'GET': do_GET,
'POST': post_wrapper,
}
if method in handlers.keys():
reply = handlers[method](data)
if reply is not None:
start_response('200 OK', [('Content-Type', 'text/html')])
return reply
# If we fell through to here, complain.
start_response('404 Not Found', [('Content-Type', 'text/plain')])
return ''
# Not django (for now), but use the same settings format anyway
import json
import os
from path import path
import sys
ROOT_PATH = path(__file__).dirname()
REPO_PATH = ROOT_PATH
ENV_ROOT = REPO_PATH.dirname()
# DEFAULTS
DEBUG = False
# Must end in '/'
RUN_URL = 'http://127.0.0.1:3031/' # Victor's VM ...
RUN_URL = 'http://sandbox-runserver-001.m.edx.org:8080/'
RUN_URL = 'http://sandbox-runserver.elb.edx.org:80/'
GRADER_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
# AWS
if os.path.isfile(ENV_ROOT / "env.json"):
print "Opening env.json file"
with open(ENV_ROOT / "env.json") as env_file:
ENV_TOKENS = json.load(env_file)
RUN_URL = ENV_TOKENS['RUN_URL']
LOG_DIR = ENV_TOKENS['LOG_DIR']
# Should be absolute path to 6.00 grader dir.
# NOTE: This means we only get one version of 6.00 graders available--has to
# be the same for internal and external class. Not critical -- can always
# use different grader file if want different problems.
GRADER_ROOT = ENV_TOKENS.get('GRADER_ROOT')
this is an incorrect response
\ No newline at end of file
,vik,vikp,02.11.2012 17:19,file:///home/vik/.config/libreoffice/3;
\ No newline at end of file
This experement didn't have a controle and the grupe didn't do multiple triles. You would also may need to know what tempriture the rome is.
{"grader":"tests/models/essay_set_1.p"}
In order for I for replicate this expirement I woukd need to know what are the reaserching with this expirement what kind of result are being booked at and the mass of each sample at the end of expirment theie results.
This source diff could not be displayed because it is too large. You can view the blob instead.
<b><fg>In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard&&&&&and many other sample materials that they&;;;& didn't use and would get different results. Also I would also<>>> need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell//////where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.<b><b>
{"grader":"tests/models/essay_set_1.p"}
In order to conduct the experiment, the students would need to know the mass of the marble, the height of the drop, and the air temperature.
This source diff could not be displayed because it is too large. You can view the blob instead.
"A group of students wrote the following procedure for their investigation. Procedure: 1. Determine the mass of four different samples. 2. Pour vinegar in each of four separate, but identical, containers. 3. Place a sample of one material into one container and label. Repeat with remaining samples, placing a single sample into a single container. 4. After 24 hours, remove the samples from the containers and rinse each sample with distilled water. 5. Allow the samples to sit and dry for 30 minutes. 6. Determine the mass of each sample. The students’ data are recorded in the table below. Sample Starting Mass (g) Ending Mass (g) Difference in Mass (g) Marble 9.8 9.4 –0.4 Limestone 10.4 9.1 –1.3 Wood 11.2 11.2 0.0 Plastic 7.2 7.1 –0.1"
In order to replicate this experiment, I would need to know additional information such as the four different samples that they used (because I could have choosen metal, carbboard and many other sample materials that they didn't use and would get different results. Also I would also need to know the amount of vinegar to pour because this can caute a major change. Lastly, they might want to tell where to sit the samples while they dry for 30 minutes because if they are sitting in room temp. or by a light source makes a difference too.
{"grader":"tests/models/essay_set_1.p"}
this is an incorrect response
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path)
import util_functions
import essay_set
import feature_extractor
import numpy
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
FILENAME="sa_data.tsv"
all_err=[]
all_kappa=[]
for t_len in [0,50,100,200,300]:
sa_val = file(FILENAME)
scores=[]
texts=[]
lines=sa_val.readlines()
eset=essay_set.EssaySet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
if len(text)>t_len:
scores.append(int(score))
texts.append(text)
eset.add_essay(text,int(score))
#if int(score)==0:
# eset.generate_additional_essays(text,int(score))
extractor=feature_extractor.FeatureExtractor()
extractor.initialize_dictionaries(eset)
train_feats=extractor.gen_feats(eset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
all_err.append(err)
all_kappa.append(kappa)
"""
outfile=open("full_cvout.tsv",'w+')
outfile.write("cv_pred" + "\t" + "actual")
for i in xrange(0,len(cv_preds)):
outfile.write("{0}\t{1}".format(cv_preds[i],scores[i]))
"""
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path)
import util_functions
import essay_set
import feature_extractor
import numpy
import math
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
filenames = ['LSQ_W09_60_MLT.tsv',
'LSQ_W10_22_a.tsv',
'LSQ_W11_21_MLT.tsv',
]
for filename in filenames:
base_name = base_path + filename
print base_name
sa_val = file(base_name)
scores=[]
texts=[]
lines=sa_val.readlines()
eset=essay_set.EssaySet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
scores.append(int(score))
texts.append(text)
eset.add_essay(text,int(score))
#if int(score)==0:
# eset.generate_additional_essays(text,int(score))
extractor=feature_extractor.FeatureExtractor()
extractor.initialize_dictionaries(eset)
train_feats=extractor.gen_feats(eset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores, num_chunks = int(math.floor(len(texts)/2)))
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
outfile=open(filename + "_cvout.tsv",'w+')
outfile.write("cv_pred" + "\t" + "actual\n")
for i in xrange(0,len(cv_preds)):
outfile.write("{0}\t{1}\n".format(str(cv_preds[i]),str(scores[i])))
outfile.close()
import os
import sys
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path=os.path.abspath(os.path.join(base_path,'..'))
sys.path.append(one_up_path)
import util_functions
import predictor_set
import predictor_extractor
import numpy
from sklearn.ensemble import GradientBoostingClassifier
if not base_path.endswith("/"):
base_path=base_path+"/"
FILENAME="sa_data.tsv"
sa_val = file(FILENAME)
scores=[]
texts=[]
lines=sa_val.readlines()
pset = predictor_set.PredictorSet(type="train")
for i in xrange(1,len(lines)):
score,text=lines[i].split("\t\"")
if len(text)>t_len:
scores.append(int(score))
texts.append(text)
pset.add_row([1],[text],int(score))
extractor=predictor_extractor.PredictorExtractor()
extractor.initialize_dictionaries(pset)
train_feats=extractor.gen_feats(pset)
clf=GradientBoostingClassifier(n_estimators=100, learn_rate=.05,max_depth=4, random_state=1,min_samples_leaf=3)
cv_preds=util_functions.gen_cv_preds(clf,train_feats,scores)
err=numpy.mean(numpy.abs(cv_preds-scores))
print err
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
print kappa
\ No newline at end of file
#!/usr/bin/env python
"""
Send some test programs to an xserver.
For each dir in the current directory, send the contents of payload.xml and each
of the answer*.py, right*.py and wrong*.py files.
"""
import argparse
import glob
import json
import os
import os.path
from path import path
import requests
import sys
import time
xserver = 'http://127.0.0.1:3031/'
def send(payload, answer):
"""
Send a grading request to the xserver
"""
body = {'grader_payload': payload,
'student_response': answer}
data = {'xqueue_body': json.dumps(body),
'xqueue_files': ''}
start = time.time()
r = requests.post(xserver, data=json.dumps(data))
end = time.time()
print "Request took %.03f sec" % (end - start)
if r.status_code != requests.codes.ok:
print "Request error:{0},{1},{2}".format(r.headers,payload,answer)
parsed_text=json.loads(r.text)
print("\nAnswer: {0}\nScore: {1} Correct: {2} \nFeedback: {3}"
.format(answer,parsed_text['score'],parsed_text['correct'],
parsed_text['feedback']))
#print "Score:{0} {1}".format(parsed_text['score'],parsed_text['correct'])
return r.text
def check_contains(string, substr):
if not substr in string:
print "ERROR: Expected to be {0}".format(substr)
return False
else:
return True
def check_not_contains(string, substr):
if substr in string:
print "ERROR: Expected to be {0}".format(substr)
return False
else:
return True
def check_right(string):
return check_contains(string, '\"correct\": true')
def check_wrong(string):
return check_contains(string, '\"correct\": false')
def globs(dirname, *patterns):
"""
Produce a sequence of all the files matching any of our patterns in dirname.
"""
for pat in patterns:
for fname in glob.glob(os.path.join(dirname, pat)):
yield fname
def contents(fname):
"""
Return the contents of the file `fname`.
"""
with open(fname) as f:
return f.read()
def check(dirname,type):
"""
Look for payload.json, answer*.py, right*.py, wrong*.py, run tests.
"""
payload_file = os.path.join(dirname, 'payload.json')
if os.path.isfile(payload_file):
payload = contents(payload_file)
print("found payload: " + payload)
else:
graders = list(globs(dirname, 'grade*.py'))
if not graders:
#print "No payload.json or grade*.py in {0}".format(dirname)
return
if len(graders) > 1:
print "More than one grader in {0}".format(dirname)
return
payload = json.dumps({'grader': os.path.abspath(graders[0])})
for name in globs(dirname, 'answer*.txt', 'right*.py'):
#print "Checking correct response from {0}".format(name)
answer = contents(name)
right=check_right(send(payload, answer))
for name in globs(dirname, 'wrong*.txt'):
#print "Checking wrong response from {0}".format(name)
answer = contents(name)
wrong=check_wrong(send(payload, answer))
if(type=="test"):
assert wrong and right
def main(argv):
global xserver
#parser = argparse.ArgumentParser(description="Send dummy requests to a qserver")
#parser.add_argument('server')
#parser.add_argument('root', nargs='?')
#args = parser.parse_args(argv)
#xserver = args.server
if not xserver.endswith('/'):
xserver += '/'
#root = args.root or '.'
root=os.path.dirname( os.path.abspath(__file__ ))
for dirpath, _, _ in os.walk(root):
print("checking" + dirpath)
check(dirpath,"normal")
if __name__=="__main__":
main(sys.argv[1:])
def test_graders():
root=os.path.dirname( os.path.abspath(__file__ ))
for dirpath, _, _ in os.walk(root):
print("checking" + dirpath)
yield check, dirpath, "test"
def test_model_creation():
model_creator_dir=os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
# Run with arguments train_file prompt_file model_path to generate a sample model file
import os
import sys
import argparse
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
sys.path.append(one_up_path)
import model_creator
def main(argv):
parser = argparse.ArgumentParser(description="Generate model from test data files")
parser.add_argument('train_file')
parser.add_argument('prompt_file')
parser.add_argument('model_path')
args = parser.parse_args(argv)
score, text = model_creator.read_in_test_data(args.train_file)
prompt_string = model_creator.read_in_test_prompt(args.prompt_file)
print("data read")
e_set = model_creator.create_essay_set(text, score, prompt_string)
print("essay set created")
feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
print("features pulled out and model generated")
model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, text, score, args.model_path)
print("model file written")
if __name__ == "__main__":
main(sys.argv[1:])
def test_model_creation():
try:
score, text = model_creator.read_in_test_data("train.tsv")
prompt_string = model_creator.read_in_test_prompt("prompt.txt")
e_set = model_creator.create_essay_set(text, score, prompt_string)
feature_ext, classifier = model_creator.extract_features_and_generate_model(e_set)
model_creator.dump_model_to_file(prompt_string, feature_ext, classifier, args.model_path)
assert True
except:
assert False
in order to replicate this experiment , we would need to know the temperature of the vinegar as well as how much vinegar to put in . both of these could vary and therefore change the result of the experiment .
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
in order for i for replicate this expirement i woukd need to know what are the reaserching with this expirement what kind of result are being booked at and the mass of each sample at the end of expirment theie results . i didn't know what the answer is .
\ No newline at end of file
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
from external_code.fisher import fisher
aspell_path = "aspell"
import re
import os
from sklearn.feature_extraction.text import CountVectorizer
import fisher
import numpy
from itertools import chain
import math
import nltk
import random
import pickle
from path import path
import logging
import sys
log=logging.getLogger(__name__)
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
if not base_path.endswith("/"):
base_path=base_path+"/"
#Paths to needed data files
ESSAY_CORPUS_PATH = base_path + "data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH = base_path + "data/essay_cor_tokens.p"
class AlgorithmTypes(object):
"""
Defines what types of algorithm can be used
"""
regression = "regression"
classification = "classifiction"
def create_model_path(model_path):
"""
Creates a path to model files
model_path - string
"""
if not model_path.startswith("/") and not model_path.startswith("models/"):
model_path="/" + model_path
if not model_path.startswith("models"):
......@@ -36,7 +51,9 @@ def sub_chars(string):
Strips illegal characters from a string. Used to sanitize input essays.
Removes all non-punctuation, digit, or letter characters.
Returns sanitized string.
string - string
"""
#Define replacement patterns
sub_pat = r"[^A-Za-z\.\?!,';:]"
char_pat = r"\."
com_pat = r","
......@@ -44,26 +61,18 @@ def sub_chars(string):
excl_pat = r"!"
sem_pat = r";"
col_pat = r":"
whitespace_pat = r"\s{1,}"
whitespace_comp = re.compile(whitespace_pat)
sub_comp = re.compile(sub_pat)
char_comp = re.compile(char_pat)
com_comp = re.compile(com_pat)
ques_comp = re.compile(ques_pat)
excl_comp = re.compile(excl_pat)
sem_comp = re.compile(sem_pat)
col_comp = re.compile(col_pat)
nstring = sub_comp.sub(" ", string)
nstring = char_comp.sub(" .", nstring)
nstring = com_comp.sub(" ,", nstring)
nstring = ques_comp.sub(" ?", nstring)
nstring = excl_comp.sub(" !", nstring)
nstring = sem_comp.sub(" ;", nstring)
nstring = col_comp.sub(" :", nstring)
nstring = whitespace_comp.sub(" ", nstring)
#Replace text. Ordering is very important!
nstring = re.sub(sub_pat, " ", string)
nstring = re.sub(char_pat," .", nstring)
nstring = re.sub(com_pat, " ,", nstring)
nstring = re.sub(ques_pat, " ?", nstring)
nstring = re.sub(excl_pat, " !", nstring)
nstring = re.sub(sem_pat, " ;", nstring)
nstring = re.sub(col_pat, " :", nstring)
nstring = re.sub(whitespace_pat, " ", nstring)
return nstring
......@@ -72,7 +81,10 @@ def spell_correct(string):
Uses aspell to spell correct an input string.
Requires aspell to be installed and added to the path.
Returns the spell corrected string if aspell is found, original string if not.
string - string
"""
#Create a temp file so that aspell could be used
f = open('tmpfile', 'w')
f.write(string)
f_path = os.path.abspath(f.name)
......@@ -81,13 +93,16 @@ def spell_correct(string):
p = os.popen(aspell_path + " -a < " + f_path + " --sug-mode=ultra")
except:
log.exception("Could not find aspell, so could not spell correct!")
#Return original string if aspell fails
return string,0, string
#Aspell returns a list of incorrect words with the above flags
incorrect = p.readlines()
p.close()
incorrect_words = list()
correct_spelling = list()
for i in range(1, len(incorrect)):
if(len(incorrect[i]) > 10):
#Reformat aspell output to make sense
match = re.search(":", incorrect[i])
if hasattr(match, "start"):
begstring = incorrect[i][2:match.start()]
......@@ -101,6 +116,8 @@ def spell_correct(string):
incorrect_words.append(begword)
correct_spelling.append(sug)
#Create markup based on spelling errors
newstring = string
markup_string = string
already_subbed=[]
......@@ -419,13 +436,13 @@ def get_separator_words(toks1):
Returns a list of separator words
"""
tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
if(os.path.isfile("essay_cor_tokens.p")):
toks2 = pickle.load(open('essay_cor_tokens.p', 'rb'))
if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
else:
essay_corpus = open("essaycorpus.txt").read()
essay_corpus = open(ESSAY_CORPUS_PATH).read()
essay_corpus = sub_chars(essay_corpus)
toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
pickle.dump(toks2, open('essay_cor_tokens.p', 'wb'))
pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
sep_words = []
for word in tab_toks1.keys():
tok1_present = tab_toks1[word]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment