Commit f238e7db by Vik Paruchuri

Remove fisher, fix generic model generation, update tests

parent 5c6a7ad7
......@@ -10,4 +10,5 @@ machine_learning.egg-info/
ease.egg-info/
*.egg
.coverage
*.orig
......@@ -4,3 +4,4 @@ nose==1.2.1
path.py==3.0
pylint==0.26.0
pytz==2012h
fisher==0.1.4
......@@ -31,9 +31,10 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays
"""
algorithm = select_algorithm(score)
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm,
'score' : score, 'text' : text, 'prompt' : prompt_string}
if len(text)!=len(score):
......@@ -42,16 +43,6 @@ def create(text,score,prompt_string):
log.exception(msg)
return results
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
except:
type = util_functions.AlgorithmTypes.regression
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
......@@ -61,12 +52,12 @@ def create(text,score,prompt_string):
log.exception(msg)
try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, algorithm = algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['algorithm'] = type
results['algorithm'] = algorithm
results['success']=True
except:
msg = "feature extraction and model creation failed."
......@@ -86,6 +77,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used
"""
algorithm = select_algorithm(target)
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
......@@ -98,7 +90,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train")
pset = predictor_set.PredictorSet(essaytype="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
......@@ -119,4 +111,17 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
results['errors'].append(msg)
log.exception(msg)
return results
\ No newline at end of file
return results
def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list)))>5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
except:
algorithm = util_functions.AlgorithmTypes.regression
return algorithm
\ No newline at end of file
......@@ -15,32 +15,33 @@ sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
base_path = base_path + "/"
log=logging.getLogger(__name__)
log = logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH = 20000
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object):
def __init__(self, type="train"):
def __init__(self, essaytype="train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
if(essaytype != "train" and essaytype != "test"):
essaytype = "train"
self._type = type
self._score=[]
self._text=[]
self._id=[]
self._clean_text=[]
self._tokens=[]
self._pos=[]
self._clean_stem_text=[]
self._type = essaytype
self._score = []
self._text = []
self._id = []
self._clean_text = []
self._tokens = []
self._pos = []
self._clean_stem_text = []
self._generated = []
self._prompt = ""
self._spelling_errors=[]
self._markup_text=[]
self._spelling_errors = []
self._markup_text = []
def add_essay(self, essay_text, essay_score, essay_generated=0):
"""
......@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try:
essay_text=essay_text.encode('ascii', 'ignore')
if len(essay_text)<5:
essay_text="Invalid essay."
essay_text = essay_text.encode('ascii', 'ignore')
if len(essay_text) < 5:
essay_text = "Invalid essay."
except:
log.exception("Could not parse essay into ascii.")
try:
#Try conversion of types
essay_score=int(essay_score)
essay_text=str(essay_text)
# Try conversion of types
essay_score = int(essay_score)
essay_text = str(essay_text)
except:
#Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
# Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters
try:
essay_text=str(essay_text.encode('ascii', 'ignore'))
essay_text = str(essay_text.encode('ascii', 'ignore'))
except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay)
# Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text)
......@@ -112,21 +113,21 @@ class EssaySet(object):
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if(type(prompt_text) == type("text")):
if(isinstance(prompt_text, basestring)):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else:
raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret
def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
def generate_additional_essays(self, e_text, e_score, dictionary=None, max_syns=3):
"""
Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
e_text is the text of the original essay.
e_score is the score of the original essay.
dict is a fixed dictionary (list) of words to replace.
dictionary is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
"""
random.seed(1)
......@@ -141,8 +142,8 @@ class EssaySet(object):
for i in range(0, max_syns):
syn_toks = e_toks
for z in range(0, len(e_toks)):
if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
if len(all_syns[z]) > i and (dictionary == None or e_toks[z] in dictionary):
syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks))
for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
self.add_essay(new_essays[z], e_score, 1)
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
#! /usr/bin/env python
##############################################################################
# Following functions have been taken from the DendroPy library from:
##
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.txt" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
import math
## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
this returns the probability of observing `x` items of class M when sampling
`k` times without replacement from the entire population (i.e., {M,N})
p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
"""
# following fails with 'OverflowError: long int too large to convert to
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x))
c = math.log(binomial_coefficient(m+n, k))
return math.exp(a+b-c)
## From dendropy.mathlib.probability
def binomial_coefficient(population, sample):
"Returns `population` choose `sample`."
s = max(sample, population - sample)
assert s <= population
assert population > -1
if s == population:
return 1
numerator = 1
denominator = 1
for i in xrange(s+1, population + 1):
numerator *= i
denominator *= (i - s)
return numerator/denominator
## From dendropy.mathlib.statistics
class FishersExactTest(object):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this calculates the sum of the probability of this table and all others
more extreme under the null hypothesis that there is no association between
the categories represented by the vertical and horizontal axes.
"""
def probability_of_table(table):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this returns the probability of this table under the null hypothesis of
no association between rows and columns, which was shown by Fisher to be
a hypergeometric distribution:
p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
"""
a = table[0][0]
b = table[0][1]
c = table[1][0]
d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c)
probability_of_table = staticmethod(probability_of_table)
def __init__(self, table):
self.table = table
self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
self.min_value = min(self.flat_table)
self.max_value = max(self.flat_table)
def _rotate_cw(self, table):
"""
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return [ [ table[1][0], table[0][0] ],
[table[1][1], table[0][1] ] ]
def _min_rotation(self):
"""
Returns copy of self.table such that the smallest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.min_value:
table = self._rotate_cw(table)
return table
def _max_rotation(self):
"""
Returns copy of self.table such that the largest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.max_value:
table = self._rotate_cw(table)
return table
def _sum_left_tail(self):
# left_tail_tables = self._get_left_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
p_vals = self._get_left_tail_probs()
return sum(p_vals)
def _sum_right_tail(self):
# right_tail_tables = self._get_right_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
p_vals = self._get_right_tail_probs()
return sum(p_vals)
def _get_left_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_right_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_left_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
left_tail_tables = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
left_tail_tables.append([list(table[0]), list(table[1])])
return left_tail_tables
def _get_right_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
right_tail_tables = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
right_tail_tables.append([list(table[0]), list(table[1])])
return right_tail_tables
def left_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_left_tail()
def right_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_right_tail()
def two_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
p0 = self.probability_of_table(self.table)
all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
p_vals = []
for p in all_p_vals:
if p <= p0:
p_vals.append(p)
return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec):
print "OK: {} == {}".format(v1, v2)
else:
print "FAIL: {} != {}".format(v1, v2)
if __name__ == "__main__":
table = [[12, 5], [29, 2]]
ft = FishersExactTest(table)
assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
......@@ -8,24 +8,25 @@ import os
import numpy
import logging
#Append sys to base path to import the following modules
# Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
#Depend on base path to be imported
# Depend on base path to be imported
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
#Imports needed to unpickle grader data
# Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
......@@ -38,73 +39,75 @@ def grade(grader_data,submission):
submission - The student submission (string)
"""
#Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
# Initialize result dictionary
results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error = False
grader_set=EssaySet(type="test")
grader_set = EssaySet(essaytype="test")
feedback = {}
#This is to preserve legacy functionality
# This is to preserve legacy functionality
if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
# Try to add essay to essay set object
grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt']))
except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
# Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0])
except :
grader_feats = grader_data['extractor'].gen_feats(grader_set)
feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
results['score'] = int(grader_data['model'].predict(grader_feats)[0])
except:
results['errors'].append("Could not extract features and score essay.")
has_error=True
has_error = True
#Try to determine confidence level
# Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
# If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
# If the essay is just a copy of the prompt, return a 0 as the score
if('too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']):
results['score'] = 0
results['correct'] = False
results['success']=True
results['success'] = True
#Generate short form output--number of problem areas identified in feedback
# Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available
# Add feedback to results if available
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
'topicality': feedback['topicality'],
'prompt-overlap': feedback['prompt_overlap'],
})
results['feedback'].update(
{
'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
'spelling': feedback['spelling'],
'grammar': feedback['grammar'],
'markup-text': feedback['markup_text'],
}
)
else:
#If error, success is False.
results['success']=False
# If error, success is False.
results['success'] = False
return results
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
......@@ -116,34 +119,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False
has_error = False
#Try to find and load the model file
# Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
grader_set = predictor_set.PredictorSet(essaytype="test")
#Try to add essays to essay set object
# Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
grader_set.add_row(numeric_features, textual_features, 0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
has_error = True
#Try to extract features from submission and assign score via the model
# Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
grader_feats = grader_data['extractor'].gen_feats(grader_set)
results['score'] = grader_data['model'].predict(grader_feats)[0]
except:
results['errors'].append("Could not extract features and score essay.")
has_error=True
has_error = True
#Try to determine confidence level
# Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
# If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
......@@ -151,7 +154,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
......@@ -163,7 +167,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))]
raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
elif hasattr(model, "predict"):
......@@ -173,4 +177,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence = 0
return confidence
......@@ -27,12 +27,12 @@ def read_in_test_data(filename):
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
"""
id, e_set, score, score2, text = [], [], [], [], []
tid, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)):
id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
id.append(int(id1))
tid1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
tid.append(int(tid1))
text.append(text1)
e_set.append(int(set1))
score.append(int(score1))
......@@ -109,12 +109,12 @@ def get_cv_error(clf,feats,scores):
return results
def get_algorithms(type):
def get_algorithms(algorithm):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if type == util_functions.AlgorithmTypes.classification:
if algorithm == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
......@@ -127,7 +127,7 @@ def get_algorithms(type):
return clf, clf2
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
......@@ -141,7 +141,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type)
clf,clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
......@@ -149,6 +149,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score = predictor_set._target
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
......@@ -156,7 +157,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
return f, clf, cv_error_results
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
......@@ -171,11 +172,11 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
type = util_functions.AlgorithmTypes.regression
algorithm = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
algorithm = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(type)
clf,clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
......@@ -205,7 +206,7 @@ def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_arra
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set=create_essay_set(text_score,prompt)
essay_set=create_essay_set(text,score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path)
......
......@@ -16,14 +16,14 @@ if not base_path.endswith("/"):
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
def __init__(self, essaytype = "train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
if(essaytype != "train" and essaytype != "test"):
essaytype = "train"
self._type = type
self._type = essaytype
self._target=[]
self._textual_features=[]
self._numeric_features=[]
......@@ -85,7 +85,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
self._essay_sets.append(essay_set.EssaySet(essaytype=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
......
......@@ -56,7 +56,7 @@ class ModelCreator():
self.text = text
#Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text[0], basestring):
if isinstance(text, list):
self.create_model_generic = False
else:
self.create_model_generic = True
......@@ -75,7 +75,7 @@ class Grader():
if isinstance(submission, basestring):
return grade.grade(self.model_data, submission)
else:
return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', []))
return grade.grade_generic(self.model_data, submission.get('numeric_values', []), submission.get('textual_values', []))
class GenericTest(object):
loader = DataLoader
......@@ -121,6 +121,25 @@ class GenericTest(object):
self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
self.assertLessEqual(cv_mae, self.expected_mae_max)
def test_generic_model_creation_and_grading(self):
score_subset = [random.randint(0,100) for i in xrange(0,min([QUICK_TEST_LIMIT, len(self.scores)]))]
text_subset = self.text[:QUICK_TEST_LIMIT]
text_subset = {
'textual_values' : [[t] for t in text_subset],
'numeric_values' : [[1] for i in xrange(0,len(text_subset))]
}
model_creator = ModelCreator(score_subset, text_subset)
results = model_creator.create_model()
self.assertTrue(results['success'])
grader = Grader(results)
test_text = {
'textual_values' : [[self.text[0]]],
'numeric_values' : [[1]]
}
grader.grade(test_text)
self.assertTrue(results['success'])
class PolarityTest(unittest.TestCase,GenericTest):
loader = PolarityLoader
data_path = "data/polarity"
......@@ -132,3 +151,5 @@ class PolarityTest(unittest.TestCase,GenericTest):
def setUp(self):
self.generic_setup()
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
from external_code.fisher import fisher
from fisher import pvalue
aspell_path = "aspell"
import re
......@@ -211,8 +211,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
fish_val = fisher.FishersExactTest.probability_of_table(
[[good_loop_present, bad_loop_present], [good_loop_missing, bad_loop_missing]])
fish_val = pvalue(good_loop_present, bad_loop_present, good_loop_missing, bad_loop_missing).two_tail
fish_vals.append(fish_val)
cutoff = 1
......@@ -382,6 +381,8 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
See quadratic_weighted_kappa for argument descriptions
"""
assert(len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b]
if min_rating is None:
min_rating = min(rater_a)
if max_rating is None:
......@@ -400,6 +401,7 @@ def histogram(ratings, min_rating=None, max_rating=None):
ratings is a list of scores
Returns a list of frequencies
"""
ratings = [int(r) for r in ratings]
if min_rating is None:
min_rating = min(ratings)
if max_rating is None:
......@@ -450,8 +452,7 @@ def get_separator_words(toks1):
tok1_total = tab_toks1._N
tok2_present = toks2[word]
tok2_total = toks2._N
fish_val = fisher.FishersExactTest.probability_of_table(
[[tok1_present, tok2_present], [tok1_total, tok2_total]])
fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
sep_words.append(word)
sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment