Commit f238e7db by Vik Paruchuri

Remove fisher, fix generic model generation, update tests

parent 5c6a7ad7
...@@ -10,4 +10,5 @@ machine_learning.egg-info/ ...@@ -10,4 +10,5 @@ machine_learning.egg-info/
ease.egg-info/ ease.egg-info/
*.egg *.egg
.coverage .coverage
*.orig
...@@ -4,3 +4,4 @@ nose==1.2.1 ...@@ -4,3 +4,4 @@ nose==1.2.1
path.py==3.0 path.py==3.0
pylint==0.26.0 pylint==0.26.0
pytz==2012h pytz==2012h
fisher==0.1.4
...@@ -31,9 +31,10 @@ def create(text,score,prompt_string): ...@@ -31,9 +31,10 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays prompt_string - the common prompt for the set of essays
""" """
algorithm = select_algorithm(score)
#Initialize a results dictionary to return #Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification, 'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm,
'score' : score, 'text' : text, 'prompt' : prompt_string} 'score' : score, 'text' : text, 'prompt' : prompt_string}
if len(text)!=len(score): if len(text)!=len(score):
...@@ -42,16 +43,6 @@ def create(text,score,prompt_string): ...@@ -42,16 +43,6 @@ def create(text,score,prompt_string):
log.exception(msg) log.exception(msg)
return results return results
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
except:
type = util_functions.AlgorithmTypes.regression
try: try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc) #Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string) e_set = model_creator.create_essay_set(text, score, prompt_string)
...@@ -61,12 +52,12 @@ def create(text,score,prompt_string): ...@@ -61,12 +52,12 @@ def create(text,score,prompt_string):
log.exception(msg) log.exception(msg)
try: try:
#Gets features from the essay set and computes error #Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type) feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, algorithm = algorithm)
results['cv_kappa']=cv_error_results['kappa'] results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae'] results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext results['feature_ext']=feature_ext
results['classifier']=classifier results['classifier']=classifier
results['algorithm'] = type results['algorithm'] = algorithm
results['success']=True results['success']=True
except: except:
msg = "feature extraction and model creation failed." msg = "feature extraction and model creation failed."
...@@ -86,6 +77,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -86,6 +77,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used algorithm - the type of algorithm that will be used
""" """
algorithm = select_algorithm(target)
#Initialize a result dictionary to return. #Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0, results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm} 'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
...@@ -98,7 +90,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -98,7 +90,7 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try: try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors #Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train") pset = predictor_set.PredictorSet(essaytype="train")
for i in xrange(0, len(numeric_values)): for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i]) pset.add_row(numeric_values[i], textual_values[i], target[i])
except: except:
...@@ -119,4 +111,17 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func ...@@ -119,4 +111,17 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
results['errors'].append(msg) results['errors'].append(msg)
log.exception(msg) log.exception(msg)
return results return results
\ No newline at end of file
def select_algorithm(score_list):
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score_list)))>5:
algorithm = util_functions.AlgorithmTypes.regression
else:
algorithm = util_functions.AlgorithmTypes.classification
except:
algorithm = util_functions.AlgorithmTypes.regression
return algorithm
\ No newline at end of file
...@@ -15,32 +15,33 @@ sys.path.append(base_path) ...@@ -15,32 +15,33 @@ sys.path.append(base_path)
import util_functions import util_functions
if not base_path.endswith("/"): if not base_path.endswith("/"):
base_path=base_path+"/" base_path = base_path + "/"
log=logging.getLogger(__name__) log = logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH = 20000
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object): class EssaySet(object):
def __init__(self, type="train"): def __init__(self, essaytype="train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
if(type != "train" and type != "test"): if(essaytype != "train" and essaytype != "test"):
type = "train" essaytype = "train"
self._type = type self._type = essaytype
self._score=[] self._score = []
self._text=[] self._text = []
self._id=[] self._id = []
self._clean_text=[] self._clean_text = []
self._tokens=[] self._tokens = []
self._pos=[] self._pos = []
self._clean_stem_text=[] self._clean_stem_text = []
self._generated = [] self._generated = []
self._prompt = "" self._prompt = ""
self._spelling_errors=[] self._spelling_errors = []
self._markup_text=[] self._markup_text = []
def add_essay(self, essay_text, essay_score, essay_generated=0): def add_essay(self, essay_text, essay_score, essay_generated=0):
""" """
...@@ -58,35 +59,35 @@ class EssaySet(object): ...@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1 # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try: try:
essay_text=essay_text.encode('ascii', 'ignore') essay_text = essay_text.encode('ascii', 'ignore')
if len(essay_text)<5: if len(essay_text) < 5:
essay_text="Invalid essay." essay_text = "Invalid essay."
except: except:
log.exception("Could not parse essay into ascii.") log.exception("Could not parse essay into ascii.")
try: try:
#Try conversion of types # Try conversion of types
essay_score=int(essay_score) essay_score = int(essay_score)
essay_text=str(essay_text) essay_text = str(essay_text)
except: except:
#Nothing needed here, will return error in any case. # Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text))) log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score), type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\ if isinstance(essay_score, int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1): and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1) self._id.append(max_id + 1)
self._score.append(essay_score) self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters # Clean text by removing non digit/work/punctuation characters
try: try:
essay_text=str(essay_text.encode('ascii', 'ignore')) essay_text = str(essay_text.encode('ascii', 'ignore'))
except: except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore') essay_text = (essay_text.decode('utf-8', 'replace')).encode('ascii', 'ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower() cleaned_essay = util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH): if(len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay) self._text.append(cleaned_essay)
# Spell correct text using aspell # Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1]) cleaned_text, spell_errors, markup_text = util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text) self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors) self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text) self._markup_text.append(markup_text)
...@@ -112,21 +113,21 @@ class EssaySet(object): ...@@ -112,21 +113,21 @@ class EssaySet(object):
prompt_text should be a string. prompt_text should be a string.
Returns the prompt as a confirmation. Returns the prompt as a confirmation.
""" """
if(type(prompt_text) == type("text")): if(isinstance(prompt_text, basestring)):
self._prompt = util_functions.sub_chars(prompt_text) self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt ret = self._prompt
else: else:
raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.") raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret return ret
def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3): def generate_additional_essays(self, e_text, e_score, dictionary=None, max_syns=3):
""" """
Substitute synonyms to generate extra essays from existing ones. Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data. This is done to increase the amount of training data.
Should only be used with lowest scoring essays. Should only be used with lowest scoring essays.
e_text is the text of the original essay. e_text is the text of the original essay.
e_score is the score of the original essay. e_score is the score of the original essay.
dict is a fixed dictionary (list) of words to replace. dictionary is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high. max_syns defines the maximum number of additional essays to generate. Do not set too high.
""" """
random.seed(1) random.seed(1)
...@@ -141,8 +142,8 @@ class EssaySet(object): ...@@ -141,8 +142,8 @@ class EssaySet(object):
for i in range(0, max_syns): for i in range(0, max_syns):
syn_toks = e_toks syn_toks = e_toks
for z in range(0, len(e_toks)): for z in range(0, len(e_toks)):
if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict): if len(all_syns[z]) > i and (dictionary == None or e_toks[z] in dictionary):
syn_toks[z] = all_syns[z][i] syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks)) new_essays.append(" ".join(syn_toks))
for z in xrange(0, len(new_essays)): for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z], e_score, 1) self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
#! /usr/bin/env python
##############################################################################
# Following functions have been taken from the DendroPy library from:
##
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.txt" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
import math
## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
this returns the probability of observing `x` items of class M when sampling
`k` times without replacement from the entire population (i.e., {M,N})
p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
"""
# following fails with 'OverflowError: long int too large to convert to
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x))
c = math.log(binomial_coefficient(m+n, k))
return math.exp(a+b-c)
## From dendropy.mathlib.probability
def binomial_coefficient(population, sample):
"Returns `population` choose `sample`."
s = max(sample, population - sample)
assert s <= population
assert population > -1
if s == population:
return 1
numerator = 1
denominator = 1
for i in xrange(s+1, population + 1):
numerator *= i
denominator *= (i - s)
return numerator/denominator
## From dendropy.mathlib.statistics
class FishersExactTest(object):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this calculates the sum of the probability of this table and all others
more extreme under the null hypothesis that there is no association between
the categories represented by the vertical and horizontal axes.
"""
def probability_of_table(table):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this returns the probability of this table under the null hypothesis of
no association between rows and columns, which was shown by Fisher to be
a hypergeometric distribution:
p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
"""
a = table[0][0]
b = table[0][1]
c = table[1][0]
d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c)
probability_of_table = staticmethod(probability_of_table)
def __init__(self, table):
self.table = table
self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
self.min_value = min(self.flat_table)
self.max_value = max(self.flat_table)
def _rotate_cw(self, table):
"""
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return [ [ table[1][0], table[0][0] ],
[table[1][1], table[0][1] ] ]
def _min_rotation(self):
"""
Returns copy of self.table such that the smallest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.min_value:
table = self._rotate_cw(table)
return table
def _max_rotation(self):
"""
Returns copy of self.table such that the largest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.max_value:
table = self._rotate_cw(table)
return table
def _sum_left_tail(self):
# left_tail_tables = self._get_left_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
p_vals = self._get_left_tail_probs()
return sum(p_vals)
def _sum_right_tail(self):
# right_tail_tables = self._get_right_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
p_vals = self._get_right_tail_probs()
return sum(p_vals)
def _get_left_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_right_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_left_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
left_tail_tables = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
left_tail_tables.append([list(table[0]), list(table[1])])
return left_tail_tables
def _get_right_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
right_tail_tables = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
right_tail_tables.append([list(table[0]), list(table[1])])
return right_tail_tables
def left_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_left_tail()
def right_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_right_tail()
def two_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
p0 = self.probability_of_table(self.table)
all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
p_vals = []
for p in all_p_vals:
if p <= p0:
p_vals.append(p)
return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec):
print "OK: {} == {}".format(v1, v2)
else:
print "FAIL: {} != {}".format(v1, v2)
if __name__ == "__main__":
table = [[12, 5], [29, 2]]
ft = FishersExactTest(table)
assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
...@@ -8,24 +8,25 @@ import os ...@@ -8,24 +8,25 @@ import os
import numpy import numpy
import logging import logging
#Append sys to base path to import the following modules # Append sys to base path to import the following modules
base_path = os.path.dirname(__file__) base_path = os.path.dirname(__file__)
sys.path.append(base_path) sys.path.append(base_path)
#Depend on base path to be imported # Depend on base path to be imported
from essay_set import EssaySet from essay_set import EssaySet
import predictor_extractor import predictor_extractor
import predictor_set import predictor_set
import util_functions import util_functions
#Imports needed to unpickle grader data # Imports needed to unpickle grader data
import feature_extractor import feature_extractor
import sklearn.ensemble import sklearn.ensemble
import math import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def grade(grader_data,submission):
def grade(grader_data, submission):
""" """
Grades a specified submission using specified models Grades a specified submission using specified models
grader_data - A dictionary: grader_data - A dictionary:
...@@ -38,73 +39,75 @@ def grade(grader_data,submission): ...@@ -38,73 +39,75 @@ def grade(grader_data,submission):
submission - The student submission (string) submission - The student submission (string)
""" """
#Initialize result dictionary # Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'feedback': "", 'success': False, 'confidence': 0}
has_error=False has_error = False
grader_set=EssaySet(type="test") grader_set = EssaySet(essaytype="test")
feedback = {}
#This is to preserve legacy functionality # This is to preserve legacy functionality
if 'algorithm' not in grader_data: if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try: try:
#Try to add essay to essay set object # Try to add essay to essay set object
grader_set.add_essay(str(submission),0) grader_set.add_essay(str(submission), 0)
grader_set.update_prompt(str(grader_data['prompt'])) grader_set.update_prompt(str(grader_data['prompt']))
except: except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission)) results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True has_error = True
#Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats=grader_data['extractor'].gen_feats(grader_set) grader_feats = grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0] feedback = grader_data['extractor'].gen_feedback(grader_set, grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0]) results['score'] = int(grader_data['model'].predict(grader_feats)[0])
except : except:
results['errors'].append("Could not extract features and score essay.") results['errors'].append("Could not extract features and score essay.")
has_error=True has_error = True
#Try to determine confidence level # Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except: except:
#If there is an error getting confidence, it is not a show-stopper, so just log # If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
if not has_error: if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score # If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']): if('too_similar_to_prompt' in feedback and feedback['too_similar_to_prompt']):
results['score']=0 results['score'] = 0
results['correct']=False results['correct'] = False
results['success']=True results['success'] = True
#Generate short form output--number of problem areas identified in feedback # Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available # Add feedback to results if available
results['feedback'] = {} results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback: if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({ results['feedback'].update({
'topicality' : feedback['topicality'], 'topicality': feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'], 'prompt-overlap': feedback['prompt_overlap'],
}) })
results['feedback'].update( results['feedback'].update(
{ {
'spelling' : feedback['spelling'], 'spelling': feedback['spelling'],
'grammar' : feedback['grammar'], 'grammar': feedback['grammar'],
'markup-text' : feedback['markup_text'], 'markup-text': feedback['markup_text'],
} }
) )
else: else:
#If error, success is False. # If error, success is False.
results['success']=False results['success'] = False
return results return results
def grade_generic(grader_data, numeric_features, textual_features): def grade_generic(grader_data, numeric_features, textual_features):
""" """
Grades a set of numeric and textual features using a generic model Grades a set of numeric and textual features using a generic model
...@@ -116,34 +119,34 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -116,34 +119,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on textual_features - list of textual feature to predict on
""" """
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0} results = {'errors': [], 'tests': [], 'score': 0, 'success': False, 'confidence': 0}
has_error=False has_error = False
#Try to find and load the model file # Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test") grader_set = predictor_set.PredictorSet(essaytype="test")
#Try to add essays to essay set object # Try to add essays to essay set object
try: try:
grader_set.add_row(numeric_features, textual_features,0) grader_set.add_row(numeric_features, textual_features, 0)
except: except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features)) results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True has_error = True
#Try to extract features from submission and assign score via the model # Try to extract features from submission and assign score via the model
try: try:
grader_feats=grader_data['extractor'].gen_feats(grader_set) grader_feats = grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0] results['score'] = grader_data['model'].predict(grader_feats)[0]
except : except:
results['errors'].append("Could not extract features and score essay.") results['errors'].append("Could not extract features and score essay.")
has_error=True has_error = True
#Try to determine confidence level # Try to determine confidence level
try: try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score']) results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except: except:
#If there is an error getting confidence, it is not a show-stopper, so just log # If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value") log.exception("Problem generating confidence value")
if not has_error: if not has_error:
...@@ -151,7 +154,8 @@ def grade_generic(grader_data, numeric_features, textual_features): ...@@ -151,7 +154,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return results return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
def get_confidence_value(algorithm, model, grader_feats, score, scores):
""" """
Determines a confidence in a certain score, given proper input parameters Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes algorithm- from util_functions.AlgorithmTypes
...@@ -163,7 +167,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores): ...@@ -163,7 +167,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score=max(numpy.asarray(scores)) max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"): if algorithm == util_functions.AlgorithmTypes.classification and hasattr(model, "predict_proba"):
#If classification, predict with probability, which gives you a matrix of confidences per score point #If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(float(score)-float(min_score))] raw_confidence = model.predict_proba(grader_feats)[0, (float(score) -float(min_score))]
#TODO: Normalize confidence somehow here #TODO: Normalize confidence somehow here
confidence=raw_confidence confidence=raw_confidence
elif hasattr(model, "predict"): elif hasattr(model, "predict"):
...@@ -173,4 +177,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores): ...@@ -173,4 +177,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence = 0 confidence = 0
return confidence return confidence
...@@ -27,12 +27,12 @@ def read_in_test_data(filename): ...@@ -27,12 +27,12 @@ def read_in_test_data(filename):
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text returns the score and the text
""" """
id, e_set, score, score2, text = [], [], [], [], [] tid, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read() combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines() raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)): for row in xrange(1, len(raw_lines)):
id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t") tid1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
id.append(int(id1)) tid.append(int(tid1))
text.append(text1) text.append(text1)
e_set.append(int(set1)) e_set.append(int(set1))
score.append(int(score1)) score.append(int(score1))
...@@ -109,12 +109,12 @@ def get_cv_error(clf,feats,scores): ...@@ -109,12 +109,12 @@ def get_cv_error(clf,feats,scores):
return results return results
def get_algorithms(type): def get_algorithms(algorithm):
""" """
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error. Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes type - one of util_functions.AlgorithmTypes
""" """
if type == util_functions.AlgorithmTypes.classification: if algorithm == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3) max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05, clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
...@@ -127,7 +127,7 @@ def get_algorithms(type): ...@@ -127,7 +127,7 @@ def get_algorithms(type):
return clf, clf2 return clf, clf2
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression): def extract_features_and_generate_model_predictors(predictor_set, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Extracts features and generates predictors based on a given predictor set Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data predictor_set - a PredictorSet object that has been initialized with data
...@@ -141,7 +141,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func ...@@ -141,7 +141,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats = f.gen_feats(predictor_set) train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type) clf,clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target) cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try: try:
...@@ -149,6 +149,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func ...@@ -149,6 +149,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
except ValueError: except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.") log.exception("Not enough classes (0,1,etc) in sample.")
set_score = predictor_set._target
set_score[0]=1 set_score[0]=1
set_score[1]=0 set_score[1]=0
clf.fit(train_feats, set_score) clf.fit(train_feats, set_score)
...@@ -156,7 +157,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func ...@@ -156,7 +157,7 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
return f, clf, cv_error_results return f, clf, cv_error_results
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression): def extract_features_and_generate_model(essays, algorithm=util_functions.AlgorithmTypes.regression):
""" """
Feed in an essay set to get feature vector and classifier Feed in an essay set to get feature vector and classifier
essays must be an essay set object essays must be an essay set object
...@@ -171,11 +172,11 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp ...@@ -171,11 +172,11 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
set_score = numpy.asarray(essays._score, dtype=numpy.int) set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5: if len(util_functions.f7(list(set_score)))>5:
type = util_functions.AlgorithmTypes.regression algorithm = util_functions.AlgorithmTypes.regression
else: else:
type = util_functions.AlgorithmTypes.classification algorithm = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(type) clf,clf2 = get_algorithms(algorithm)
cv_error_results=get_cv_error(clf2,train_feats,essays._score) cv_error_results=get_cv_error(clf2,train_feats,essays._score)
...@@ -205,7 +206,7 @@ def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_arra ...@@ -205,7 +206,7 @@ def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_arra
Function that creates essay set, extracts features, and writes out model Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions See above functions for argument descriptions
""" """
essay_set=create_essay_set(text_score,prompt) essay_set=create_essay_set(text,score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array) feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path) dump_model_to_file(prompt,feature_ext,clf,model_path)
......
...@@ -16,14 +16,14 @@ if not base_path.endswith("/"): ...@@ -16,14 +16,14 @@ if not base_path.endswith("/"):
log=logging.getLogger(__name__) log=logging.getLogger(__name__)
class PredictorSet(object): class PredictorSet(object):
def __init__(self, type = "train"): def __init__(self, essaytype = "train"):
""" """
Initialize variables and check essay set type Initialize variables and check essay set type
""" """
if(type != "train" and type != "test"): if(essaytype != "train" and essaytype != "test"):
type = "train" essaytype = "train"
self._type = type self._type = essaytype
self._target=[] self._target=[]
self._textual_features=[] self._textual_features=[]
self._numeric_features=[] self._numeric_features=[]
...@@ -85,7 +85,7 @@ class PredictorSet(object): ...@@ -85,7 +85,7 @@ class PredictorSet(object):
#Create essay sets for textual features if needed #Create essay sets for textual features if needed
if len(self._textual_features)==0: if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)): for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type)) self._essay_sets.append(essay_set.EssaySet(essaytype=self._type))
#Add numeric and textual features #Add numeric and textual features
self._numeric_features.append(numeric_features) self._numeric_features.append(numeric_features)
......
...@@ -56,7 +56,7 @@ class ModelCreator(): ...@@ -56,7 +56,7 @@ class ModelCreator():
self.text = text self.text = text
#Governs which creation function in the ease.create module to use. See module for info. #Governs which creation function in the ease.create module to use. See module for info.
if isinstance(text[0], basestring): if isinstance(text, list):
self.create_model_generic = False self.create_model_generic = False
else: else:
self.create_model_generic = True self.create_model_generic = True
...@@ -75,7 +75,7 @@ class Grader(): ...@@ -75,7 +75,7 @@ class Grader():
if isinstance(submission, basestring): if isinstance(submission, basestring):
return grade.grade(self.model_data, submission) return grade.grade(self.model_data, submission)
else: else:
return grade.grade_generic(self.model_data, submission.get('numeric_features', []), submission.get('textual_features', [])) return grade.grade_generic(self.model_data, submission.get('numeric_values', []), submission.get('textual_values', []))
class GenericTest(object): class GenericTest(object):
loader = DataLoader loader = DataLoader
...@@ -121,6 +121,25 @@ class GenericTest(object): ...@@ -121,6 +121,25 @@ class GenericTest(object):
self.assertGreaterEqual(cv_kappa, self.expected_kappa_min) self.assertGreaterEqual(cv_kappa, self.expected_kappa_min)
self.assertLessEqual(cv_mae, self.expected_mae_max) self.assertLessEqual(cv_mae, self.expected_mae_max)
def test_generic_model_creation_and_grading(self):
score_subset = [random.randint(0,100) for i in xrange(0,min([QUICK_TEST_LIMIT, len(self.scores)]))]
text_subset = self.text[:QUICK_TEST_LIMIT]
text_subset = {
'textual_values' : [[t] for t in text_subset],
'numeric_values' : [[1] for i in xrange(0,len(text_subset))]
}
model_creator = ModelCreator(score_subset, text_subset)
results = model_creator.create_model()
self.assertTrue(results['success'])
grader = Grader(results)
test_text = {
'textual_values' : [[self.text[0]]],
'numeric_values' : [[1]]
}
grader.grade(test_text)
self.assertTrue(results['success'])
class PolarityTest(unittest.TestCase,GenericTest): class PolarityTest(unittest.TestCase,GenericTest):
loader = PolarityLoader loader = PolarityLoader
data_path = "data/polarity" data_path = "data/polarity"
...@@ -132,3 +151,5 @@ class PolarityTest(unittest.TestCase,GenericTest): ...@@ -132,3 +151,5 @@ class PolarityTest(unittest.TestCase,GenericTest):
def setUp(self): def setUp(self):
self.generic_setup() self.generic_setup()
#Collection of misc functions needed to support essay_set.py and feature_extractor.py. #Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path #Requires aspell to be installed and added to the path
from external_code.fisher import fisher from fisher import pvalue
aspell_path = "aspell" aspell_path = "aspell"
import re import re
...@@ -211,8 +211,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200): ...@@ -211,8 +211,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
good_loop_missing = len(good_loop_vec[good_loop_vec == 0]) good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0]) bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0]) bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
fish_val = fisher.FishersExactTest.probability_of_table( fish_val = pvalue(good_loop_present, bad_loop_present, good_loop_missing, bad_loop_missing).two_tail
[[good_loop_present, bad_loop_present], [good_loop_missing, bad_loop_missing]])
fish_vals.append(fish_val) fish_vals.append(fish_val)
cutoff = 1 cutoff = 1
...@@ -382,6 +381,8 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None): ...@@ -382,6 +381,8 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
See quadratic_weighted_kappa for argument descriptions See quadratic_weighted_kappa for argument descriptions
""" """
assert(len(rater_a) == len(rater_b)) assert(len(rater_a) == len(rater_b))
rater_a = [int(a) for a in rater_a]
rater_b = [int(b) for b in rater_b]
if min_rating is None: if min_rating is None:
min_rating = min(rater_a) min_rating = min(rater_a)
if max_rating is None: if max_rating is None:
...@@ -400,6 +401,7 @@ def histogram(ratings, min_rating=None, max_rating=None): ...@@ -400,6 +401,7 @@ def histogram(ratings, min_rating=None, max_rating=None):
ratings is a list of scores ratings is a list of scores
Returns a list of frequencies Returns a list of frequencies
""" """
ratings = [int(r) for r in ratings]
if min_rating is None: if min_rating is None:
min_rating = min(ratings) min_rating = min(ratings)
if max_rating is None: if max_rating is None:
...@@ -450,8 +452,7 @@ def get_separator_words(toks1): ...@@ -450,8 +452,7 @@ def get_separator_words(toks1):
tok1_total = tab_toks1._N tok1_total = tab_toks1._N
tok2_present = toks2[word] tok2_present = toks2[word]
tok2_total = toks2._N tok2_total = toks2._N
fish_val = fisher.FishersExactTest.probability_of_table( fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
[[tok1_present, tok2_present], [tok1_total, tok2_total]])
if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2): if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
sep_words.append(word) sep_words.append(word)
sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5] sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment