Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
2e6cb8e5
Commit
2e6cb8e5
authored
Jun 02, 2013
by
Hugh Brown
Committed by
Vik Paruchuri
Jun 06, 2013
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
./grade.py: W391 blank line at end of file
parent
0d7ac804
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
256 additions
and
232 deletions
+256
-232
ease/.pep8
+3
-0
ease/create.py
+33
-33
ease/essay_set.py
+31
-31
ease/external_code/fisher/fisher.py
+17
-11
ease/feature_extractor.py
+0
-0
ease/grade.py
+52
-50
ease/model_creator.py
+38
-34
ease/predictor_extractor.py
+11
-10
ease/predictor_set.py
+24
-25
ease/tests/test_model_accuracy.py
+17
-11
ease/util_functions.py
+30
-27
No files found.
ease/.pep8
0 → 100644
View file @
2e6cb8e5
[pep8]
ignore=E501,E712,E711
ease/create.py
View file @
2e6cb8e5
...
...
@@ -7,22 +7,23 @@ import sys
import
logging
import
numpy
#Define base path and add to sys path
#
Define base path and add to sys path
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
sys
.
path
.
append
(
one_up_path
)
#Import modules that are dependent on the base path
#
Import modules that are dependent on the base path
import
model_creator
import
util_functions
import
predictor_set
import
predictor_extractor
#Make a log
#
Make a log
log
=
logging
.
getLogger
(
__name__
)
def
create
(
text
,
score
,
prompt_string
):
def
create
(
text
,
score
,
prompt_string
):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
...
...
@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays
"""
#Initialize a results dictionary to return
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
util_functions
.
AlgorithmTypes
.
classification
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
#
Initialize a results dictionary to return
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
util_functions
.
AlgorithmTypes
.
classification
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
if
len
(
text
)
!=
len
(
score
):
if
len
(
text
)
!=
len
(
score
):
msg
=
"Target and text lists must be same length."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
return
results
#Decide what algorithm to use (regression or classification)
#
Decide what algorithm to use (regression or classification)
try
:
#Count the number of unique score points in the score list
if
len
(
util_functions
.
f7
(
list
(
score
)))
>
5
:
#
Count the number of unique score points in the score list
if
len
(
util_functions
.
f7
(
list
(
score
)))
>
5
:
type
=
util_functions
.
AlgorithmTypes
.
regression
else
:
type
=
util_functions
.
AlgorithmTypes
.
classification
...
...
@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
type
=
util_functions
.
AlgorithmTypes
.
regression
try
:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
#
Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set
=
model_creator
.
create_essay_set
(
text
,
score
,
prompt_string
)
except
:
msg
=
"essay set creation failed."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
try
:
#Gets features from the essay set and computes error
#
Gets features from the essay set and computes error
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
type
=
type
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'algorithm'
]
=
type
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
:
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
...
...
@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
return
results
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
...
...
@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used
"""
#Initialize a result dictionary to return.
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
#
Initialize a result dictionary to return.
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
msg
=
"Target, numeric features, and text features must all be the same length."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
return
results
try
:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
#
Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset
=
predictor_set
.
PredictorSet
(
type
=
"train"
)
for
i
in
xrange
(
0
,
len
(
numeric_values
)):
pset
.
add_row
(
numeric_values
[
i
],
textual_values
[
i
],
target
[
i
])
...
...
@@ -107,16 +108,16 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
log
.
exception
(
msg
)
try
:
#Extract all features and then train a classifier with the features
#
Extract all features and then train a classifier with the features
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
except
:
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
return
results
\ No newline at end of file
return
results
ease/essay_set.py
View file @
2e6cb8e5
...
...
@@ -15,11 +15,12 @@ sys.path.append(base_path)
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
MAXIMUM_ESSAY_LENGTH
=
20000
MAXIMUM_ESSAY_LENGTH
=
20000
class
EssaySet
(
object
):
def
__init__
(
self
,
type
=
"train"
):
...
...
@@ -30,17 +31,17 @@ class EssaySet(object):
type
=
"train"
self
.
_type
=
type
self
.
_score
=
[]
self
.
_text
=
[]
self
.
_id
=
[]
self
.
_clean_text
=
[]
self
.
_tokens
=
[]
self
.
_pos
=
[]
self
.
_clean_stem_text
=
[]
self
.
_score
=
[]
self
.
_text
=
[]
self
.
_id
=
[]
self
.
_clean_text
=
[]
self
.
_tokens
=
[]
self
.
_pos
=
[]
self
.
_clean_stem_text
=
[]
self
.
_generated
=
[]
self
.
_prompt
=
""
self
.
_spelling_errors
=
[]
self
.
_markup_text
=
[]
self
.
_spelling_errors
=
[]
self
.
_markup_text
=
[]
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
"""
...
...
@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try
:
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
except
:
log
.
exception
(
"Could not parse essay into ascii."
)
try
:
#Try conversion of types
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
#
Try conversion of types
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
except
:
#Nothing needed here, will return error in any case.
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
#
Nothing needed here, will return error in any case.
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
self
.
_id
.
append
(
max_id
+
1
)
self
.
_score
.
append
(
essay_score
)
# Clean text by removing non digit/work/punctuation characters
try
:
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
# Spell correct text using aspell
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
])
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
])
self
.
_clean_text
.
append
(
cleaned_text
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_markup_text
.
append
(
markup_text
)
...
...
@@ -112,7 +113,7 @@ class EssaySet(object):
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if
(
type
(
prompt_text
)
==
type
(
"text"
)):
if
(
isinstance
(
prompt_text
,
type
(
"text"
)
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
else
:
...
...
@@ -145,4 +146,4 @@ class EssaySet(object):
syn_toks
[
z
]
=
all_syns
[
z
][
i
]
new_essays
.
append
(
" "
.
join
(
syn_toks
))
for
z
in
xrange
(
0
,
len
(
new_essays
)):
self
.
add_essay
(
new_essays
[
z
],
e_score
,
1
)
\ No newline at end of file
self
.
add_essay
(
new_essays
[
z
],
e_score
,
1
)
ease/external_code/fisher/fisher.py
View file @
2e6cb8e5
...
...
@@ -21,6 +21,8 @@
import
math
## From dendropy.mathlib.probability
def
hypergeometric_pmf
(
x
,
m
,
n
,
k
):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
...
...
@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a
=
math
.
log
(
binomial_coefficient
(
m
,
x
))
b
=
math
.
log
(
binomial_coefficient
(
n
,
k
-
x
))
c
=
math
.
log
(
binomial_coefficient
(
m
+
n
,
k
))
return
math
.
exp
(
a
+
b
-
c
)
b
=
math
.
log
(
binomial_coefficient
(
n
,
k
-
x
))
c
=
math
.
log
(
binomial_coefficient
(
m
+
n
,
k
))
return
math
.
exp
(
a
+
b
-
c
)
## From dendropy.mathlib.probability
def
binomial_coefficient
(
population
,
sample
):
"Returns `population` choose `sample`."
s
=
max
(
sample
,
population
-
sample
)
...
...
@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
return
1
numerator
=
1
denominator
=
1
for
i
in
xrange
(
s
+
1
,
population
+
1
):
for
i
in
xrange
(
s
+
1
,
population
+
1
):
numerator
*=
i
denominator
*=
(
i
-
s
)
return
numerator
/
denominator
return
numerator
/
denominator
## From dendropy.mathlib.statistics
class
FishersExactTest
(
object
):
"""
Given a 2x2 table:
...
...
@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
b
=
table
[
0
][
1
]
c
=
table
[
1
][
0
]
d
=
table
[
1
][
1
]
return
hypergeometric_pmf
(
a
,
a
+
b
,
c
+
d
,
a
+
c
)
return
hypergeometric_pmf
(
a
,
a
+
b
,
c
+
d
,
a
+
c
)
probability_of_table
=
staticmethod
(
probability_of_table
)
def
__init__
(
self
,
table
):
...
...
@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return
[
[
table
[
1
][
0
],
table
[
0
][
0
]
],
[
table
[
1
][
1
],
table
[
0
][
1
]
]
]
return
[
[
table
[
1
][
0
],
table
[
0
][
0
]
],
[
table
[
1
][
1
],
table
[
0
][
1
]
]
]
def
_min_rotation
(
self
):
"""
...
...
@@ -241,8 +247,9 @@ extreme.
p_vals
.
append
(
p
)
return
sum
(
p_vals
)
+
p0
def
assert_almost_equal
(
v1
,
v2
,
prec
=
8
):
if
abs
(
v1
-
v2
)
<=
10
**
(
-
prec
):
if
abs
(
v1
-
v2
)
<=
10
**
(
-
prec
):
print
"OK: {} == {}"
.
format
(
v1
,
v2
)
else
:
print
"FAIL: {} != {}"
.
format
(
v1
,
v2
)
...
...
@@ -252,4 +259,4 @@ if __name__ == "__main__":
ft
=
FishersExactTest
(
table
)
assert_almost_equal
(
ft
.
left_tail_p
(),
0.044554737835078267
)
assert_almost_equal
(
ft
.
right_tail_p
(),
0.99452520602190897
)
assert_almost_equal
(
ft
.
two_tail_p
(),
0.08026855207410688
)
\ No newline at end of file
assert_almost_equal
(
ft
.
two_tail_p
(),
0.08026855207410688
)
ease/feature_extractor.py
View file @
2e6cb8e5
This diff is collapsed.
Click to expand it.
ease/grade.py
View file @
2e6cb8e5
...
...
@@ -8,24 +8,25 @@ import os
import
numpy
import
logging
#Append sys to base path to import the following modules
#
Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
#Depend on base path to be imported
#
Depend on base path to be imported
from
essay_set
import
EssaySet
import
predictor_extractor
import
predictor_set
import
util_functions
#Imports needed to unpickle grader data
#
Imports needed to unpickle grader data
import
feature_extractor
import
sklearn.ensemble
import
math
log
=
logging
.
getLogger
(
__name__
)
def
grade
(
grader_data
,
submission
):
def
grade
(
grader_data
,
submission
):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
...
...
@@ -38,73 +39,74 @@ def grade(grader_data,submission):
submission - The student submission (string)
"""
#Initialize result dictionary
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
#
Initialize result dictionary
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
grader_set
=
EssaySet
(
type
=
"test"
)
grader_set
=
EssaySet
(
type
=
"test"
)
#This is to preserve legacy functionality
#
This is to preserve legacy functionality
if
'algorithm'
not
in
grader_data
:
grader_data
[
'algorithm'
]
=
util_functions
.
AlgorithmTypes
.
classification
try
:
#Try to add essay to essay set object
grader_set
.
add_essay
(
str
(
submission
),
0
)
#
Try to add essay to essay set object
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
except
:
results
[
'errors'
]
.
append
(
"Essay could not be added to essay set:{0}"
.
format
(
submission
))
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#
Try to extract features from submission and assign score via the model
try
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
feedback
=
grader_data
[
'extractor'
]
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
])
except
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
feedback
=
grader_data
[
'extractor'
]
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
])
except
:
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#
Try to determine confidence level
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
except
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#
If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
if
not
has_error
:
#If the essay is just a copy of the prompt, return a 0 as the score
#
If the essay is just a copy of the prompt, return a 0 as the score
if
(
feedback
[
'too_similar_to_prompt'
]):
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
results
[
'success'
]
=
True
results
[
'success'
]
=
True
#Generate short form output--number of problem areas identified in feedback
#
Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available
#
Add feedback to results if available
results
[
'feedback'
]
=
{}
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
results
[
'feedback'
]
.
update
({
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
results
[
'feedback'
]
.
update
(
{
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
)
else
:
#If error, success is False.
results
[
'success'
]
=
False
#
If error, success is False.
results
[
'success'
]
=
False
return
results
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
"""
Grades a set of numeric and textual features using a generic model
...
...
@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
"""
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
#Try to find and load the model file
#
Try to find and load the model file
grader_set
=
predictor_set
.
PredictorSet
(
type
=
"test"
)
grader_set
=
predictor_set
.
PredictorSet
(
type
=
"test"
)
#Try to add essays to essay set object
#
Try to add essays to essay set object
try
:
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
except
:
results
[
'errors'
]
.
append
(
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
))
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#
Try to extract features from submission and assign score via the model
try
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
]
except
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
]
except
:
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#
Try to determine confidence level
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
])
except
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#
If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
if
not
has_error
:
...
...
@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return
results
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
...
...
@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score
=
max
(
numpy
.
asarray
(
scores
))
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,(
float
(
score
)
-
float
(
min_score
))]
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,
(
float
(
score
)
-
float
(
min_score
))]
#TODO: Normalize confidence somehow here
confidence
=
raw_confidence
elif
hasattr
(
model
,
"predict"
):
...
...
@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence
=
0
return
confidence
ease/model_creator.py
View file @
2e6cb8e5
#Provides interface functions to create and save models
#
Provides interface functions to create and save models
import
numpy
import
re
...
...
@@ -19,7 +19,8 @@ import feature_extractor
import
logging
import
predictor_extractor
log
=
logging
.
getLogger
()
log
=
logging
.
getLogger
()
def
read_in_test_data
(
filename
):
"""
...
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string
=
open
(
filename
)
.
read
()
return
prompt_string
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
...
...
@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return
x
def
get_cv_error
(
clf
,
feats
,
scores
):
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
except
ValueError
:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
#
If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log
.
exception
(
"Not enough classes (0,1,etc) in each cross validation fold."
)
except
:
log
.
exception
(
"Error getting cv error estimates."
)
return
results
def
get_algorithms
(
type
):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...
...
@@ -116,14 +120,14 @@ def get_algorithms(type):
"""
if
type
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
...
...
@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats
=
f
.
gen_feats
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
type
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
clf
,
clf2
=
get_algorithms
(
type
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
try
:
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
0
]
=
1
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
...
...
@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
train_feats
=
f
.
gen_feats
(
essays
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
type
=
util_functions
.
AlgorithmTypes
.
regression
else
:
type
=
util_functions
.
AlgorithmTypes
.
classification
clf
,
clf2
=
get_algorithms
(
type
)
clf
,
clf2
=
get_algorithms
(
type
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
try
:
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
0
]
=
1
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
"""
Writes out a model to a file.
...
...
@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set
=
create_essay_set
(
text_score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
essay_set
=
create_essay_set
(
text_score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
ease/predictor_extractor.py
View file @
2e6cb8e5
...
...
@@ -16,17 +16,18 @@ import logging
import
math
from
feature_extractor
import
FeatureExtractor
#Append to path and then import things that depend on path
#
Append to path and then import things that depend on path
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
self
.
_extractors
=
[]
...
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
div_length
=
1
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
#
Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_initialized
=
True
...
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
p_set - PredictorSet
"""
if
self
.
_initialized
!=
True
:
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
...
...
ease/predictor_set.py
View file @
2e6cb8e5
...
...
@@ -11,12 +11,13 @@ sys.path.append(base_path)
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorSet
(
object
):
def
__init__
(
self
,
type
=
"train"
):
def
__init__
(
self
,
type
=
"train"
):
"""
Initialize variables and check essay set type
"""
...
...
@@ -24,13 +25,13 @@ class PredictorSet(object):
type
=
"train"
self
.
_type
=
type
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
#Basic input checking
#
Basic input checking
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
error_message
=
"Target is not a numeric value."
log
.
exception
(
error_message
)
...
...
@@ -46,26 +47,26 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Do some length checking for parameters
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
#
Do some length checking for parameters
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
current_numeric_length
=
len
(
numeric_features
)
if
numeric_length
!=
current_numeric_length
:
error_message
=
"Numeric features are an improper length."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
current_textual_length
=
len
(
textual_features
)
if
textual_length
!=
current_textual_length
:
error_message
=
"Textual features are an improper length."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Now check to see if text features and numeric features are individually correct
#
Now check to see if text features and numeric features are individually correct
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
try
:
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
except
:
...
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
try
:
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
except
:
...
...
@@ -82,19 +82,18 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
#
Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
type
=
self
.
_type
))
#Add numeric and textual features
#
Add numeric and textual features
self
.
_numeric_features
.
append
(
numeric_features
)
self
.
_textual_features
.
append
(
textual_features
)
#Add targets
#
Add targets
self
.
_target
.
append
(
target
)
#Add textual features to essay sets
for
i
in
xrange
(
0
,
len
(
textual_features
)):
#
Add textual features to essay sets
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
ease/tests/test_model_accuracy.py
View file @
2e6cb8e5
...
...
@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT
=
100
QUICK_TEST_LIMIT
=
5
class
DataLoader
():
def
load_text_files
(
self
,
pathname
):
filenames
=
os
.
listdir
(
pathname
)
...
...
@@ -28,34 +29,36 @@ class DataLoader():
"""
pass
class
PolarityLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
#Sort so neg is first
#
Sort so neg is first
directories
.
sort
()
#We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
#
We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
neg
=
self
.
load_text_files
(
directories
[
0
])
pos
=
self
.
load_text_files
(
directories
[
1
])
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
text
=
neg
+
pos
return
scores
,
text
class
ModelCreator
():
def
__init__
(
self
,
scores
,
text
):
self
.
scores
=
scores
self
.
text
=
text
#Governs which creation function in the ease.create module to use. See module for info.
#
Governs which creation function in the ease.create module to use. See module for info.
if
isinstance
(
text
[
0
],
basestring
):
self
.
create_model_generic
=
False
else
:
...
...
@@ -67,6 +70,7 @@ class ModelCreator():
else
:
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
class
Grader
():
def
__init__
(
self
,
model_data
):
self
.
model_data
=
model_data
...
...
@@ -77,6 +81,7 @@ class Grader():
else
:
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_features'
,
[]),
submission
.
get
(
'textual_features'
,
[]))
class
GenericTest
(
object
):
loader
=
DataLoader
data_path
=
""
...
...
@@ -87,11 +92,11 @@ class GenericTest(object):
data_loader
=
self
.
loader
(
os
.
path
.
join
(
TEST_PATH
,
self
.
data_path
))
scores
,
text
=
data_loader
.
load_data
()
#Shuffle to mix up the classes, set seed to make it repeatable
#
Shuffle to mix up the classes, set seed to make it repeatable
random
.
seed
(
1
)
shuffled_scores
=
[]
shuffled_text
=
[]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
random
.
shuffle
(
indices
)
for
i
in
indices
:
shuffled_scores
.
append
(
scores
[
i
])
...
...
@@ -121,12 +126,13 @@ class GenericTest(object):
self
.
assertGreaterEqual
(
cv_kappa
,
self
.
expected_kappa_min
)
self
.
assertLessEqual
(
cv_mae
,
self
.
expected_mae_max
)
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
loader
=
PolarityLoader
data_path
=
"data/polarity"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
These will increase if we allow more data in.
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_mae_max
=
1
...
...
ease/util_functions.py
View file @
2e6cb8e5
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
#
Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#
Requires aspell to be installed and added to the path
from
external_code.fisher
import
fisher
aspell_path
=
"aspell"
...
...
@@ -14,17 +14,18 @@ import pickle
import
logging
import
sys
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
#Paths to needed data files
#
Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
class
AlgorithmTypes
(
object
):
"""
Defines what types of algorithm can be used
...
...
@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
regression
=
"regression"
classification
=
"classifiction"
def
create_model_path
(
model_path
):
"""
Creates a path to model files
model_path - string
"""
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
model_path
=
"/"
+
model_path
model_path
=
"/"
+
model_path
if
not
model_path
.
startswith
(
"models"
):
model_path
=
"models"
+
model_path
if
not
model_path
.
endswith
(
".p"
):
model_path
+=
".p"
model_path
+=
".p"
return
model_path
def
sub_chars
(
string
):
"""
Strips illegal characters from a string. Used to sanitize input essays.
...
...
@@ -53,7 +56,7 @@ def sub_chars(string):
Returns sanitized string.
string - string
"""
#Define replacement patterns
#
Define replacement patterns
sub_pat
=
r"[^A-Za-z\.\?!,';:]"
char_pat
=
r"\."
com_pat
=
r","
...
...
@@ -63,9 +66,9 @@ def sub_chars(string):
col_pat
=
r":"
whitespace_pat
=
r"\s{1,}"
#Replace text. Ordering is very important!
#
Replace text. Ordering is very important!
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
...
...
@@ -84,7 +87,7 @@ def spell_correct(string):
string - string
"""
#Create a temp file so that aspell could be used
#
Create a temp file so that aspell could be used
f
=
open
(
'tmpfile'
,
'w'
)
f
.
write
(
string
)
f_path
=
os
.
path
.
abspath
(
f
.
name
)
...
...
@@ -93,16 +96,16 @@ def spell_correct(string):
p
=
os
.
popen
(
aspell_path
+
" -a < "
+
f_path
+
" --sug-mode=ultra"
)
except
:
log
.
exception
(
"Could not find aspell, so could not spell correct!"
)
#Return original string if aspell fails
return
string
,
0
,
string
#Aspell returns a list of incorrect words with the above flags
#
Return original string if aspell fails
return
string
,
0
,
string
#
Aspell returns a list of incorrect words with the above flags
incorrect
=
p
.
readlines
()
p
.
close
()
incorrect_words
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
):
#Reformat aspell output to make sense
#
Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
begstring
=
incorrect
[
i
][
2
:
match
.
start
()]
...
...
@@ -117,19 +120,19 @@ def spell_correct(string):
incorrect_words
.
append
(
begword
)
correct_spelling
.
append
(
sug
)
#Create markup based on spelling errors
#
Create markup based on spelling errors
newstring
=
string
markup_string
=
string
already_subbed
=
[]
already_subbed
=
[]
for
i
in
range
(
0
,
len
(
incorrect_words
)):
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_comp
=
re
.
compile
(
sub_pat
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
if
incorrect_words
[
i
]
not
in
already_subbed
:
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
already_subbed
.
append
(
incorrect_words
[
i
])
return
newstring
,
len
(
incorrect_words
),
markup_string
return
newstring
,
len
(
incorrect_words
),
markup_string
def
ngrams
(
tokens
,
min_n
,
max_n
):
...
...
@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict_mat
=
dict
.
fit_transform
(
text
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
...
...
@@ -246,12 +249,12 @@ def edit_distance(s1, s2):
else
:
cost
=
1
d
[(
i
,
j
)]
=
min
(
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
)
if
i
and
j
and
s1
[
i
]
==
s2
[
j
-
1
]
and
s1
[
i
-
1
]
==
s2
[
j
]:
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
return
d
[
lenstr1
-
1
,
lenstr2
-
1
]
...
...
@@ -335,6 +338,7 @@ def calc_list_average(l):
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
"""
Calculates kappa correlation between rater_a and rater_b.
...
...
@@ -351,7 +355,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if
max_rating
is
None
:
max_rating
=
max
(
rater_a
+
rater_b
)
conf_mat
=
confusion_matrix
(
rater_a
,
rater_b
,
min_rating
,
max_rating
)
min_rating
,
max_rating
)
num_ratings
=
len
(
conf_mat
)
num_scored_items
=
float
(
len
(
rater_a
))
...
...
@@ -482,4 +486,4 @@ def getMedian(numericValues):
lower
=
theValues
[
len
(
theValues
)
/
2
-
1
]
upper
=
theValues
[
len
(
theValues
)
/
2
]
return
(
float
(
lower
+
upper
))
/
2
\ No newline at end of file
return
(
float
(
lower
+
upper
))
/
2
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment