Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
2e6cb8e5
Commit
2e6cb8e5
authored
Jun 02, 2013
by
Hugh Brown
Committed by
Vik Paruchuri
Jun 06, 2013
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
./grade.py: W391 blank line at end of file
parent
0d7ac804
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
345 additions
and
316 deletions
+345
-316
ease/.pep8
+3
-0
ease/create.py
+32
-31
ease/essay_set.py
+29
-28
ease/external_code/fisher/fisher.py
+16
-9
ease/feature_extractor.py
+101
-100
ease/grade.py
+52
-50
ease/model_creator.py
+38
-34
ease/predictor_extractor.py
+11
-10
ease/predictor_set.py
+22
-23
ease/tests/test_model_accuracy.py
+17
-11
ease/util_functions.py
+24
-20
No files found.
ease/.pep8
0 → 100644
View file @
2e6cb8e5
[pep8]
ignore=E501,E712,E711
ease/create.py
View file @
2e6cb8e5
...
@@ -7,22 +7,23 @@ import sys
...
@@ -7,22 +7,23 @@ import sys
import
logging
import
logging
import
numpy
import
numpy
#Define base path and add to sys path
#
Define base path and add to sys path
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
sys
.
path
.
append
(
one_up_path
)
sys
.
path
.
append
(
one_up_path
)
#Import modules that are dependent on the base path
#
Import modules that are dependent on the base path
import
model_creator
import
model_creator
import
util_functions
import
util_functions
import
predictor_set
import
predictor_set
import
predictor_extractor
import
predictor_extractor
#Make a log
#
Make a log
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
def
create
(
text
,
score
,
prompt_string
):
def
create
(
text
,
score
,
prompt_string
):
"""
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
TODO: Remove model path argument, it is needed for now to support legacy code
...
@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
...
@@ -31,21 +32,21 @@ def create(text,score,prompt_string):
prompt_string - the common prompt for the set of essays
prompt_string - the common prompt for the set of essays
"""
"""
#Initialize a results dictionary to return
#
Initialize a results dictionary to return
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
util_functions
.
AlgorithmTypes
.
classification
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
util_functions
.
AlgorithmTypes
.
classification
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
if
len
(
text
)
!=
len
(
score
):
if
len
(
text
)
!=
len
(
score
):
msg
=
"Target and text lists must be same length."
msg
=
"Target and text lists must be same length."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
log
.
exception
(
msg
)
return
results
return
results
#Decide what algorithm to use (regression or classification)
#
Decide what algorithm to use (regression or classification)
try
:
try
:
#Count the number of unique score points in the score list
#
Count the number of unique score points in the score list
if
len
(
util_functions
.
f7
(
list
(
score
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
score
)))
>
5
:
type
=
util_functions
.
AlgorithmTypes
.
regression
type
=
util_functions
.
AlgorithmTypes
.
regression
else
:
else
:
type
=
util_functions
.
AlgorithmTypes
.
classification
type
=
util_functions
.
AlgorithmTypes
.
classification
...
@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
...
@@ -53,21 +54,21 @@ def create(text,score,prompt_string):
type
=
util_functions
.
AlgorithmTypes
.
regression
type
=
util_functions
.
AlgorithmTypes
.
regression
try
:
try
:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
#
Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set
=
model_creator
.
create_essay_set
(
text
,
score
,
prompt_string
)
e_set
=
model_creator
.
create_essay_set
(
text
,
score
,
prompt_string
)
except
:
except
:
msg
=
"essay set creation failed."
msg
=
"essay set creation failed."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
log
.
exception
(
msg
)
try
:
try
:
#Gets features from the essay set and computes error
#
Gets features from the essay set and computes error
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
type
=
type
)
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
type
=
type
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'classifier'
]
=
classifier
results
[
'algorithm'
]
=
type
results
[
'algorithm'
]
=
type
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
:
except
:
msg
=
"feature extraction and model creation failed."
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
...
@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
...
@@ -76,7 +77,7 @@ def create(text,score,prompt_string):
return
results
return
results
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
"""
"""
Creates a model from a generic list numeric values and text values
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
numeric_values - A list of lists that are the predictors
...
@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
...
@@ -86,18 +87,18 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm - the type of algorithm that will be used
algorithm - the type of algorithm that will be used
"""
"""
#Initialize a result dictionary to return.
#
Initialize a result dictionary to return.
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
msg
=
"Target, numeric features, and text features must all be the same length."
msg
=
"Target, numeric features, and text features must all be the same length."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
log
.
exception
(
msg
)
return
results
return
results
try
:
try
:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
#
Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset
=
predictor_set
.
PredictorSet
(
type
=
"train"
)
pset
=
predictor_set
.
PredictorSet
(
type
=
"train"
)
for
i
in
xrange
(
0
,
len
(
numeric_values
)):
for
i
in
xrange
(
0
,
len
(
numeric_values
)):
pset
.
add_row
(
numeric_values
[
i
],
textual_values
[
i
],
target
[
i
])
pset
.
add_row
(
numeric_values
[
i
],
textual_values
[
i
],
target
[
i
])
...
@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
...
@@ -107,13 +108,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
log
.
exception
(
msg
)
log
.
exception
(
msg
)
try
:
try
:
#Extract all features and then train a classifier with the features
#
Extract all features and then train a classifier with the features
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
:
except
:
msg
=
"feature extraction and model creation failed."
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
...
...
ease/essay_set.py
View file @
2e6cb8e5
...
@@ -15,11 +15,12 @@ sys.path.append(base_path)
...
@@ -15,11 +15,12 @@ sys.path.append(base_path)
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
MAXIMUM_ESSAY_LENGTH
=
20000
MAXIMUM_ESSAY_LENGTH
=
20000
class
EssaySet
(
object
):
class
EssaySet
(
object
):
def
__init__
(
self
,
type
=
"train"
):
def
__init__
(
self
,
type
=
"train"
):
...
@@ -30,17 +31,17 @@ class EssaySet(object):
...
@@ -30,17 +31,17 @@ class EssaySet(object):
type
=
"train"
type
=
"train"
self
.
_type
=
type
self
.
_type
=
type
self
.
_score
=
[]
self
.
_score
=
[]
self
.
_text
=
[]
self
.
_text
=
[]
self
.
_id
=
[]
self
.
_id
=
[]
self
.
_clean_text
=
[]
self
.
_clean_text
=
[]
self
.
_tokens
=
[]
self
.
_tokens
=
[]
self
.
_pos
=
[]
self
.
_pos
=
[]
self
.
_clean_stem_text
=
[]
self
.
_clean_stem_text
=
[]
self
.
_generated
=
[]
self
.
_generated
=
[]
self
.
_prompt
=
""
self
.
_prompt
=
""
self
.
_spelling_errors
=
[]
self
.
_spelling_errors
=
[]
self
.
_markup_text
=
[]
self
.
_markup_text
=
[]
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
def
add_essay
(
self
,
essay_text
,
essay_score
,
essay_generated
=
0
):
"""
"""
...
@@ -58,35 +59,35 @@ class EssaySet(object):
...
@@ -58,35 +59,35 @@ class EssaySet(object):
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try
:
try
:
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
essay_text
.
encode
(
'ascii'
,
'ignore'
)
if
len
(
essay_text
)
<
5
:
if
len
(
essay_text
)
<
5
:
essay_text
=
"Invalid essay."
essay_text
=
"Invalid essay."
except
:
except
:
log
.
exception
(
"Could not parse essay into ascii."
)
log
.
exception
(
"Could not parse essay into ascii."
)
try
:
try
:
#Try conversion of types
#
Try conversion of types
essay_score
=
int
(
essay_score
)
essay_score
=
int
(
essay_score
)
essay_text
=
str
(
essay_text
)
essay_text
=
str
(
essay_text
)
except
:
except
:
#Nothing needed here, will return error in any case.
#
Nothing needed here, will return error in any case.
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
and
(
essay_generated
==
0
or
essay_generated
==
1
):
self
.
_id
.
append
(
max_id
+
1
)
self
.
_id
.
append
(
max_id
+
1
)
self
.
_score
.
append
(
essay_score
)
self
.
_score
.
append
(
essay_score
)
# Clean text by removing non digit/work/punctuation characters
# Clean text by removing non digit/work/punctuation characters
try
:
try
:
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
essay_text
=
str
(
essay_text
.
encode
(
'ascii'
,
'ignore'
))
except
:
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
self
.
_text
.
append
(
cleaned_essay
)
# Spell correct text using aspell
# Spell correct text using aspell
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
])
cleaned_text
,
spell_errors
,
markup_text
=
util_functions
.
spell_correct
(
self
.
_text
[
len
(
self
.
_text
)
-
1
])
self
.
_clean_text
.
append
(
cleaned_text
)
self
.
_clean_text
.
append
(
cleaned_text
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_spelling_errors
.
append
(
spell_errors
)
self
.
_markup_text
.
append
(
markup_text
)
self
.
_markup_text
.
append
(
markup_text
)
...
@@ -112,7 +113,7 @@ class EssaySet(object):
...
@@ -112,7 +113,7 @@ class EssaySet(object):
prompt_text should be a string.
prompt_text should be a string.
Returns the prompt as a confirmation.
Returns the prompt as a confirmation.
"""
"""
if
(
type
(
prompt_text
)
==
type
(
"text"
)):
if
(
isinstance
(
prompt_text
,
type
(
"text"
)
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
ret
=
self
.
_prompt
else
:
else
:
...
...
ease/external_code/fisher/fisher.py
View file @
2e6cb8e5
...
@@ -21,6 +21,8 @@
...
@@ -21,6 +21,8 @@
import
math
import
math
## From dendropy.mathlib.probability
## From dendropy.mathlib.probability
def
hypergeometric_pmf
(
x
,
m
,
n
,
k
):
def
hypergeometric_pmf
(
x
,
m
,
n
,
k
):
"""
"""
Given a population consisting of `m` items of class M and `n` items of class N,
Given a population consisting of `m` items of class M and `n` items of class N,
...
@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
...
@@ -33,11 +35,13 @@ p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
# float' with large numbers
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a
=
math
.
log
(
binomial_coefficient
(
m
,
x
))
a
=
math
.
log
(
binomial_coefficient
(
m
,
x
))
b
=
math
.
log
(
binomial_coefficient
(
n
,
k
-
x
))
b
=
math
.
log
(
binomial_coefficient
(
n
,
k
-
x
))
c
=
math
.
log
(
binomial_coefficient
(
m
+
n
,
k
))
c
=
math
.
log
(
binomial_coefficient
(
m
+
n
,
k
))
return
math
.
exp
(
a
+
b
-
c
)
return
math
.
exp
(
a
+
b
-
c
)
## From dendropy.mathlib.probability
## From dendropy.mathlib.probability
def
binomial_coefficient
(
population
,
sample
):
def
binomial_coefficient
(
population
,
sample
):
"Returns `population` choose `sample`."
"Returns `population` choose `sample`."
s
=
max
(
sample
,
population
-
sample
)
s
=
max
(
sample
,
population
-
sample
)
...
@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
...
@@ -47,12 +51,14 @@ def binomial_coefficient(population, sample):
return
1
return
1
numerator
=
1
numerator
=
1
denominator
=
1
denominator
=
1
for
i
in
xrange
(
s
+
1
,
population
+
1
):
for
i
in
xrange
(
s
+
1
,
population
+
1
):
numerator
*=
i
numerator
*=
i
denominator
*=
(
i
-
s
)
denominator
*=
(
i
-
s
)
return
numerator
/
denominator
return
numerator
/
denominator
## From dendropy.mathlib.statistics
## From dendropy.mathlib.statistics
class
FishersExactTest
(
object
):
class
FishersExactTest
(
object
):
"""
"""
Given a 2x2 table:
Given a 2x2 table:
...
@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
...
@@ -97,7 +103,7 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
b
=
table
[
0
][
1
]
b
=
table
[
0
][
1
]
c
=
table
[
1
][
0
]
c
=
table
[
1
][
0
]
d
=
table
[
1
][
1
]
d
=
table
[
1
][
1
]
return
hypergeometric_pmf
(
a
,
a
+
b
,
c
+
d
,
a
+
c
)
return
hypergeometric_pmf
(
a
,
a
+
b
,
c
+
d
,
a
+
c
)
probability_of_table
=
staticmethod
(
probability_of_table
)
probability_of_table
=
staticmethod
(
probability_of_table
)
def
__init__
(
self
,
table
):
def
__init__
(
self
,
table
):
...
@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
...
@@ -111,8 +117,8 @@ p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
Returns a copy of table such that all the values
Returns a copy of table such that all the values
are rotated clockwise once.
are rotated clockwise once.
"""
"""
return
[
[
table
[
1
][
0
],
table
[
0
][
0
]
],
return
[
[
table
[
1
][
0
],
table
[
0
][
0
]
],
[
table
[
1
][
1
],
table
[
0
][
1
]
]
]
[
table
[
1
][
1
],
table
[
0
][
1
]
]
]
def
_min_rotation
(
self
):
def
_min_rotation
(
self
):
"""
"""
...
@@ -241,8 +247,9 @@ extreme.
...
@@ -241,8 +247,9 @@ extreme.
p_vals
.
append
(
p
)
p_vals
.
append
(
p
)
return
sum
(
p_vals
)
+
p0
return
sum
(
p_vals
)
+
p0
def
assert_almost_equal
(
v1
,
v2
,
prec
=
8
):
def
assert_almost_equal
(
v1
,
v2
,
prec
=
8
):
if
abs
(
v1
-
v2
)
<=
10
**
(
-
prec
):
if
abs
(
v1
-
v2
)
<=
10
**
(
-
prec
):
print
"OK: {} == {}"
.
format
(
v1
,
v2
)
print
"OK: {} == {}"
.
format
(
v1
,
v2
)
else
:
else
:
print
"FAIL: {} != {}"
.
format
(
v1
,
v2
)
print
"FAIL: {} != {}"
.
format
(
v1
,
v2
)
...
...
ease/feature_extractor.py
View file @
2e6cb8e5
...
@@ -20,22 +20,23 @@ from essay_set import EssaySet
...
@@ -20,22 +20,23 @@ from essay_set import EssaySet
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
#Paths to needed data files
#
Paths to needed data files
NGRAM_PATH
=
base_path
+
"data/good_pos_ngrams.p"
NGRAM_PATH
=
base_path
+
"data/good_pos_ngrams.p"
ESSAY_CORPUS_PATH
=
util_functions
.
ESSAY_CORPUS_PATH
ESSAY_CORPUS_PATH
=
util_functions
.
ESSAY_CORPUS_PATH
class
FeatureExtractor
(
object
):
class
FeatureExtractor
(
object
):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
_good_pos_ngrams
=
self
.
get_good_pos_ngrams
()
self
.
_good_pos_ngrams
=
self
.
get_good_pos_ngrams
()
self
.
dict_initialized
=
False
self
.
dict_initialized
=
False
self
.
_spell_errors_per_character
=
0
self
.
_spell_errors_per_character
=
0
self
.
_grammar_errors_per_character
=
0
self
.
_grammar_errors_per_character
=
0
def
initialize_dictionaries
(
self
,
e_set
,
max_feats2
=
200
):
def
initialize_dictionaries
(
self
,
e_set
,
max_feats2
=
200
):
"""
"""
Initializes dictionaries from an essay set object
Initializes dictionaries from an essay set object
Dictionaries must be initialized prior to using this to extract features
Dictionaries must be initialized prior to using this to extract features
...
@@ -44,27 +45,27 @@ class FeatureExtractor(object):
...
@@ -44,27 +45,27 @@ class FeatureExtractor(object):
"""
"""
if
(
hasattr
(
e_set
,
'_type'
)):
if
(
hasattr
(
e_set
,
'_type'
)):
if
(
e_set
.
_type
==
"train"
):
if
(
e_set
.
_type
==
"train"
):
#normal text (unstemmed) useful words/bigrams
#
normal text (unstemmed) useful words/bigrams
nvocab
=
util_functions
.
get_vocab
(
e_set
.
_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
nvocab
=
util_functions
.
get_vocab
(
e_set
.
_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
#stemmed and spell corrected vocab useful words/ngrams
#
stemmed and spell corrected vocab useful words/ngrams
svocab
=
util_functions
.
get_vocab
(
e_set
.
_clean_stem_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
svocab
=
util_functions
.
get_vocab
(
e_set
.
_clean_stem_text
,
e_set
.
_score
,
max_feats2
=
max_feats2
)
#dictionary trained on proper vocab
#
dictionary trained on proper vocab
self
.
_normal_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
nvocab
)
self
.
_normal_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
nvocab
)
#dictionary trained on proper vocab
#
dictionary trained on proper vocab
self
.
_stem_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
svocab
)
self
.
_stem_dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
vocabulary
=
svocab
)
self
.
dict_initialized
=
True
self
.
dict_initialized
=
True
#Average spelling errors in set. needed later for spelling detection
#
Average spelling errors in set. needed later for spelling detection
self
.
_mean_spelling_errors
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
len
(
e_set
.
_spelling_errors
))
self
.
_mean_spelling_errors
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
len
(
e_set
.
_spelling_errors
))
self
.
_spell_errors_per_character
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
self
.
_spell_errors_per_character
=
sum
(
e_set
.
_spelling_errors
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
#Gets the number and positions of grammar errors
#
Gets the number and positions of grammar errors
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
self
.
_grammar_errors_per_character
=
(
sum
(
good_pos_tags
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
])))
self
.
_grammar_errors_per_character
=
(
sum
(
good_pos_tags
)
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
])))
#Generate bag of words features
#
Generate bag of words features
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
bag_feats
=
self
.
gen_bag_feats
(
e_set
)
#Sum of a row of bag of words features (topical words in an essay)
#
Sum of a row of bag of words features (topical words in an essay)
f_row_sum
=
numpy
.
sum
(
bag_feats
[:,
:])
f_row_sum
=
numpy
.
sum
(
bag_feats
[:,
:])
#Average index of how "topical" essays are
#
Average index of how "topical" essays are
self
.
_mean_f_prop
=
f_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
self
.
_mean_f_prop
=
f_row_sum
/
float
(
sum
([
len
(
t
)
for
t
in
e_set
.
_text
]))
ret
=
"ok"
ret
=
"ok"
else
:
else
:
raise
util_functions
.
InputError
(
e_set
,
"needs to be an essay set of the train type."
)
raise
util_functions
.
InputError
(
e_set
,
"needs to be an essay set of the train type."
)
...
@@ -85,8 +86,8 @@ class FeatureExtractor(object):
...
@@ -85,8 +86,8 @@ class FeatureExtractor(object):
good_pos_ngrams
=
util_functions
.
regenerate_good_tokens
(
essay_corpus
)
good_pos_ngrams
=
util_functions
.
regenerate_good_tokens
(
essay_corpus
)
pickle
.
dump
(
good_pos_ngrams
,
open
(
NGRAM_PATH
,
'wb'
))
pickle
.
dump
(
good_pos_ngrams
,
open
(
NGRAM_PATH
,
'wb'
))
else
:
else
:
#Hard coded list in case the needed files cannot be found
#
Hard coded list in case the needed files cannot be found
good_pos_ngrams
=
[
'NN PRP'
,
'NN PRP .'
,
'NN PRP . DT'
,
'PRP .'
,
'PRP . DT'
,
'PRP . DT NNP'
,
'. DT'
,
good_pos_ngrams
=
[
'NN PRP'
,
'NN PRP .'
,
'NN PRP . DT'
,
'PRP .'
,
'PRP . DT'
,
'PRP . DT NNP'
,
'. DT'
,
'. DT NNP'
,
'. DT NNP NNP'
,
'DT NNP'
,
'DT NNP NNP'
,
'DT NNP NNP NNP'
,
'NNP NNP'
,
'. DT NNP'
,
'. DT NNP NNP'
,
'DT NNP'
,
'DT NNP NNP'
,
'DT NNP NNP NNP'
,
'NNP NNP'
,
'NNP NNP NNP'
,
'NNP NNP NNP NNP'
,
'NNP NNP NNP .'
,
'NNP NNP .'
,
'NNP NNP . TO'
,
'NNP NNP NNP'
,
'NNP NNP NNP NNP'
,
'NNP NNP NNP .'
,
'NNP NNP .'
,
'NNP NNP . TO'
,
'NNP .'
,
'NNP . TO'
,
'NNP . TO NNP'
,
'. TO'
,
'. TO NNP'
,
'. TO NNP NNP'
,
'NNP .'
,
'NNP . TO'
,
'NNP . TO NNP'
,
'. TO'
,
'. TO NNP'
,
'. TO NNP NNP'
,
...
@@ -94,38 +95,38 @@ class FeatureExtractor(object):
...
@@ -94,38 +95,38 @@ class FeatureExtractor(object):
return
good_pos_ngrams
return
good_pos_ngrams
def
_get_grammar_errors
(
self
,
pos
,
text
,
tokens
):
def
_get_grammar_errors
(
self
,
pos
,
text
,
tokens
):
"""
"""
Internal function to get the number of grammar errors in given text
Internal function to get the number of grammar errors in given text
pos - part of speech tagged text (list)
pos - part of speech tagged text (list)
text - normal text (list)
text - normal text (list)
tokens - list of lists of tokenized text
tokens - list of lists of tokenized text
"""
"""
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
tokens
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
tokens
]
good_pos_tags
=
[]
good_pos_tags
=
[]
min_pos_seq
=
2
min_pos_seq
=
2
max_pos_seq
=
4
max_pos_seq
=
4
bad_pos_positions
=
[]
bad_pos_positions
=
[]
for
i
in
xrange
(
0
,
len
(
text
)):
for
i
in
xrange
(
0
,
len
(
text
)):
pos_seq
=
[
tag
[
1
]
for
tag
in
pos
[
i
]]
pos_seq
=
[
tag
[
1
]
for
tag
in
pos
[
i
]]
pos_ngrams
=
util_functions
.
ngrams
(
pos_seq
,
min_pos_seq
,
max_pos_seq
)
pos_ngrams
=
util_functions
.
ngrams
(
pos_seq
,
min_pos_seq
,
max_pos_seq
)
long_pos_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
.
count
(
' '
)
==
(
max_pos_seq
-
1
)]
long_pos_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
.
count
(
' '
)
==
(
max_pos_seq
-
1
)]
bad_pos_tuples
=
[[
z
,
z
+
max_pos_seq
]
for
z
in
xrange
(
0
,
len
(
long_pos_ngrams
))
if
long_pos_ngrams
[
z
]
not
in
self
.
_good_pos_ngrams
]
bad_pos_tuples
=
[[
z
,
z
+
max_pos_seq
]
for
z
in
xrange
(
0
,
len
(
long_pos_ngrams
))
if
long_pos_ngrams
[
z
]
not
in
self
.
_good_pos_ngrams
]
bad_pos_tuples
.
sort
(
key
=
operator
.
itemgetter
(
1
))
bad_pos_tuples
.
sort
(
key
=
operator
.
itemgetter
(
1
))
to_delete
=
[]
to_delete
=
[]
for
m
in
reversed
(
xrange
(
len
(
bad_pos_tuples
)
-
1
)):
for
m
in
reversed
(
xrange
(
len
(
bad_pos_tuples
)
-
1
)):
start
,
end
=
bad_pos_tuples
[
m
]
start
,
end
=
bad_pos_tuples
[
m
]
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
lstart
,
lend
=
bad_pos_tuples
[
j
]
lstart
,
lend
=
bad_pos_tuples
[
j
]
if
lstart
>=
start
and
lstart
<=
end
:
if
lstart
>=
start
and
lstart
<=
end
:
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
to_delete
.
append
(
j
)
to_delete
.
append
(
j
)
fixed_bad_pos_tuples
=
[
bad_pos_tuples
[
z
]
for
z
in
xrange
(
0
,
len
(
bad_pos_tuples
))
if
z
not
in
to_delete
]
fixed_bad_pos_tuples
=
[
bad_pos_tuples
[
z
]
for
z
in
xrange
(
0
,
len
(
bad_pos_tuples
))
if
z
not
in
to_delete
]
bad_pos_positions
.
append
(
fixed_bad_pos_tuples
)
bad_pos_positions
.
append
(
fixed_bad_pos_tuples
)
overlap_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
in
self
.
_good_pos_ngrams
]
overlap_ngrams
=
[
z
for
z
in
pos_ngrams
if
z
in
self
.
_good_pos_ngrams
]
if
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
>
0
:
if
(
len
(
pos_ngrams
)
-
len
(
overlap_ngrams
))
>
0
:
divisor
=
len
(
pos_ngrams
)
/
len
(
pos_seq
)
divisor
=
len
(
pos_ngrams
)
/
len
(
pos_seq
)
else
:
else
:
divisor
=
1
divisor
=
1
if
divisor
==
0
:
if
divisor
==
0
:
...
@@ -143,13 +144,13 @@ class FeatureExtractor(object):
...
@@ -143,13 +144,13 @@ class FeatureExtractor(object):
"""
"""
text
=
e_set
.
_text
text
=
e_set
.
_text
lengths
=
[
len
(
e
)
for
e
in
text
]
lengths
=
[
len
(
e
)
for
e
in
text
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
e_set
.
_tokens
]
word_counts
=
[
max
(
len
(
t
),
1
)
for
t
in
e_set
.
_tokens
]
comma_count
=
[
e
.
count
(
","
)
for
e
in
text
]
comma_count
=
[
e
.
count
(
","
)
for
e
in
text
]
ap_count
=
[
e
.
count
(
"'"
)
for
e
in
text
]
ap_count
=
[
e
.
count
(
"'"
)
for
e
in
text
]
punc_count
=
[
e
.
count
(
"."
)
+
e
.
count
(
"?"
)
+
e
.
count
(
"!"
)
for
e
in
text
]
punc_count
=
[
e
.
count
(
"."
)
+
e
.
count
(
"?"
)
+
e
.
count
(
"!"
)
for
e
in
text
]
chars_per_word
=
[
lengths
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
chars_per_word
=
[
lengths
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
good_pos_tag_prop
=
[
good_pos_tags
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
good_pos_tag_prop
=
[
good_pos_tags
[
m
]
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
text
))]
length_arr
=
numpy
.
array
((
length_arr
=
numpy
.
array
((
...
@@ -203,17 +204,17 @@ class FeatureExtractor(object):
...
@@ -203,17 +204,17 @@ class FeatureExtractor(object):
prompt_overlap
=
[]
prompt_overlap
=
[]
prompt_overlap_prop
=
[]
prompt_overlap_prop
=
[]
for
j
in
e_set
.
_tokens
:
for
j
in
e_set
.
_tokens
:
tok_length
=
len
(
j
)
tok_length
=
len
(
j
)
if
(
tok_length
==
0
):
if
(
tok_length
==
0
):
tok_length
=
1
tok_length
=
1
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
expand_overlap
=
[]
expand_overlap
=
[]
expand_overlap_prop
=
[]
expand_overlap_prop
=
[]
for
j
in
e_set
.
_tokens
:
for
j
in
e_set
.
_tokens
:
tok_length
=
len
(
j
)
tok_length
=
len
(
j
)
if
(
tok_length
==
0
):
if
(
tok_length
==
0
):
tok_length
=
1
tok_length
=
1
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
...
@@ -231,62 +232,62 @@ class FeatureExtractor(object):
...
@@ -231,62 +232,62 @@ class FeatureExtractor(object):
e_set - EssaySet object
e_set - EssaySet object
"""
"""
#Set ratio to modify thresholds for grammar/spelling errors
#
Set ratio to modify thresholds for grammar/spelling errors
modifier_ratio
=
1.05
modifier_ratio
=
1.05
#Calc number of grammar and spelling errors per character
#
Calc number of grammar and spelling errors per character
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
set_grammar
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
e_set
.
_pos
,
e_set
.
_text
,
e_set
.
_tokens
)
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
set_grammar_per_character
=
[
set_grammar
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
set_spell_errors_per_character
=
[
e_set
.
_spelling_errors
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
set_spell_errors_per_character
=
[
e_set
.
_spelling_errors
[
m
]
/
float
(
len
(
e_set
.
_text
[
m
])
+
.
1
)
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
))]
#Iterate through essays and create a feedback dict for each
#
Iterate through essays and create a feedback dict for each
all_feedback
=
[]
all_feedback
=
[]
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
)):
for
m
in
xrange
(
0
,
len
(
e_set
.
_text
)):
#Be very careful about changing these messages!
#
Be very careful about changing these messages!
individual_feedback
=
{
'grammar'
:
"Grammar: Ok."
,
individual_feedback
=
{
'grammar'
:
"Grammar: Ok."
,
'spelling'
:
"Spelling: Ok."
,
'spelling'
:
"Spelling: Ok."
,
'markup_text'
:
""
,
'markup_text'
:
""
,
'grammar_per_char'
:
set_grammar_per_character
[
m
],
'grammar_per_char'
:
set_grammar_per_character
[
m
],
'spelling_per_char'
:
set_spell_errors_per_character
[
m
],
'spelling_per_char'
:
set_spell_errors_per_character
[
m
],
'too_similar_to_prompt'
:
False
,
'too_similar_to_prompt'
:
False
,
}
}
markup_tokens
=
e_set
.
_markup_text
[
m
]
.
split
(
" "
)
markup_tokens
=
e_set
.
_markup_text
[
m
]
.
split
(
" "
)
#This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#
This loop ensures that sequences of bad grammar get put together into one sequence instead of staying
#disjointed
#
disjointed
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_starts
=
[
z
[
0
]
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
bad_pos_ends
=
[
z
[
1
]
-
1
for
z
in
bad_pos_positions
[
m
]]
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
for
z
in
xrange
(
0
,
len
(
markup_tokens
)):
if
z
in
bad_pos_starts
:
if
z
in
bad_pos_starts
:
markup_tokens
[
z
]
=
'<bg>'
+
markup_tokens
[
z
]
markup_tokens
[
z
]
=
'<bg>'
+
markup_tokens
[
z
]
elif
z
in
bad_pos_ends
:
elif
z
in
bad_pos_ends
:
markup_tokens
[
z
]
=
markup_tokens
[
z
]
+
"</bg>"
markup_tokens
[
z
]
=
markup_tokens
[
z
]
+
"</bg>"
if
(
len
(
bad_pos_ends
)
>
0
and
len
(
bad_pos_starts
)
>
0
and
len
(
markup_tokens
)
>
1
):
if
(
len
(
bad_pos_ends
)
>
0
and
len
(
bad_pos_starts
)
>
0
and
len
(
markup_tokens
)
>
1
):
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
if
max
(
bad_pos_ends
)
>
(
len
(
markup_tokens
)
-
1
)
and
max
(
bad_pos_starts
)
<
(
len
(
markup_tokens
)
-
1
):
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
markup_tokens
[
len
(
markup_tokens
)
-
1
]
+=
"</bg>"
#Display messages if grammar/spelling errors greater than average in training set
#
Display messages if grammar/spelling errors greater than average in training set
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
if
set_grammar_per_character
[
m
]
>
(
self
.
_grammar_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
individual_feedback
[
'grammar'
]
=
"Grammar: More grammar errors than average."
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
if
set_spell_errors_per_character
[
m
]
>
(
self
.
_spell_errors_per_character
*
modifier_ratio
):
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
individual_feedback
[
'spelling'
]
=
"Spelling: More spelling errors than average."
#Test topicality by calculating # of on topic words per character and comparing to the training set
#
Test topicality by calculating # of on topic words per character and comparing to the training set
#mean. Requires features to be passed in
#
mean. Requires features to be passed in
if
features
is
not
None
:
if
features
is
not
None
:
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_sum
=
numpy
.
sum
(
features
[
m
,
12
:])
f_row_prop
=
f_row_sum
/
len
(
e_set
.
_text
[
m
])
f_row_prop
=
f_row_sum
/
len
(
e_set
.
_text
[
m
])
if
f_row_prop
<
(
self
.
_mean_f_prop
/
1.5
)
or
len
(
e_set
.
_text
[
m
])
<
20
:
if
f_row_prop
<
(
self
.
_mean_f_prop
/
1.5
)
or
len
(
e_set
.
_text
[
m
])
<
20
:
individual_feedback
[
'topicality'
]
=
"Topicality: Essay may be off topic."
individual_feedback
[
'topicality'
]
=
"Topicality: Essay may be off topic."
if
(
features
[
m
,
9
]
>
.
6
):
if
(
features
[
m
,
9
]
>
.
6
):
individual_feedback
[
'prompt_overlap'
]
=
"Prompt Overlap: Too much overlap with prompt."
individual_feedback
[
'prompt_overlap'
]
=
"Prompt Overlap: Too much overlap with prompt."
individual_feedback
[
'too_similar_to_prompt'
]
=
True
individual_feedback
[
'too_similar_to_prompt'
]
=
True
log
.
debug
(
features
[
m
,
9
])
log
.
debug
(
features
[
m
,
9
])
#Create string representation of markup text
#
Create string representation of markup text
markup_string
=
" "
.
join
(
markup_tokens
)
markup_string
=
" "
.
join
(
markup_tokens
)
individual_feedback
[
'markup_text'
]
=
markup_string
individual_feedback
[
'markup_text'
]
=
markup_string
all_feedback
.
append
(
individual_feedback
)
all_feedback
.
append
(
individual_feedback
)
return
all_feedback
return
all_feedback
ease/grade.py
View file @
2e6cb8e5
...
@@ -8,24 +8,25 @@ import os
...
@@ -8,24 +8,25 @@ import os
import
numpy
import
numpy
import
logging
import
logging
#Append sys to base path to import the following modules
#
Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
#Depend on base path to be imported
#
Depend on base path to be imported
from
essay_set
import
EssaySet
from
essay_set
import
EssaySet
import
predictor_extractor
import
predictor_extractor
import
predictor_set
import
predictor_set
import
util_functions
import
util_functions
#Imports needed to unpickle grader data
#
Imports needed to unpickle grader data
import
feature_extractor
import
feature_extractor
import
sklearn.ensemble
import
sklearn.ensemble
import
math
import
math
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
def
grade
(
grader_data
,
submission
):
def
grade
(
grader_data
,
submission
):
"""
"""
Grades a specified submission using specified models
Grades a specified submission using specified models
grader_data - A dictionary:
grader_data - A dictionary:
...
@@ -38,73 +39,74 @@ def grade(grader_data,submission):
...
@@ -38,73 +39,74 @@ def grade(grader_data,submission):
submission - The student submission (string)
submission - The student submission (string)
"""
"""
#Initialize result dictionary
#
Initialize result dictionary
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
grader_set
=
EssaySet
(
type
=
"test"
)
grader_set
=
EssaySet
(
type
=
"test"
)
#This is to preserve legacy functionality
#
This is to preserve legacy functionality
if
'algorithm'
not
in
grader_data
:
if
'algorithm'
not
in
grader_data
:
grader_data
[
'algorithm'
]
=
util_functions
.
AlgorithmTypes
.
classification
grader_data
[
'algorithm'
]
=
util_functions
.
AlgorithmTypes
.
classification
try
:
try
:
#Try to add essay to essay set object
#
Try to add essay to essay set object
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
except
:
except
:
results
[
'errors'
]
.
append
(
"Essay could not be added to essay set:{0}"
.
format
(
submission
))
results
[
'errors'
]
.
append
(
"Essay could not be added to essay set:{0}"
.
format
(
submission
))
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#
Try to extract features from submission and assign score via the model
try
:
try
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
feedback
=
grader_data
[
'extractor'
]
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
feedback
=
grader_data
[
'extractor'
]
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
])
results
[
'score'
]
=
int
(
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
])
except
:
except
:
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#
Try to determine confidence level
try
:
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
except
:
except
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#
If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
log
.
exception
(
"Problem generating confidence value"
)
if
not
has_error
:
if
not
has_error
:
#If the essay is just a copy of the prompt, return a 0 as the score
#
If the essay is just a copy of the prompt, return a 0 as the score
if
(
feedback
[
'too_similar_to_prompt'
]):
if
(
feedback
[
'too_similar_to_prompt'
]):
results
[
'score'
]
=
0
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
results
[
'correct'
]
=
False
results
[
'success'
]
=
True
results
[
'success'
]
=
True
#Generate short form output--number of problem areas identified in feedback
#
Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available
#
Add feedback to results if available
results
[
'feedback'
]
=
{}
results
[
'feedback'
]
=
{}
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
results
[
'feedback'
]
.
update
({
results
[
'feedback'
]
.
update
({
'topicality'
:
feedback
[
'topicality'
],
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
})
results
[
'feedback'
]
.
update
(
results
[
'feedback'
]
.
update
(
{
{
'spelling'
:
feedback
[
'spelling'
],
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
}
)
)
else
:
else
:
#If error, success is False.
#
If error, success is False.
results
[
'success'
]
=
False
results
[
'success'
]
=
False
return
results
return
results
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
"""
"""
Grades a set of numeric and textual features using a generic model
Grades a set of numeric and textual features using a generic model
...
@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
...
@@ -116,34 +118,34 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
textual_features - list of textual feature to predict on
"""
"""
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
#Try to find and load the model file
#
Try to find and load the model file
grader_set
=
predictor_set
.
PredictorSet
(
type
=
"test"
)
grader_set
=
predictor_set
.
PredictorSet
(
type
=
"test"
)
#Try to add essays to essay set object
#
Try to add essays to essay set object
try
:
try
:
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
except
:
except
:
results
[
'errors'
]
.
append
(
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
))
results
[
'errors'
]
.
append
(
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
))
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#
Try to extract features from submission and assign score via the model
try
:
try
:
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
grader_feats
=
grader_data
[
'extractor'
]
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
]
results
[
'score'
]
=
grader_data
[
'model'
]
.
predict
(
grader_feats
)[
0
]
except
:
except
:
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
results
[
'errors'
]
.
append
(
"Could not extract features and score essay."
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#
Try to determine confidence level
try
:
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
grader_data
[
'model'
],
grader_feats
,
results
[
'score'
])
except
:
except
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#
If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
log
.
exception
(
"Problem generating confidence value"
)
if
not
has_error
:
if
not
has_error
:
...
@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
...
@@ -151,7 +153,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return
results
return
results
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
"""
"""
Determines a confidence in a certain score, given proper input parameters
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
algorithm- from util_functions.AlgorithmTypes
...
@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
...
@@ -163,7 +166,7 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
max_score
=
max
(
numpy
.
asarray
(
scores
))
max_score
=
max
(
numpy
.
asarray
(
scores
))
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
#If classification, predict with probability, which gives you a matrix of confidences per score point
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,(
float
(
score
)
-
float
(
min_score
))]
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,
(
float
(
score
)
-
float
(
min_score
))]
#TODO: Normalize confidence somehow here
#TODO: Normalize confidence somehow here
confidence
=
raw_confidence
confidence
=
raw_confidence
elif
hasattr
(
model
,
"predict"
):
elif
hasattr
(
model
,
"predict"
):
...
@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
...
@@ -173,4 +176,3 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
confidence
=
0
confidence
=
0
return
confidence
return
confidence
ease/model_creator.py
View file @
2e6cb8e5
#Provides interface functions to create and save models
#
Provides interface functions to create and save models
import
numpy
import
numpy
import
re
import
re
...
@@ -19,7 +19,8 @@ import feature_extractor
...
@@ -19,7 +19,8 @@ import feature_extractor
import
logging
import
logging
import
predictor_extractor
import
predictor_extractor
log
=
logging
.
getLogger
()
log
=
logging
.
getLogger
()
def
read_in_test_data
(
filename
):
def
read_in_test_data
(
filename
):
"""
"""
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string
=
open
(
filename
)
.
read
()
prompt_string
=
open
(
filename
)
.
read
()
return
prompt_string
return
prompt_string
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
"""
"""
Reads in a two column version of the test data.
Reads in a two column version of the test data.
Filename must point to a delimited file.
Filename must point to a delimited file.
...
@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
...
@@ -86,29 +88,31 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return
x
return
x
def
get_cv_error
(
clf
,
feats
,
scores
):
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
"""
Gets cross validated error for a given classifier, set of features, and scores
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
clf - classifier
feats - features to feed into the classified and cross validate over
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
ValueError
:
except
ValueError
:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
#
If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log
.
exception
(
"Not enough classes (0,1,etc) in each cross validation fold."
)
log
.
exception
(
"Not enough classes (0,1,etc) in each cross validation fold."
)
except
:
except
:
log
.
exception
(
"Error getting cv error estimates."
)
log
.
exception
(
"Error getting cv error estimates."
)
return
results
return
results
def
get_algorithms
(
type
):
def
get_algorithms
(
type
):
"""
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...
@@ -116,14 +120,14 @@ def get_algorithms(type):
...
@@ -116,14 +120,14 @@ def get_algorithms(type):
"""
"""
if
type
==
util_functions
.
AlgorithmTypes
.
classification
:
if
type
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
return
clf
,
clf2
...
@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
...
@@ -141,16 +145,16 @@ def extract_features_and_generate_model_predictors(predictor_set, type=util_func
train_feats
=
f
.
gen_feats
(
predictor_set
)
train_feats
=
f
.
gen_feats
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
type
)
clf
,
clf2
=
get_algorithms
(
type
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
try
:
try
:
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
return
f
,
clf
,
cv_error_results
...
@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
...
@@ -170,25 +174,26 @@ def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTyp
train_feats
=
f
.
gen_feats
(
essays
)
train_feats
=
f
.
gen_feats
(
essays
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
type
=
util_functions
.
AlgorithmTypes
.
regression
type
=
util_functions
.
AlgorithmTypes
.
regression
else
:
else
:
type
=
util_functions
.
AlgorithmTypes
.
classification
type
=
util_functions
.
AlgorithmTypes
.
classification
clf
,
clf2
=
get_algorithms
(
type
)
clf
,
clf2
=
get_algorithms
(
type
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
try
:
try
:
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
return
f
,
clf
,
cv_error_results
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
"""
"""
Writes out a model to a file.
Writes out a model to a file.
...
@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
...
@@ -197,16 +202,15 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
classifier is a trained classifier
model_path is the path of write out the model file to
model_path is the path of write out the model file to
"""
"""
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
"""
"""
Function that creates essay set, extracts features, and writes out model
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
See above functions for argument descriptions
"""
"""
essay_set
=
create_essay_set
(
text_score
,
prompt
)
essay_set
=
create_essay_set
(
text_score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
ease/predictor_extractor.py
View file @
2e6cb8e5
...
@@ -16,17 +16,18 @@ import logging
...
@@ -16,17 +16,18 @@ import logging
import
math
import
math
from
feature_extractor
import
FeatureExtractor
from
feature_extractor
import
FeatureExtractor
#Append to path and then import things that depend on path
#
Append to path and then import things that depend on path
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
from
essay_set
import
EssaySet
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorExtractor
(
object
):
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
_extractors
=
[]
self
.
_extractors
=
[]
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
if
div_length
==
0
:
div_length
=
1
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
#
Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_initialized
=
True
self
.
_initialized
=
True
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
Generates features based on an iput p_set
p_set - PredictorSet
p_set - PredictorSet
"""
"""
if
self
.
_initialized
!=
True
:
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
textual_features
=
[]
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
...
...
ease/predictor_set.py
View file @
2e6cb8e5
...
@@ -11,12 +11,13 @@ sys.path.append(base_path)
...
@@ -11,12 +11,13 @@ sys.path.append(base_path)
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorSet
(
object
):
class
PredictorSet
(
object
):
def
__init__
(
self
,
type
=
"train"
):
def
__init__
(
self
,
type
=
"train"
):
"""
"""
Initialize variables and check essay set type
Initialize variables and check essay set type
"""
"""
...
@@ -24,13 +25,13 @@ class PredictorSet(object):
...
@@ -24,13 +25,13 @@ class PredictorSet(object):
type
=
"train"
type
=
"train"
self
.
_type
=
type
self
.
_type
=
type
self
.
_target
=
[]
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
self
.
_essay_sets
=
[]
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
#Basic input checking
#
Basic input checking
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
error_message
=
"Target is not a numeric value."
error_message
=
"Target is not a numeric value."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
...
@@ -46,8 +47,8 @@ class PredictorSet(object):
...
@@ -46,8 +47,8 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Do some length checking for parameters
#
Do some length checking for parameters
if
len
(
self
.
_numeric_features
)
>
0
:
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
current_numeric_length
=
len
(
numeric_features
)
current_numeric_length
=
len
(
numeric_features
)
if
numeric_length
!=
current_numeric_length
:
if
numeric_length
!=
current_numeric_length
:
...
@@ -55,7 +56,7 @@ class PredictorSet(object):
...
@@ -55,7 +56,7 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
if
len
(
self
.
_textual_features
)
>
0
:
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
current_textual_length
=
len
(
textual_features
)
current_textual_length
=
len
(
textual_features
)
if
textual_length
!=
current_textual_length
:
if
textual_length
!=
current_textual_length
:
...
@@ -63,9 +64,9 @@ class PredictorSet(object):
...
@@ -63,9 +64,9 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Now check to see if text features and numeric features are individually correct
#
Now check to see if text features and numeric features are individually correct
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
try
:
try
:
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
except
:
except
:
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
try
:
try
:
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
except
:
except
:
...
@@ -82,19 +82,18 @@ class PredictorSet(object):
...
@@ -82,19 +82,18 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Create essay sets for textual features if needed
#
Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
type
=
self
.
_type
))
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
type
=
self
.
_type
))
#Add numeric and textual features
#
Add numeric and textual features
self
.
_numeric_features
.
append
(
numeric_features
)
self
.
_numeric_features
.
append
(
numeric_features
)
self
.
_textual_features
.
append
(
textual_features
)
self
.
_textual_features
.
append
(
textual_features
)
#Add targets
#
Add targets
self
.
_target
.
append
(
target
)
self
.
_target
.
append
(
target
)
#Add textual features to essay sets
#
Add textual features to essay sets
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
ease/tests/test_model_accuracy.py
View file @
2e6cb8e5
...
@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
...
@@ -13,6 +13,7 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT
=
100
TRAINING_LIMIT
=
100
QUICK_TEST_LIMIT
=
5
QUICK_TEST_LIMIT
=
5
class
DataLoader
():
class
DataLoader
():
def
load_text_files
(
self
,
pathname
):
def
load_text_files
(
self
,
pathname
):
filenames
=
os
.
listdir
(
pathname
)
filenames
=
os
.
listdir
(
pathname
)
...
@@ -28,34 +29,36 @@ class DataLoader():
...
@@ -28,34 +29,36 @@ class DataLoader():
"""
"""
pass
pass
class
PolarityLoader
(
DataLoader
):
class
PolarityLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
self
.
pathname
=
pathname
def
load_data
(
self
):
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
filenames
=
os
.
listdir
(
self
.
pathname
)
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
#Sort so neg is first
#
Sort so neg is first
directories
.
sort
()
directories
.
sort
()
#We need to have both a postive and a negative folder to classify
#
We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
if
len
(
directories
)
!=
2
:
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
neg
=
self
.
load_text_files
(
directories
[
0
])
neg
=
self
.
load_text_files
(
directories
[
0
])
pos
=
self
.
load_text_files
(
directories
[
1
])
pos
=
self
.
load_text_files
(
directories
[
1
])
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
text
=
neg
+
pos
text
=
neg
+
pos
return
scores
,
text
return
scores
,
text
class
ModelCreator
():
class
ModelCreator
():
def
__init__
(
self
,
scores
,
text
):
def
__init__
(
self
,
scores
,
text
):
self
.
scores
=
scores
self
.
scores
=
scores
self
.
text
=
text
self
.
text
=
text
#Governs which creation function in the ease.create module to use. See module for info.
#
Governs which creation function in the ease.create module to use. See module for info.
if
isinstance
(
text
[
0
],
basestring
):
if
isinstance
(
text
[
0
],
basestring
):
self
.
create_model_generic
=
False
self
.
create_model_generic
=
False
else
:
else
:
...
@@ -67,6 +70,7 @@ class ModelCreator():
...
@@ -67,6 +70,7 @@ class ModelCreator():
else
:
else
:
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
class
Grader
():
class
Grader
():
def
__init__
(
self
,
model_data
):
def
__init__
(
self
,
model_data
):
self
.
model_data
=
model_data
self
.
model_data
=
model_data
...
@@ -77,6 +81,7 @@ class Grader():
...
@@ -77,6 +81,7 @@ class Grader():
else
:
else
:
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_features'
,
[]),
submission
.
get
(
'textual_features'
,
[]))
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_features'
,
[]),
submission
.
get
(
'textual_features'
,
[]))
class
GenericTest
(
object
):
class
GenericTest
(
object
):
loader
=
DataLoader
loader
=
DataLoader
data_path
=
""
data_path
=
""
...
@@ -87,11 +92,11 @@ class GenericTest(object):
...
@@ -87,11 +92,11 @@ class GenericTest(object):
data_loader
=
self
.
loader
(
os
.
path
.
join
(
TEST_PATH
,
self
.
data_path
))
data_loader
=
self
.
loader
(
os
.
path
.
join
(
TEST_PATH
,
self
.
data_path
))
scores
,
text
=
data_loader
.
load_data
()
scores
,
text
=
data_loader
.
load_data
()
#Shuffle to mix up the classes, set seed to make it repeatable
#
Shuffle to mix up the classes, set seed to make it repeatable
random
.
seed
(
1
)
random
.
seed
(
1
)
shuffled_scores
=
[]
shuffled_scores
=
[]
shuffled_text
=
[]
shuffled_text
=
[]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
random
.
shuffle
(
indices
)
random
.
shuffle
(
indices
)
for
i
in
indices
:
for
i
in
indices
:
shuffled_scores
.
append
(
scores
[
i
])
shuffled_scores
.
append
(
scores
[
i
])
...
@@ -121,12 +126,13 @@ class GenericTest(object):
...
@@ -121,12 +126,13 @@ class GenericTest(object):
self
.
assertGreaterEqual
(
cv_kappa
,
self
.
expected_kappa_min
)
self
.
assertGreaterEqual
(
cv_kappa
,
self
.
expected_kappa_min
)
self
.
assertLessEqual
(
cv_mae
,
self
.
expected_mae_max
)
self
.
assertLessEqual
(
cv_mae
,
self
.
expected_mae_max
)
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
loader
=
PolarityLoader
loader
=
PolarityLoader
data_path
=
"data/polarity"
data_path
=
"data/polarity"
#These will increase if we allow more data in.
#
These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_kappa_min
=
-.
2
expected_mae_max
=
1
expected_mae_max
=
1
...
...
ease/util_functions.py
View file @
2e6cb8e5
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#
Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
#
Requires aspell to be installed and added to the path
from
external_code.fisher
import
fisher
from
external_code.fisher
import
fisher
aspell_path
=
"aspell"
aspell_path
=
"aspell"
...
@@ -14,17 +14,18 @@ import pickle
...
@@ -14,17 +14,18 @@ import pickle
import
logging
import
logging
import
sys
import
sys
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
#Paths to needed data files
#
Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
class
AlgorithmTypes
(
object
):
class
AlgorithmTypes
(
object
):
"""
"""
Defines what types of algorithm can be used
Defines what types of algorithm can be used
...
@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
...
@@ -32,20 +33,22 @@ class AlgorithmTypes(object):
regression
=
"regression"
regression
=
"regression"
classification
=
"classifiction"
classification
=
"classifiction"
def
create_model_path
(
model_path
):
def
create_model_path
(
model_path
):
"""
"""
Creates a path to model files
Creates a path to model files
model_path - string
model_path - string
"""
"""
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
model_path
=
"/"
+
model_path
model_path
=
"/"
+
model_path
if
not
model_path
.
startswith
(
"models"
):
if
not
model_path
.
startswith
(
"models"
):
model_path
=
"models"
+
model_path
model_path
=
"models"
+
model_path
if
not
model_path
.
endswith
(
".p"
):
if
not
model_path
.
endswith
(
".p"
):
model_path
+=
".p"
model_path
+=
".p"
return
model_path
return
model_path
def
sub_chars
(
string
):
def
sub_chars
(
string
):
"""
"""
Strips illegal characters from a string. Used to sanitize input essays.
Strips illegal characters from a string. Used to sanitize input essays.
...
@@ -53,7 +56,7 @@ def sub_chars(string):
...
@@ -53,7 +56,7 @@ def sub_chars(string):
Returns sanitized string.
Returns sanitized string.
string - string
string - string
"""
"""
#Define replacement patterns
#
Define replacement patterns
sub_pat
=
r"[^A-Za-z\.\?!,';:]"
sub_pat
=
r"[^A-Za-z\.\?!,';:]"
char_pat
=
r"\."
char_pat
=
r"\."
com_pat
=
r","
com_pat
=
r","
...
@@ -63,9 +66,9 @@ def sub_chars(string):
...
@@ -63,9 +66,9 @@ def sub_chars(string):
col_pat
=
r":"
col_pat
=
r":"
whitespace_pat
=
r"\s{1,}"
whitespace_pat
=
r"\s{1,}"
#Replace text. Ordering is very important!
#
Replace text. Ordering is very important!
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
...
@@ -84,7 +87,7 @@ def spell_correct(string):
...
@@ -84,7 +87,7 @@ def spell_correct(string):
string - string
string - string
"""
"""
#Create a temp file so that aspell could be used
#
Create a temp file so that aspell could be used
f
=
open
(
'tmpfile'
,
'w'
)
f
=
open
(
'tmpfile'
,
'w'
)
f
.
write
(
string
)
f
.
write
(
string
)
f_path
=
os
.
path
.
abspath
(
f
.
name
)
f_path
=
os
.
path
.
abspath
(
f
.
name
)
...
@@ -93,16 +96,16 @@ def spell_correct(string):
...
@@ -93,16 +96,16 @@ def spell_correct(string):
p
=
os
.
popen
(
aspell_path
+
" -a < "
+
f_path
+
" --sug-mode=ultra"
)
p
=
os
.
popen
(
aspell_path
+
" -a < "
+
f_path
+
" --sug-mode=ultra"
)
except
:
except
:
log
.
exception
(
"Could not find aspell, so could not spell correct!"
)
log
.
exception
(
"Could not find aspell, so could not spell correct!"
)
#Return original string if aspell fails
#
Return original string if aspell fails
return
string
,
0
,
string
return
string
,
0
,
string
#Aspell returns a list of incorrect words with the above flags
#
Aspell returns a list of incorrect words with the above flags
incorrect
=
p
.
readlines
()
incorrect
=
p
.
readlines
()
p
.
close
()
p
.
close
()
incorrect_words
=
list
()
incorrect_words
=
list
()
correct_spelling
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
):
if
(
len
(
incorrect
[
i
])
>
10
):
#Reformat aspell output to make sense
#
Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
if
hasattr
(
match
,
"start"
):
begstring
=
incorrect
[
i
][
2
:
match
.
start
()]
begstring
=
incorrect
[
i
][
2
:
match
.
start
()]
...
@@ -117,19 +120,19 @@ def spell_correct(string):
...
@@ -117,19 +120,19 @@ def spell_correct(string):
incorrect_words
.
append
(
begword
)
incorrect_words
.
append
(
begword
)
correct_spelling
.
append
(
sug
)
correct_spelling
.
append
(
sug
)
#Create markup based on spelling errors
#
Create markup based on spelling errors
newstring
=
string
newstring
=
string
markup_string
=
string
markup_string
=
string
already_subbed
=
[]
already_subbed
=
[]
for
i
in
range
(
0
,
len
(
incorrect_words
)):
for
i
in
range
(
0
,
len
(
incorrect_words
)):
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_comp
=
re
.
compile
(
sub_pat
)
sub_comp
=
re
.
compile
(
sub_pat
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
if
incorrect_words
[
i
]
not
in
already_subbed
:
if
incorrect_words
[
i
]
not
in
already_subbed
:
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
already_subbed
.
append
(
incorrect_words
[
i
])
already_subbed
.
append
(
incorrect_words
[
i
])
return
newstring
,
len
(
incorrect_words
),
markup_string
return
newstring
,
len
(
incorrect_words
),
markup_string
def
ngrams
(
tokens
,
min_n
,
max_n
):
def
ngrams
(
tokens
,
min_n
,
max_n
):
...
@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
...
@@ -192,7 +195,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
Returns a list of words that constitute the significant vocabulary
"""
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict_mat
=
dict
.
fit_transform
(
text
)
dict_mat
=
dict
.
fit_transform
(
text
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
med_score
=
numpy
.
median
(
set_score
)
...
@@ -335,6 +338,7 @@ def calc_list_average(l):
...
@@ -335,6 +338,7 @@ def calc_list_average(l):
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
"""
"""
Calculates kappa correlation between rater_a and rater_b.
Calculates kappa correlation between rater_a and rater_b.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment