Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
31d4363d
Commit
31d4363d
authored
Jun 13, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Stylistic cleanup
parent
9c16fbbe
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
83 deletions
+107
-83
ease/create.py
+8
-5
ease/errors.py
+2
-0
ease/essay_set.py
+11
-10
ease/feature_extractor.py
+17
-15
ease/grade.py
+2
-2
ease/tests/test_model_accuracy.py
+55
-38
ease/tests/test_spellcheck.py
+1
-1
ease/util_functions.py
+11
-12
No files found.
ease/create.py
View file @
31d4363d
...
@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
...
@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
"""
"""
import
os
import
os
import
sys
import
logging
import
logging
import
numpy
import
numpy
import
sys
# Constructs a log
# Constructs a log
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
# Setup base path so that we can import modules who are dependent on it
# Setup base path so that we can import modules who are dependent on it
...
@@ -15,7 +18,7 @@ sys.path.append(base_path)
...
@@ -15,7 +18,7 @@ sys.path.append(base_path)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
sys
.
path
.
append
(
one_up_path
)
sys
.
path
.
append
(
one_up_path
)
#Import modules that are dependent on the base path
#
Import modules that are dependent on the base path
import
util_functions
import
util_functions
from
errors
import
*
from
errors
import
*
from
datetime
import
datetime
from
datetime
import
datetime
...
@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
...
@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
# We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
# We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
except
Exception
as
ex
:
except
Exception
as
ex
:
str
=
(
msg
=
(
"predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
"predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
)
.
format
(
ex
)
)
.
format
(
ex
)
log
.
exception
(
str
)
log
.
exception
(
msg
)
raise
ClassifierTrainingInternalError
(
str
)
raise
ClassifierTrainingInternalError
(
msg
)
return
feat_extractor
,
predict_classifier
,
cv_error_results
return
feat_extractor
,
predict_classifier
,
cv_error_results
...
...
ease/errors.py
View file @
31d4363d
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
Errors for the EASE repository
Errors for the EASE repository
"""
"""
class
EaseError
(
Exception
):
class
EaseError
(
Exception
):
pass
pass
...
@@ -45,6 +46,7 @@ class InputError(EaseError):
...
@@ -45,6 +46,7 @@ class InputError(EaseError):
"""
"""
The user supplied an argument which was incorrect.
The user supplied an argument which was incorrect.
"""
"""
def
__init__
(
self
,
expr
,
msg
):
def
__init__
(
self
,
expr
,
msg
):
self
.
expr
=
expr
self
.
expr
=
expr
self
.
msg
=
msg
self
.
msg
=
msg
...
...
ease/essay_set.py
View file @
31d4363d
...
@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
...
@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
Performs spell and grammar checking, tokenization, and stemming.
Performs spell and grammar checking, tokenization, and stemming.
"""
"""
import
nltk
import
sys
import
random
import
random
import
os
import
os
import
logging
import
logging
from
ease.errors
import
InputError
import
nltk
import
sys
from
errors
import
*
from
errors
import
*
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
...
@@ -97,9 +98,9 @@ class EssaySet(object):
...
@@ -97,9 +98,9 @@ class EssaySet(object):
try
:
try
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
except
UnicodeError
as
ex
:
except
UnicodeError
as
ex
:
str
=
"Could not parse essay text into ascii: {}"
.
format
(
ex
)
msg
=
"Could not parse essay text into ascii: {}"
.
format
(
ex
)
log
.
exception
(
str
)
log
.
exception
(
msg
)
raise
EssaySetRequestError
(
ex
)
raise
EssaySetRequestError
(
msg
)
# Validates that score is an integer and essay_text is a string.
# Validates that score is an integer and essay_text is a string.
try
:
try
:
...
@@ -107,9 +108,9 @@ class EssaySet(object):
...
@@ -107,9 +108,9 @@ class EssaySet(object):
essay_text
=
str
(
essay_text
)
essay_text
=
str
(
essay_text
)
essay_generated
=
int
(
essay_generated
)
essay_generated
=
int
(
essay_generated
)
except
TypeError
:
except
TypeError
:
str
=
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
))
ex
=
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
))
log
.
exception
(
str
)
log
.
exception
(
ex
)
raise
EssaySetRequestError
(
str
)
raise
EssaySetRequestError
(
ex
)
# Validates that essay generated is 0 or 1
# Validates that essay generated is 0 or 1
if
essay_generated
!=
0
and
essay_generated
!=
1
:
if
essay_generated
!=
0
and
essay_generated
!=
1
:
...
...
ease/feature_extractor.py
View file @
31d4363d
...
@@ -2,24 +2,26 @@
...
@@ -2,24 +2,26 @@
Extracts features from training set and test set essays
Extracts features from training set and test set essays
"""
"""
import
numpy
import
nltk
import
sys
from
sklearn.feature_extraction.text
import
CountVectorizer
import
pickle
import
pickle
import
os
import
os
from
itertools
import
chain
from
itertools
import
chain
import
operator
import
operator
import
logging
import
logging
import
numpy
import
nltk
import
sys
from
sklearn.feature_extraction.text
import
CountVectorizer
from
errors
import
*
from
errors
import
*
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
...
@@ -79,9 +81,8 @@ class FeatureExtractor(object):
...
@@ -79,9 +81,8 @@ class FeatureExtractor(object):
sum
([
len
(
essay
)
for
essay
in
essay_set
.
_cleaned_essays
]))
sum
([
len
(
essay
)
for
essay
in
essay_set
.
_cleaned_essays
]))
# Gets the number and positions of grammar errors
# Gets the number and positions of grammar errors
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
essay_set
.
_cleaned_essays
)
)
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# grammatical errors. Though this was not what the original author used, it is clearly what his code
# grammatical errors. Though this was not what the original author used, it is clearly what his code
# implies, as if this is intended to be a true "grammar errors per character", we should have that
# implies, as if this is intended to be a true "grammar errors per character", we should have that
...
@@ -154,7 +155,7 @@ class FeatureExtractor(object):
...
@@ -154,7 +155,7 @@ class FeatureExtractor(object):
# SEE COMMENT AROUND LINE 85
# SEE COMMENT AROUND LINE 85
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
)
essay_set
.
_cleaned_essays
)
good_pos_tag_proportion
=
[
len
(
bad_pos_positions
[
m
])
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
good_pos_tag_proportion
=
[
len
(
bad_pos_positions
[
m
])
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
length_array
=
numpy
.
array
((
length_array
=
numpy
.
array
((
...
@@ -204,7 +205,7 @@ class FeatureExtractor(object):
...
@@ -204,7 +205,7 @@ class FeatureExtractor(object):
prompt_overlap_prop
=
[]
prompt_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
tok_length
=
len
(
j
)
if
(
tok_length
==
0
)
:
if
tok_length
==
0
:
tok_length
=
1
tok_length
=
1
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
...
@@ -212,7 +213,7 @@ class FeatureExtractor(object):
...
@@ -212,7 +213,7 @@ class FeatureExtractor(object):
expand_overlap_prop
=
[]
expand_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
tok_length
=
len
(
j
)
if
(
tok_length
==
0
)
:
if
tok_length
==
0
:
tok_length
=
1
tok_length
=
1
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
...
@@ -221,7 +222,7 @@ class FeatureExtractor(object):
...
@@ -221,7 +222,7 @@ class FeatureExtractor(object):
return
prompt_arr
.
copy
()
return
prompt_arr
.
copy
()
def
_get_grammar_errors
(
self
,
pos
,
essays
,
tokens
):
def
_get_grammar_errors
(
self
,
pos
,
essays
):
"""
"""
Internal function to get the number of grammar errors in given text
Internal function to get the number of grammar errors in given text
...
@@ -251,7 +252,7 @@ class FeatureExtractor(object):
...
@@ -251,7 +252,7 @@ class FeatureExtractor(object):
start
,
end
=
bad_pos_tuples
[
m
]
start
,
end
=
bad_pos_tuples
[
m
]
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
lstart
,
lend
=
bad_pos_tuples
[
j
]
lstart
,
lend
=
bad_pos_tuples
[
j
]
if
lstart
>=
start
and
lstart
<=
end
:
if
start
<=
lstart
<=
end
:
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
to_delete
.
append
(
j
)
to_delete
.
append
(
j
)
...
@@ -268,7 +269,8 @@ class FeatureExtractor(object):
...
@@ -268,7 +269,8 @@ class FeatureExtractor(object):
good_grammar_ratios
.
append
(
good_grammar_ratio
)
good_grammar_ratios
.
append
(
good_grammar_ratio
)
return
good_grammar_ratios
,
bad_pos_positions
return
good_grammar_ratios
,
bad_pos_positions
def
_get_good_pos_ngrams
(
self
):
@staticmethod
def
_get_good_pos_ngrams
():
"""
"""
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
Returns the list and caches the file
...
...
ease/grade.py
View file @
31d4363d
...
@@ -8,11 +8,12 @@ import logging
...
@@ -8,11 +8,12 @@ import logging
import
sys
import
sys
# Append sys to base path to import the following modules
# Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
#Depend on base path to be imported
#
Depend on base path to be imported
from
essay_set
import
EssaySet
from
essay_set
import
EssaySet
from
errors
import
*
from
errors
import
*
...
@@ -45,7 +46,6 @@ def grade(grader_data, submission):
...
@@ -45,7 +46,6 @@ def grade(grader_data, submission):
# Instantiates the Essay set which will carry our essay while it is being classified and graded.
# Instantiates the Essay set which will carry our essay while it is being classified and graded.
grader_set
=
EssaySet
(
essay_type
=
"test"
)
grader_set
=
EssaySet
(
essay_type
=
"test"
)
feedback
=
{}
# Retrieves the model and extractor we will be using
# Retrieves the model and extractor we will be using
model
,
extractor
=
_get_classifier_and_extractor
(
grader_data
)
model
,
extractor
=
_get_classifier_and_extractor
(
grader_data
)
...
...
ease/tests/test_model_accuracy.py
View file @
31d4363d
import
unittest
import
unittest
import
os
import
os
from
ease
import
create
,
grade
import
random
import
random
import
logging
import
logging
import
json
import
json
from
ease
import
create
,
grade
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
ROOT_PATH
=
os
.
path
.
abspath
(
__file__
)
ROOT_PATH
=
os
.
path
.
abspath
(
__file__
)
...
@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
...
@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT
=
50
TRAINING_LIMIT
=
50
QUICK_TEST_LIMIT
=
5
QUICK_TEST_LIMIT
=
5
# noinspection PyClassHasNoInit
class
DataLoader
():
class
DataLoader
():
def
load_text_files
(
self
,
pathname
):
@staticmethod
def
load_text_files
(
pathname
):
filenames
=
os
.
listdir
(
pathname
)
filenames
=
os
.
listdir
(
pathname
)
text
=
[]
text
=
[]
for
filename
in
filenames
:
for
filename
in
filenames
:
...
@@ -23,7 +27,8 @@ class DataLoader():
...
@@ -23,7 +27,8 @@ class DataLoader():
text
.
append
(
data
[:
CHARACTER_LIMIT
])
text
.
append
(
data
[:
CHARACTER_LIMIT
])
return
text
return
text
def
load_json_file
(
self
,
filename
):
@staticmethod
def
load_json_file
(
filename
):
datafile
=
open
(
os
.
path
.
join
(
filename
))
datafile
=
open
(
os
.
path
.
join
(
filename
))
data
=
json
.
load
(
datafile
)
data
=
json
.
load
(
datafile
)
return
data
return
data
...
@@ -34,38 +39,42 @@ class DataLoader():
...
@@ -34,38 +39,42 @@ class DataLoader():
"""
"""
pass
pass
class
PolarityLoader
(
DataLoader
):
class
PolarityLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
self
.
pathname
=
pathname
def
load_data
(
self
):
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
filenames
=
os
.
listdir
(
self
.
pathname
)
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
#Sort so neg is first
#
Sort so neg is first
directories
.
sort
()
directories
.
sort
()
#We need to have both a postive and a negative folder to classify
#
We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
if
len
(
directories
)
!=
2
:
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
neg
=
self
.
load_text_files
(
directories
[
0
])
neg
=
self
.
load_text_files
(
directories
[
0
])
pos
=
self
.
load_text_files
(
directories
[
1
])
pos
=
self
.
load_text_files
(
directories
[
1
])
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
text
=
neg
+
pos
text
=
neg
+
pos
return
scores
,
text
return
scores
,
text
class
JSONLoader
(
DataLoader
):
class
JSONLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
self
.
pathname
=
pathname
def
load_data
(
self
):
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
filenames
=
os
.
listdir
(
self
.
pathname
)
files
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
if
f
.
endswith
(
".json"
)]
files
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
if
f
.
endswith
(
".json"
)]
files
.
sort
()
files
.
sort
()
#We need to have both a postive and a negative folder to classify
#
We need to have both a postive and a negative folder to classify
if
len
(
files
)
==
0
:
if
len
(
files
)
==
0
:
return
[],
[]
return
[],
[]
...
@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
...
@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
all_scores
=
[]
all_scores
=
[]
all_text
=
[]
all_text
=
[]
for
i
in
xrange
(
0
,
len
(
data
)):
for
i
in
xrange
(
0
,
len
(
data
)):
scores
=
[
d
[
'score'
]
for
d
in
data
[
i
]]
scores
=
[
d
[
'score'
]
for
d
in
data
[
i
]]
text
=
[
d
[
'text'
]
for
d
in
data
[
i
]]
text
=
[
d
[
'text'
]
for
d
in
data
[
i
]]
if
isinstance
(
scores
[
0
],
list
):
if
isinstance
(
scores
[
0
],
list
):
new_text
=
[]
new_text
=
[]
new_scores
=
[]
new_scores
=
[]
for
i
in
xrange
(
0
,
len
(
scores
)):
for
j
in
xrange
(
0
,
len
(
scores
)):
text
=
scores
[
i
]
text
=
scores
[
j
]
s
=
scores
[
i
]
s
=
scores
[
j
]
for
j
in
s
:
for
k
in
s
:
new_text
.
append
(
text
)
new_text
.
append
(
text
)
new_scores
.
append
(
j
)
new_scores
.
append
(
k
)
text
=
new_text
text
=
new_text
scores
=
new_scores
scores
=
new_scores
...
@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
...
@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
return
all_scores
,
all_text
return
all_scores
,
all_text
class
ModelCreator
():
class
ModelCreator
():
def
__init__
(
self
,
scores
,
text
):
def
__init__
(
self
,
scores
,
text
):
self
.
scores
=
scores
self
.
scores
=
scores
self
.
text
=
text
self
.
text
=
text
#Governs which creation function in the ease.create module to use. See module for info.
#
Governs which creation function in the ease.create module to use. See module for info.
if
isinstance
(
text
,
list
):
if
isinstance
(
text
,
list
):
self
.
create_model_generic
=
False
self
.
create_model_generic
=
False
else
:
else
:
...
@@ -112,7 +122,9 @@ class ModelCreator():
...
@@ -112,7 +122,9 @@ class ModelCreator():
if
not
self
.
create_model_generic
:
if
not
self
.
create_model_generic
:
return
create
.
create
(
self
.
text
,
self
.
scores
,
""
)
return
create
.
create
(
self
.
text
,
self
.
scores
,
""
)
else
:
else
:
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
class
Grader
():
class
Grader
():
def
__init__
(
self
,
model_data
):
def
__init__
(
self
,
model_data
):
...
@@ -122,7 +134,9 @@ class Grader():
...
@@ -122,7 +134,9 @@ class Grader():
if
isinstance
(
submission
,
basestring
):
if
isinstance
(
submission
,
basestring
):
return
grade
.
grade
(
self
.
model_data
,
submission
)
return
grade
.
grade
(
self
.
model_data
,
submission
)
else
:
else
:
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_values'
,
[]),
submission
.
get
(
'textual_values'
,
[]))
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_values'
,
[]),
submission
.
get
(
'textual_values'
,
[]))
class
GenericTest
(
object
):
class
GenericTest
(
object
):
loader
=
DataLoader
loader
=
DataLoader
...
@@ -137,11 +151,11 @@ class GenericTest(object):
...
@@ -137,11 +151,11 @@ class GenericTest(object):
return
scores
,
text
return
scores
,
text
def
generic_setup
(
self
,
scores
,
text
):
def
generic_setup
(
self
,
scores
,
text
):
#Shuffle to mix up the classes, set seed to make it repeatable
#
Shuffle to mix up the classes, set seed to make it repeatable
random
.
seed
(
1
)
random
.
seed
(
1
)
shuffled_scores
=
[]
shuffled_scores
=
[]
shuffled_text
=
[]
shuffled_text
=
[]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
random
.
shuffle
(
indices
)
random
.
shuffle
(
indices
)
for
i
in
indices
:
for
i
in
indices
:
shuffled_scores
.
append
(
scores
[
i
])
shuffled_scores
.
append
(
scores
[
i
])
...
@@ -159,45 +173,46 @@ class GenericTest(object):
...
@@ -159,45 +173,46 @@ class GenericTest(object):
grader
=
Grader
(
results
)
grader
=
Grader
(
results
)
results
=
grader
.
grade
(
self
.
text
[
0
])
results
=
grader
.
grade
(
self
.
text
[
0
])
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
def
scoring_accuracy
(
self
):
def
scoring_accuracy
(
self
):
random
.
seed
(
1
)
random
.
seed
(
1
)
model_creator
=
ModelCreator
(
self
.
scores
,
self
.
text
)
model_creator
=
ModelCreator
(
self
.
scores
,
self
.
text
)
results
=
model_creator
.
create_model
()
results
=
model_creator
.
create_model
()
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
cv_kappa
=
results
[
'cv_kappa'
]
cv_kappa
=
results
[
'cv_kappa'
]
cv_mae
=
results
[
'cv_mean_absolute_error'
]
cv_mae
=
results
[
'cv_mean_absolute_error'
]
assert
cv_kappa
>=
self
.
expected_kappa_min
assert
cv_kappa
>=
self
.
expected_kappa_min
assert
cv_mae
<=
self
.
expected_mae_max
assert
cv_mae
<=
self
.
expected_mae_max
def
generic_model_creation_and_grading
(
self
):
def
generic_model_creation_and_grading
(
self
):
log
.
info
(
self
.
scores
)
log
.
info
(
self
.
scores
)
log
.
info
(
self
.
text
)
log
.
info
(
self
.
text
)
score_subset
=
[
random
.
randint
(
0
,
100
)
for
i
in
xrange
(
0
,
min
([
QUICK_TEST_LIMIT
,
len
(
self
.
scores
)]))]
score_subset
=
[
random
.
randint
(
0
,
100
)
for
i
in
xrange
(
0
,
min
([
QUICK_TEST_LIMIT
,
len
(
self
.
scores
)]))]
text_subset
=
self
.
text
[:
QUICK_TEST_LIMIT
]
text_subset
=
self
.
text
[:
QUICK_TEST_LIMIT
]
text_subset
=
{
text_subset
=
{
'textual_values'
:
[[
t
]
for
t
in
text_subset
],
'textual_values'
:
[[
t
]
for
t
in
text_subset
],
'numeric_values'
:
[[
1
]
for
i
in
xrange
(
0
,
len
(
text_subset
))]
'numeric_values'
:
[[
1
]
for
i
in
xrange
(
0
,
len
(
text_subset
))]
}
}
model_creator
=
ModelCreator
(
score_subset
,
text_subset
)
model_creator
=
ModelCreator
(
score_subset
,
text_subset
)
results
=
model_creator
.
create_model
()
results
=
model_creator
.
create_model
()
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
grader
=
Grader
(
results
)
grader
=
Grader
(
results
)
test_text
=
{
test_text
=
{
'textual_values'
:
[
self
.
text
[
0
]],
'textual_values'
:
[
self
.
text
[
0
]],
'numeric_values'
:
[
1
]
'numeric_values'
:
[
1
]
}
}
results
=
grader
.
grade
(
test_text
)
results
=
grader
.
grade
(
test_text
)
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
loader
=
PolarityLoader
loader
=
PolarityLoader
data_path
=
"data/polarity"
data_path
=
"data/polarity"
#These will increase if we allow more data in.
#
These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_kappa_min
=
-.
2
expected_mae_max
=
1
expected_mae_max
=
1
...
@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
...
@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
def
test_generic_model_creation_and_grading
(
self
):
def
test_generic_model_creation_and_grading
(
self
):
self
.
generic_model_creation_and_grading
()
self
.
generic_model_creation_and_grading
()
class
JSONTest
(
GenericTest
):
class
JSONTest
(
GenericTest
):
loader
=
JSONLoader
loader
=
JSONLoader
data_path
=
"data/json_data"
data_path
=
"data/json_data"
#These will increase if we allow more data in.
#
These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_kappa_min
=
-.
2
expected_mae_max
=
1
expected_mae_max
=
1
...
@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
...
@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
self
.
scores
,
self
.
text
=
self
.
load_data
()
self
.
scores
,
self
.
text
=
self
.
load_data
()
return
self
.
scores
,
self
.
text
return
self
.
scores
,
self
.
text
def
test_loop
():
def
test_loop
():
json_test
=
JSONTest
()
json_test
=
JSONTest
()
scores
,
text
=
json_test
.
setUp
()
scores
,
text
=
json_test
.
setUp
()
for
i
in
xrange
(
0
,
len
(
scores
)):
for
i
in
xrange
(
0
,
len
(
scores
)):
json_test
.
generic_setup
(
scores
[
i
],
text
[
i
])
json_test
.
generic_setup
(
scores
[
i
],
text
[
i
])
yield
json_test
.
model_creation_and_grading
yield
json_test
.
model_creation_and_grading
yield
json_test
.
scoring_accuracy
yield
json_test
.
scoring_accuracy
...
...
ease/tests/test_spellcheck.py
View file @
31d4363d
from
unittest
import
TestCase
from
unittest
import
TestCase
from
nose.tools
import
assert_equal
from
nose.tools
import
assert_equal
from
mock
import
patch
from
mock
import
patch
from
ease.util_functions
import
spell_correct
from
ease.util_functions
import
spell_correct
...
@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
...
@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
@patch
(
"util_functions.os.popen"
)
@patch
(
"util_functions.os.popen"
)
def
test_aspell_not_found
(
self
,
popen_mock
):
def
test_aspell_not_found
(
self
,
popen_mock
):
# Expected behavior when aspell is not installed is to return the original
# Expected behavior when aspell is not installed is to return the original
# string with no corrections.
# string with no corrections.
popen_mock
.
side_effect
=
OSError
popen_mock
.
side_effect
=
OSError
...
...
ease/util_functions.py
View file @
31d4363d
...
@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
...
@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
#Paths to needed data files
#
Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
...
@@ -100,7 +100,7 @@ def spell_correct(string):
...
@@ -100,7 +100,7 @@ def spell_correct(string):
incorrect_words
=
list
()
incorrect_words
=
list
()
correct_spelling
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
)
:
if
len
(
incorrect
[
i
])
>
10
:
#Reformat aspell output to make sense
#Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
if
hasattr
(
match
,
"start"
):
...
@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
...
@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
"""
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_features_pass_1
)
dict
ionary
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_features_pass_1
)
dict_matrix
=
dict
.
fit_transform
(
essays
)
dict_matrix
=
dict
ionary
.
fit_transform
(
essays
)
set_score
=
numpy
.
asarray
(
scores
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
scores
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
med_score
=
numpy
.
median
(
set_score
)
new_score
=
set_score
new_score
=
set_score
if
(
med_score
==
0
)
:
if
med_score
==
0
:
med_score
=
1
med_score
=
1
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
>=
med_score
]
=
1
new_score
[
set_score
>=
med_score
]
=
1
...
@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
...
@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
fish_vals
.
append
(
fish_val
)
fish_vals
.
append
(
fish_val
)
cutoff
=
1
cutoff
=
1
if
(
len
(
fish_vals
)
>
max_features_pass_2
)
:
if
len
(
fish_vals
)
>
max_features_pass_2
:
cutoff
=
sorted
(
fish_vals
)[
max_features_pass_2
]
cutoff
=
sorted
(
fish_vals
)[
max_features_pass_2
]
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_matrix
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_matrix
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
get
Var
=
lambda
searchList
,
ind
:
[
searchL
ist
[
i
]
for
i
in
ind
]
get
_var
=
lambda
search_list
,
ind
:
[
search_l
ist
[
i
]
for
i
in
ind
]
vocab
=
get
Var
(
dict
.
get_feature_names
(),
good_cols
)
vocab
=
get
_var
(
dictionary
.
get_feature_names
(),
good_cols
)
return
vocab
return
vocab
...
@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
...
@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
chunks
.
append
(
range
(
range_min
,
range_max
))
chunks
.
append
(
range
(
range_min
,
range_max
))
preds
=
[]
preds
=
[]
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
chunk_vec
=
numpy
.
asarray
(
range
(
0
,
len
(
chunks
)))
for
i
in
xrange
(
0
,
len
(
chunks
)):
for
i
in
xrange
(
0
,
len
(
chunks
)):
loop_inds
=
list
(
loop_inds
=
list
(
chain
.
from_iterable
([
chunks
[
int
(
z
)]
for
z
,
m
in
enumerate
(
range
(
0
,
len
(
chunks
)))
if
int
(
z
)
!=
i
]))
chain
.
from_iterable
([
chunks
[
int
(
z
)]
for
z
,
m
in
enumerate
(
range
(
0
,
len
(
chunks
)))
if
int
(
z
)
!=
i
]))
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
all_preds
=
list
(
chain
(
*
preds
))
all_preds
=
list
(
chain
(
*
preds
))
return
(
all_preds
)
return
all_preds
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
...
@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
...
@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator
=
0.0
numerator
=
0.0
denominator
=
0.0
denominator
=
0.0
if
(
num_ratings
>
1
)
:
if
num_ratings
>
1
:
for
i
in
range
(
num_ratings
):
for
i
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment