Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
31d4363d
Commit
31d4363d
authored
Jun 13, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Stylistic cleanup
parent
9c16fbbe
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
83 deletions
+107
-83
ease/create.py
+8
-5
ease/errors.py
+2
-0
ease/essay_set.py
+11
-10
ease/feature_extractor.py
+17
-15
ease/grade.py
+2
-2
ease/tests/test_model_accuracy.py
+55
-38
ease/tests/test_spellcheck.py
+1
-1
ease/util_functions.py
+11
-12
No files found.
ease/create.py
View file @
31d4363d
...
...
@@ -3,10 +3,13 @@ Functions that create a machine learning model from training data
"""
import
os
import
sys
import
logging
import
numpy
import
sys
# Constructs a log
log
=
logging
.
getLogger
(
__name__
)
# Setup base path so that we can import modules who are dependent on it
...
...
@@ -15,7 +18,7 @@ sys.path.append(base_path)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
sys
.
path
.
append
(
one_up_path
)
#Import modules that are dependent on the base path
#
Import modules that are dependent on the base path
import
util_functions
from
errors
import
*
from
datetime
import
datetime
...
...
@@ -179,11 +182,11 @@ def _extract_features_and_generate_model(essay_set):
# We cannot be sure what kind of errors .fit could throw at us. Memory, Type, Interrupt, etc.
except
Exception
as
ex
:
str
=
(
msg
=
(
"predict_classifier.fit raised an exception in _extract_features_and_generate_model: {}"
)
.
format
(
ex
)
log
.
exception
(
str
)
raise
ClassifierTrainingInternalError
(
str
)
log
.
exception
(
msg
)
raise
ClassifierTrainingInternalError
(
msg
)
return
feat_extractor
,
predict_classifier
,
cv_error_results
...
...
ease/errors.py
View file @
31d4363d
...
...
@@ -2,6 +2,7 @@
Errors for the EASE repository
"""
class
EaseError
(
Exception
):
pass
...
...
@@ -45,6 +46,7 @@ class InputError(EaseError):
"""
The user supplied an argument which was incorrect.
"""
def
__init__
(
self
,
expr
,
msg
):
self
.
expr
=
expr
self
.
msg
=
msg
...
...
ease/essay_set.py
View file @
31d4363d
...
...
@@ -3,20 +3,21 @@ Defines an essay set object, which encapsulates essays from training and test se
Performs spell and grammar checking, tokenization, and stemming.
"""
import
nltk
import
sys
import
random
import
os
import
logging
from
ease.errors
import
InputError
import
nltk
import
sys
from
errors
import
*
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
log
=
logging
.
getLogger
(
__name__
)
...
...
@@ -97,9 +98,9 @@ class EssaySet(object):
try
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
except
UnicodeError
as
ex
:
str
=
"Could not parse essay text into ascii: {}"
.
format
(
ex
)
log
.
exception
(
str
)
raise
EssaySetRequestError
(
ex
)
msg
=
"Could not parse essay text into ascii: {}"
.
format
(
ex
)
log
.
exception
(
msg
)
raise
EssaySetRequestError
(
msg
)
# Validates that score is an integer and essay_text is a string.
try
:
...
...
@@ -107,9 +108,9 @@ class EssaySet(object):
essay_text
=
str
(
essay_text
)
essay_generated
=
int
(
essay_generated
)
except
TypeError
:
str
=
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
))
log
.
exception
(
str
)
raise
EssaySetRequestError
(
str
)
ex
=
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
))
log
.
exception
(
ex
)
raise
EssaySetRequestError
(
ex
)
# Validates that essay generated is 0 or 1
if
essay_generated
!=
0
and
essay_generated
!=
1
:
...
...
ease/feature_extractor.py
View file @
31d4363d
...
...
@@ -2,24 +2,26 @@
Extracts features from training set and test set essays
"""
import
numpy
import
nltk
import
sys
from
sklearn.feature_extraction.text
import
CountVectorizer
import
pickle
import
os
from
itertools
import
chain
import
operator
import
logging
import
numpy
import
nltk
import
sys
from
sklearn.feature_extraction.text
import
CountVectorizer
from
errors
import
*
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
log
=
logging
.
getLogger
(
__name__
)
...
...
@@ -79,9 +81,8 @@ class FeatureExtractor(object):
sum
([
len
(
essay
)
for
essay
in
essay_set
.
_cleaned_essays
]))
# Gets the number and positions of grammar errors
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
)
good_pos_tags
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
)
# NOTE!!! Here, I changed the definition from utilizing good grammar ratios to using the counts of
# grammatical errors. Though this was not what the original author used, it is clearly what his code
# implies, as if this is intended to be a true "grammar errors per character", we should have that
...
...
@@ -154,7 +155,7 @@ class FeatureExtractor(object):
# SEE COMMENT AROUND LINE 85
good_grammar_ratios
,
bad_pos_positions
=
self
.
_get_grammar_errors
(
essay_set
.
_pos_tags
,
essay_set
.
_cleaned_essays
,
essay_set
.
_tokens
)
essay_set
.
_cleaned_essays
)
good_pos_tag_proportion
=
[
len
(
bad_pos_positions
[
m
])
/
float
(
word_counts
[
m
])
for
m
in
xrange
(
0
,
len
(
essays
))]
length_array
=
numpy
.
array
((
...
...
@@ -204,7 +205,7 @@ class FeatureExtractor(object):
prompt_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
if
(
tok_length
==
0
)
:
if
tok_length
==
0
:
tok_length
=
1
prompt_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
prompt_toks
]))
prompt_overlap_prop
.
append
(
prompt_overlap
[
len
(
prompt_overlap
)
-
1
]
/
float
(
tok_length
))
...
...
@@ -212,7 +213,7 @@ class FeatureExtractor(object):
expand_overlap_prop
=
[]
for
j
in
essay_set
.
_tokens
:
tok_length
=
len
(
j
)
if
(
tok_length
==
0
)
:
if
tok_length
==
0
:
tok_length
=
1
expand_overlap
.
append
(
len
([
i
for
i
in
j
if
i
in
expand_syns
]))
expand_overlap_prop
.
append
(
expand_overlap
[
len
(
expand_overlap
)
-
1
]
/
float
(
tok_length
))
...
...
@@ -221,7 +222,7 @@ class FeatureExtractor(object):
return
prompt_arr
.
copy
()
def
_get_grammar_errors
(
self
,
pos
,
essays
,
tokens
):
def
_get_grammar_errors
(
self
,
pos
,
essays
):
"""
Internal function to get the number of grammar errors in given text
...
...
@@ -251,7 +252,7 @@ class FeatureExtractor(object):
start
,
end
=
bad_pos_tuples
[
m
]
for
j
in
xrange
(
m
+
1
,
len
(
bad_pos_tuples
)):
lstart
,
lend
=
bad_pos_tuples
[
j
]
if
lstart
>=
start
and
lstart
<=
end
:
if
start
<=
lstart
<=
end
:
bad_pos_tuples
[
m
][
1
]
=
bad_pos_tuples
[
j
][
1
]
to_delete
.
append
(
j
)
...
...
@@ -268,7 +269,8 @@ class FeatureExtractor(object):
good_grammar_ratios
.
append
(
good_grammar_ratio
)
return
good_grammar_ratios
,
bad_pos_positions
def
_get_good_pos_ngrams
(
self
):
@staticmethod
def
_get_good_pos_ngrams
():
"""
Gets a list of grammatically correct part of speech sequences from an input file called essaycorpus.txt
Returns the list and caches the file
...
...
ease/grade.py
View file @
31d4363d
...
...
@@ -8,11 +8,12 @@ import logging
import
sys
# Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
#Depend on base path to be imported
#
Depend on base path to be imported
from
essay_set
import
EssaySet
from
errors
import
*
...
...
@@ -45,7 +46,6 @@ def grade(grader_data, submission):
# Instantiates the Essay set which will carry our essay while it is being classified and graded.
grader_set
=
EssaySet
(
essay_type
=
"test"
)
feedback
=
{}
# Retrieves the model and extractor we will be using
model
,
extractor
=
_get_classifier_and_extractor
(
grader_data
)
...
...
ease/tests/test_model_accuracy.py
View file @
31d4363d
import
unittest
import
os
from
ease
import
create
,
grade
import
random
import
logging
import
json
from
ease
import
create
,
grade
log
=
logging
.
getLogger
(
__name__
)
ROOT_PATH
=
os
.
path
.
abspath
(
__file__
)
...
...
@@ -14,8 +16,10 @@ CHARACTER_LIMIT = 1000
TRAINING_LIMIT
=
50
QUICK_TEST_LIMIT
=
5
# noinspection PyClassHasNoInit
class
DataLoader
():
def
load_text_files
(
self
,
pathname
):
@staticmethod
def
load_text_files
(
pathname
):
filenames
=
os
.
listdir
(
pathname
)
text
=
[]
for
filename
in
filenames
:
...
...
@@ -23,7 +27,8 @@ class DataLoader():
text
.
append
(
data
[:
CHARACTER_LIMIT
])
return
text
def
load_json_file
(
self
,
filename
):
@staticmethod
def
load_json_file
(
filename
):
datafile
=
open
(
os
.
path
.
join
(
filename
))
data
=
json
.
load
(
datafile
)
return
data
...
...
@@ -34,38 +39,42 @@ class DataLoader():
"""
pass
class
PolarityLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
directories
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
not
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
and
f
in
[
"neg"
,
"pos"
]]
#Sort so neg is first
#
Sort so neg is first
directories
.
sort
()
#We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
#
We need to have both a postive and a negative folder to classify
if
len
(
directories
)
!=
2
:
raise
Exception
(
"Need a pos and a neg directory in {0}"
.
format
(
self
.
pathname
))
neg
=
self
.
load_text_files
(
directories
[
0
])
pos
=
self
.
load_text_files
(
directories
[
1
])
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
scores
=
[
0
for
i
in
xrange
(
0
,
len
(
neg
))]
+
[
1
for
i
in
xrange
(
0
,
len
(
pos
))]
text
=
neg
+
pos
return
scores
,
text
class
JSONLoader
(
DataLoader
):
def
__init__
(
self
,
pathname
):
self
.
pathname
=
pathname
def
load_data
(
self
):
filenames
=
os
.
listdir
(
self
.
pathname
)
files
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
if
f
.
endswith
(
".json"
)]
files
=
[
os
.
path
.
abspath
(
os
.
path
.
join
(
self
.
pathname
,
f
))
for
f
in
filenames
if
os
.
path
.
isfile
(
os
.
path
.
join
(
self
.
pathname
,
f
))
if
f
.
endswith
(
".json"
)]
files
.
sort
()
#We need to have both a postive and a negative folder to classify
#
We need to have both a postive and a negative folder to classify
if
len
(
files
)
==
0
:
return
[],
[]
...
...
@@ -76,19 +85,19 @@ class JSONLoader(DataLoader):
all_scores
=
[]
all_text
=
[]
for
i
in
xrange
(
0
,
len
(
data
)):
for
i
in
xrange
(
0
,
len
(
data
)):
scores
=
[
d
[
'score'
]
for
d
in
data
[
i
]]
text
=
[
d
[
'text'
]
for
d
in
data
[
i
]]
if
isinstance
(
scores
[
0
],
list
):
new_text
=
[]
new_scores
=
[]
for
i
in
xrange
(
0
,
len
(
scores
)):
text
=
scores
[
i
]
s
=
scores
[
i
]
for
j
in
s
:
for
j
in
xrange
(
0
,
len
(
scores
)):
text
=
scores
[
j
]
s
=
scores
[
j
]
for
k
in
s
:
new_text
.
append
(
text
)
new_scores
.
append
(
j
)
new_scores
.
append
(
k
)
text
=
new_text
scores
=
new_scores
...
...
@@ -97,12 +106,13 @@ class JSONLoader(DataLoader):
return
all_scores
,
all_text
class
ModelCreator
():
def
__init__
(
self
,
scores
,
text
):
self
.
scores
=
scores
self
.
text
=
text
#Governs which creation function in the ease.create module to use. See module for info.
#
Governs which creation function in the ease.create module to use. See module for info.
if
isinstance
(
text
,
list
):
self
.
create_model_generic
=
False
else
:
...
...
@@ -112,7 +122,9 @@ class ModelCreator():
if
not
self
.
create_model_generic
:
return
create
.
create
(
self
.
text
,
self
.
scores
,
""
)
else
:
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
return
create
.
create_generic
(
self
.
text
.
get
(
'numeric_values'
,
[]),
self
.
text
.
get
(
'textual_values'
,
[]),
self
.
scores
)
class
Grader
():
def
__init__
(
self
,
model_data
):
...
...
@@ -122,7 +134,9 @@ class Grader():
if
isinstance
(
submission
,
basestring
):
return
grade
.
grade
(
self
.
model_data
,
submission
)
else
:
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_values'
,
[]),
submission
.
get
(
'textual_values'
,
[]))
return
grade
.
grade_generic
(
self
.
model_data
,
submission
.
get
(
'numeric_values'
,
[]),
submission
.
get
(
'textual_values'
,
[]))
class
GenericTest
(
object
):
loader
=
DataLoader
...
...
@@ -137,11 +151,11 @@ class GenericTest(object):
return
scores
,
text
def
generic_setup
(
self
,
scores
,
text
):
#Shuffle to mix up the classes, set seed to make it repeatable
#
Shuffle to mix up the classes, set seed to make it repeatable
random
.
seed
(
1
)
shuffled_scores
=
[]
shuffled_text
=
[]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
indices
=
[
i
for
i
in
xrange
(
0
,
len
(
scores
))]
random
.
shuffle
(
indices
)
for
i
in
indices
:
shuffled_scores
.
append
(
scores
[
i
])
...
...
@@ -159,45 +173,46 @@ class GenericTest(object):
grader
=
Grader
(
results
)
results
=
grader
.
grade
(
self
.
text
[
0
])
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
def
scoring_accuracy
(
self
):
random
.
seed
(
1
)
model_creator
=
ModelCreator
(
self
.
scores
,
self
.
text
)
results
=
model_creator
.
create_model
()
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
cv_kappa
=
results
[
'cv_kappa'
]
cv_mae
=
results
[
'cv_mean_absolute_error'
]
assert
cv_kappa
>=
self
.
expected_kappa_min
assert
cv_mae
<=
self
.
expected_mae_max
assert
cv_kappa
>=
self
.
expected_kappa_min
assert
cv_mae
<=
self
.
expected_mae_max
def
generic_model_creation_and_grading
(
self
):
log
.
info
(
self
.
scores
)
log
.
info
(
self
.
text
)
score_subset
=
[
random
.
randint
(
0
,
100
)
for
i
in
xrange
(
0
,
min
([
QUICK_TEST_LIMIT
,
len
(
self
.
scores
)]))]
score_subset
=
[
random
.
randint
(
0
,
100
)
for
i
in
xrange
(
0
,
min
([
QUICK_TEST_LIMIT
,
len
(
self
.
scores
)]))]
text_subset
=
self
.
text
[:
QUICK_TEST_LIMIT
]
text_subset
=
{
'textual_values'
:
[[
t
]
for
t
in
text_subset
],
'numeric_values'
:
[[
1
]
for
i
in
xrange
(
0
,
len
(
text_subset
))]
'textual_values'
:
[[
t
]
for
t
in
text_subset
],
'numeric_values'
:
[[
1
]
for
i
in
xrange
(
0
,
len
(
text_subset
))]
}
model_creator
=
ModelCreator
(
score_subset
,
text_subset
)
results
=
model_creator
.
create_model
()
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
grader
=
Grader
(
results
)
test_text
=
{
'textual_values'
:
[
self
.
text
[
0
]],
'numeric_values'
:
[
1
]
'textual_values'
:
[
self
.
text
[
0
]],
'numeric_values'
:
[
1
]
}
results
=
grader
.
grade
(
test_text
)
assert
results
[
'success'
]
==
True
assert
results
[
'success'
]
==
True
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
class
PolarityTest
(
unittest
.
TestCase
,
GenericTest
):
loader
=
PolarityLoader
data_path
=
"data/polarity"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
These will increase if we allow more data in.
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_mae_max
=
1
...
...
@@ -214,12 +229,13 @@ class PolarityTest(unittest.TestCase,GenericTest):
def
test_generic_model_creation_and_grading
(
self
):
self
.
generic_model_creation_and_grading
()
class
JSONTest
(
GenericTest
):
loader
=
JSONLoader
data_path
=
"data/json_data"
#These will increase if we allow more data in.
#I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
#
These will increase if we allow more data in.
#
I am setting the amount of data low to allow tests to finish quickly (40 training essays, 1000 character max for each)
expected_kappa_min
=
-.
2
expected_mae_max
=
1
...
...
@@ -227,10 +243,11 @@ class JSONTest(GenericTest):
self
.
scores
,
self
.
text
=
self
.
load_data
()
return
self
.
scores
,
self
.
text
def
test_loop
():
json_test
=
JSONTest
()
scores
,
text
=
json_test
.
setUp
()
for
i
in
xrange
(
0
,
len
(
scores
)):
for
i
in
xrange
(
0
,
len
(
scores
)):
json_test
.
generic_setup
(
scores
[
i
],
text
[
i
])
yield
json_test
.
model_creation_and_grading
yield
json_test
.
scoring_accuracy
...
...
ease/tests/test_spellcheck.py
View file @
31d4363d
from
unittest
import
TestCase
from
nose.tools
import
assert_equal
from
mock
import
patch
from
ease.util_functions
import
spell_correct
...
...
@@ -35,7 +36,6 @@ class SpellCheckUnitTest(TestCase):
@patch
(
"util_functions.os.popen"
)
def
test_aspell_not_found
(
self
,
popen_mock
):
# Expected behavior when aspell is not installed is to return the original
# string with no corrections.
popen_mock
.
side_effect
=
OSError
...
...
ease/util_functions.py
View file @
31d4363d
...
...
@@ -23,9 +23,9 @@ log = logging.getLogger(__name__)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
+=
"/"
#Paths to needed data files
#
Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
...
...
@@ -100,7 +100,7 @@ def spell_correct(string):
incorrect_words
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
)
:
if
len
(
incorrect
[
i
])
>
10
:
#Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
...
...
@@ -167,12 +167,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
NOTE: GBW didn't mess around with this because it is very easy to mess up, and I didn't want to mess it up.
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_features_pass_1
)
dict_matrix
=
dict
.
fit_transform
(
essays
)
dict
ionary
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_features_pass_1
)
dict_matrix
=
dict
ionary
.
fit_transform
(
essays
)
set_score
=
numpy
.
asarray
(
scores
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
new_score
=
set_score
if
(
med_score
==
0
)
:
if
med_score
==
0
:
med_score
=
1
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
>=
med_score
]
=
1
...
...
@@ -190,12 +190,12 @@ def get_vocab(essays, scores, max_features_pass_1=750, max_features_pass_2=200):
fish_vals
.
append
(
fish_val
)
cutoff
=
1
if
(
len
(
fish_vals
)
>
max_features_pass_2
)
:
if
len
(
fish_vals
)
>
max_features_pass_2
:
cutoff
=
sorted
(
fish_vals
)[
max_features_pass_2
]
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_matrix
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
get
Var
=
lambda
searchList
,
ind
:
[
searchL
ist
[
i
]
for
i
in
ind
]
vocab
=
get
Var
(
dict
.
get_feature_names
(),
good_cols
)
get
_var
=
lambda
search_list
,
ind
:
[
search_l
ist
[
i
]
for
i
in
ind
]
vocab
=
get
_var
(
dictionary
.
get_feature_names
(),
good_cols
)
return
vocab
...
...
@@ -219,14 +219,13 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
chunks
.
append
(
range
(
range_min
,
range_max
))
preds
=
[]
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
chunk_vec
=
numpy
.
asarray
(
range
(
0
,
len
(
chunks
)))
for
i
in
xrange
(
0
,
len
(
chunks
)):
loop_inds
=
list
(
chain
.
from_iterable
([
chunks
[
int
(
z
)]
for
z
,
m
in
enumerate
(
range
(
0
,
len
(
chunks
)))
if
int
(
z
)
!=
i
]))
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
all_preds
=
list
(
chain
(
*
preds
))
return
(
all_preds
)
return
all_preds
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
...
...
@@ -260,7 +259,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator
=
0.0
denominator
=
0.0
if
(
num_ratings
>
1
)
:
if
num_ratings
>
1
:
for
i
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment