Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
da78277e
Commit
da78277e
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleand up all of the files sytlistically
parent
a990b25e
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
183 additions
and
158 deletions
+183
-158
ease/create.py
+29
-24
ease/essay_set.py
+8
-7
ease/feature_extractor.py
+0
-0
ease/grade.py
+45
-39
ease/model_creator.py
+38
-32
ease/predictor_extractor.py
+10
-9
ease/predictor_set.py
+19
-19
ease/util_functions.py
+34
-28
No files found.
ease/create.py
View file @
da78277e
...
@@ -7,7 +7,7 @@ import sys
...
@@ -7,7 +7,7 @@ import sys
import
logging
import
logging
import
numpy
import
numpy
#Define base path and add to sys path
#
Define base path and add to sys path
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
...
@@ -24,6 +24,7 @@ import json
...
@@ -24,6 +24,7 @@ import json
#Make a log
#Make a log
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
def
dump_input_data
(
text
,
score
):
def
dump_input_data
(
text
,
score
):
try
:
try
:
file_path
=
base_path
+
"/tests/data/json_data/"
file_path
=
base_path
+
"/tests/data/json_data/"
...
@@ -32,14 +33,15 @@ def dump_input_data(text, score):
...
@@ -32,14 +33,15 @@ def dump_input_data(text, score):
filename
=
prefix
+
time_suffix
+
".json"
filename
=
prefix
+
time_suffix
+
".json"
json_data
=
[]
json_data
=
[]
for
i
in
xrange
(
0
,
len
(
text
)):
for
i
in
xrange
(
0
,
len
(
text
)):
json_data
.
append
({
'text'
:
text
[
i
],
'score'
:
score
[
i
]})
json_data
.
append
({
'text'
:
text
[
i
],
'score'
:
score
[
i
]})
with
open
(
file_path
+
filename
,
'w+'
)
as
outfile
:
with
open
(
file_path
+
filename
,
'w+'
)
as
outfile
:
json
.
dump
(
json_data
,
outfile
)
json
.
dump
(
json_data
,
outfile
)
except
:
except
:
error
=
"Could not dump data to file."
error
=
"Could not dump data to file."
log
.
exception
(
error
)
log
.
exception
(
error
)
def
create
(
text
,
score
,
prompt_string
,
dump_data
=
False
):
def
create
(
text
,
score
,
prompt_string
,
dump_data
=
False
):
"""
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
TODO: Remove model path argument, it is needed for now to support legacy code
...
@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):
...
@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):
algorithm
=
select_algorithm
(
score
)
algorithm
=
select_algorithm
(
score
)
#Initialize a results dictionary to return
#Initialize a results dictionary to return
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
if
len
(
text
)
!=
len
(
score
):
if
len
(
text
)
!=
len
(
score
):
msg
=
"Target and text lists must be same length."
msg
=
"Target and text lists must be same length."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
log
.
exception
(
msg
)
...
@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
...
@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
log
.
exception
(
msg
)
log
.
exception
(
msg
)
try
:
try
:
#Gets features from the essay set and computes error
#Gets features from the essay set and computes error
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
algorithm
=
algorithm
)
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
algorithm
=
algorithm
)
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'classifier'
]
=
classifier
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'algorithm'
]
=
algorithm
results
[
'algorithm'
]
=
algorithm
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
:
except
:
msg
=
"feature extraction and model creation failed."
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
...
@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
...
@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
return
results
return
results
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
"""
"""
Creates a model from a generic list numeric values and text values
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
numeric_values - A list of lists that are the predictors
...
@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
...
@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm
=
select_algorithm
(
target
)
algorithm
=
select_algorithm
(
target
)
#Initialize a result dictionary to return.
#Initialize a result dictionary to return.
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
msg
=
"Target, numeric features, and text features must all be the same length."
msg
=
"Target, numeric features, and text features must all be the same length."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
log
.
exception
(
msg
)
...
@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
...
@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try
:
try
:
#Extract all features and then train a classifier with the features
#Extract all features and then train a classifier with the features
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
algorithm
)
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'classifier'
]
=
classifier
results
[
'feature_ext'
]
=
feature_ext
results
[
'success'
]
=
True
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
except
:
except
:
msg
=
"feature extraction and model creation failed."
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
results
[
'errors'
]
.
append
(
msg
)
...
@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
...
@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
return
results
return
results
def
select_algorithm
(
score_list
):
def
select_algorithm
(
score_list
):
#Decide what algorithm to use (regression or classification)
#Decide what algorithm to use (regression or classification)
try
:
try
:
#Count the number of unique score points in the score list
#Count the number of unique score points in the score list
if
len
(
util_functions
.
f7
(
list
(
score_list
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
score_list
)))
>
5
:
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
else
:
else
:
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
...
...
ease/essay_set.py
View file @
da78277e
...
@@ -27,7 +27,7 @@ class EssaySet(object):
...
@@ -27,7 +27,7 @@ class EssaySet(object):
"""
"""
Initialize variables and check essay set type
Initialize variables and check essay set type
"""
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
essaytype
=
"train"
self
.
_type
=
essaytype
self
.
_type
=
essaytype
...
@@ -52,7 +52,7 @@ class EssaySet(object):
...
@@ -52,7 +52,7 @@ class EssaySet(object):
Returns a confirmation that essay was added.
Returns a confirmation that essay was added.
"""
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
# Get maximum current essay id, or set to 0 if this is the first essay added
if
(
len
(
self
.
_id
)
>
0
):
if
(
len
(
self
.
_id
)
>
0
):
max_id
=
max
(
self
.
_id
)
max_id
=
max
(
self
.
_id
)
else
:
else
:
max_id
=
0
max_id
=
0
...
@@ -71,9 +71,10 @@ class EssaySet(object):
...
@@ -71,9 +71,10 @@ class EssaySet(object):
essay_text
=
str
(
essay_text
)
essay_text
=
str
(
essay_text
)
except
:
except
:
# Nothing needed here, will return error in any case.
# Nothing needed here, will return error in any case.
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
and
(
essay_generated
==
0
or
essay_generated
==
1
):
self
.
_id
.
append
(
max_id
+
1
)
self
.
_id
.
append
(
max_id
+
1
)
self
.
_score
.
append
(
essay_score
)
self
.
_score
.
append
(
essay_score
)
...
@@ -83,7 +84,7 @@ class EssaySet(object):
...
@@ -83,7 +84,7 @@ class EssaySet(object):
except
:
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
self
.
_text
.
append
(
cleaned_essay
)
# Spell correct text using aspell
# Spell correct text using aspell
...
@@ -113,7 +114,7 @@ class EssaySet(object):
...
@@ -113,7 +114,7 @@ class EssaySet(object):
prompt_text should be a string.
prompt_text should be a string.
Returns the prompt as a confirmation.
Returns the prompt as a confirmation.
"""
"""
if
(
isinstance
(
prompt_text
,
basestring
)):
if
(
isinstance
(
prompt_text
,
basestring
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
ret
=
self
.
_prompt
else
:
else
:
...
@@ -134,7 +135,7 @@ class EssaySet(object):
...
@@ -134,7 +135,7 @@ class EssaySet(object):
all_syns
=
[]
all_syns
=
[]
for
word
in
e_toks
:
for
word
in
e_toks
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
if
(
len
(
synonyms
)
>
max_syns
):
if
(
len
(
synonyms
)
>
max_syns
):
synonyms
=
random
.
sample
(
synonyms
,
max_syns
)
synonyms
=
random
.
sample
(
synonyms
,
max_syns
)
all_syns
.
append
(
synonyms
)
all_syns
.
append
(
synonyms
)
new_essays
=
[]
new_essays
=
[]
...
...
ease/feature_extractor.py
View file @
da78277e
This diff is collapsed.
Click to expand it.
ease/grade.py
View file @
da78277e
...
@@ -8,7 +8,7 @@ import os
...
@@ -8,7 +8,7 @@ import os
import
numpy
import
numpy
import
logging
import
logging
#Append sys to base path to import the following modules
#
Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
...
@@ -25,7 +25,8 @@ import math
...
@@ -25,7 +25,8 @@ import math
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
def
grade
(
grader_data
,
submission
):
def
grade
(
grader_data
,
submission
):
"""
"""
Grades a specified submission using specified models
Grades a specified submission using specified models
grader_data - A dictionary:
grader_data - A dictionary:
...
@@ -39,10 +40,10 @@ def grade(grader_data,submission):
...
@@ -39,10 +40,10 @@ def grade(grader_data,submission):
"""
"""
#Initialize result dictionary
#Initialize result dictionary
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
grader_set
=
EssaySet
(
essaytype
=
"test"
)
grader_set
=
EssaySet
(
essaytype
=
"test"
)
feedback
=
{}
feedback
=
{}
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
...
@@ -53,28 +54,29 @@ def grade(grader_data,submission):
...
@@ -53,28 +54,29 @@ def grade(grader_data,submission):
try
:
try
:
#Try to add essay to essay set object
#Try to add essay to essay set object
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
except
Exception
:
except
Exception
:
error_message
=
"Essay could not be added to essay set:{0}"
.
format
(
submission
)
error_message
=
"Essay could not be added to essay set:{0}"
.
format
(
submission
)
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#Try to extract features from submission and assign score via the model
try
:
try
:
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
feedback
=
extractor
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
feedback
=
extractor
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
model
.
predict
(
grader_feats
)[
0
])
results
[
'score'
]
=
int
(
model
.
predict
(
grader_feats
)[
0
])
except
Exception
:
except
Exception
:
error_message
=
"Could not extract features and score essay."
error_message
=
"Could not extract features and score essay."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#Try to determine confidence level
try
:
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
except
Exception
:
except
Exception
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
log
.
exception
(
"Problem generating confidence value"
)
...
@@ -82,11 +84,11 @@ def grade(grader_data,submission):
...
@@ -82,11 +84,11 @@ def grade(grader_data,submission):
if
not
has_error
:
if
not
has_error
:
#If the essay is just a copy of the prompt, return a 0 as the score
#If the essay is just a copy of the prompt, return a 0 as the score
if
(
'too_similar_to_prompt'
in
feedback
and
feedback
[
'too_similar_to_prompt'
])
:
if
'too_similar_to_prompt'
in
feedback
and
feedback
[
'too_similar_to_prompt'
]
:
results
[
'score'
]
=
0
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
results
[
'correct'
]
=
False
results
[
'success'
]
=
True
results
[
'success'
]
=
True
#Generate short form output--number of problem areas identified in feedback
#Generate short form output--number of problem areas identified in feedback
...
@@ -94,24 +96,25 @@ def grade(grader_data,submission):
...
@@ -94,24 +96,25 @@ def grade(grader_data,submission):
results
[
'feedback'
]
=
{}
results
[
'feedback'
]
=
{}
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
results
[
'feedback'
]
.
update
({
results
[
'feedback'
]
.
update
({
'topicality'
:
feedback
[
'topicality'
],
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
})
results
[
'feedback'
]
.
update
(
results
[
'feedback'
]
.
update
(
{
{
'spelling'
:
feedback
[
'spelling'
],
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
}
)
)
else
:
else
:
#If error, success is False.
#If error, success is False.
results
[
'success'
]
=
False
results
[
'success'
]
=
False
return
results
return
results
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
"""
"""
Grades a set of numeric and textual features using a generic model
Grades a set of numeric and textual features using a generic model
...
@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
...
@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
textual_features - list of textual feature to predict on
"""
"""
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
#Try to find and load the model file
#Try to find and load the model file
grader_set
=
predictor_set
.
PredictorSet
(
essaytype
=
"test"
)
grader_set
=
predictor_set
.
PredictorSet
(
essaytype
=
"test"
)
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
#Try to add essays to essay set object
#Try to add essays to essay set object
try
:
try
:
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
except
Exception
:
except
Exception
:
error_msg
=
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
)
error_msg
=
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
)
log
.
exception
(
error_msg
)
log
.
exception
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
#Try to extract features from submission and assign score via the model
try
:
try
:
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
model
.
predict
(
grader_feats
)[
0
]
results
[
'score'
]
=
model
.
predict
(
grader_feats
)[
0
]
except
Exception
:
except
Exception
:
error_msg
=
"Could not extract features and score essay."
error_msg
=
"Could not extract features and score essay."
log
.
exception
(
error_msg
)
log
.
exception
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
#Try to determine confidence level
try
:
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
])
except
Exception
:
except
Exception
:
#If there is an error getting confidence, it is not a show-stopper, so just log
#If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
log
.
exception
(
"Problem generating confidence value"
)
...
@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
...
@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return
results
return
results
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
"""
"""
Determines a confidence in a certain score, given proper input parameters
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
algorithm- from util_functions.AlgorithmTypes
...
@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
...
@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
grader_feats - a row of features used by the model for classification/regression
grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model
score - The score assigned to the submission by a prior model
"""
"""
min_score
=
min
(
numpy
.
asarray
(
scores
))
min_score
=
min
(
numpy
.
asarray
(
scores
))
max_score
=
max
(
numpy
.
asarray
(
scores
))
max_score
=
max
(
numpy
.
asarray
(
scores
))
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
#If classification, predict with probability, which gives you a matrix of confidences per score point
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,(
float
(
score
)
-
float
(
min_score
))]
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,
(
float
(
score
)
-
float
(
min_score
))]
#TODO: Normalize confidence somehow here
#TODO: Normalize confidence somehow here
confidence
=
raw_confidence
confidence
=
raw_confidence
elif
hasattr
(
model
,
"predict"
):
elif
hasattr
(
model
,
"predict"
):
raw_confidence
=
model
.
predict
(
grader_feats
)[
0
]
raw_confidence
=
model
.
predict
(
grader_feats
)[
0
]
confidence
=
max
(
float
(
raw_confidence
)
-
math
.
floor
(
float
(
raw_confidence
)),
math
.
ceil
(
float
(
raw_confidence
))
-
float
(
raw_confidence
))
confidence
=
max
(
float
(
raw_confidence
)
-
math
.
floor
(
float
(
raw_confidence
)),
math
.
ceil
(
float
(
raw_confidence
))
-
float
(
raw_confidence
))
else
:
else
:
confidence
=
0
confidence
=
0
return
confidence
return
confidence
def
get_classifier_and_ext
(
grader_data
):
def
get_classifier_and_ext
(
grader_data
):
if
'classifier'
in
grader_data
:
if
'classifier'
in
grader_data
:
model
=
grader_data
[
'classifier'
]
model
=
grader_data
[
'classifier'
]
...
...
ease/model_creator.py
View file @
da78277e
#Provides interface functions to create and save models
#
Provides interface functions to create and save models
import
numpy
import
numpy
import
re
import
re
...
@@ -19,7 +19,8 @@ import feature_extractor
...
@@ -19,7 +19,8 @@ import feature_extractor
import
logging
import
logging
import
predictor_extractor
import
predictor_extractor
log
=
logging
.
getLogger
()
log
=
logging
.
getLogger
()
def
read_in_test_data
(
filename
):
def
read_in_test_data
(
filename
):
"""
"""
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string
=
open
(
filename
)
.
read
()
prompt_string
=
open
(
filename
)
.
read
()
return
prompt_string
return
prompt_string
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
"""
"""
Reads in a two column version of the test data.
Reads in a two column version of the test data.
Filename must point to a delimited file.
Filename must point to a delimited file.
...
@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
...
@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return
x
return
x
def
get_cv_error
(
clf
,
feats
,
scores
):
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
"""
Gets cross validated error for a given classifier, set of features, and scores
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
clf - classifier
feats - features to feed into the classified and cross validate over
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
ValueError
as
ex
:
except
ValueError
as
ex
:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
...
@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):
...
@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):
return
results
return
results
def
get_algorithms
(
algorithm
):
def
get_algorithms
(
algorithm
):
"""
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...
@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
...
@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
"""
"""
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
:
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
return
clf
,
clf2
...
@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
...
@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
predictor_set - a PredictorSet object that has been initialized with data
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
type - one of util_functions.AlgorithmType
"""
"""
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
f
=
predictor_extractor
.
PredictorExtractor
()
f
=
predictor_extractor
.
PredictorExtractor
()
...
@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
...
@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
train_feats
=
f
.
gen_feats
(
predictor_set
)
train_feats
=
f
.
gen_feats
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
try
:
try
:
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
...
@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
...
@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
except
ValueError
:
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
=
predictor_set
.
_target
set_score
=
predictor_set
.
_target
set_score
[
0
]
=
1
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
return
f
,
clf
,
cv_error_results
...
@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
...
@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats
=
f
.
gen_feats
(
essays
)
train_feats
=
f
.
gen_feats
(
essays
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
else
:
else
:
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
clf
,
clf2
=
get_algorithms
(
algorithm
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
try
:
try
:
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
return
f
,
clf
,
cv_error_results
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
"""
"""
Writes out a model to a file.
Writes out a model to a file.
...
@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
...
@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
classifier is a trained classifier
model_path is the path of write out the model file to
model_path is the path of write out the model file to
"""
"""
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
"""
"""
Function that creates essay set, extracts features, and writes out model
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
See above functions for argument descriptions
"""
"""
essay_set
=
create_essay_set
(
text
,
score
,
prompt
)
essay_set
=
create_essay_set
(
text
,
score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
ease/predictor_extractor.py
View file @
da78277e
...
@@ -16,17 +16,18 @@ import logging
...
@@ -16,17 +16,18 @@ import logging
import
math
import
math
from
feature_extractor
import
FeatureExtractor
from
feature_extractor
import
FeatureExtractor
#Append to path and then import things that depend on path
#
Append to path and then import things that depend on path
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
from
essay_set
import
EssaySet
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorExtractor
(
object
):
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
_extractors
=
[]
self
.
_extractors
=
[]
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
if
div_length
==
0
:
div_length
=
1
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_initialized
=
True
self
.
_initialized
=
True
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
Generates features based on an iput p_set
p_set - PredictorSet
p_set - PredictorSet
"""
"""
if
self
.
_initialized
!=
True
:
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
textual_features
=
[]
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
...
...
ease/predictor_set.py
View file @
da78277e
...
@@ -11,26 +11,27 @@ sys.path.append(base_path)
...
@@ -11,26 +11,27 @@ sys.path.append(base_path)
import
util_functions
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorSet
(
object
):
class
PredictorSet
(
object
):
def
__init__
(
self
,
essaytype
=
"train"
):
def
__init__
(
self
,
essaytype
=
"train"
):
"""
"""
Initialize variables and check essay set type
Initialize variables and check essay set type
"""
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
essaytype
=
"train"
self
.
_type
=
essaytype
self
.
_type
=
essaytype
self
.
_target
=
[]
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
self
.
_essay_sets
=
[]
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
#Basic input checking
#
Basic input checking
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
error_message
=
"Target is not a numeric value."
error_message
=
"Target is not a numeric value."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
...
@@ -47,16 +48,16 @@ class PredictorSet(object):
...
@@ -47,16 +48,16 @@ class PredictorSet(object):
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Do some length checking for parameters
#Do some length checking for parameters
if
len
(
self
.
_numeric_features
)
>
0
:
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
current_numeric_length
=
len
(
numeric_features
)
current_numeric_length
=
len
(
numeric_features
)
if
numeric_length
!=
current_numeric_length
:
if
numeric_length
!=
current_numeric_length
:
error_message
=
"Numeric features are an improper length."
error_message
=
"Numeric features are an improper length."
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
if
len
(
self
.
_textual_features
)
>
0
:
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
current_textual_length
=
len
(
textual_features
)
current_textual_length
=
len
(
textual_features
)
if
textual_length
!=
current_textual_length
:
if
textual_length
!=
current_textual_length
:
error_message
=
"Textual features are an improper length."
error_message
=
"Textual features are an improper length."
...
@@ -65,7 +66,7 @@ class PredictorSet(object):
...
@@ -65,7 +66,7 @@ class PredictorSet(object):
#Now check to see if text features and numeric features are individually correct
#Now check to see if text features and numeric features are individually correct
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
try
:
try
:
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
except
:
except
:
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
try
:
try
:
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
except
:
except
:
...
@@ -83,8 +83,8 @@ class PredictorSet(object):
...
@@ -83,8 +83,8 @@ class PredictorSet(object):
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Create essay sets for textual features if needed
#Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essaytype
=
self
.
_type
))
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essaytype
=
self
.
_type
))
#Add numeric and textual features
#Add numeric and textual features
...
@@ -95,6 +95,6 @@ class PredictorSet(object):
...
@@ -95,6 +95,6 @@ class PredictorSet(object):
self
.
_target
.
append
(
target
)
self
.
_target
.
append
(
target
)
#Add textual features to essay sets
#Add textual features to essay sets
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
ease/util_functions.py
View file @
da78277e
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#
Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
#Requires aspell to be installed and added to the path
from
fisher
import
pvalue
from
fisher
import
pvalue
...
@@ -15,17 +15,18 @@ import logging
...
@@ -15,17 +15,18 @@ import logging
import
sys
import
sys
import
tempfile
import
tempfile
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
#Paths to needed data files
#Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
class
AlgorithmTypes
(
object
):
class
AlgorithmTypes
(
object
):
"""
"""
Defines what types of algorithm can be used
Defines what types of algorithm can be used
...
@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
...
@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
regression
=
"regression"
regression
=
"regression"
classification
=
"classifiction"
classification
=
"classifiction"
def
create_model_path
(
model_path
):
def
create_model_path
(
model_path
):
"""
"""
Creates a path to model files
Creates a path to model files
model_path - string
model_path - string
"""
"""
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
model_path
=
"/"
+
model_path
model_path
=
"/"
+
model_path
if
not
model_path
.
startswith
(
"models"
):
if
not
model_path
.
startswith
(
"models"
):
model_path
=
"models"
+
model_path
model_path
=
"models"
+
model_path
if
not
model_path
.
endswith
(
".p"
):
if
not
model_path
.
endswith
(
".p"
):
model_path
+=
".p"
model_path
+=
".p"
return
model_path
return
model_path
def
sub_chars
(
string
):
def
sub_chars
(
string
):
"""
"""
Strips illegal characters from a string. Used to sanitize input essays.
Strips illegal characters from a string. Used to sanitize input essays.
...
@@ -66,7 +69,7 @@ def sub_chars(string):
...
@@ -66,7 +69,7 @@ def sub_chars(string):
#Replace text. Ordering is very important!
#Replace text. Ordering is very important!
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
...
@@ -101,7 +104,7 @@ def spell_correct(string):
...
@@ -101,7 +104,7 @@ def spell_correct(string):
except
Exception
:
except
Exception
:
log
.
exception
(
"aspell process failed; could not spell check"
)
log
.
exception
(
"aspell process failed; could not spell check"
)
# Return original string if aspell fails
# Return original string if aspell fails
return
string
,
0
,
string
return
string
,
0
,
string
finally
:
finally
:
f
.
close
()
f
.
close
()
...
@@ -109,7 +112,7 @@ def spell_correct(string):
...
@@ -109,7 +112,7 @@ def spell_correct(string):
incorrect_words
=
list
()
incorrect_words
=
list
()
correct_spelling
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
):
if
(
len
(
incorrect
[
i
])
>
10
):
#Reformat aspell output to make sense
#Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
if
hasattr
(
match
,
"start"
):
...
@@ -128,16 +131,16 @@ def spell_correct(string):
...
@@ -128,16 +131,16 @@ def spell_correct(string):
#Create markup based on spelling errors
#Create markup based on spelling errors
newstring
=
string
newstring
=
string
markup_string
=
string
markup_string
=
string
already_subbed
=
[]
already_subbed
=
[]
for
i
in
range
(
0
,
len
(
incorrect_words
)):
for
i
in
range
(
0
,
len
(
incorrect_words
)):
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_comp
=
re
.
compile
(
sub_pat
)
sub_comp
=
re
.
compile
(
sub_pat
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
if
incorrect_words
[
i
]
not
in
already_subbed
:
if
incorrect_words
[
i
]
not
in
already_subbed
:
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
already_subbed
.
append
(
incorrect_words
[
i
])
already_subbed
.
append
(
incorrect_words
[
i
])
return
newstring
,
len
(
incorrect_words
),
markup_string
return
newstring
,
len
(
incorrect_words
),
markup_string
def
ngrams
(
tokens
,
min_n
,
max_n
):
def
ngrams
(
tokens
,
min_n
,
max_n
):
...
@@ -162,6 +165,7 @@ def f7(seq):
...
@@ -162,6 +165,7 @@ def f7(seq):
"""
"""
seen
=
set
()
seen
=
set
()
seen_add
=
seen
.
add
seen_add
=
seen
.
add
#TODO Potential Improvment Here
return
[
x
for
x
in
seq
if
x
not
in
seen
and
not
seen_add
(
x
)]
return
[
x
for
x
in
seq
if
x
not
in
seen
and
not
seen_add
(
x
)]
...
@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
...
@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
Returns a list of words that constitute the significant vocabulary
"""
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict_mat
=
dict
.
fit_transform
(
text
)
dict_mat
=
dict
.
fit_transform
(
text
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
med_score
=
numpy
.
median
(
set_score
)
new_score
=
set_score
new_score
=
set_score
if
(
med_score
==
0
):
if
(
med_score
==
0
):
med_score
=
1
med_score
=
1
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
>=
med_score
]
=
1
new_score
[
set_score
>=
med_score
]
=
1
...
@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
...
@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals
.
append
(
fish_val
)
fish_vals
.
append
(
fish_val
)
cutoff
=
1
cutoff
=
1
if
(
len
(
fish_vals
)
>
max_feats2
):
if
(
len
(
fish_vals
)
>
max_feats2
):
cutoff
=
sorted
(
fish_vals
)[
max_feats2
]
cutoff
=
sorted
(
fish_vals
)[
max_feats2
]
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_mat
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_mat
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
...
@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
...
@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
else
:
else
:
cost
=
1
cost
=
1
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)]
=
min
(
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
)
)
if
i
and
j
and
s1
[
i
]
==
s2
[
j
-
1
]
and
s1
[
i
-
1
]
==
s2
[
j
]:
if
i
and
j
and
s1
[
i
]
==
s2
[
j
-
1
]
and
s1
[
i
-
1
]
==
s2
[
j
]:
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
return
d
[
lenstr1
-
1
,
lenstr2
-
1
]
return
d
[
lenstr1
-
1
,
lenstr2
-
1
]
...
@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
...
@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
all_preds
=
list
(
chain
(
*
preds
))
all_preds
=
list
(
chain
(
*
preds
))
return
(
all_preds
)
return
(
all_preds
)
def
gen_model
(
clf
,
arr
,
sel_score
):
def
gen_model
(
clf
,
arr
,
sel_score
):
...
@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
...
@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
"""
"""
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
sim_fit
=
clf
.
fit
(
arr
,
set_score
)
sim_fit
=
clf
.
fit
(
arr
,
set_score
)
return
(
sim_fit
)
return
(
sim_fit
)
def
gen_preds
(
clf
,
arr
):
def
gen_preds
(
clf
,
arr
):
...
@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
...
@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
arr is a data array identical in dimension to the array clf was trained on
arr is a data array identical in dimension to the array clf was trained on
Returns the array of predictions.
Returns the array of predictions.
"""
"""
if
(
hasattr
(
clf
,
"predict_proba"
)):
if
(
hasattr
(
clf
,
"predict_proba"
)):
ret
=
clf
.
predict
(
arr
)
ret
=
clf
.
predict
(
arr
)
# pred_score=preds.argmax(1)+min(x._score)
# pred_score=preds.argmax(1)+min(x._score)
else
:
else
:
...
@@ -340,8 +344,10 @@ def calc_list_average(l):
...
@@ -340,8 +344,10 @@ def calc_list_average(l):
total
+=
value
total
+=
value
return
total
/
len
(
l
)
return
total
/
len
(
l
)
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
"""
"""
Calculates kappa correlation between rater_a and rater_b.
Calculates kappa correlation between rater_a and rater_b.
...
@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
...
@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
max_rating is an optional argument describing the maximum rating possible on the data set
max_rating is an optional argument describing the maximum rating possible on the data set
Returns a float corresponding to the kappa correlation
Returns a float corresponding to the kappa correlation
"""
"""
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
if
min_rating
is
None
:
if
min_rating
is
None
:
...
@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
...
@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if
max_rating
is
None
:
if
max_rating
is
None
:
max_rating
=
max
(
rater_a
+
rater_b
)
max_rating
=
max
(
rater_a
+
rater_b
)
conf_mat
=
confusion_matrix
(
rater_a
,
rater_b
,
conf_mat
=
confusion_matrix
(
rater_a
,
rater_b
,
min_rating
,
max_rating
)
min_rating
,
max_rating
)
num_ratings
=
len
(
conf_mat
)
num_ratings
=
len
(
conf_mat
)
num_scored_items
=
float
(
len
(
rater_a
))
num_scored_items
=
float
(
len
(
rater_a
))
...
@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
...
@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator
=
0.0
numerator
=
0.0
denominator
=
0.0
denominator
=
0.0
if
(
num_ratings
>
1
):
if
(
num_ratings
>
1
):
for
i
in
range
(
num_ratings
):
for
i
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
...
@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
...
@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
A confusion matrix shows how often 2 values agree and disagree
A confusion matrix shows how often 2 values agree and disagree
See quadratic_weighted_kappa for argument descriptions
See quadratic_weighted_kappa for argument descriptions
"""
"""
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
min_rating
=
int
(
min_rating
)
min_rating
=
int
(
min_rating
)
...
@@ -450,7 +456,7 @@ def get_separator_words(toks1):
...
@@ -450,7 +456,7 @@ def get_separator_words(toks1):
Returns a list of separator words
Returns a list of separator words
"""
"""
tab_toks1
=
nltk
.
FreqDist
(
word
.
lower
()
for
word
in
toks1
)
tab_toks1
=
nltk
.
FreqDist
(
word
.
lower
()
for
word
in
toks1
)
if
(
os
.
path
.
isfile
(
ESSAY_COR_TOKENS_PATH
)):
if
(
os
.
path
.
isfile
(
ESSAY_COR_TOKENS_PATH
)):
toks2
=
pickle
.
load
(
open
(
ESSAY_COR_TOKENS_PATH
,
'rb'
))
toks2
=
pickle
.
load
(
open
(
ESSAY_COR_TOKENS_PATH
,
'rb'
))
else
:
else
:
essay_corpus
=
open
(
ESSAY_CORPUS_PATH
)
.
read
()
essay_corpus
=
open
(
ESSAY_CORPUS_PATH
)
.
read
()
...
@@ -460,12 +466,12 @@ def get_separator_words(toks1):
...
@@ -460,12 +466,12 @@ def get_separator_words(toks1):
sep_words
=
[]
sep_words
=
[]
for
word
in
tab_toks1
.
keys
():
for
word
in
tab_toks1
.
keys
():
tok1_present
=
tab_toks1
[
word
]
tok1_present
=
tab_toks1
[
word
]
if
(
tok1_present
>
2
):
if
(
tok1_present
>
2
):
tok1_total
=
tab_toks1
.
_N
tok1_total
=
tab_toks1
.
_N
tok2_present
=
toks2
[
word
]
tok2_present
=
toks2
[
word
]
tok2_total
=
toks2
.
_N
tok2_total
=
toks2
.
_N
fish_val
=
pvalue
(
tok1_present
,
tok2_present
,
tok1_total
,
tok2_total
)
.
two_tail
fish_val
=
pvalue
(
tok1_present
,
tok2_present
,
tok1_total
,
tok2_total
)
.
two_tail
if
(
fish_val
<
.
001
and
tok1_present
/
float
(
tok1_total
)
>
(
tok2_present
/
float
(
tok2_total
))
*
2
):
if
(
fish_val
<
.
001
and
tok1_present
/
float
(
tok1_total
)
>
(
tok2_present
/
float
(
tok2_total
))
*
2
):
sep_words
.
append
(
word
)
sep_words
.
append
(
word
)
sep_words
=
[
w
for
w
in
sep_words
if
not
w
in
nltk
.
corpus
.
stopwords
.
words
(
"english"
)
and
len
(
w
)
>
5
]
sep_words
=
[
w
for
w
in
sep_words
if
not
w
in
nltk
.
corpus
.
stopwords
.
words
(
"english"
)
and
len
(
w
)
>
5
]
return
sep_words
return
sep_words
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment