Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
da78277e
Commit
da78277e
authored
Jun 12, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleand up all of the files sytlistically
parent
a990b25e
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
183 additions
and
158 deletions
+183
-158
ease/create.py
+29
-24
ease/essay_set.py
+8
-7
ease/feature_extractor.py
+0
-0
ease/grade.py
+45
-39
ease/model_creator.py
+38
-32
ease/predictor_extractor.py
+10
-9
ease/predictor_set.py
+19
-19
ease/util_functions.py
+34
-28
No files found.
ease/create.py
View file @
da78277e
...
...
@@ -7,7 +7,7 @@ import sys
import
logging
import
numpy
#Define base path and add to sys path
#
Define base path and add to sys path
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
...
...
@@ -24,6 +24,7 @@ import json
#Make a log
log
=
logging
.
getLogger
(
__name__
)
def
dump_input_data
(
text
,
score
):
try
:
file_path
=
base_path
+
"/tests/data/json_data/"
...
...
@@ -32,14 +33,15 @@ def dump_input_data(text, score):
filename
=
prefix
+
time_suffix
+
".json"
json_data
=
[]
for
i
in
xrange
(
0
,
len
(
text
)):
json_data
.
append
({
'text'
:
text
[
i
],
'score'
:
score
[
i
]})
json_data
.
append
({
'text'
:
text
[
i
],
'score'
:
score
[
i
]})
with
open
(
file_path
+
filename
,
'w+'
)
as
outfile
:
json
.
dump
(
json_data
,
outfile
)
except
:
error
=
"Could not dump data to file."
log
.
exception
(
error
)
def
create
(
text
,
score
,
prompt_string
,
dump_data
=
False
):
def
create
(
text
,
score
,
prompt_string
,
dump_data
=
False
):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
...
...
@@ -53,11 +55,11 @@ def create(text,score,prompt_string, dump_data=False):
algorithm
=
select_algorithm
(
score
)
#Initialize a results dictionary to return
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
,
'score'
:
score
,
'text'
:
text
,
'prompt'
:
prompt_string
}
if
len
(
text
)
!=
len
(
score
):
if
len
(
text
)
!=
len
(
score
):
msg
=
"Target and text lists must be same length."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
...
...
@@ -72,13 +74,14 @@ def create(text,score,prompt_string, dump_data=False):
log
.
exception
(
msg
)
try
:
#Gets features from the essay set and computes error
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
algorithm
=
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
e_set
,
algorithm
=
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'algorithm'
]
=
algorithm
results
[
'success'
]
=
True
results
[
'success'
]
=
True
except
:
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
...
...
@@ -87,7 +90,7 @@ def create(text,score,prompt_string, dump_data=False):
return
results
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
def
create_generic
(
numeric_values
,
textual_values
,
target
,
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
...
...
@@ -99,10 +102,10 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
algorithm
=
select_algorithm
(
target
)
#Initialize a result dictionary to return.
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
results
=
{
'errors'
:
[],
'success'
:
False
,
'cv_kappa'
:
0
,
'cv_mean_absolute_error'
:
0
,
'feature_ext'
:
""
,
'classifier'
:
""
,
'algorithm'
:
algorithm
}
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
if
len
(
numeric_values
)
!=
len
(
textual_values
)
or
len
(
numeric_values
)
!=
len
(
target
):
msg
=
"Target, numeric features, and text features must all be the same length."
results
[
'errors'
]
.
append
(
msg
)
log
.
exception
(
msg
)
...
...
@@ -120,12 +123,13 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
try
:
#Extract all features and then train a classifier with the features
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model_predictors
(
pset
,
algorithm
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
results
[
'cv_mean_absolute_error'
]
=
cv_error_results
[
'mae'
]
results
[
'feature_ext'
]
=
feature_ext
results
[
'classifier'
]
=
classifier
results
[
'success'
]
=
True
except
:
msg
=
"feature extraction and model creation failed."
results
[
'errors'
]
.
append
(
msg
)
...
...
@@ -133,11 +137,12 @@ def create_generic(numeric_values, textual_values, target, algorithm = util_func
return
results
def
select_algorithm
(
score_list
):
#Decide what algorithm to use (regression or classification)
try
:
#Count the number of unique score points in the score list
if
len
(
util_functions
.
f7
(
list
(
score_list
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
score_list
)))
>
5
:
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
else
:
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
...
...
ease/essay_set.py
View file @
da78277e
...
...
@@ -27,7 +27,7 @@ class EssaySet(object):
"""
Initialize variables and check essay set type
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
self
.
_type
=
essaytype
...
...
@@ -52,7 +52,7 @@ class EssaySet(object):
Returns a confirmation that essay was added.
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if
(
len
(
self
.
_id
)
>
0
):
if
(
len
(
self
.
_id
)
>
0
):
max_id
=
max
(
self
.
_id
)
else
:
max_id
=
0
...
...
@@ -71,9 +71,10 @@ class EssaySet(object):
essay_text
=
str
(
essay_text
)
except
:
# Nothing needed here, will return error in any case.
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
log
.
exception
(
"Invalid type for essay score : {0} or essay text : {1}"
.
format
(
type
(
essay_score
),
type
(
essay_text
)))
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
if
isinstance
(
essay_score
,
int
)
and
isinstance
(
essay_text
,
basestring
)
\
and
(
essay_generated
==
0
or
essay_generated
==
1
):
self
.
_id
.
append
(
max_id
+
1
)
self
.
_score
.
append
(
essay_score
)
...
...
@@ -83,7 +84,7 @@ class EssaySet(object):
except
:
essay_text
=
(
essay_text
.
decode
(
'utf-8'
,
'replace'
))
.
encode
(
'ascii'
,
'ignore'
)
cleaned_essay
=
util_functions
.
sub_chars
(
essay_text
)
.
lower
()
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
if
(
len
(
cleaned_essay
)
>
MAXIMUM_ESSAY_LENGTH
):
cleaned_essay
=
cleaned_essay
[
0
:
MAXIMUM_ESSAY_LENGTH
]
self
.
_text
.
append
(
cleaned_essay
)
# Spell correct text using aspell
...
...
@@ -113,7 +114,7 @@ class EssaySet(object):
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if
(
isinstance
(
prompt_text
,
basestring
)):
if
(
isinstance
(
prompt_text
,
basestring
)):
self
.
_prompt
=
util_functions
.
sub_chars
(
prompt_text
)
ret
=
self
.
_prompt
else
:
...
...
@@ -134,7 +135,7 @@ class EssaySet(object):
all_syns
=
[]
for
word
in
e_toks
:
synonyms
=
util_functions
.
get_wordnet_syns
(
word
)
if
(
len
(
synonyms
)
>
max_syns
):
if
(
len
(
synonyms
)
>
max_syns
):
synonyms
=
random
.
sample
(
synonyms
,
max_syns
)
all_syns
.
append
(
synonyms
)
new_essays
=
[]
...
...
ease/feature_extractor.py
View file @
da78277e
This diff is collapsed.
Click to expand it.
ease/grade.py
View file @
da78277e
...
...
@@ -8,7 +8,7 @@ import os
import
numpy
import
logging
#Append sys to base path to import the following modules
#
Append sys to base path to import the following modules
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
...
...
@@ -25,7 +25,8 @@ import math
log
=
logging
.
getLogger
(
__name__
)
def
grade
(
grader_data
,
submission
):
def
grade
(
grader_data
,
submission
):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
...
...
@@ -39,10 +40,10 @@ def grade(grader_data,submission):
"""
#Initialize result dictionary
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'feedback'
:
""
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
grader_set
=
EssaySet
(
essaytype
=
"test"
)
grader_set
=
EssaySet
(
essaytype
=
"test"
)
feedback
=
{}
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
...
...
@@ -53,28 +54,29 @@ def grade(grader_data,submission):
try
:
#Try to add essay to essay set object
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
add_essay
(
str
(
submission
),
0
)
grader_set
.
update_prompt
(
str
(
grader_data
[
'prompt'
]))
except
Exception
:
error_message
=
"Essay could not be added to essay set:{0}"
.
format
(
submission
)
log
.
exception
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
try
:
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
feedback
=
extractor
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
model
.
predict
(
grader_feats
)[
0
])
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
feedback
=
extractor
.
gen_feedback
(
grader_set
,
grader_feats
)[
0
]
results
[
'score'
]
=
int
(
model
.
predict
(
grader_feats
)[
0
])
except
Exception
:
error_message
=
"Could not extract features and score essay."
log
.
exception
(
error_message
)
results
[
'errors'
]
.
append
(
error_message
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
],
grader_data
[
'score'
])
except
Exception
:
#If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
...
...
@@ -82,11 +84,11 @@ def grade(grader_data,submission):
if
not
has_error
:
#If the essay is just a copy of the prompt, return a 0 as the score
if
(
'too_similar_to_prompt'
in
feedback
and
feedback
[
'too_similar_to_prompt'
])
:
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
if
'too_similar_to_prompt'
in
feedback
and
feedback
[
'too_similar_to_prompt'
]
:
results
[
'score'
]
=
0
results
[
'correct'
]
=
False
results
[
'success'
]
=
True
results
[
'success'
]
=
True
#Generate short form output--number of problem areas identified in feedback
...
...
@@ -94,24 +96,25 @@ def grade(grader_data,submission):
results
[
'feedback'
]
=
{}
if
'topicality'
in
feedback
and
'prompt_overlap'
in
feedback
:
results
[
'feedback'
]
.
update
({
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
'topicality'
:
feedback
[
'topicality'
],
'prompt-overlap'
:
feedback
[
'prompt_overlap'
],
})
results
[
'feedback'
]
.
update
(
{
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
'spelling'
:
feedback
[
'spelling'
],
'grammar'
:
feedback
[
'grammar'
],
'markup-text'
:
feedback
[
'markup_text'
],
}
)
else
:
#If error, success is False.
results
[
'success'
]
=
False
results
[
'success'
]
=
False
return
results
def
grade_generic
(
grader_data
,
numeric_features
,
textual_features
):
"""
Grades a set of numeric and textual features using a generic model
...
...
@@ -123,38 +126,38 @@ def grade_generic(grader_data, numeric_features, textual_features):
textual_features - list of textual feature to predict on
"""
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
results
=
{
'errors'
:
[],
'tests'
:
[],
'score'
:
0
,
'success'
:
False
,
'confidence'
:
0
}
has_error
=
False
has_error
=
False
#Try to find and load the model file
grader_set
=
predictor_set
.
PredictorSet
(
essaytype
=
"test"
)
grader_set
=
predictor_set
.
PredictorSet
(
essaytype
=
"test"
)
model
,
extractor
=
get_classifier_and_ext
(
grader_data
)
#Try to add essays to essay set object
try
:
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
grader_set
.
add_row
(
numeric_features
,
textual_features
,
0
)
except
Exception
:
error_msg
=
"Row could not be added to predictor set:{0} {1}"
.
format
(
numeric_features
,
textual_features
)
log
.
exception
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
has_error
=
True
has_error
=
True
#Try to extract features from submission and assign score via the model
try
:
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
model
.
predict
(
grader_feats
)[
0
]
grader_feats
=
extractor
.
gen_feats
(
grader_set
)
results
[
'score'
]
=
model
.
predict
(
grader_feats
)[
0
]
except
Exception
:
error_msg
=
"Could not extract features and score essay."
log
.
exception
(
error_msg
)
results
[
'errors'
]
.
append
(
error_msg
)
has_error
=
True
has_error
=
True
#Try to determine confidence level
try
:
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
])
results
[
'confidence'
]
=
get_confidence_value
(
grader_data
[
'algorithm'
],
model
,
grader_feats
,
results
[
'score'
])
except
Exception
:
#If there is an error getting confidence, it is not a show-stopper, so just log
log
.
exception
(
"Problem generating confidence value"
)
...
...
@@ -164,7 +167,8 @@ def grade_generic(grader_data, numeric_features, textual_features):
return
results
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
def
get_confidence_value
(
algorithm
,
model
,
grader_feats
,
score
,
scores
):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
...
...
@@ -172,21 +176,23 @@ def get_confidence_value(algorithm,model,grader_feats,score, scores):
grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model
"""
min_score
=
min
(
numpy
.
asarray
(
scores
))
max_score
=
max
(
numpy
.
asarray
(
scores
))
min_score
=
min
(
numpy
.
asarray
(
scores
))
max_score
=
max
(
numpy
.
asarray
(
scores
))
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
and
hasattr
(
model
,
"predict_proba"
):
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,(
float
(
score
)
-
float
(
min_score
))]
raw_confidence
=
model
.
predict_proba
(
grader_feats
)[
0
,
(
float
(
score
)
-
float
(
min_score
))]
#TODO: Normalize confidence somehow here
confidence
=
raw_confidence
confidence
=
raw_confidence
elif
hasattr
(
model
,
"predict"
):
raw_confidence
=
model
.
predict
(
grader_feats
)[
0
]
confidence
=
max
(
float
(
raw_confidence
)
-
math
.
floor
(
float
(
raw_confidence
)),
math
.
ceil
(
float
(
raw_confidence
))
-
float
(
raw_confidence
))
confidence
=
max
(
float
(
raw_confidence
)
-
math
.
floor
(
float
(
raw_confidence
)),
math
.
ceil
(
float
(
raw_confidence
))
-
float
(
raw_confidence
))
else
:
confidence
=
0
return
confidence
def
get_classifier_and_ext
(
grader_data
):
if
'classifier'
in
grader_data
:
model
=
grader_data
[
'classifier'
]
...
...
ease/model_creator.py
View file @
da78277e
#Provides interface functions to create and save models
#
Provides interface functions to create and save models
import
numpy
import
re
...
...
@@ -19,7 +19,8 @@ import feature_extractor
import
logging
import
predictor_extractor
log
=
logging
.
getLogger
()
log
=
logging
.
getLogger
()
def
read_in_test_data
(
filename
):
"""
...
...
@@ -49,7 +50,8 @@ def read_in_test_prompt(filename):
prompt_string
=
open
(
filename
)
.
read
()
return
prompt_string
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
def
read_in_test_data_twocolumn
(
filename
,
sep
=
","
):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
...
...
@@ -86,21 +88,22 @@ def create_essay_set(text, score, prompt_string, generate_additional=True):
return
x
def
get_cv_error
(
clf
,
feats
,
scores
):
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
except
ValueError
as
ex
:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
...
...
@@ -110,6 +113,7 @@ def get_cv_error(clf,feats,scores):
return
results
def
get_algorithms
(
algorithm
):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
...
...
@@ -117,14 +121,14 @@ def get_algorithms(algorithm):
"""
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
...
...
@@ -134,7 +138,7 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
if
(
algorithm
not
in
[
util_functions
.
AlgorithmTypes
.
regression
,
util_functions
.
AlgorithmTypes
.
classification
]):
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
f
=
predictor_extractor
.
PredictorExtractor
()
...
...
@@ -142,8 +146,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
train_feats
=
f
.
gen_feats
(
predictor_set
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
predictor_set
.
_target
)
try
:
set_score
=
numpy
.
asarray
(
predictor_set
.
_target
,
dtype
=
numpy
.
int
)
...
...
@@ -151,8 +155,8 @@ def extract_features_and_generate_model_predictors(predictor_set, algorithm=util
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
=
predictor_set
.
_target
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
0
]
=
1
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
...
...
@@ -172,25 +176,26 @@ def extract_features_and_generate_model(essays, algorithm=util_functions.Algorit
train_feats
=
f
.
gen_feats
(
essays
)
set_score
=
numpy
.
asarray
(
essays
.
_score
,
dtype
=
numpy
.
int
)
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
if
len
(
util_functions
.
f7
(
list
(
set_score
)))
>
5
:
algorithm
=
util_functions
.
AlgorithmTypes
.
regression
else
:
algorithm
=
util_functions
.
AlgorithmTypes
.
classification
clf
,
clf2
=
get_algorithms
(
algorithm
)
clf
,
clf2
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
cv_error_results
=
get_cv_error
(
clf2
,
train_feats
,
essays
.
_score
)
try
:
clf
.
fit
(
train_feats
,
set_score
)
except
ValueError
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
1
]
=
0
set_score
[
0
]
=
1
set_score
[
1
]
=
0
clf
.
fit
(
train_feats
,
set_score
)
return
f
,
clf
,
cv_error_results
def
dump_model_to_file
(
prompt_string
,
feature_ext
,
classifier
,
text
,
score
,
model_path
):
"""
Writes out a model to a file.
...
...
@@ -199,16 +204,17 @@ def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, mode
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
model_file
=
{
'prompt'
:
prompt_string
,
'extractor'
:
feature_ext
,
'model'
:
classifier
,
'text'
:
text
,
'score'
:
score
}
pickle
.
dump
(
model_file
,
file
=
open
(
model_path
,
"w"
))
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
def
create_essay_set_and_dump_model
(
text
,
score
,
prompt
,
model_path
,
additional_array
=
None
):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set
=
create_essay_set
(
text
,
score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
essay_set
=
create_essay_set
(
text
,
score
,
prompt
)
feature_ext
,
clf
=
extract_features_and_generate_model
(
essay_set
,
additional_array
)
dump_model_to_file
(
prompt
,
feature_ext
,
clf
,
model_path
)
ease/predictor_extractor.py
View file @
da78277e
...
...
@@ -16,17 +16,18 @@ import logging
import
math
from
feature_extractor
import
FeatureExtractor
#Append to path and then import things that depend on path
#
Append to path and then import things that depend on path
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
class
PredictorExtractor
(
object
):
def
__init__
(
self
):
self
.
_extractors
=
[]
...
...
@@ -48,13 +49,13 @@ class PredictorExtractor(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
div_length
=
1
div_length
=
len
(
p_set
.
_essay_sets
)
if
div_length
==
0
:
div_length
=
1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
max_feats2
=
int
(
math
.
floor
(
200
/
div_length
))
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
self
.
_extractors
.
append
(
FeatureExtractor
())
self
.
_extractors
[
i
]
.
initialize_dictionaries
(
p_set
.
_essay_sets
[
i
],
max_feats2
=
max_feats2
)
self
.
_initialized
=
True
...
...
@@ -66,13 +67,13 @@ class PredictorExtractor(object):
Generates features based on an iput p_set
p_set - PredictorSet
"""
if
self
.
_initialized
!=
True
:
if
self
.
_initialized
!=
True
:
error_message
=
"Dictionaries have not been initialized."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
p_set
,
error_message
)
textual_features
=
[]
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
for
i
in
xrange
(
0
,
len
(
p_set
.
_essay_sets
)):
textual_features
.
append
(
self
.
_extractors
[
i
]
.
gen_feats
(
p_set
.
_essay_sets
[
i
]))
textual_matrix
=
numpy
.
concatenate
(
textual_features
,
axis
=
1
)
...
...
ease/predictor_set.py
View file @
da78277e
...
...
@@ -11,26 +11,27 @@ sys.path.append(base_path)
import
util_functions
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
class
PredictorSet
(
object
):
def
__init__
(
self
,
essaytype
=
"train"
):
def
__init__
(
self
,
essaytype
=
"train"
):
"""
Initialize variables and check essay set type
"""
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
if
(
essaytype
!=
"train"
and
essaytype
!=
"test"
):
essaytype
=
"train"
self
.
_type
=
essaytype
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
self
.
_target
=
[]
self
.
_textual_features
=
[]
self
.
_numeric_features
=
[]
self
.
_essay_sets
=
[]
def
add_row
(
self
,
numeric_features
,
textual_features
,
target
):
#Basic input checking
#
Basic input checking
if
not
isinstance
(
target
,
(
int
,
long
,
float
)):
error_message
=
"Target is not a numeric value."
log
.
exception
(
error_message
)
...
...
@@ -47,16 +48,16 @@ class PredictorSet(object):
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Do some length checking for parameters
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
if
len
(
self
.
_numeric_features
)
>
0
:
numeric_length
=
len
(
self
.
_numeric_features
[
-
1
])
current_numeric_length
=
len
(
numeric_features
)
if
numeric_length
!=
current_numeric_length
:
error_message
=
"Numeric features are an improper length."
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
if
len
(
self
.
_textual_features
)
>
0
:
textual_length
=
len
(
self
.
_textual_features
[
-
1
])
current_textual_length
=
len
(
textual_features
)
if
textual_length
!=
current_textual_length
:
error_message
=
"Textual features are an improper length."
...
...
@@ -65,7 +66,7 @@ class PredictorSet(object):
#Now check to see if text features and numeric features are individually correct
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
for
i
in
xrange
(
0
,
len
(
numeric_features
)):
try
:
numeric_features
[
i
]
=
float
(
numeric_features
[
i
])
except
:
...
...
@@ -73,8 +74,7 @@ class PredictorSet(object):
log
.
exception
(
error_message
)
raise
util_functions
.
InputError
(
numeric_features
,
error_message
)
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
try
:
textual_features
[
i
]
=
str
(
textual_features
[
i
]
.
encode
(
'ascii'
,
'ignore'
))
except
:
...
...
@@ -83,8 +83,8 @@ class PredictorSet(object):
raise
util_functions
.
InputError
(
textual_features
,
error_message
)
#Create essay sets for textual features if needed
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
if
len
(
self
.
_textual_features
)
==
0
:
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
.
append
(
essay_set
.
EssaySet
(
essaytype
=
self
.
_type
))
#Add numeric and textual features
...
...
@@ -95,6 +95,6 @@ class PredictorSet(object):
self
.
_target
.
append
(
target
)
#Add textual features to essay sets
for
i
in
xrange
(
0
,
len
(
textual_features
)):
for
i
in
xrange
(
0
,
len
(
textual_features
)):
self
.
_essay_sets
[
i
]
.
add_essay
(
textual_features
[
i
],
target
)
ease/util_functions.py
View file @
da78277e
#Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#
Collection of misc functions needed to support essay_set.py and feature_extractor.py.
#Requires aspell to be installed and added to the path
from
fisher
import
pvalue
...
...
@@ -15,17 +15,18 @@ import logging
import
sys
import
tempfile
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
if
not
base_path
.
endswith
(
"/"
):
base_path
=
base_path
+
"/"
base_path
=
base_path
+
"/"
#Paths to needed data files
ESSAY_CORPUS_PATH
=
base_path
+
"data/essaycorpus.txt"
ESSAY_COR_TOKENS_PATH
=
base_path
+
"data/essay_cor_tokens.p"
class
AlgorithmTypes
(
object
):
"""
Defines what types of algorithm can be used
...
...
@@ -33,20 +34,22 @@ class AlgorithmTypes(object):
regression
=
"regression"
classification
=
"classifiction"
def
create_model_path
(
model_path
):
"""
Creates a path to model files
model_path - string
"""
if
not
model_path
.
startswith
(
"/"
)
and
not
model_path
.
startswith
(
"models/"
):
model_path
=
"/"
+
model_path
model_path
=
"/"
+
model_path
if
not
model_path
.
startswith
(
"models"
):
model_path
=
"models"
+
model_path
if
not
model_path
.
endswith
(
".p"
):
model_path
+=
".p"
model_path
+=
".p"
return
model_path
def
sub_chars
(
string
):
"""
Strips illegal characters from a string. Used to sanitize input essays.
...
...
@@ -66,7 +69,7 @@ def sub_chars(string):
#Replace text. Ordering is very important!
nstring
=
re
.
sub
(
sub_pat
,
" "
,
string
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
char_pat
,
" ."
,
nstring
)
nstring
=
re
.
sub
(
com_pat
,
" ,"
,
nstring
)
nstring
=
re
.
sub
(
ques_pat
,
" ?"
,
nstring
)
nstring
=
re
.
sub
(
excl_pat
,
" !"
,
nstring
)
...
...
@@ -101,7 +104,7 @@ def spell_correct(string):
except
Exception
:
log
.
exception
(
"aspell process failed; could not spell check"
)
# Return original string if aspell fails
return
string
,
0
,
string
return
string
,
0
,
string
finally
:
f
.
close
()
...
...
@@ -109,7 +112,7 @@ def spell_correct(string):
incorrect_words
=
list
()
correct_spelling
=
list
()
for
i
in
range
(
1
,
len
(
incorrect
)):
if
(
len
(
incorrect
[
i
])
>
10
):
if
(
len
(
incorrect
[
i
])
>
10
):
#Reformat aspell output to make sense
match
=
re
.
search
(
":"
,
incorrect
[
i
])
if
hasattr
(
match
,
"start"
):
...
...
@@ -128,16 +131,16 @@ def spell_correct(string):
#Create markup based on spelling errors
newstring
=
string
markup_string
=
string
already_subbed
=
[]
already_subbed
=
[]
for
i
in
range
(
0
,
len
(
incorrect_words
)):
sub_pat
=
r"\b"
+
incorrect_words
[
i
]
+
r"\b"
sub_comp
=
re
.
compile
(
sub_pat
)
newstring
=
re
.
sub
(
sub_comp
,
correct_spelling
[
i
],
newstring
)
if
incorrect_words
[
i
]
not
in
already_subbed
:
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
markup_string
=
re
.
sub
(
sub_comp
,
'<bs>'
+
incorrect_words
[
i
]
+
"</bs>"
,
markup_string
)
already_subbed
.
append
(
incorrect_words
[
i
])
return
newstring
,
len
(
incorrect_words
),
markup_string
return
newstring
,
len
(
incorrect_words
),
markup_string
def
ngrams
(
tokens
,
min_n
,
max_n
):
...
...
@@ -162,6 +165,7 @@ def f7(seq):
"""
seen
=
set
()
seen_add
=
seen
.
add
#TODO Potential Improvment Here
return
[
x
for
x
in
seq
if
x
not
in
seen
and
not
seen_add
(
x
)]
...
...
@@ -200,12 +204,12 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
max_feats2 is the maximum number of features to consider in the second (final) pass
Returns a list of words that constitute the significant vocabulary
"""
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict
=
CountVectorizer
(
ngram_range
=
(
1
,
2
),
max_features
=
max_feats
)
dict_mat
=
dict
.
fit_transform
(
text
)
set_score
=
numpy
.
asarray
(
score
,
dtype
=
numpy
.
int
)
med_score
=
numpy
.
median
(
set_score
)
new_score
=
set_score
if
(
med_score
==
0
):
if
(
med_score
==
0
):
med_score
=
1
new_score
[
set_score
<
med_score
]
=
0
new_score
[
set_score
>=
med_score
]
=
1
...
...
@@ -223,7 +227,7 @@ def get_vocab(text, score, max_feats=750, max_feats2=200):
fish_vals
.
append
(
fish_val
)
cutoff
=
1
if
(
len
(
fish_vals
)
>
max_feats2
):
if
(
len
(
fish_vals
)
>
max_feats2
):
cutoff
=
sorted
(
fish_vals
)[
max_feats2
]
good_cols
=
numpy
.
asarray
([
num
for
num
in
range
(
0
,
dict_mat
.
shape
[
1
])
if
fish_vals
[
num
]
<=
cutoff
])
...
...
@@ -253,12 +257,12 @@ def edit_distance(s1, s2):
else
:
cost
=
1
d
[(
i
,
j
)]
=
min
(
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
d
[(
i
-
1
,
j
)]
+
1
,
# deletion
d
[(
i
,
j
-
1
)]
+
1
,
# insertion
d
[(
i
-
1
,
j
-
1
)]
+
cost
,
# substitution
)
if
i
and
j
and
s1
[
i
]
==
s2
[
j
-
1
]
and
s1
[
i
-
1
]
==
s2
[
j
]:
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
d
[(
i
,
j
)]
=
min
(
d
[(
i
,
j
)],
d
[
i
-
2
,
j
-
2
]
+
cost
)
# transposition
return
d
[
lenstr1
-
1
,
lenstr2
-
1
]
...
...
@@ -299,7 +303,7 @@ def gen_cv_preds(clf, arr, sel_score, num_chunks=3):
sim_fit
=
clf
.
fit
(
arr
[
loop_inds
],
set_score
[
loop_inds
])
preds
.
append
(
list
(
sim_fit
.
predict
(
arr
[
chunks
[
i
]])))
all_preds
=
list
(
chain
(
*
preds
))
return
(
all_preds
)
return
(
all_preds
)
def
gen_model
(
clf
,
arr
,
sel_score
):
...
...
@@ -312,7 +316,7 @@ def gen_model(clf, arr, sel_score):
"""
set_score
=
numpy
.
asarray
(
sel_score
,
dtype
=
numpy
.
int
)
sim_fit
=
clf
.
fit
(
arr
,
set_score
)
return
(
sim_fit
)
return
(
sim_fit
)
def
gen_preds
(
clf
,
arr
):
...
...
@@ -322,7 +326,7 @@ def gen_preds(clf, arr):
arr is a data array identical in dimension to the array clf was trained on
Returns the array of predictions.
"""
if
(
hasattr
(
clf
,
"predict_proba"
)):
if
(
hasattr
(
clf
,
"predict_proba"
)):
ret
=
clf
.
predict
(
arr
)
# pred_score=preds.argmax(1)+min(x._score)
else
:
...
...
@@ -340,8 +344,10 @@ def calc_list_average(l):
total
+=
value
return
total
/
len
(
l
)
stdev
=
lambda
d
:
(
sum
((
x
-
1.
*
sum
(
d
)
/
len
(
d
))
**
2
for
x
in
d
)
/
(
1.
*
(
len
(
d
)
-
1
)))
**
.
5
def
quadratic_weighted_kappa
(
rater_a
,
rater_b
,
min_rating
=
None
,
max_rating
=
None
):
"""
Calculates kappa correlation between rater_a and rater_b.
...
...
@@ -352,7 +358,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
max_rating is an optional argument describing the maximum rating possible on the data set
Returns a float corresponding to the kappa correlation
"""
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
if
min_rating
is
None
:
...
...
@@ -360,7 +366,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
if
max_rating
is
None
:
max_rating
=
max
(
rater_a
+
rater_b
)
conf_mat
=
confusion_matrix
(
rater_a
,
rater_b
,
min_rating
,
max_rating
)
min_rating
,
max_rating
)
num_ratings
=
len
(
conf_mat
)
num_scored_items
=
float
(
len
(
rater_a
))
...
...
@@ -370,7 +376,7 @@ def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None)
numerator
=
0.0
denominator
=
0.0
if
(
num_ratings
>
1
):
if
(
num_ratings
>
1
):
for
i
in
range
(
num_ratings
):
for
j
in
range
(
num_ratings
):
expected_count
=
(
hist_rater_a
[
i
]
*
hist_rater_b
[
j
]
...
...
@@ -390,7 +396,7 @@ def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
A confusion matrix shows how often 2 values agree and disagree
See quadratic_weighted_kappa for argument descriptions
"""
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
assert
(
len
(
rater_a
)
==
len
(
rater_b
))
rater_a
=
[
int
(
a
)
for
a
in
rater_a
]
rater_b
=
[
int
(
b
)
for
b
in
rater_b
]
min_rating
=
int
(
min_rating
)
...
...
@@ -450,7 +456,7 @@ def get_separator_words(toks1):
Returns a list of separator words
"""
tab_toks1
=
nltk
.
FreqDist
(
word
.
lower
()
for
word
in
toks1
)
if
(
os
.
path
.
isfile
(
ESSAY_COR_TOKENS_PATH
)):
if
(
os
.
path
.
isfile
(
ESSAY_COR_TOKENS_PATH
)):
toks2
=
pickle
.
load
(
open
(
ESSAY_COR_TOKENS_PATH
,
'rb'
))
else
:
essay_corpus
=
open
(
ESSAY_CORPUS_PATH
)
.
read
()
...
...
@@ -460,12 +466,12 @@ def get_separator_words(toks1):
sep_words
=
[]
for
word
in
tab_toks1
.
keys
():
tok1_present
=
tab_toks1
[
word
]
if
(
tok1_present
>
2
):
if
(
tok1_present
>
2
):
tok1_total
=
tab_toks1
.
_N
tok2_present
=
toks2
[
word
]
tok2_total
=
toks2
.
_N
fish_val
=
pvalue
(
tok1_present
,
tok2_present
,
tok1_total
,
tok2_total
)
.
two_tail
if
(
fish_val
<
.
001
and
tok1_present
/
float
(
tok1_total
)
>
(
tok2_present
/
float
(
tok2_total
))
*
2
):
if
(
fish_val
<
.
001
and
tok1_present
/
float
(
tok1_total
)
>
(
tok2_present
/
float
(
tok2_total
))
*
2
):
sep_words
.
append
(
word
)
sep_words
=
[
w
for
w
in
sep_words
if
not
w
in
nltk
.
corpus
.
stopwords
.
words
(
"english"
)
and
len
(
w
)
>
5
]
return
sep_words
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment