Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
ease
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
ease
Commits
049f0856
Commit
049f0856
authored
Jun 13, 2014
by
gradyward
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Refactored to delete the Model_creator.py file.
Added all use cases to the create.py file.
parent
9a92afda
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
130 additions
and
124 deletions
+130
-124
ease/create.py
+130
-5
ease/model_creator.py
+0
-119
No files found.
ease/create.py
View file @
049f0856
...
...
@@ -8,13 +8,15 @@ import logging
import
numpy
# Define base path and add to sys path
from
ease
import
feature_extractor
from
ease.essay_set
import
EssaySet
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
one_up_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..//'
))
sys
.
path
.
append
(
one_up_path
)
#Import modules that are dependent on the base path
import
model_creator
import
util_functions
from
errors
import
*
from
datetime
import
datetime
...
...
@@ -83,7 +85,7 @@ def create(examples, scores, prompt_string, dump_data=False):
# Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
try
:
essay_set
=
model_creator
.
create_essay_set
(
examples
,
scores
,
prompt_string
)
essay_set
=
create_essay_set
(
examples
,
scores
,
prompt_string
)
except
(
ExampleCreationRequestError
,
ExampleCreationInternalError
)
as
ex
:
msg
=
"essay set creation failed due to an error in the create_essay_set method. {}"
.
format
(
ex
)
results
[
'errors'
]
.
append
(
msg
)
...
...
@@ -92,7 +94,7 @@ def create(examples, scores, prompt_string, dump_data=False):
# Gets the features and classifiers from the essay set and computes the error
try
:
feature_ext
,
classifier
,
cv_error_results
=
model_creator
.
extract_features_and_generate_model
(
feature_ext
,
classifier
,
cv_error_results
=
extract_features_and_generate_model
(
essay_set
)
results
[
'cv_kappa'
]
=
cv_error_results
[
'kappa'
]
...
...
@@ -128,4 +130,127 @@ def select_algorithm(score_list):
if
len
(
set
(
score_list
))
>
5
:
return
util_functions
.
AlgorithmTypes
.
regression
else
:
return
util_functions
.
AlgorithmTypes
.
classification
\ No newline at end of file
return
util_functions
.
AlgorithmTypes
.
classification
def
create_essay_set
(
text
,
score
,
prompt_string
,
generate_additional
=
True
):
"""
Creates an essay set from given data.
Text should be a list of strings corresponding to essay text.
Score should be a list of scores where score[n] corresponds to text[n]
Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not.
"""
essay_set
=
EssaySet
()
for
i
in
xrange
(
0
,
len
(
text
)):
essay_set
.
add_essay
(
text
[
i
],
score
[
i
])
if
score
[
i
]
==
min
(
score
)
and
generate_additional
==
True
:
essay_set
.
generate_additional_essays
(
essay_set
.
_cleaned_spelled_essays
[
len
(
essay_set
.
_cleaned_spelled_essays
)
-
1
],
score
[
i
])
essay_set
.
update_prompt
(
prompt_string
)
return
essay_set
def
extract_features_and_generate_model
(
essay_set
):
"""
Feed in an essay set to get feature vector and classifier
Args:
essays (EssaySet): The essay set to construct the feature extractor and model off of
Returns:
A tuple with the following elements in the following order:
- The Trained Feature extractor
- The Trained Classifier
- Any Cross Validation results
"""
feat_extractor
=
feature_extractor
.
FeatureExtractor
(
essay_set
)
features
=
feat_extractor
.
generate_features
(
essay_set
)
set_scores
=
numpy
.
asarray
(
essay_set
.
_scores
,
dtype
=
numpy
.
int
)
algorithm
=
create
.
select_algorithm
(
set_scores
)
predict_classifier
,
cv_error_classifier
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
cv_error_classifier
,
features
,
essay_set
.
_scores
)
try
:
predict_classifier
.
fit
(
features
,
set_scores
)
except
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_scores
[
0
]
=
1
set_scores
[
1
]
=
0
predict_classifier
.
fit
(
features
,
set_scores
)
return
feat_extractor
,
predict_classifier
,
cv_error_results
def
get_algorithms
(
algorithm
):
"""
Gets two classifiers for each type of algorithm, and returns them.
The First algorithm is used for for predicting scores,
The second is used for calculating cv error.
Args:
algorithm: One of the Algorithm types defined in util_functions.AlgorithmTypes
Returns:
A tuple of the form (classifier, classifier), where
The First algorithm is used for for predicting scores,
The second is used for calculating cv error.
"""
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
def
get_cv_error
(
classifier
,
features
,
scores
):
"""
Gets cross validated error for a given classifier, set of features, and scores
Args:
classifier: The classifier to be used for CV
features: The features to feed into the classifier and to cross validate over.
Stored as a list of lists. Each row in the outer list associates with a single essay
scores: the scores associated with each of the features. Feature row 1 associates with score 1, etc.
Returns:
(dict) with the following keys:
'mae': Mean Absolute Error (measures the average deviation between AI grade and Human Grade)
'kappa': Quadratic weighted kappa (measures the similarity between graders (AI and Human))
'success': Whether or not the calculation was successful.
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
classifier
,
features
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
except
ValueError
as
ex
:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
# TODO Figure out why this error would occur in the first place.
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
log
.
debug
(
msg
)
except
:
log
.
exception
(
"Error getting cv error estimates."
)
return
results
\ No newline at end of file
ease/model_creator.py
deleted
100644 → 0
View file @
9a92afda
# Provides interface functions to create and save models
import
numpy
import
re
import
nltk
import
sys
from
sklearn.feature_extraction.text
import
CountVectorizer
import
pickle
import
os
import
sklearn.ensemble
from
itertools
import
chain
base_path
=
os
.
path
.
dirname
(
__file__
)
sys
.
path
.
append
(
base_path
)
from
essay_set
import
EssaySet
import
util_functions
import
feature_extractor
import
logging
import
create
log
=
logging
.
getLogger
()
def
create_essay_set
(
text
,
score
,
prompt_string
,
generate_additional
=
True
):
"""
Creates an essay set from given data.
Text should be a list of strings corresponding to essay text.
Score should be a list of scores where score[n] corresponds to text[n]
Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not.
"""
essay_set
=
EssaySet
()
for
i
in
xrange
(
0
,
len
(
text
)):
essay_set
.
add_essay
(
text
[
i
],
score
[
i
])
if
score
[
i
]
==
min
(
score
)
and
generate_additional
==
True
:
essay_set
.
generate_additional_essays
(
essay_set
.
_cleaned_spelled_essays
[
len
(
essay_set
.
_cleaned_spelled_essays
)
-
1
],
score
[
i
])
essay_set
.
update_prompt
(
prompt_string
)
return
essay_set
def
get_cv_error
(
clf
,
feats
,
scores
):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results
=
{
'success'
:
False
,
'kappa'
:
0
,
'mae'
:
0
}
try
:
cv_preds
=
util_functions
.
gen_cv_preds
(
clf
,
feats
,
scores
)
err
=
numpy
.
mean
(
numpy
.
abs
(
numpy
.
array
(
cv_preds
)
-
scores
))
kappa
=
util_functions
.
quadratic_weighted_kappa
(
list
(
cv_preds
),
scores
)
results
[
'mae'
]
=
err
results
[
'kappa'
]
=
kappa
results
[
'success'
]
=
True
except
ValueError
as
ex
:
# If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
msg
=
u"Not enough classes (0,1,etc) in each cross validation fold: {ex}"
.
format
(
ex
=
ex
)
log
.
debug
(
msg
)
except
:
log
.
exception
(
"Error getting cv error estimates."
)
return
results
def
get_algorithms
(
algorithm
):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if
algorithm
==
util_functions
.
AlgorithmTypes
.
classification
:
clf
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingClassifier
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
else
:
clf
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
clf2
=
sklearn
.
ensemble
.
GradientBoostingRegressor
(
n_estimators
=
100
,
learn_rate
=.
05
,
max_depth
=
4
,
random_state
=
1
,
min_samples_leaf
=
3
)
return
clf
,
clf2
def
extract_features_and_generate_model
(
essay_set
):
"""
Feed in an essay set to get feature vector and classifier
Args:
essays (EssaySet): The essay set to construct the feature extractor and model off of
Returns:
A tuple with the following elements in the following order:
- The Trained Feature extractor
- The Trained Classifier
- Any Cross Validation results
"""
feat_extractor
=
feature_extractor
.
FeatureExtractor
(
essay_set
)
features
=
feat_extractor
.
generate_features
(
essay_set
)
set_score
=
numpy
.
asarray
(
essay_set
.
_score
,
dtype
=
numpy
.
int
)
algorithm
=
create
.
select_algorithm
(
set_score
)
predict_classifier
,
cv_error_classifier
=
get_algorithms
(
algorithm
)
cv_error_results
=
get_cv_error
(
cv_error_classifier
,
features
,
essay_set
.
_score
)
try
:
predict_classifier
.
fit
(
features
,
set_score
)
except
:
log
.
exception
(
"Not enough classes (0,1,etc) in sample."
)
set_score
[
0
]
=
1
set_score
[
1
]
=
0
predict_classifier
.
fit
(
features
,
set_score
)
return
feat_extractor
,
predict_classifier
,
cv_error_results
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment