Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-ora2
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-ora2
Commits
0749ce1d
Commit
0749ce1d
authored
May 21, 2014
by
Will Daly
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added fake AI Algorithm; add EASE wrapper for AI training
parent
6883bfc0
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
377 additions
and
5 deletions
+377
-5
Vagrantfile
+1
-1
apps/openassessment/assessment/tasks.py
+6
-0
apps/openassessment/assessment/test/test_ai_algorithm.py
+133
-0
apps/openassessment/assessment/worker/algorithm.py
+232
-4
settings/dev.py
+5
-0
No files found.
Vagrantfile
View file @
0749ce1d
...
@@ -69,7 +69,7 @@ LOGIN_SCRIPT
...
@@ -69,7 +69,7 @@ LOGIN_SCRIPT
echo "Downloading NLTK corpus..."
echo "Downloading NLTK corpus..."
mkdir -p /home/vagrant/data
mkdir -p /home/vagrant/data
curl -o /home/vagrant/data/nltk.tmp.tar.tz http://edx-static.s3.amazonaws.com/nltk/nltk-data-20131113.tar.gz
curl -o /home/vagrant/data/nltk.tmp.tar.tz http://edx-static.s3.amazonaws.com/nltk/nltk-data-20131113.tar.gz
tar zxf /home/vagrant/data/nltk.tmp.tar.tz
cd /home/vagrant/data &&
tar zxf /home/vagrant/data/nltk.tmp.tar.tz
echo "Install edx-ora2..."
echo "Install edx-ora2..."
cd /home/vagrant/edx-ora2 && ./scripts/install.sh
cd /home/vagrant/edx-ora2 && ./scripts/install.sh
...
...
apps/openassessment/assessment/tasks.py
0 → 100644
View file @
0749ce1d
"""
Celery looks for tasks in this module,
so import the tasks we want the workers to implement.
"""
# pylint:disable=W0611
from
.worker.training
import
train_classifiers
apps/openassessment/assessment/test/test_ai_algorithm.py
0 → 100644
View file @
0749ce1d
# coding=utf-8
"""
Tests for AI algorithm implementations.
"""
import
unittest
import
mock
from
openassessment.test_utils
import
CacheResetTest
from
openassessment.assessment.worker.algorithm
import
(
AIAlgorithm
,
FakeAIAlgorithm
,
EaseAIAlgorithm
,
TrainingError
,
InvalidClassifier
)
EXAMPLES
=
[
AIAlgorithm
.
ExampleEssay
(
u"Mine's a tale that can't be told, my ƒяєє∂σм I hold dear."
,
2
),
AIAlgorithm
.
ExampleEssay
(
u"How years ago in days of old, when 𝒎𝒂𝒈𝒊𝒄 filled th air."
,
1
),
AIAlgorithm
.
ExampleEssay
(
u"Ṫ'ẅäṡ in the darkest depths of Ṁöṛḋöṛ, I met a girl so fair."
,
1
),
AIAlgorithm
.
ExampleEssay
(
u"But goレレuᄊ, and the evil one crept up and slipped away with her"
,
0
),
AIAlgorithm
.
ExampleEssay
(
u""
,
4
)
]
INPUT_ESSAYS
=
[
u"Good times, 𝑩𝒂𝒅 𝑻𝒊𝒎𝒆𝒔, you know I had my share"
,
u"When my woman left home for a 𝒃𝒓𝒐𝒘𝒏 𝒆𝒚𝒆𝒅 𝒎𝒂𝒏"
,
u"Well, I still don't seem to 𝒄𝒂𝒓𝒆"
,
u""
]
class
AIAlgorithmTest
(
CacheResetTest
):
"""
Base class for testing AI algorithm implementations.
"""
ALGORITHM_CLASS
=
None
def
setUp
(
self
):
self
.
algorithm
=
self
.
ALGORITHM_CLASS
()
# pylint:disable=E1102
def
_scores
(
self
,
classifier
,
input_essays
):
"""
Use the classifier to score multiple input essays.
Args:
input_essays (list of unicode): The essays to score.
Returns:
list of int: The scores
"""
return
[
self
.
algorithm
.
score
(
input_essay
,
classifier
)
for
input_essay
in
input_essays
]
class
FakeAIAlgorithmTest
(
AIAlgorithmTest
):
"""
Test for the fake AI algorithm implementation.
"""
ALGORITHM_CLASS
=
FakeAIAlgorithm
def
test_train_and_score
(
self
):
classifier
=
self
.
algorithm
.
train_classifier
(
EXAMPLES
)
expected_scores
=
[
2
,
0
,
0
,
0
]
scores
=
self
.
_scores
(
classifier
,
INPUT_ESSAYS
)
self
.
assertEqual
(
scores
,
expected_scores
)
def
test_score_classifier_missing_key
(
self
):
with
self
.
assertRaises
(
InvalidClassifier
):
self
.
algorithm
.
score
(
u"Test input"
,
dict
())
def
test_score_classifier_no_scores
(
self
):
with
self
.
assertRaises
(
InvalidClassifier
):
self
.
algorithm
.
score
(
u"Test input"
,
{
'scores'
:
[]})
# Try to import EASE -- if we can't, then skip the tests that require it
try
:
import
ease
# pylint: disable=F0401,W0611
EASE_INSTALLED
=
True
except
ImportError
:
EASE_INSTALLED
=
False
@unittest.skipUnless
(
EASE_INSTALLED
,
"EASE library required"
)
class
EaseAIAlgorithmTest
(
AIAlgorithmTest
):
"""
Test for the EASE AI library wrapper.
"""
ALGORITHM_CLASS
=
EaseAIAlgorithm
def
test_train_and_score
(
self
):
classifier
=
self
.
algorithm
.
train_classifier
(
EXAMPLES
)
scores
=
self
.
_scores
(
classifier
,
INPUT_ESSAYS
)
# Check that we got scores in the correct range
valid_scores
=
set
(
example
.
score
for
example
in
EXAMPLES
)
for
score
in
scores
:
self
.
assertIn
(
score
,
valid_scores
)
# Check that the scores are consistent when we re-run the algorithm
repeat_scores
=
self
.
_scores
(
classifier
,
INPUT_ESSAYS
)
self
.
assertEqual
(
scores
,
repeat_scores
)
def
test_all_examples_have_same_score
(
self
):
examples
=
[
AIAlgorithm
.
ExampleEssay
(
u"Test ëṡṡäÿ"
,
1
),
AIAlgorithm
.
ExampleEssay
(
u"Another test ëṡṡäÿ"
,
1
),
]
# No assertion -- just verifying that this does not raise an exception
classifier
=
self
.
algorithm
.
train_classifier
(
examples
)
self
.
_scores
(
classifier
,
INPUT_ESSAYS
)
def
test_no_examples
(
self
):
with
self
.
assertRaises
(
TrainingError
):
self
.
algorithm
.
train_classifier
([])
@mock.patch
(
'openassessment.assessment.worker.algorithm.pickle'
)
def
test_pickle_serialize_error
(
self
,
mock_pickle
):
mock_pickle
.
dumps
.
side_effect
=
Exception
(
"Test error!"
)
with
self
.
assertRaises
(
TrainingError
):
self
.
algorithm
.
train_classifier
(
EXAMPLES
)
@mock.patch
(
'openassessment.assessment.worker.algorithm.pickle'
)
def
test_pickle_deserialize_error
(
self
,
mock_pickle
):
mock_pickle
.
loads
.
side_effect
=
Exception
(
"Test error!"
)
classifier
=
self
.
algorithm
.
train_classifier
(
EXAMPLES
)
with
self
.
assertRaises
(
InvalidClassifier
):
self
.
algorithm
.
score
(
u"Test ëṡṡäÿ"
,
classifier
)
def
test_serialized_classifier_not_a_dict
(
self
):
with
self
.
assertRaises
(
InvalidClassifier
):
self
.
algorithm
.
score
(
u"Test ëṡṡäÿ"
,
"not a dict"
)
apps/openassessment/assessment/worker/algorithm.py
View file @
0749ce1d
...
@@ -4,6 +4,8 @@ Define the ML algorithms used to train text classifiers.
...
@@ -4,6 +4,8 @@ Define the ML algorithms used to train text classifiers.
from
abc
import
ABCMeta
,
abstractmethod
from
abc
import
ABCMeta
,
abstractmethod
from
collections
import
namedtuple
from
collections
import
namedtuple
import
importlib
import
importlib
import
traceback
import
pickle
from
django.conf
import
settings
from
django.conf
import
settings
...
@@ -20,7 +22,7 @@ class UnknownAlgorithm(AIAlgorithmError):
...
@@ -20,7 +22,7 @@ class UnknownAlgorithm(AIAlgorithmError):
Algorithm ID not found in the configuration.
Algorithm ID not found in the configuration.
"""
"""
def
__init__
(
self
,
algorithm_id
):
def
__init__
(
self
,
algorithm_id
):
msg
=
u"Could not find algorithm
\"
u
{}
\"
in the configuration."
.
format
(
algorithm_id
)
msg
=
u"Could not find algorithm
\"
{}
\"
in the configuration."
.
format
(
algorithm_id
)
super
(
UnknownAlgorithm
,
self
)
.
__init__
(
msg
)
super
(
UnknownAlgorithm
,
self
)
.
__init__
(
msg
)
...
@@ -56,7 +58,6 @@ class InvalidClassifier(ScoreError):
...
@@ -56,7 +58,6 @@ class InvalidClassifier(ScoreError):
pass
pass
class
AIAlgorithm
(
object
):
class
AIAlgorithm
(
object
):
"""
"""
Abstract base class for a supervised ML text classification algorithm.
Abstract base class for a supervised ML text classification algorithm.
...
@@ -79,8 +80,7 @@ class AIAlgorithm(object):
...
@@ -79,8 +80,7 @@ class AIAlgorithm(object):
examples (list of AIAlgorithm.ExampleEssay): Example essays and scores.
examples (list of AIAlgorithm.ExampleEssay): Example essays and scores.
Returns:
Returns:
JSON-serializable: The trained classifier. This MUST be JSON-serializable;
JSON-serializable: The trained classifier. This MUST be JSON-serializable.
if any of the classifier data is binary, it should be base-64 encoded.
Raises:
Raises:
TrainingError: The classifier could not be trained successfully.
TrainingError: The classifier could not be trained successfully.
...
@@ -133,3 +133,231 @@ class AIAlgorithm(object):
...
@@ -133,3 +133,231 @@ class AIAlgorithm(object):
return
algorithm_cls
()
return
algorithm_cls
()
except
(
ImportError
,
AttributeError
):
except
(
ImportError
,
AttributeError
):
raise
AlgorithmLoadError
(
algorithm_id
,
cls_path
)
raise
AlgorithmLoadError
(
algorithm_id
,
cls_path
)
class
FakeAIAlgorithm
(
AIAlgorithm
):
"""
Fake AI algorithm implementation that assigns scores randomly.
We use this for testing the pipeline independently of EASE.
"""
def
train_classifier
(
self
,
examples
):
"""
Store the possible score labels, which will allow
us to deterministically choose scores for other essays.
"""
unique_sorted_scores
=
sorted
(
list
(
set
(
example
.
score
for
example
in
examples
)))
return
{
'scores'
:
unique_sorted_scores
}
def
score
(
self
,
text
,
classifier
):
"""
Choose a score for the essay deterministically based on its length.
"""
if
'scores'
not
in
classifier
or
len
(
classifier
[
'scores'
])
==
0
:
raise
InvalidClassifier
(
"Classifier must provide score labels"
)
else
:
score_index
=
len
(
text
)
%
len
(
classifier
[
'scores'
])
return
classifier
[
'scores'
][
score_index
]
class
EaseAIAlgorithm
(
AIAlgorithm
):
"""
Wrapper for the EASE library.
See https://github.com/edx/ease for more information.
Since EASE has many system dependencies, we don't include it explicitly
in edx-ora2 requirements. When testing locally, we use the fake
algorithm implementation instead.
"""
def
train_classifier
(
self
,
examples
):
"""
Train a text classifier using the EASE library.
The classifier is serialized as a dictionary with keys:
* 'feature_extractor': The pickled feature extractor (transforms text into a numeric feature vector).
* 'score_classifier': The pickled classifier (uses the feature vector to assign scores to essays).
Because we are using `pickle`, the serialized classifiers are unfortunately
tied to the particular version of ease/scikit-learn/numpy/scipy/nltk that we
have installed at the time of training.
Args:
examples (list of AIAlgorithm.ExampleEssay): Example essays and scores.
Returns:
dict: The serializable classifier.
Raises:
TrainingError: The classifier could not be trained successfully.
"""
feature_ext
,
classifier
=
self
.
_train_classifiers
(
examples
)
return
self
.
_serialize_classifiers
(
feature_ext
,
classifier
)
def
score
(
self
,
text
,
classifier
):
"""
Score essays using EASE.
Args:
text (unicode): The essay text to score.
classifier (dict): The serialized classifiers created during training.
Returns:
int
Raises:
InvalidClassifier
ScoreError
"""
try
:
from
ease.grade
import
grade
# pylint:disable=F0401
except
ImportError
:
msg
=
u"Could not import EASE to grade essays."
raise
ScoreError
(
msg
)
feature_extractor
,
score_classifier
=
self
.
_deserialize_classifiers
(
classifier
)
grader_input
=
{
'model'
:
score_classifier
,
'extractor'
:
feature_extractor
,
'prompt'
:
''
}
# EASE apparently can't handle non-ASCII unicode in the submission text
# (although, oddly, training runs without error)
# So we need to sanitize the input.
sanitized_text
=
text
.
encode
(
'ascii'
,
'ignore'
)
try
:
results
=
grade
(
grader_input
,
sanitized_text
)
except
:
msg
=
(
u"An unexpected error occurred while using "
u"EASE to score an essay: {traceback}"
)
.
format
(
traceback
=
traceback
.
format_exc
())
raise
ScoreError
(
msg
)
if
not
results
.
get
(
'success'
,
False
):
msg
=
(
u"Errors occurred while scoring an essay "
u"using EASE: {errors}"
)
.
format
(
errors
=
results
.
get
(
'errors'
,
[]))
raise
ScoreError
(
msg
)
score
=
results
.
get
(
'score'
)
if
score
is
None
:
msg
=
u"Error retrieving the score from EASE"
raise
ScoreError
(
msg
)
return
score
def
_train_classifiers
(
self
,
examples
):
"""
Use EASE to train classifiers.
Args:
examples (list of AIAlgorithm.ExampleEssay): Example essays and scores.
Returns:
tuple of `feature_extractor` (an `ease.feature_extractor.FeatureExtractor` object)
and `classifier` (a `sklearn.ensemble.GradientBoostingClassifier` object).
Raises:
TrainingError: Could not load EASE or could not complete training.
"""
try
:
from
ease.create
import
create
# pylint: disable=F0401
except
ImportError
:
msg
=
u"Could not import EASE to perform training."
raise
TrainingError
(
msg
)
input_essays
=
[
example
.
text
for
example
in
examples
]
input_scores
=
[
example
.
score
for
example
in
examples
]
try
:
# Train the classifiers
# The third argument is the essay prompt, which EASE uses
# to check if an input essay is too similar to the prompt.
# Since we're not using this feature, we pass in an empty string.
results
=
create
(
input_essays
,
input_scores
,
""
)
except
:
msg
=
(
u"An unexpected error occurred while using "
u"EASE to train classifiers: {traceback}"
)
.
format
(
traceback
=
traceback
.
format_exc
())
raise
TrainingError
(
msg
)
if
not
results
.
get
(
'success'
,
False
):
msg
=
(
u"Errors occurred while training classifiers "
u"using EASE: {errors}"
)
.
format
(
errors
=
results
.
get
(
'errors'
,
[]))
raise
TrainingError
(
msg
)
return
results
.
get
(
'feature_ext'
),
results
.
get
(
'classifier'
)
def
_serialize_classifiers
(
self
,
feature_ext
,
classifier
):
"""
Serialize the classifier objects.
Args:
feature_extractor (ease.feature_extractor.FeatureExtractor)
classifier (sklearn.ensemble.GradientBoostingClassifier)
Returns:
dict containing the pickled classifiers
Raises:
TrainingError: Could not serialize the classifiers.
"""
try
:
return
{
'feature_extractor'
:
pickle
.
dumps
(
feature_ext
),
'score_classifier'
:
pickle
.
dumps
(
classifier
),
}
except
Exception
as
ex
:
msg
=
(
u"An error occurred while serializing the classifiers "
u"created by EASE: {ex}"
)
.
format
(
ex
=
ex
)
raise
TrainingError
(
msg
)
def
_deserialize_classifiers
(
self
,
classifier_data
):
"""
Deserialize the classifier objects.
Args:
classifier_data (dict): The serialized classifiers.
Returns:
tuple of `(feature_extractor, score_classifier)`
Raises:
InvalidClassifier
"""
if
not
isinstance
(
classifier_data
,
dict
):
raise
InvalidClassifier
(
"Classifier must be a dictionary."
)
try
:
feature_extractor
=
pickle
.
loads
(
classifier_data
.
get
(
'feature_extractor'
))
except
Exception
as
ex
:
msg
=
(
u"An error occurred while deserializing the "
u"EASE feature extractor: {ex}"
)
.
format
(
ex
=
ex
)
raise
InvalidClassifier
(
msg
)
try
:
score_classifier
=
pickle
.
loads
(
classifier_data
.
get
(
'score_classifier'
))
except
Exception
as
ex
:
msg
=
(
u"An error occurred while deserializing the "
u"EASE score classifier: {ex}"
)
.
format
(
ex
=
ex
)
raise
InvalidClassifier
(
msg
)
return
feature_extractor
,
score_classifier
settings/dev.py
View file @
0749ce1d
...
@@ -100,3 +100,8 @@ LOGGING = {
...
@@ -100,3 +100,8 @@ LOGGING = {
# Store uploaded files in a dev-specific directory
# Store uploaded files in a dev-specific directory
MEDIA_ROOT
=
os
.
path
.
join
(
BASE_DIR
,
'storage/dev'
)
MEDIA_ROOT
=
os
.
path
.
join
(
BASE_DIR
,
'storage/dev'
)
# AI algorithm configuration
ORA2_AI_ALGORITHMS
=
{
'fake'
:
'openassessment.assessment.worker.algorithm.FakeAIAlgorithm'
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment