Commit 5295df1d by Will Daly

Optimize database queries for student training

parent d71c867f
...@@ -316,8 +316,8 @@ def get_training_example(submission_uuid, rubric, examples): ...@@ -316,8 +316,8 @@ def get_training_example(submission_uuid, rubric, examples):
# Pick a training example that the student has not yet completed # Pick a training example that the student has not yet completed
# If the student already started a training example, then return that instead. # If the student already started a training example, then return that instead.
item = workflow.next_incomplete_item(examples) next_example = workflow.next_training_example(examples)
return None if item is None else serialize_training_example(item.training_example) return None if next_example is None else serialize_training_example(next_example)
except (InvalidRubric, InvalidTrainingExample) as ex: except (InvalidRubric, InvalidTrainingExample) as ex:
logger.exception( logger.exception(
"Could not deserialize training examples for submission UUID {}".format(submission_uuid) "Could not deserialize training examples for submission UUID {}".format(submission_uuid)
......
""" """
Django models specific to the student training assessment type. Django models specific to the student training assessment type.
""" """
from django.db import models, transaction from django.db import models
from django.utils import timezone from django.utils import timezone
from submissions import api as sub_api from submissions import api as sub_api
from .training import TrainingExample from .training import TrainingExample
...@@ -60,44 +60,6 @@ class StudentTrainingWorkflow(models.Model): ...@@ -60,44 +60,6 @@ class StudentTrainingWorkflow(models.Model):
course_id=student_item['course_id'] course_id=student_item['course_id']
) )
@transaction.commit_on_success
def create_workflow_item(self, training_example):
"""
Create a workflow item for a training example
and add it to the workflow.
Args:
training_example (TrainingExample): The training example model
associated with the next workflow item.
Returns:
StudentTrainingWorkflowItem
"""
order_num = self.items.count() + 1 # pylint:disable=E1101
item = StudentTrainingWorkflowItem.objects.create(
workflow=self,
order_num=order_num,
training_example=training_example
)
self.items.add(item) # pylint:disable=E1101
self.save()
return item
@property
def status(self):
"""
The student's status within the workflow (num steps completed / num steps available).
Returns:
tuple of `(num_completed, num_total)`, both integers
"""
items = self.items.all() # pylint:disable=E1101
num_complete = sum([1 if item.is_complete else 0 for item in items])
num_total = len(items)
return num_complete, num_total
@property @property
def num_completed(self): def num_completed(self):
""" """
...@@ -110,26 +72,36 @@ class StudentTrainingWorkflow(models.Model): ...@@ -110,26 +72,36 @@ class StudentTrainingWorkflow(models.Model):
""" """
return self.items.filter(completed_at__isnull=False).count() # pylint:disable=E1101 return self.items.filter(completed_at__isnull=False).count() # pylint:disable=E1101
def next_incomplete_item(self, examples): def next_training_example(self, examples):
""" """
Find the next incomplete item in the workflow. Return the next training example for the student to assess.
If the student is already working on an example, return that.
Otherwise, choose an example the student hasn't seen
from the list of available examples.
Args: Args:
examples (list of TrainingExample): Training examples to choose from. examples (list of TrainingExample): Training examples to choose from.
Returns: Returns:
StudentTrainingWorkflowItem or None TrainingExample or None
""" """
# Fetch all the items for this workflow from the database
# Since Django's `select_related` does not follow reverse keys
# we perform the filter ourselves.
items = StudentTrainingWorkflowItem.objects.select_related(
'training_example'
).filter(workflow=self)
# If we're already working on an item, then return that item # If we're already working on an item, then return that item
current_item = self.current_item incomplete_items = [item for item in items if not item.is_complete]
if current_item is not None: if len(incomplete_items) > 0:
return current_item return incomplete_items[0].training_example
# Otherwise, pick an item that we have not completed # Otherwise, pick an item that we have not completed
# from the list of examples. # from the list of examples.
completed_examples = [ completed_examples = [
item.training_example for item in self.items.all() # pylint:disable=E1101 item.training_example for item in items
] ]
available_examples = [ available_examples = [
available for available in examples available for available in examples
...@@ -142,7 +114,14 @@ class StudentTrainingWorkflow(models.Model): ...@@ -142,7 +114,14 @@ class StudentTrainingWorkflow(models.Model):
# Otherwise, create a new workflow item for the example # Otherwise, create a new workflow item for the example
# and add it to the workflow # and add it to the workflow
else: else:
return self.create_workflow_item(available_examples[0]) order_num = len(items) + 1
next_example = available_examples[0]
StudentTrainingWorkflowItem.objects.create(
workflow=self,
order_num=order_num,
training_example=next_example
)
return next_example
@property @property
def current_item(self): def current_item(self):
...@@ -154,14 +133,13 @@ class StudentTrainingWorkflow(models.Model): ...@@ -154,14 +133,13 @@ class StudentTrainingWorkflow(models.Model):
StudentTrainingWorkflowItem or None StudentTrainingWorkflowItem or None
""" """
next_incomplete = self.items.filter( # pylint:disable=E1101 next_incomplete = self.items.select_related(
'training_example'
).filter( # pylint:disable=E1101
completed_at__isnull=True completed_at__isnull=True
).order_by('order_num')[:1] ).order_by('order_num')[:1]
if len(next_incomplete) > 0: return None if len(next_incomplete) == 0 else next_incomplete[0]
return next_incomplete[0]
else:
return None
class StudentTrainingWorkflowItem(models.Model): class StudentTrainingWorkflowItem(models.Model):
......
...@@ -3,6 +3,7 @@ Django models for training (both student and AI). ...@@ -3,6 +3,7 @@ Django models for training (both student and AI).
""" """
import json import json
from hashlib import sha1 from hashlib import sha1
from django.core.cache import cache
from django.db import models from django.db import models
from .base import Rubric, CriterionOption from .base import Rubric, CriterionOption
...@@ -22,29 +23,34 @@ class TrainingExample(models.Model): ...@@ -22,29 +23,34 @@ class TrainingExample(models.Model):
# SHA1 hash # SHA1 hash
content_hash = models.CharField(max_length=40, unique=True, db_index=True) content_hash = models.CharField(max_length=40, unique=True, db_index=True)
# Version for models serialized to the cache
# Increment this number whenever you update this model!
CACHE_KEY_VERSION = 1
class Meta: class Meta:
app_label = "assessment" app_label = "assessment"
@classmethod @classmethod
def create_example(cls, answer, options_ids, rubric): def create_example(cls, answer, options_selected, rubric):
""" """
Create a new training example. Create a new training example.
Args: Args:
answer (JSON-serializable): The answer associated with the training example. answer (JSON-serializable): The answer associated with the training example.
option_ids (iterable of int): Selected option IDs for the training example. options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
rubric (Rubric): The rubric associated with the training example. rubric (Rubric): The rubric associated with the training example.
Returns: Returns:
TrainingExample TrainingExample
""" """
content_hash = cls.calculate_hash(answer, options_ids, rubric) content_hash = cls.calculate_hash(answer, options_selected, rubric)
example = TrainingExample.objects.create( example = TrainingExample.objects.create(
content_hash=content_hash, content_hash=content_hash,
raw_answer=json.dumps(answer), raw_answer=json.dumps(answer),
rubric=rubric rubric=rubric
) )
options_ids = rubric.options_ids(options_selected)
for option in CriterionOption.objects.filter(pk__in=list(options_ids)): for option in CriterionOption.objects.filter(pk__in=list(options_ids)):
example.options_selected.add(option) example.options_selected.add(option)
...@@ -71,19 +77,50 @@ class TrainingExample(models.Model): ...@@ -71,19 +77,50 @@ class TrainingExample(models.Model):
dict: maps criterion names to selected option names dict: maps criterion names to selected option names
""" """
return { # Since training examples are immutable, we can safely cache this
option.criterion.name: option.name cache_key = self.cache_key_serialized(attribute="options_selected_dict")
for option in self.options_selected.all() # pylint:disable=E1101 options_selected = cache.get(cache_key)
} if options_selected is None:
options_selected = {
option.criterion.name: option.name
for option in self.options_selected.all() # pylint:disable=E1101
}
cache.set(cache_key, options_selected)
return options_selected
def cache_key_serialized(self, attribute=None):
"""
Create a cache key based on the content hash
for serialized versions of this model.
Kwargs:
attribute: The name of the attribute being serialized.
If not specified, assume that we are serializing the entire model.
Returns:
str: The cache key
"""
if attribute is None:
key_template = u"TrainingExample.json.v{version}.{content_hash}"
else:
key_template = u"TrainingExample.{attribute}.json.v{version}.{content_hash}"
cache_key = key_template.format(
version=self.CACHE_KEY_VERSION,
content_hash=self.content_hash,
attribute=attribute
)
return cache_key
@staticmethod @staticmethod
def calculate_hash(answer, option_ids, rubric): def calculate_hash(answer, options_selected, rubric):
""" """
Calculate a hash for the contents of training example. Calculate a hash for the contents of training example.
Args: Args:
answer (JSON-serializable): The answer associated with the training example. answer (JSON-serializable): The answer associated with the training example.
option_ids (iterable of int): Selected option IDs for the training example. options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
rubric (Rubric): The rubric associated with the training example. rubric (Rubric): The rubric associated with the training example.
Returns: Returns:
...@@ -92,10 +129,28 @@ class TrainingExample(models.Model): ...@@ -92,10 +129,28 @@ class TrainingExample(models.Model):
""" """
contents = json.dumps({ contents = json.dumps({
'answer': answer, 'answer': answer,
'option_ids': list(option_ids), 'options_selected': options_selected,
'rubric': rubric.id 'rubric': rubric.id
}) })
return sha1(contents).hexdigest() return sha1(contents).hexdigest()
class Meta: @classmethod
app_label = "assessment" def cache_key(cls, answer, options_selected, rubric):
"""
Calculate a cache key based on the content hash.
Args:
answer (JSON-serializable): The answer associated with the training example.
options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
rubric (Rubric): The rubric associated with the training example.
Returns:
tuple of `(cache_key, content_hash)`, both bytestrings
"""
content_hash = cls.calculate_hash(answer, options_selected, rubric)
cache_key = u"TrainingExample.model.v{version}.{content_hash}".format(
version=cls.CACHE_KEY_VERSION,
content_hash=content_hash
)
return cache_key, content_hash
""" """
Serializers for the training assessment type. Serializers for the training assessment type.
""" """
import json from django.core.cache import cache
from django.db import transaction, IntegrityError from django.db import transaction, IntegrityError
from openassessment.assessment.models import TrainingExample from openassessment.assessment.models import TrainingExample
from .base import rubric_from_dict, RubricSerializer from .base import rubric_from_dict, RubricSerializer
...@@ -53,11 +53,17 @@ def serialize_training_example(example): ...@@ -53,11 +53,17 @@ def serialize_training_example(example):
dict dict
""" """
return { # Since training examples are immutable, we can safely cache them
'answer': example.answer, cache_key = example.cache_key_serialized()
'options_selected': example.options_selected_dict, example_dict = cache.get(cache_key)
'rubric': RubricSerializer.serialized_from_cache(example.rubric), if example_dict is None:
} example_dict = {
'answer': example.answer,
'options_selected': example.options_selected_dict,
'rubric': RubricSerializer.serialized_from_cache(example.rubric),
}
cache.set(cache_key, example_dict)
return example_dict
@transaction.commit_on_success @transaction.commit_on_success
...@@ -144,24 +150,31 @@ def deserialize_training_examples(examples, rubric_dict): ...@@ -144,24 +150,31 @@ def deserialize_training_examples(examples, rubric_dict):
# Parse each example # Parse each example
created_examples = [] created_examples = []
for example_dict in examples: for example_dict in examples:
is_valid, errors = validate_training_example_format(example_dict)
if not is_valid:
raise InvalidTrainingExample("; ".join(errors))
options_ids = rubric.options_ids(example_dict['options_selected']) # Try to retrieve the example from the cache
cache_key, content_hash = TrainingExample.cache_key(example_dict['answer'], example_dict['options_selected'], rubric)
example = cache.get(cache_key)
# Calculate the content hash to look up the example # If we couldn't retrieve the example from the cache, create it
content_hash = TrainingExample.calculate_hash(example_dict['answer'], options_ids, rubric) if example is None:
# Validate the training example
is_valid, errors = validate_training_example_format(example_dict)
if not is_valid:
raise InvalidTrainingExample("; ".join(errors))
try: # Get or create the training example
example = TrainingExample.objects.get(content_hash=content_hash)
except TrainingExample.DoesNotExist:
try: try:
example = TrainingExample.create_example(
example_dict['answer'], options_ids, rubric
)
except IntegrityError:
example = TrainingExample.objects.get(content_hash=content_hash) example = TrainingExample.objects.get(content_hash=content_hash)
except TrainingExample.DoesNotExist:
try:
example = TrainingExample.create_example(
example_dict['answer'], example_dict['options_selected'], rubric
)
except IntegrityError:
example = TrainingExample.objects.get(content_hash=content_hash)
# Add the example to the cache
cache.set(cache_key, example)
created_examples.append(example) created_examples.append(example)
......
...@@ -159,6 +159,60 @@ class StudentTrainingAssessmentTest(CacheResetTest): ...@@ -159,6 +159,60 @@ class StudentTrainingAssessmentTest(CacheResetTest):
next_retrieved = training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES) next_retrieved = training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
self.assertEqual(retrieved, next_retrieved) self.assertEqual(retrieved, next_retrieved)
def test_get_training_example_num_queries(self):
# Run through the training example once using a different submission
# Training examples and rubrics will be cached and shared for other
# students working on the same problem.
self._warm_cache(self.RUBRIC, self.EXAMPLES)
# First training example
# This will need to create the student training workflow and the first item
# NOTE: we *could* cache the rubric model to reduce the number of queries here,
# but we're selecting it by content hash, which is indexed and should be plenty fast.
with self.assertNumQueries(6):
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
# Without assessing the first training example, try to retrieve a training example.
# This should return the same example as before, so we won't need to create
# any workflows or workflow items.
with self.assertNumQueries(3):
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
# Assess the current training example
training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
# Retrieve the next training example, which requires us to create
# a new workflow item (but not a new workflow).
with self.assertNumQueries(4):
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
def test_submitter_is_finished_num_queries(self):
# Complete the first training example
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
# Check whether we've completed the requirements
requirements = {'num_required': 2}
with self.assertNumQueries(2):
training_api.submitter_is_finished(self.submission_uuid, requirements)
def test_get_num_completed_num_queries(self):
# Complete the first training example
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
# Check the number completed
with self.assertNumQueries(2):
training_api.get_num_completed(self.submission_uuid)
def test_assess_training_example_num_queries(self):
# Populate the cache with training examples and rubrics
self._warm_cache(self.RUBRIC, self.EXAMPLES)
training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
with self.assertNumQueries(4):
training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
@ddt.file_data('data/validate_training_examples.json') @ddt.file_data('data/validate_training_examples.json')
def test_validate_training_examples(self, data): def test_validate_training_examples(self, data):
errors = training_api.validate_training_examples( errors = training_api.validate_training_examples(
...@@ -319,3 +373,22 @@ class StudentTrainingAssessmentTest(CacheResetTest): ...@@ -319,3 +373,22 @@ class StudentTrainingAssessmentTest(CacheResetTest):
example = training_api.get_training_example(submission_uuid, input_rubric, input_examples) example = training_api.get_training_example(submission_uuid, input_rubric, input_examples)
expected_example = self._expected_example(input_examples[order_num], input_rubric) expected_example = self._expected_example(input_examples[order_num], input_rubric)
self.assertItemsEqual(example, expected_example) self.assertItemsEqual(example, expected_example)
def _warm_cache(self, rubric, examples):
"""
Create a submission and complete student training.
This will populate the cache with training examples and rubrics,
which are immutable and shared for all students training on a particular problem.
Args:
rubric (dict): Serialized rubric model.
examples (list of dict): Serialized training examples
Returns:
None
"""
pre_submission = sub_api.create_submission(self.STUDENT_ITEM, self.ANSWER)
for example in examples:
training_api.get_training_example(pre_submission['uuid'], rubric, examples)
training_api.assess_training_example(pre_submission['uuid'], example['options_selected'])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment