Optimize database queries for student training

5295df1d · Will Daly · d71c867f · 5295df1d · 5295df1d · 5295df1d
Commit 5295df1d authored May 15, 2014 by Will Daly
5 changed files
--- a/apps/openassessment/assessment/api/student_training.py
+++ b/apps/openassessment/assessment/api/student_training.py
@@ -316,8 +316,8 @@ def get_training_example(submission_uuid, rubric, examples):

        # Pick a training example that the student has not yet completed
        # If the student already started a training example, then return that instead.
-        item = workflow.next_incomplete_item(examples)
-        return None if item is None else serialize_training_example(item.training_example)
+        next_example = workflow.next_training_example(examples)
+        return None if next_example is None else serialize_training_example(next_example)
    except (InvalidRubric, InvalidTrainingExample) as ex:
        logger.exception(
            "Could not deserialize training examples for submission UUID {}".format(submission_uuid)

--- a/apps/openassessment/assessment/models/student_training.py
+++ b/apps/openassessment/assessment/models/student_training.py
 """
 Django models specific to the student training assessment type.
 """
-from django.db import models, transaction
+from django.db import models
 from django.utils import timezone
 from submissions import api as sub_api
 from .training import TrainingExample
@@ -60,44 +60,6 @@ class StudentTrainingWorkflow(models.Model):
            course_id=student_item['course_id']
        )

-    @transaction.commit_on_success
-    def create_workflow_item(self, training_example):
-        """
-        Create a workflow item for a training example
-        and add it to the workflow.
-
-        Args:
-            training_example (TrainingExample): The training example model
-                associated with the next workflow item.
-
-        Returns:
-            StudentTrainingWorkflowItem
-
-        """
-        order_num = self.items.count() + 1  # pylint:disable=E1101
-        item = StudentTrainingWorkflowItem.objects.create(
-            workflow=self,
-            order_num=order_num,
-            training_example=training_example
-        )
-        self.items.add(item)    # pylint:disable=E1101
-        self.save()
-        return item
-
-    @property
-    def status(self):
-        """
-        The student's status within the workflow (num steps completed / num steps available).
-
-        Returns:
-            tuple of `(num_completed, num_total)`, both integers
-
-        """
-        items = self.items.all()    # pylint:disable=E1101
-        num_complete = sum([1 if item.is_complete else 0 for item in items])
-        num_total = len(items)
-        return num_complete, num_total
-
    @property
    def num_completed(self):
        """
@@ -110,26 +72,36 @@ class StudentTrainingWorkflow(models.Model):
        """
        return self.items.filter(completed_at__isnull=False).count()  # pylint:disable=E1101

-    def next_incomplete_item(self, examples):
+    def next_training_example(self, examples):
        """
-        Find the next incomplete item in the workflow.
+        Return the next training example for the student to assess.
+        If the student is already working on an example, return that.
+        Otherwise, choose an example the student hasn't seen
+        from the list of available examples.

        Args:
            examples (list of TrainingExample): Training examples to choose from.

        Returns:
-            StudentTrainingWorkflowItem or None
+            TrainingExample or None

        """
+        # Fetch all the items for this workflow from the database
+        # Since Django's `select_related` does not follow reverse keys
+        # we perform the filter ourselves.
+        items = StudentTrainingWorkflowItem.objects.select_related(
+            'training_example'
+        ).filter(workflow=self)
+
        # If we're already working on an item, then return that item
-        current_item = self.current_item
-        if current_item is not None:
-            return current_item
+        incomplete_items = [item for item in items if not item.is_complete]
+        if len(incomplete_items) > 0:
+            return incomplete_items[0].training_example

        # Otherwise, pick an item that we have not completed
        # from the list of examples.
        completed_examples = [
-            item.training_example for item in self.items.all()  # pylint:disable=E1101
+            item.training_example for item in items
        ]
        available_examples = [
            available for available in examples
@@ -142,7 +114,14 @@ class StudentTrainingWorkflow(models.Model):
        # Otherwise, create a new workflow item for the example
        # and add it to the workflow
        else:
-            return self.create_workflow_item(available_examples[0])
+            order_num = len(items) + 1
+            next_example = available_examples[0]
+            StudentTrainingWorkflowItem.objects.create(
+                workflow=self,
+                order_num=order_num,
+                training_example=next_example
+            )
+            return next_example

    @property
    def current_item(self):
@@ -154,14 +133,13 @@ class StudentTrainingWorkflow(models.Model):
            StudentTrainingWorkflowItem or None

        """
-        next_incomplete = self.items.filter(  # pylint:disable=E1101
+        next_incomplete = self.items.select_related(
+            'training_example'
+        ).filter(  # pylint:disable=E1101
            completed_at__isnull=True
        ).order_by('order_num')[:1]

-        if len(next_incomplete) > 0:
-            return next_incomplete[0]
-        else:
-            return None
+        return None if len(next_incomplete) == 0 else next_incomplete[0]


 class StudentTrainingWorkflowItem(models.Model):

--- a/apps/openassessment/assessment/models/training.py
+++ b/apps/openassessment/assessment/models/training.py
@@ -3,6 +3,7 @@ Django models for training (both student and AI).
 """
 import json
 from hashlib import sha1
+from django.core.cache import cache
 from django.db import models
 from .base import Rubric, CriterionOption

@@ -22,29 +23,34 @@ class TrainingExample(models.Model):
    # SHA1 hash
    content_hash = models.CharField(max_length=40, unique=True, db_index=True)

+    # Version for models serialized to the cache
+    # Increment this number whenever you update this model!
+    CACHE_KEY_VERSION = 1
+
    class Meta:
        app_label = "assessment"

    @classmethod
-    def create_example(cls, answer, options_ids, rubric):
+    def create_example(cls, answer, options_selected, rubric):
        """
        Create a new training example.

        Args:
            answer (JSON-serializable): The answer associated with the training example.
-            option_ids (iterable of int): Selected option IDs for the training example.
+            options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
            rubric (Rubric): The rubric associated with the training example.

        Returns:
            TrainingExample

        """
-        content_hash = cls.calculate_hash(answer, options_ids, rubric)
+        content_hash = cls.calculate_hash(answer, options_selected, rubric)
        example = TrainingExample.objects.create(
            content_hash=content_hash,
            raw_answer=json.dumps(answer),
            rubric=rubric
        )
+        options_ids = rubric.options_ids(options_selected)

        for option in CriterionOption.objects.filter(pk__in=list(options_ids)):
            example.options_selected.add(option)
@@ -71,19 +77,50 @@ class TrainingExample(models.Model):
            dict: maps criterion names to selected option names

        """
-        return {
-            option.criterion.name: option.name
-            for option in self.options_selected.all()  # pylint:disable=E1101
-        }
+        # Since training examples are immutable, we can safely cache this
+        cache_key = self.cache_key_serialized(attribute="options_selected_dict")
+        options_selected = cache.get(cache_key)
+        if options_selected is None:
+            options_selected = {
+                option.criterion.name: option.name
+                for option in self.options_selected.all()  # pylint:disable=E1101
+            }
+            cache.set(cache_key, options_selected)
+        return options_selected
+
+    def cache_key_serialized(self, attribute=None):
+        """
+        Create a cache key based on the content hash
+        for serialized versions of this model.
+
+        Kwargs:
+            attribute: The name of the attribute being serialized.
+                If not specified, assume that we are serializing the entire model.
+
+        Returns:
+            str: The cache key
+
+        """
+        if attribute is None:
+            key_template = u"TrainingExample.json.v{version}.{content_hash}"
+        else:
+            key_template = u"TrainingExample.{attribute}.json.v{version}.{content_hash}"
+
+        cache_key = key_template.format(
+            version=self.CACHE_KEY_VERSION,
+            content_hash=self.content_hash,
+            attribute=attribute
+        )
+        return cache_key

    @staticmethod
-    def calculate_hash(answer, option_ids, rubric):
+    def calculate_hash(answer, options_selected, rubric):
        """
        Calculate a hash for the contents of training example.

        Args:
            answer (JSON-serializable): The answer associated with the training example.
-            option_ids (iterable of int): Selected option IDs for the training example.
+            options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
            rubric (Rubric): The rubric associated with the training example.

        Returns:
@@ -92,10 +129,28 @@ class TrainingExample(models.Model):
        """
        contents = json.dumps({
            'answer': answer,
-            'option_ids': list(option_ids),
+            'options_selected': options_selected,
            'rubric': rubric.id
        })
        return sha1(contents).hexdigest()

-    class Meta:
-        app_label = "assessment"
+    @classmethod
+    def cache_key(cls, answer, options_selected, rubric):
+        """
+        Calculate a cache key based on the content hash.
+
+        Args:
+            answer (JSON-serializable): The answer associated with the training example.
+            options_selected (dict): The options selected from the rubric (mapping of criterion names to option names)
+            rubric (Rubric): The rubric associated with the training example.
+
+        Returns:
+            tuple of `(cache_key, content_hash)`, both bytestrings
+
+        """
+        content_hash = cls.calculate_hash(answer, options_selected, rubric)
+        cache_key = u"TrainingExample.model.v{version}.{content_hash}".format(
+            version=cls.CACHE_KEY_VERSION,
+            content_hash=content_hash
+        )
+        return cache_key, content_hash
--- a/apps/openassessment/assessment/serializers/training.py
+++ b/apps/openassessment/assessment/serializers/training.py
 """
 Serializers for the training assessment type.
 """
-import json
+from django.core.cache import cache
 from django.db import transaction, IntegrityError
 from openassessment.assessment.models import TrainingExample
 from .base import rubric_from_dict, RubricSerializer
@@ -53,11 +53,17 @@ def serialize_training_example(example):
        dict

    """
-    return {
-        'answer': example.answer,
-        'options_selected': example.options_selected_dict,
-        'rubric': RubricSerializer.serialized_from_cache(example.rubric),
-    }
+    # Since training examples are immutable, we can safely cache them
+    cache_key = example.cache_key_serialized()
+    example_dict = cache.get(cache_key)
+    if example_dict is None:
+        example_dict = {
+            'answer': example.answer,
+            'options_selected': example.options_selected_dict,
+            'rubric': RubricSerializer.serialized_from_cache(example.rubric),
+        }
+        cache.set(cache_key, example_dict)
+    return example_dict


 @transaction.commit_on_success
@@ -144,24 +150,31 @@ def deserialize_training_examples(examples, rubric_dict):
    # Parse each example
    created_examples = []
    for example_dict in examples:
-        is_valid, errors = validate_training_example_format(example_dict)
-        if not is_valid:
-            raise InvalidTrainingExample("; ".join(errors))

-        options_ids = rubric.options_ids(example_dict['options_selected'])
+        # Try to retrieve the example from the cache
+        cache_key, content_hash = TrainingExample.cache_key(example_dict['answer'], example_dict['options_selected'], rubric)
+        example = cache.get(cache_key)

-        # Calculate the content hash to look up the example
-        content_hash = TrainingExample.calculate_hash(example_dict['answer'], options_ids, rubric)
+        # If we couldn't retrieve the example from the cache, create it
+        if example is None:
+            # Validate the training example
+            is_valid, errors = validate_training_example_format(example_dict)
+            if not is_valid:
+                raise InvalidTrainingExample("; ".join(errors))

-        try:
-            example = TrainingExample.objects.get(content_hash=content_hash)
-        except TrainingExample.DoesNotExist:
+            # Get or create the training example
            try:
-                example = TrainingExample.create_example(
-                    example_dict['answer'], options_ids, rubric
-                )
-            except IntegrityError:
                example = TrainingExample.objects.get(content_hash=content_hash)
+            except TrainingExample.DoesNotExist:
+                try:
+                    example = TrainingExample.create_example(
+                        example_dict['answer'], example_dict['options_selected'], rubric
+                    )
+                except IntegrityError:
+                    example = TrainingExample.objects.get(content_hash=content_hash)
+
+            # Add the example to the cache
+            cache.set(cache_key, example)

        created_examples.append(example)


--- a/apps/openassessment/assessment/test/test_student_training.py
+++ b/apps/openassessment/assessment/test/test_student_training.py
@@ -159,6 +159,60 @@ class StudentTrainingAssessmentTest(CacheResetTest):
        next_retrieved = training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
        self.assertEqual(retrieved, next_retrieved)

+    def test_get_training_example_num_queries(self):
+
+        # Run through the training example once using a different submission
+        # Training examples and rubrics will be cached and shared for other
+        # students working on the same problem.
+        self._warm_cache(self.RUBRIC, self.EXAMPLES)
+
+        # First training example
+        # This will need to create the student training workflow and the first item
+        # NOTE: we *could* cache the rubric model to reduce the number of queries here,
+        # but we're selecting it by content hash, which is indexed and should be plenty fast.
+        with self.assertNumQueries(6):
+            training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+
+        # Without assessing the first training example, try to retrieve a training example.
+        # This should return the same example as before, so we won't need to create
+        # any workflows or workflow items.
+        with self.assertNumQueries(3):
+            training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+
+        # Assess the current training example
+        training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
+
+        # Retrieve the next training example, which requires us to create
+        # a new workflow item (but not a new workflow).
+        with self.assertNumQueries(4):
+            training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+
+    def test_submitter_is_finished_num_queries(self):
+        # Complete the first training example
+        training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+        training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
+
+        # Check whether we've completed the requirements
+        requirements = {'num_required': 2}
+        with self.assertNumQueries(2):
+            training_api.submitter_is_finished(self.submission_uuid, requirements)
+
+    def test_get_num_completed_num_queries(self):
+        # Complete the first training example
+        training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+        training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
+
+        # Check the number completed
+        with self.assertNumQueries(2):
+            training_api.get_num_completed(self.submission_uuid)
+
+    def test_assess_training_example_num_queries(self):
+        # Populate the cache with training examples and rubrics
+        self._warm_cache(self.RUBRIC, self.EXAMPLES)
+        training_api.get_training_example(self.submission_uuid, self.RUBRIC, self.EXAMPLES)
+        with self.assertNumQueries(4):
+            training_api.assess_training_example(self.submission_uuid, self.EXAMPLES[0]['options_selected'])
+
    @ddt.file_data('data/validate_training_examples.json')
    def test_validate_training_examples(self, data):
        errors = training_api.validate_training_examples(
@@ -319,3 +373,22 @@ class StudentTrainingAssessmentTest(CacheResetTest):
        example = training_api.get_training_example(submission_uuid, input_rubric, input_examples)
        expected_example = self._expected_example(input_examples[order_num], input_rubric)
        self.assertItemsEqual(example, expected_example)
+
+    def _warm_cache(self, rubric, examples):
+        """
+        Create a submission and complete student training.
+        This will populate the cache with training examples and rubrics,
+        which are immutable and shared for all students training on a particular problem.
+
+        Args:
+            rubric (dict): Serialized rubric model.
+            examples (list of dict): Serialized training examples
+
+        Returns:
+            None
+
+        """
+        pre_submission = sub_api.create_submission(self.STUDENT_ITEM, self.ANSWER)
+        for example in examples:
+            training_api.get_training_example(pre_submission['uuid'], rubric, examples)
+            training_api.assess_training_example(pre_submission['uuid'], example['options_selected'])