Merge pull request #182 from edx/ormsbee/kill_queries

Reducing query count

Merge pull request #182 from edx/ormsbee/kill_queries
Reducing query count
4b6cff6c · David Ormsbee · c1dbf128 · 428824c3 · 4b6cff6c · 4b6cff6c
Commit 4b6cff6c authored Mar 21, 2014 by David Ormsbee
7 changed files
--- a/apps/openassessment/assessment/models.py
+++ b/apps/openassessment/assessment/models.py
@@ -15,6 +15,7 @@ from copy import deepcopy
 from hashlib import sha1
 import json
+from django.core.cache import cache
 from django.db import models
 from django.utils.timezone import now
 from django.utils.translation import ugettext as _
@@ -105,17 +106,35 @@ class Rubric(models.Model):
            InvalidOptionSelection: the selected options do not match the rubric.
        """
-        # Select all criteria and options for this rubric
+        # Cache based on the content_hash, not the id. It's slightly safer, and
-        # We use `select_related()` to minimize the number of database queries
+        # we don't have to worry about invalidation of the cache while running
-        rubric_options = CriterionOption.objects.filter(criterion__rubric=self).select_related()
+        # tests.
+        rubric_criteria_dict_cache_key = (
+            "assessment.rubric_criteria_dict.{}".format(self.content_hash)
+        )
        # Create a dict of dicts that maps:
        # criterion names --> option names --> option ids
-        rubric_criteria_dict = defaultdict(dict)
+        #
+        # If we've already generated one of these for this rubric, grab it from
+        # the cache instead of hitting the database again.
+        rubric_criteria_dict = cache.get(rubric_criteria_dict_cache_key)
+        if not rubric_criteria_dict:
+            rubric_criteria_dict = defaultdict(dict)
+            # Select all criteria and options for this rubric
+            # We use `select_related()` to minimize the number of database queries
+            rubric_options = CriterionOption.objects.filter(
+                criterion__rubric=self
+            ).select_related()
-        # Construct dictionaries for each option in the rubric
+            # Construct dictionaries for each option in the rubric
-        for option in rubric_options:
+            for option in rubric_options:
-            rubric_criteria_dict[option.criterion.name][option.name] = option.id
+                rubric_criteria_dict[option.criterion.name][option.name] = option.id
+            # Save it in our cache
+            cache.set(rubric_criteria_dict_cache_key, rubric_criteria_dict)
        # Validate: are options selected for each criterion in the rubric?
        if len(options_selected) != len(rubric_criteria_dict):
@@ -329,11 +348,26 @@ class Assessment(models.Model):
                "bar": [6, 7, 8]
            }
        """
+        assessments = list(assessments)  # Force us to read it all
+        if not assessments:
+            return []
+        # Generate a cache key that represents all the assessments we're being
+        # asked to grab scores from (comma separated list of assessment IDs)
+        cache_key = "assessments.scores_by_criterion.{}".format(
+            ",".join(str(assessment.id) for assessment in assessments)
+        )
+        scores = cache.get(cache_key)
+        if scores:
+            return scores
        scores = defaultdict(list)
        for assessment in assessments:
-            for part in assessment.parts.all():
+            for part in assessment.parts.all().select_related("option__criterion"):
                criterion_name = part.option.criterion.name
                scores[criterion_name].append(part.option.points)
+        cache.set(cache_key, scores)
        return scores
@@ -362,6 +396,14 @@ class AssessmentPart(models.Model):
    def points_possible(self):
        return self.option.criterion.points_possible
+    @classmethod
+    def add_to_assessment(cls, assessment, option_ids):
+        """Creates AssessmentParts and adds them to `assessment`."""
+        cls.objects.bulk_create([
+            cls(assessment=assessment, option_id=option_id)
+            for option_id in option_ids
+        ])
 class AssessmentFeedback(models.Model):
    """A response to a submission's feedback, judging accuracy or helpfulness."""

--- a/apps/openassessment/assessment/peer_api.py
+++ b/apps/openassessment/assessment/peer_api.py
@@ -14,12 +14,14 @@ from django.db import DatabaseError
 from django.db.models import Q
 from openassessment.assessment.models import (
-    Assessment, InvalidOptionSelection, PeerWorkflow, PeerWorkflowItem,
+    Assessment, AssessmentFeedback, AssessmentPart,
-    AssessmentFeedback
+    InvalidOptionSelection, PeerWorkflow, PeerWorkflowItem,
 )
 from openassessment.assessment.serializers import (
-    AssessmentSerializer, rubric_from_dict, AssessmentFeedbackSerializer,
+    AssessmentSerializer, AssessmentFeedbackSerializer, RubricSerializer,
-    full_assessment_dict)
+    full_assessment_dict, rubric_from_dict, serialize_assessments,
+)
+from submissions import api as sub_api
 from submissions.api import get_submission_and_student
 from submissions.models import Submission, StudentItem
 from submissions.serializers import SubmissionSerializer, StudentItemSerializer
@@ -78,7 +80,7 @@ def is_complete(submission_uuid, requirements):
        workflow = PeerWorkflow.objects.get(submission_uuid=submission_uuid)
    except PeerWorkflow.DoesNotExist:
        return False
-    return _check_student_done_grading(workflow, requirements["must_grade"])
+    return _num_peers_graded(workflow) >= requirements["must_grade"]
 def get_score(submission_uuid, requirements):
@@ -182,7 +184,6 @@ def create_assessment(
            "submission_uuid": submission.uuid,
            "score_type": PEER_TYPE,
            "feedback": feedback,
-            "parts": [{"option": option_id} for option_id in option_ids]
        }
        if scored_at is not None:
@@ -192,8 +193,14 @@ def create_assessment(
        if not peer_serializer.is_valid():
            raise PeerAssessmentRequestError(peer_serializer.errors)
        assessment = peer_serializer.save()
+        # We do this to do a run around django-rest-framework serializer
+        # validation, which would otherwise require two DB queries per
+        # option to do validation. We already validated these options above.
+        AssessmentPart.add_to_assessment(assessment, option_ids)
        student_item = submission.student_item
        student_item_dict = StudentItemSerializer(student_item).data
@@ -223,7 +230,7 @@ def create_assessment(
        # Close the active assessment
        _close_active_assessment(scorer_workflow, submission_uuid, assessment)
-        return peer_serializer.data
+        return full_assessment_dict(assessment)
    except DatabaseError:
        error_message = _(
            u"An error occurred while creating assessment {} for submission: "
@@ -250,12 +257,20 @@ def get_rubric_max_scores(submission_uuid):
            the submission, or its associated rubric.
    """
    try:
-        assessments = Assessment.objects.filter(submission_uuid=submission_uuid).order_by( "-scored_at", "-id")
+        assessments = list(
-        if assessments:
+            Assessment.objects.filter(
-            return {
+                submission_uuid=submission_uuid
-                criterion.name: criterion.points_possible
+            ).order_by( "-scored_at", "-id").select_related("rubric")[:1]
-                for criterion in assessments[0].rubric.criteria.all()
+        )
-            }
+        if not assessments:
+            return None
+        assessment = assessments[0]
+        rubric_dict = RubricSerializer.serialized_from_cache(assessment.rubric)
+        return {
+            criterion["name"]: criterion["points_possible"]
+            for criterion in rubric_dict["criteria"]
+        }
    except Submission.DoesNotExist:
        return None
    except DatabaseError:
@@ -341,11 +356,11 @@ def has_finished_required_evaluating(student_item_dict, required_assessments):
    """
    workflow = _get_latest_workflow(student_item_dict)
    done = False
-    count = 0
+    peers_graded = 0
    if workflow:
-        done = _check_student_done_grading(workflow, required_assessments)
+        peers_graded = _num_peers_graded(workflow)
-        count = workflow.items.all().exclude(assessment=-1).count()
+        done = (peers_graded >= required_assessments)
-    return done, count
+    return done, peers_graded
 def get_assessments(submission_uuid, scored_only=True, limit=None):
@@ -398,13 +413,13 @@ def get_assessments(submission_uuid, scored_only=True, limit=None):
        if scored_only:
            assessments = PeerWorkflowItem.get_scored_assessments(
                submission_uuid
-            )
+            )[:limit]
        else:
            assessments = Assessment.objects.filter(
                submission_uuid=submission_uuid,
                score_type=PEER_TYPE
-            )
+            )[:limit]
-        return [full_assessment_dict(assessment) for assessment in assessments[:limit]]
+        return serialize_assessments(assessments)
    except DatabaseError:
        error_message = _(
            u"Error getting assessments for submission {}".format(submission_uuid)
@@ -486,10 +501,10 @@ def get_submission_to_assess(
        submission_uuid = _get_submission_for_over_grading(workflow)
    if submission_uuid:
        try:
-            submission = Submission.objects.get(uuid=submission_uuid)
+            submission_data = sub_api.get_submission(submission_uuid)
            _create_peer_workflow_item(workflow, submission_uuid)
-            return SubmissionSerializer(submission).data
+            return submission_data
-        except Submission.DoesNotExist:
+        except sub_api.SubmissionDoesNotExist:
            error_message = _(
                u"Could not find a submission with the uuid {} for student {} "
                u"in the peer workflow."
@@ -890,16 +905,14 @@ def _close_active_assessment(workflow, submission_uuid, assessment):
        raise PeerAssessmentWorkflowError(error_message)
-def _check_student_done_grading(workflow, must_grade):
+def _num_peers_graded(workflow):
-    """Checks if the student has graded enough peers.
+    """Returns the number of peers the student owning the workflow has graded.
    Determines if the student has graded enough peers.
    Args:
        workflow (PeerWorkflow): The workflow associated with the current
            student.
-        must_grade (int): The number of submissions the student has to peer
-            assess before they are finished.
    Returns:
        True if the student is done peer assessing, False if not.
@@ -912,10 +925,10 @@ def _check_student_done_grading(workflow, must_grade):
        >>>    student_id="Bob",
        >>> )
        >>> workflow = _get_latest_workflow(student_item_dict)
-        >>> _check_student_done_grading(workflow, 3)
+        >>> _num_peers_graded(workflow, 3)
        True
    """
-    return workflow.items.all().exclude(assessment=-1).count() >= must_grade
+    return workflow.items.all().exclude(assessment=-1).count()
 def get_assessment_feedback(submission_uuid):

--- a/apps/openassessment/assessment/self_api.py
+++ b/apps/openassessment/assessment/self_api.py
 """
 Public interface for self-assessment.
 """
+from django.core.cache import cache
 from django.utils.translation import ugettext as _
 from submissions.api import (
    get_submission_and_student, get_submission,
    SubmissionNotFoundError, SubmissionRequestError
 )
 from openassessment.assessment.serializers import (
-    rubric_from_dict, AssessmentSerializer, full_assessment_dict, InvalidRubric
+    AssessmentSerializer, InvalidRubric, RubricSerializer,
+    full_assessment_dict, rubric_from_dict, serialize_assessments
+)
+from openassessment.assessment.models import (
+    Assessment, AssessmentPart, InvalidOptionSelection
 )
-from openassessment.assessment.models import Assessment, InvalidOptionSelection
 # Assessments are tagged as "self-evaluation"
@@ -74,7 +78,6 @@ def create_assessment(submission_uuid, user_id, options_selected, rubric_dict, s
        "submission_uuid": submission_uuid,
        "score_type": SELF_TYPE,
        "feedback": u"",
-        "parts": [{"option": option_id} for option_id in option_ids],
    }
    if scored_at is not None:
@@ -86,10 +89,15 @@ def create_assessment(submission_uuid, user_id, options_selected, rubric_dict, s
        msg = _("Could not create self assessment: {errors}").format(errors=serializer.errors)
        raise SelfAssessmentRequestError(msg)
-    serializer.save()
+    assessment = serializer.save()
+    # We do this to do a run around django-rest-framework serializer
+    # validation, which would otherwise require two DB queries per
+    # option to do validation. We already validated these options above.
+    AssessmentPart.add_to_assessment(assessment, option_ids)
    # Return the serialized assessment
-    return serializer.data
+    return full_assessment_dict(assessment)
 def get_assessment(submission_uuid):
@@ -112,14 +120,11 @@ def get_assessment(submission_uuid):
    # but not at the database level.  Someone could take advantage of the race condition
    # between checking the number of self-assessments and creating a new self-assessment.
    # To be safe, we retrieve just the most recent submission.
-    assessments = Assessment.objects.filter(
+    serialized_assessments = serialize_assessments(Assessment.objects.filter(
        score_type=SELF_TYPE, submission_uuid=submission_uuid
-    ).order_by('-scored_at')
+    ).order_by('-scored_at')[:1])
-    if assessments.exists():
+    return serialized_assessments[0] if serialized_assessments else None
-        assessment_dict = full_assessment_dict(assessments[0])
-        return assessment_dict
-    return None
 def is_complete(submission_uuid):

--- a/apps/openassessment/assessment/serializers.py
+++ b/apps/openassessment/assessment/serializers.py
@@ -4,7 +4,9 @@ Serializers are created to ensure models do not have to be accessed outside the
 scope of the Tim APIs.
 """
 from copy import deepcopy
+import logging
+from django.core.cache import cache
 from django.utils.translation import ugettext as _
 from rest_framework import serializers
 from openassessment.assessment.models import (
@@ -12,6 +14,9 @@ from openassessment.assessment.models import (
    PeerWorkflowItem, PeerWorkflow)
+logger = logging.getLogger(__name__)
 class InvalidRubric(Exception):
    """This can be raised during the deserialization process."""
    def __init__(self, errors):
@@ -66,10 +71,11 @@ class CriterionOptionSerializer(NestedModelSerializer):
 class CriterionSerializer(NestedModelSerializer):
    """Serializer for :class:`Criterion`"""
    options = CriterionOptionSerializer(required=True, many=True)
+    points_possible = serializers.Field(source='points_possible')
    class Meta:
        model = Criterion
-        fields = ('order_num', 'name', 'prompt', 'options')
+        fields = ('order_num', 'name', 'prompt', 'options', 'points_possible')
    def validate_options(self, attrs, source):
        """Make sure we have at least one CriterionOption in a Criterion."""
@@ -97,6 +103,49 @@ class RubricSerializer(NestedModelSerializer):
            raise serializers.ValidationError("Must have at least one criterion")
        return attrs
+    @classmethod
+    def serialized_from_cache(cls, rubric, local_cache=None):
+        """For a given `Rubric` model object, return a serialized version.
+        This method will attempt to use the cache if possible, first looking at
+        the `local_cache` dict you can pass in, and then looking at whatever
+        Django cache is configured.
+        Args:
+            rubric (Rubric): The Rubric model to get the serialized form of.
+            local_cach (dict): Mapping of `rubric.content_hash` to serialized
+                rubric dictionary. We include this so that we can call this
+                method in a loop.
+        Returns:
+            dict: `Rubric` fields as a dictionary, with `criteria` and `options`
+                relations followed.
+        """
+        # Optional local cache you can send in (for when you're calling this
+        # in a loop).
+        local_cache = local_cache or {}
+        # Check our in-memory cache...
+        if rubric.content_hash in local_cache:
+            return local_cache[rubric.content_hash]
+        # Check the external cache (e.g. memcached)
+        rubric_dict_cache_key = (
+            "RubricSerializer.serialized_from_cache.{}"
+            .format(rubric.content_hash)
+        )
+        rubric_dict = cache.get(rubric_dict_cache_key)
+        if rubric_dict:
+            local_cache[rubric.content_hash] = rubric_dict
+            return rubric_dict
+        # Grab it from the database
+        rubric_dict = RubricSerializer(rubric).data
+        cache.set(rubric_dict_cache_key, rubric_dict)
+        local_cache[rubric.content_hash] = rubric_dict
+        return rubric_dict
 class AssessmentPartSerializer(serializers.ModelSerializer):
    """Serializer for :class:`AssessmentPart`."""
@@ -107,11 +156,7 @@ class AssessmentPartSerializer(serializers.ModelSerializer):
 class AssessmentSerializer(serializers.ModelSerializer):
-    """Serializer for :class:`Assessment`."""
+    """Simplified serializer for :class:`Assessment` that's lighter on the DB."""
-    parts = AssessmentPartSerializer(required=True, many=True)
-    points_earned = serializers.Field(source='points_earned')
-    points_possible = serializers.Field(source='points_possible')
    class Meta:
        model = Assessment
@@ -122,20 +167,32 @@ class AssessmentSerializer(serializers.ModelSerializer):
            'scorer_id',
            'score_type',
            'feedback',
+        )
-            # Foreign Key
+def serialize_assessments(assessments_qset):
-            'parts',
+    assessments = list(assessments_qset.select_related("rubric"))
+    rubric_cache = {}
-            # Computed, not part of the model
+    return [
-            'points_earned',
+        full_assessment_dict(
-            'points_possible',
+            assessment,
+            RubricSerializer.serialized_from_cache(
+                assessment.rubric, rubric_cache
+            )
        )
+        for assessment in assessments
+    ]
-def full_assessment_dict(assessment):
+def full_assessment_dict(assessment, rubric_dict=None):
    """
-    Return a dict representation of the Assessment model,
+    Return a dict representation of the Assessment model, including nested
-    including nested assessment parts.
+    assessment parts. We do some of the serialization ourselves here instead
+    of relying on the Django REST Framework serializers. This is for performance
+    reasons -- we have a cached rubric easily available, and we don't want to
+    follow all the DB relations from assessment -> assessment part -> option ->
+    criterion.
    Args:
        assessment (Assessment): The Assessment model to serialize
@@ -143,18 +200,45 @@ def full_assessment_dict(assessment):
    Returns:
        dict with keys 'rubric' (serialized Rubric model) and 'parts' (serialized assessment parts)
    """
+    assessment_cache_key = "assessment.full_assessment_dict.{}.{}.{}".format(
+        assessment.id, assessment.submission_uuid, assessment.scored_at.isoformat()
+    )
+    assessment_dict = cache.get(assessment_cache_key)
+    if assessment_dict:
+        return assessment_dict
    assessment_dict = AssessmentSerializer(assessment).data
-    rubric_dict = RubricSerializer(assessment.rubric).data
+    if not rubric_dict:
+        rubric_dict = RubricSerializer.serialized_from_cache(assessment.rubric)
    assessment_dict["rubric"] = rubric_dict
+    # This part looks a little goofy, but it's in the name of saving dozens of
+    # SQL lookups. The rubric_dict has the entire serialized output of the
+    # `Rubric`, its child `Criterion` and grandchild `CriterionOption`. This
+    # includes calculated things like `points_possible` which aren't actually in
+    # the DB model. Instead of invoking the serializers for `Criterion` and
+    # `CriterionOption` again, we simply index into the places we expect them to
+    # be from the big, saved `Rubric` serialization.
    parts = []
-    for part in assessment.parts.all():
+    for part in assessment.parts.all().select_related("option__criterion"):
-        part_dict = AssessmentPartSerializer(part).data
+        criterion_dict = rubric_dict["criteria"][part.option.criterion.order_num]
-        options_dict = CriterionOptionSerializer(part.option).data
+        options_dict = criterion_dict["options"][part.option.order_num]
-        criterion_dict = CriterionSerializer(part.option.criterion).data
        options_dict["criterion"] = criterion_dict
-        part_dict["option"] = options_dict
+        parts.append({
-        parts.append(part_dict)
+            "option": options_dict
+        })
+    # Now manually built up the dynamically calculated values on the
+    # `Assessment` so we can again avoid DB calls.
    assessment_dict["parts"] = parts
+    assessment_dict["points_earned"] = sum(
+        part_dict["option"]["points"] for part_dict in parts
+    )
+    assessment_dict["points_possible"] = rubric_dict["points_possible"]
+    cache.set(assessment_cache_key, assessment_dict)
    return assessment_dict

--- a/apps/submissions/api.py
+++ b/apps/submissions/api.py
@@ -381,14 +381,11 @@ def get_scores(course_id, student_id):
 def get_latest_score_for_submission(submission_uuid):
    try:
-        submission = Submission.objects.get(uuid=submission_uuid)
+        score = Score.objects.filter(
-        score = Score.objects.filter(submission=submission).order_by("-id")[0]
+            submission__uuid=submission_uuid
+        ).order_by("-id").select_related("submission")[0]
    except IndexError:
        return None
-    except Submission.DoesNotExist:
-        raise SubmissionNotFoundError(
-            u"No submission matching uuid {}".format(submission_uuid)
-        )
    return ScoreSerializer(score).data

--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -18,3 +18,6 @@ sphinxcontrib-napoleon==0.2.3
 # runserver_plus
 Werkzeug==0.9.4
+# caching
+python-memcached==1.53
--- a/settings/dev.py
+++ b/settings/dev.py
@@ -21,3 +21,11 @@ MIDDLEWARE_CLASSES += (
 DEBUG_TOOLBAR_PATCH_SETTINGS = False
 INTERNAL_IPS = ('127.0.0.1',)
+CACHES = {
+    'default': {
+        'BACKEND': 'django.core.cache.backends.memcached.MemcachedCache',
+        'LOCATION': '127.0.0.1:11211',
+        'TIMEOUT': 60 * 60 * 8
+    }
+}