Refactor get_all_submissions implementation and add a test.

b32d34f7 · Sven Marnach · 29a96a4b · b32d34f7 · b32d34f7
Commit b32d34f7 authored May 16, 2015 by Sven Marnach
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 35 deletions

submissions/api.py
+34 -35

submissions/tests/test_api.py
+22 -0

No files found.
--- a/submissions/api.py
+++ b/submissions/api.py
@@ -3,10 +3,11 @@ Public interface for the submissions app.

 """
 import copy
+import itertools
 import logging
+import operator
 import json

-from collections import namedtuple
 from django.conf import settings
 from django.core.cache import cache
 from django.db import IntegrityError, DatabaseError
@@ -90,15 +91,6 @@ class SubmissionRequestError(SubmissionError):
        )


-# For API stability and low memory usage, large result sets are returned using namedtuples.
-# They take up only as much memory as tuples, but fields are accessible by name,
-# so we can change the fields in the future if needed
-LatestSubmission = namedtuple(
-    "LatestSubmission",
-    ("student_id", "attempt_number", "submitted_at", "created_at", "answer"),
-)
-
-
 def create_submission(student_item_dict, answer, submitted_at=None, attempt_number=None):
    """Creates a submission for assessment.

@@ -384,37 +376,44 @@ def get_submissions(student_item_dict, limit=None):


 def get_all_submissions(course_id, item_id, item_type, read_replica=True):
-    """
-    For the given item, get the most recent submission for every student who has submitted.
+    """For the given item, get the most recent submission for every student who has submitted.

    This may return a very large result set! It is implemented as a generator for efficiency.
-    It yields namedtuples with the following fields:
-        student_id
-        attempt_number
-        submitted_at
-        created_at
-        answer
-
-    Cannot fail unless there's a database error, but may return an empty list.
+
+    Args:
+        course_id, item_id, item_type (string): The values of the respective student_item fields
+            to filter the submissions by.
+        read_replica (bool): If true, attempt to use the read replica database.
+            If no read replica is available, use the default database.
+
+    Yields:
+        Dicts representing the submissions with the following fields:
+            student_item
+            student_id
+            attempt_number
+            submitted_at
+            created_at
+            answer
+
+    Raises:
+        Cannot fail unless there's a database error, but may return an empty iterable.
    """
-    query = Submission.objects.filter(
+    submission_qs = Submission.objects
+    if read_replica:
+        submission_qs = _use_read_replica(submission_qs)
+    query = submission_qs.select_related('student_item').filter(
        student_item__course_id=course_id,
        student_item__item_id=item_id,
        student_item__item_type=item_type,
-    ).values_list(
-        'student_item__student_id', 'attempt_number', 'submitted_at', 'created_at', 'raw_answer'
-    ).order_by('student_item__student_id', '-created_at')
-
-    last_student_id = None
-    for row in query.all():
-        # We cannot use SELECT DISTINCT ON because it's PostgreSQL only, so unfortunately
-        # our results may contain every entry of each student, not just the most recent.
-        if row[0] == last_student_id:
-            continue  # Skip this row; it's an old submission for a student we've already included
-        last_student_id = row[0]
-        # Parse 'raw_answer' as JSON:
-        answer = json.loads(row[4])
-        yield LatestSubmission(row[0], row[1], row[2], row[3], answer)
+    ).order_by('student_item__student_id', '-created_at').iterator()
+
+    # We cannot use SELECT DISTINCT ON because it's PostgreSQL only, so unfortunately
+    # our results will contain every entry of each student, not just the most recent.
+    for unused_student_id, row_iter in itertools.groupby(query, operator.attrgetter('student_item.student_id')):
+        submission = next(row_iter)
+        data = SubmissionSerializer(submission).data
+        data['student_id'] = submission.student_item.student_id
+        yield data


 def get_top_submissions(course_id, item_id, item_type, number_of_top_scores, use_cache=True, read_replica=True):

--- a/submissions/tests/test_api.py
+++ b/submissions/tests/test_api.py
@@ -74,6 +74,28 @@ class TestSubmissionsApi(TestCase):
        self._assert_submission(submissions[1], ANSWER_ONE, student_item.pk, 1)
        self._assert_submission(submissions[0], ANSWER_TWO, student_item.pk, 2)

+    def test_get_all_submissions(self):
+        api.create_submission(SECOND_STUDENT_ITEM, ANSWER_TWO)
+        api.create_submission(STUDENT_ITEM, ANSWER_ONE)
+        api.create_submission(STUDENT_ITEM, ANSWER_TWO)
+        api.create_submission(SECOND_STUDENT_ITEM, ANSWER_ONE)
+        with self.assertNumQueries(1):
+            submissions = list(api.get_all_submissions(
+                STUDENT_ITEM['course_id'],
+                STUDENT_ITEM['item_id'],
+                STUDENT_ITEM['item_type'],
+                read_replica=False,
+            ))
+
+        student_item = self._get_student_item(STUDENT_ITEM)
+        second_student_item = self._get_student_item(SECOND_STUDENT_ITEM)
+        # The result is assumed to be sorted by student_id, which is not part of the specification
+        # of get_all_submissions(), but it is what it currently does.
+        self._assert_submission(submissions[0], ANSWER_ONE, second_student_item.pk, 2)
+        self.assertEqual(submissions[0]['student_id'], SECOND_STUDENT_ITEM['student_id'])
+        self._assert_submission(submissions[1], ANSWER_TWO, student_item.pk, 2)
+        self.assertEqual(submissions[1]['student_id'], STUDENT_ITEM['student_id'])
+
    def test_get_submission(self):
        # Test base case that we can create a submission and get it back
        sub_dict1 = api.create_submission(STUDENT_ITEM, ANSWER_ONE)