Optimized Courses Problems Endpoint

Using raw SQL to retrieve and aggregate the data. This significantly decreases response time. Also added a created field so that users know when the data was last updated.

Optimized Courses Problems Endpoint
Using raw SQL to retrieve and aggregate the data. This significantly decreases response time. Also added a created field so that users know when the data was last updated.
ef070668 · Clinton Blackburn · 6b88b6c4 · ef070668 · ef070668 · ef070668
Commit ef070668 authored Jan 23, 2015 by Clinton Blackburn
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 36 deletions

analytics_data_api/utils.py
+10 -0

analytics_data_api/v0/serializers.py
+1 -0

analytics_data_api/v0/tests/views/test_courses.py
+13 -8

analytics_data_api/v0/views/courses.py
+31 -28

No files found.
--- a/analytics_data_api/utils.py
+++ b/analytics_data_api/utils.py
@@ -82,3 +82,13 @@ def consolidate_answers(problem):
        consolidated_answers.append(consolidated_answer)
    return consolidated_answers
+def dictfetchall(cursor):
+    """Returns all rows from a cursor as a dict"""
+    desc = cursor.description
+    return [
+        dict(zip([col[0] for col in desc], row))
+        for row in cursor.fetchall()
+    ]
--- a/analytics_data_api/v0/serializers.py
+++ b/analytics_data_api/v0/serializers.py
@@ -48,6 +48,7 @@ class ProblemSerializer(serializers.Serializer):
    total_submissions = serializers.IntegerField(default=0)
    correct_submissions = serializers.IntegerField(default=0)
    part_ids = serializers.CharField()
+    created = serializers.DateTimeField(format=settings.DATETIME_FORMAT)
 class ProblemResponseAnswerDistributionSerializer(ModelSerializerWithCreatedField):

--- a/analytics_data_api/v0/tests/views/test_courses.py
+++ b/analytics_data_api/v0/tests/views/test_courses.py
@@ -600,29 +600,34 @@ class CourseProblemsListViewTests(DemoCourseMixin, TestCaseWithAuthentication):
        # This data should never be returned by the tests below because the course_id doesn't match.
        G(models.ProblemResponseAnswerDistribution)
-        # This test assumes the view is using Python's groupby for grouping. Create multiple objects here to test the
+        # Create multiple objects here to test the grouping. Add a model with a different module_id to break up the
-        # grouping. Add a model with a different module_id to break up the natural order and ensure the view properly
+        # natural order and ensure the view properly sorts the objects before grouping.
-        # sorts the objects before grouping.
        module_id = 'i4x://test/problem/1'
        alt_module_id = 'i4x://test/problem/2'
+        created = datetime.datetime.utcnow()
+        alt_created = created + datetime.timedelta(seconds=2)
        o1 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=module_id, correct=True,
-               count=100)
+               count=100, created=created)
        o2 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=alt_module_id,
-               correct=True, count=100)
+               correct=True, count=100, created=created)
        o3 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=module_id, correct=False,
-               count=200)
+               count=200, created=alt_created)
        expected = [
            {
                'module_id': module_id,
                'total_submissions': 300,
                'correct_submissions': 100,
-                'part_ids': [o1.part_id, o3.part_id]
+                'part_ids': [o1.part_id, o3.part_id],
+                'created': alt_created.strftime(settings.DATETIME_FORMAT)
            },
            {
                'module_id': alt_module_id,
                'total_submissions': 100,
                'correct_submissions': 100,
-                'part_ids': [o2.part_id]
+                'part_ids': [o2.part_id],
+                'created': created.strftime(settings.DATETIME_FORMAT)
            }
        ]

--- a/analytics_data_api/v0/views/courses.py
+++ b/analytics_data_api/v0/views/courses.py
@@ -4,6 +4,7 @@ import warnings
 from django.conf import settings
 from django.core.exceptions import ObjectDoesNotExist
+from django.db import connections
 from django.db.models import Max
 from django.http import Http404
 from django.utils.timezone import make_aware, utc
@@ -11,6 +12,7 @@ from rest_framework import generics
 from opaque_keys.edx.keys import CourseKey
 from analytics_data_api.constants import enrollment_modes
+from analytics_data_api.utils import dictfetchall
 from analytics_data_api.v0 import models, serializers
@@ -610,6 +612,7 @@ class CourseEnrollmentByLocationView(BaseCourseEnrollmentView):
        return returned_items
+# pylint: disable=abstract-method
 class ProblemsListView(BaseCourseView):
    """
    Get the problems.
@@ -627,36 +630,36 @@ class ProblemsListView(BaseCourseView):
            * correct_submissions: Total number of *correct* submissions.
            * part_ids: List of problem part IDs
    """
-    model = models.ProblemResponseAnswerDistribution
    serializer_class = serializers.ProblemSerializer
+    allow_empty = False
-    def apply_date_filtering(self, queryset):
-        # Date filtering is not possible for this data.
-        return queryset
    def get_queryset(self):
-        queryset = super(ProblemsListView, self).get_queryset()
+        sql = """
-        queryset = queryset.order_by('module_id', 'part_id')
+SELECT
+    module_id,
+    SUM(count) AS total_submissions,
+    SUM(CASE WHEN correct=1 THEN count ELSE 0 END) AS correct_submissions,
+    GROUP_CONCAT(DISTINCT part_id) AS part_ids,
+    MAX(created) AS created
+FROM answer_distribution
+WHERE course_id = %s
+GROUP BY module_id;
+        """
+        with connections[settings.ANALYTICS_DATABASE].cursor() as cursor:
+            cursor.execute(sql, [self.course_id])
+            rows = dictfetchall(cursor)
-        data = []
+        for row in rows:
+            # Convert the comma-separated list into an array of strings.
+            row['part_ids'] = row['part_ids'].split(',')
-        for problem_id, distribution in groupby(queryset, lambda x: x.module_id):
+            # Convert the aggregated decimal fields to integers
-            total = 0
+            row['total_submissions'] = int(row['total_submissions'])
-            correct = 0
+            row['correct_submissions'] = int(row['correct_submissions'])
-            part_ids = set()    # Use a set to remove duplicate values.
+            # Rather than write custom SQL for the SQLite backend, simply parse the timestamp.
-            for answer in distribution:
+            created = row['created']
-                part_ids.add(answer.part_id)
+            if not isinstance(created, datetime.datetime):
-                count = answer.count
+                row['created'] = datetime.datetime.strptime(created, '%Y-%m-%d %H:%M:%S.%f')
-                total += count
-                if answer.correct:
+        return rows
-                    correct += count
-            data.append({
-                'module_id': problem_id,
-                'total_submissions': total,
-                'correct_submissions': correct,
-                'part_ids': sorted(part_ids)
-            })
-        return data