Commit ef070668 by Clinton Blackburn

Optimized Courses Problems Endpoint

Using raw SQL to retrieve and aggregate the data. This significantly decreases response time. Also added a created field so that users know when the data was last updated.
parent 6b88b6c4
......@@ -82,3 +82,13 @@ def consolidate_answers(problem):
consolidated_answers.append(consolidated_answer)
return consolidated_answers
def dictfetchall(cursor):
"""Returns all rows from a cursor as a dict"""
desc = cursor.description
return [
dict(zip([col[0] for col in desc], row))
for row in cursor.fetchall()
]
......@@ -48,6 +48,7 @@ class ProblemSerializer(serializers.Serializer):
total_submissions = serializers.IntegerField(default=0)
correct_submissions = serializers.IntegerField(default=0)
part_ids = serializers.CharField()
created = serializers.DateTimeField(format=settings.DATETIME_FORMAT)
class ProblemResponseAnswerDistributionSerializer(ModelSerializerWithCreatedField):
......
......@@ -600,29 +600,34 @@ class CourseProblemsListViewTests(DemoCourseMixin, TestCaseWithAuthentication):
# This data should never be returned by the tests below because the course_id doesn't match.
G(models.ProblemResponseAnswerDistribution)
# This test assumes the view is using Python's groupby for grouping. Create multiple objects here to test the
# grouping. Add a model with a different module_id to break up the natural order and ensure the view properly
# sorts the objects before grouping.
# Create multiple objects here to test the grouping. Add a model with a different module_id to break up the
# natural order and ensure the view properly sorts the objects before grouping.
module_id = 'i4x://test/problem/1'
alt_module_id = 'i4x://test/problem/2'
created = datetime.datetime.utcnow()
alt_created = created + datetime.timedelta(seconds=2)
o1 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=module_id, correct=True,
count=100)
count=100, created=created)
o2 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=alt_module_id,
correct=True, count=100)
correct=True, count=100, created=created)
o3 = G(models.ProblemResponseAnswerDistribution, course_id=self.course_id, module_id=module_id, correct=False,
count=200)
count=200, created=alt_created)
expected = [
{
'module_id': module_id,
'total_submissions': 300,
'correct_submissions': 100,
'part_ids': [o1.part_id, o3.part_id]
'part_ids': [o1.part_id, o3.part_id],
'created': alt_created.strftime(settings.DATETIME_FORMAT)
},
{
'module_id': alt_module_id,
'total_submissions': 100,
'correct_submissions': 100,
'part_ids': [o2.part_id]
'part_ids': [o2.part_id],
'created': created.strftime(settings.DATETIME_FORMAT)
}
]
......
......@@ -4,6 +4,7 @@ import warnings
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db import connections
from django.db.models import Max
from django.http import Http404
from django.utils.timezone import make_aware, utc
......@@ -11,6 +12,7 @@ from rest_framework import generics
from opaque_keys.edx.keys import CourseKey
from analytics_data_api.constants import enrollment_modes
from analytics_data_api.utils import dictfetchall
from analytics_data_api.v0 import models, serializers
......@@ -610,6 +612,7 @@ class CourseEnrollmentByLocationView(BaseCourseEnrollmentView):
return returned_items
# pylint: disable=abstract-method
class ProblemsListView(BaseCourseView):
"""
Get the problems.
......@@ -627,36 +630,36 @@ class ProblemsListView(BaseCourseView):
* correct_submissions: Total number of *correct* submissions.
* part_ids: List of problem part IDs
"""
model = models.ProblemResponseAnswerDistribution
serializer_class = serializers.ProblemSerializer
def apply_date_filtering(self, queryset):
# Date filtering is not possible for this data.
return queryset
allow_empty = False
def get_queryset(self):
queryset = super(ProblemsListView, self).get_queryset()
queryset = queryset.order_by('module_id', 'part_id')
sql = """
SELECT
module_id,
SUM(count) AS total_submissions,
SUM(CASE WHEN correct=1 THEN count ELSE 0 END) AS correct_submissions,
GROUP_CONCAT(DISTINCT part_id) AS part_ids,
MAX(created) AS created
FROM answer_distribution
WHERE course_id = %s
GROUP BY module_id;
"""
with connections[settings.ANALYTICS_DATABASE].cursor() as cursor:
cursor.execute(sql, [self.course_id])
rows = dictfetchall(cursor)
data = []
for row in rows:
# Convert the comma-separated list into an array of strings.
row['part_ids'] = row['part_ids'].split(',')
for problem_id, distribution in groupby(queryset, lambda x: x.module_id):
total = 0
correct = 0
part_ids = set() # Use a set to remove duplicate values.
for answer in distribution:
part_ids.add(answer.part_id)
count = answer.count
total += count
if answer.correct:
correct += count
data.append({
'module_id': problem_id,
'total_submissions': total,
'correct_submissions': correct,
'part_ids': sorted(part_ids)
})
return data
# Convert the aggregated decimal fields to integers
row['total_submissions'] = int(row['total_submissions'])
row['correct_submissions'] = int(row['correct_submissions'])
# Rather than write custom SQL for the SQLite backend, simply parse the timestamp.
created = row['created']
if not isinstance(created, datetime.datetime):
row['created'] = datetime.datetime.strptime(created, '%Y-%m-%d %H:%M:%S.%f')
return rows
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment