Commit 0565fbbf by David Ormsbee

Fix (re-implement) answer distribution report generation.

This restores functionality that has been broken since the introduction of
XModuleDescriptor/XModule proxying (part of the XBlock transition). It generates
a CSV of all answers for all content of type "problem" in a given course, with a
row per (problem part, answer). The format is:

url_name, display name, answer id, answer, count

Example values:
  url_name = "7f1b1523a55848cd9f5c93eb8cbabcf7"
  display name = "Problem 1: Something Hard"
  answer id = i4x-JediAcdmy-LTSB304-problem-7f1b1523a55848cd9f5c93eb8cbabcf7_2_1
  answer = "Use the Force"
  count = 1138

Since it only grabs things of type "problem", it will not return results for
things like self/peer-assessments. Any Loncapa problem types will show up (so
multiple choice, text input, numeric, etc.)

Instead of crawling the course tree and instantiating the appropriate CapaModule
objects to grab state, this version draws directly from StudentModule. This lets
us skip a lot of processing and most importantly lets us generate the answer
distribution without causing side-effects (since XBlocks auto-save state). It
also lets us take advantage of a read-replica database if one is available, to
minimize locking concerns.

There are minor changes to the legacy dashboard around CSV charset encoding and
a change to OptionResponseXMLFactory to make it more unicode friendly. Answer
distribution output is now also sorted, to group together answers for the same
content piece.

Note that this does not introduce celery into the process. Answer distributions
are still only available for small courses.

This was originally created to fix [LMS-922], but it also addresses [LMS-811] and
possibly other areas in the legacy dashboard where CSV downloads break due to
character encoding issues.
parent 608aaf31
...@@ -5,6 +5,8 @@ These are notable changes in edx-platform. This is a rolling list of changes, ...@@ -5,6 +5,8 @@ These are notable changes in edx-platform. This is a rolling list of changes,
in roughly chronological order, most recent first. Add your entries at or near in roughly chronological order, most recent first. Add your entries at or near
the top. Include a label indicating the component affected. the top. Include a label indicating the component affected.
LMS: Fix answer distribution download for small courses. LMS-922, LMS-811
Blades: Add template for the zooming image in studio. BLD-206. Blades: Add template for the zooming image in studio. BLD-206.
Blades: Update behavior of start/end time fields. BLD-506. Blades: Update behavior of start/end time fields. BLD-506.
......
...@@ -640,8 +640,8 @@ class OptionResponseXMLFactory(ResponseXMLFactory): ...@@ -640,8 +640,8 @@ class OptionResponseXMLFactory(ResponseXMLFactory):
# Set the "options" attribute # Set the "options" attribute
# Format: "('first', 'second', 'third')" # Format: "('first', 'second', 'third')"
options_attr_string = ",".join(["'%s'" % str(o) for o in options_list]) options_attr_string = u",".join([u"'{}'".format(o) for o in options_list])
options_attr_string = "(%s)" % options_attr_string options_attr_string = u"({})".format(options_attr_string)
optioninput_element.set('options', options_attr_string) optioninput_element.set('options', options_attr_string)
# Set the "correct" attribute # Set the "correct" attribute
......
# Compute grades using real division, with no integer truncation # Compute grades using real division, with no integer truncation
from __future__ import division from __future__ import division
from collections import defaultdict from collections import defaultdict
import json
import random import random
import logging import logging
...@@ -17,24 +18,15 @@ from courseware import courses ...@@ -17,24 +18,15 @@ from courseware import courses
from courseware.model_data import FieldDataCache from courseware.model_data import FieldDataCache
from xblock.fields import Scope from xblock.fields import Scope
from xmodule import graders from xmodule import graders
from xmodule.capa_module import CapaModule
from xmodule.graders import Score from xmodule.graders import Score
from xmodule.modulestore.django import modulestore
from xmodule.modulestore.exceptions import ItemNotFoundError
from .models import StudentModule from .models import StudentModule
from .module_render import get_module, get_module_for_descriptor from .module_render import get_module, get_module_for_descriptor
log = logging.getLogger("edx.courseware") log = logging.getLogger("edx.courseware")
def yield_module_descendents(module):
stack = module.get_display_items()
stack.reverse()
while len(stack) > 0:
next_module = stack.pop()
stack.extend(next_module.get_display_items())
yield next_module
def yield_dynamic_descriptor_descendents(descriptor, module_creator): def yield_dynamic_descriptor_descendents(descriptor, module_creator):
""" """
This returns all of the descendants of a descriptor. If the descriptor This returns all of the descendants of a descriptor. If the descriptor
...@@ -58,74 +50,101 @@ def yield_dynamic_descriptor_descendents(descriptor, module_creator): ...@@ -58,74 +50,101 @@ def yield_dynamic_descriptor_descendents(descriptor, module_creator):
yield next_descriptor yield next_descriptor
def yield_problems(request, course, student): def answer_distributions(course_id):
""" """
Return an iterator over capa_modules that this student has Given a course_id, return answer distributions in the form of a dictionary
potentially answered. (all that student has answered will definitely be in mapping:
the list, but there may be others as well).
(problem url_name, problem display_name, problem_id) -> {dict: answer -> count}
Answer distributions are found by iterating through all StudentModule
entries for a given course with type="problem" and a grade that is not null.
This means that we only count LoncapaProblems that people have submitted.
Other types of items like ORA or sequences will not be collected. Empty
Loncapa problem state that gets created from runnig the progress page is
also not counted.
This method accesses the StudentModule table directly instead of using the
CapaModule abstraction. The main reason for this is so that we can generate
the report without any side-effects -- we don't have to worry about answer
distribution potentially causing re-evaluation of the student answer. This
also allows us to use the read-replica database, which reduces risk of bad
locking behavior. And quite frankly, it makes this a lot less confusing.
Also, we're pulling all available records from the database for this course
rather than crawling through a student's course-tree -- the latter could
potentially cause us trouble with A/B testing. The distribution report may
not be aware of problems that are not visible to the user being used to
generate the report.
This method will try to use a read-replica database if one is available.
""" """
grading_context = course.grading_context # dict: { module.module_state_key : (url_name, display_name) }
state_keys_to_problem_info = {} # For caching, used by url_and_display_name
descriptor_locations = (descriptor.location.url() for descriptor in grading_context['all_descriptors'])
existing_student_modules = set(StudentModule.objects.filter( def url_and_display_name(module_state_key):
module_state_key__in=descriptor_locations """
).values_list('module_state_key', flat=True)) For a given module_state_key, return the problem's url and display_name.
Handle modulestore access and caching. This method ignores permissions.
sections_to_list = [] May throw an ItemNotFoundError if there is no content that corresponds
for _, sections in grading_context['graded_sections'].iteritems(): to this module_state_key.
for section in sections: """
problem_store = modulestore()
section_descriptor = section['section_descriptor'] if module_state_key not in state_keys_to_problem_info:
problems = problem_store.get_items(module_state_key, course_id=course_id, depth=1)
# If the student hasn't seen a single problem in the section, skip it. if not problems:
for moduledescriptor in section['xmoduledescriptors']: # Likely means that the problem was deleted from the course
if moduledescriptor.location.url() in existing_student_modules: # after the student had answered. We log this suspicion where
sections_to_list.append(section_descriptor) # this exception is caught.
break raise ItemNotFoundError(
"Answer Distribution: Module {} not found for course {}"
field_data_cache = FieldDataCache(sections_to_list, course.id, student) .format(module_state_key, course_id)
for section_descriptor in sections_to_list: )
section_module = get_module(student, request, problem = problems[0]
section_descriptor.location, field_data_cache, problem_info = (problem.url_name, problem.display_name_with_default)
course.id) state_keys_to_problem_info[module_state_key] = problem_info
if section_module is None:
# student doesn't have access to this module, or something else return state_keys_to_problem_info[module_state_key]
# went wrong.
# log.debug("couldn't get module for student {0} for section location {1}" # Iterate through all problems submitted for this course in no particular
# .format(student.username, section_descriptor.location)) # order, and build up our answer_counts dict that we will eventually return
answer_counts = defaultdict(lambda: defaultdict(int))
for module in StudentModule.all_submitted_problems_read_only(course_id):
try:
state_dict = json.loads(module.state) if module.state else {}
raw_answers = state_dict.get("student_answers", {})
except ValueError:
log.error(
"Answer Distribution: Could not parse module state for " +
"StudentModule id={}, course={}".format(module.id, course_id)
)
continue continue
for problem in yield_module_descendents(section_module): # Each problem part has an ID that is derived from the
if isinstance(problem, CapaModule): # module.module_state_key (with some suffix appended)
yield problem for problem_part_id, raw_answer in raw_answers.items():
# Convert whatever raw answers we have (numbers, unicode, None, etc.)
# to be unicode values. Note that if we get a string, it's always
def answer_distributions(request, course): # unicode and not str -- state comes from the json decoder, and that
""" # always returns unicode for strings.
Given a course_descriptor, compute frequencies of answers for each problem: answer = unicode(raw_answer)
Format is:
dict: (problem url_name, problem display_name, problem_id) -> (dict : answer -> count)
TODO (vshnayder): this is currently doing a full linear pass through all try:
students and all problems. This will be just a little slow. url, display_name = url_and_display_name(module.module_state_key)
""" except ItemNotFoundError:
msg = "Answer Distribution: Item {} referenced in StudentModule {} " + \
counts = defaultdict(lambda: defaultdict(int)) "for user {} in course {} not found; " + \
"This can happen if a student answered a question that " + \
enrolled_students = User.objects.filter(courseenrollment__course_id=course.id) "was later deleted from the course. This answer will be " + \
"omitted from the answer distribution CSV."
for student in enrolled_students: log.warning(
for capa_module in yield_problems(request, course, student): msg.format(module.module_state_key, module.id, module.student_id, course_id)
for problem_id in capa_module.lcp.student_answers: )
# Answer can be a list or some other unhashable element. Convert to string. continue
answer = str(capa_module.lcp.student_answers[problem_id])
key = (capa_module.url_name, capa_module.display_name_with_default, problem_id)
counts[key][answer] += 1
return counts answer_counts[(url, display_name, problem_part_id)][answer] += 1
return answer_counts
@transaction.commit_manually @transaction.commit_manually
def grade(student, request, course, keep_raw_scores=False): def grade(student, request, course, keep_raw_scores=False):
......
...@@ -13,6 +13,7 @@ ASSUMPTIONS: modules have unique IDs, even across different module_types ...@@ -13,6 +13,7 @@ ASSUMPTIONS: modules have unique IDs, even across different module_types
""" """
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.conf import settings
from django.db import models from django.db import models
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
...@@ -57,6 +58,23 @@ class StudentModule(models.Model): ...@@ -57,6 +58,23 @@ class StudentModule(models.Model):
created = models.DateTimeField(auto_now_add=True, db_index=True) created = models.DateTimeField(auto_now_add=True, db_index=True)
modified = models.DateTimeField(auto_now=True, db_index=True) modified = models.DateTimeField(auto_now=True, db_index=True)
@classmethod
def all_submitted_problems_read_only(cls, course_id):
"""
Return all model instances that correspond to problems that have been
submitted for a given course. So module_type='problem' and a non-null
grade. Use a read replica if one exists for this environment.
"""
queryset = cls.objects.filter(
course_id=course_id,
module_type='problem',
grade__isnull=False
)
if "read_replica" in settings.DATABASES:
return queryset.using("read_replica")
else:
return queryset
def __repr__(self): def __repr__(self):
return 'StudentModule<%r>' % ({ return 'StudentModule<%r>' % ({
'course_id': self.course_id, 'course_id': self.course_id,
......
"""Integration tests for submitting problem responses and getting grades.""" # -*- coding: utf-8 -*-
"""
# text processing dependancies Integration tests for submitting problem responses and getting grades.
"""
# text processing dependencies
import json import json
from textwrap import dedent from textwrap import dedent
...@@ -12,6 +14,7 @@ from django.test.utils import override_settings ...@@ -12,6 +14,7 @@ from django.test.utils import override_settings
# Need access to internal func to put users in the right group # Need access to internal func to put users in the right group
from courseware import grades from courseware import grades
from courseware.model_data import FieldDataCache from courseware.model_data import FieldDataCache
from courseware.models import StudentModule
from xmodule.modulestore.django import modulestore, editable_modulestore from xmodule.modulestore.django import modulestore, editable_modulestore
...@@ -22,6 +25,7 @@ from capa.tests.response_xml_factory import OptionResponseXMLFactory, CustomResp ...@@ -22,6 +25,7 @@ from capa.tests.response_xml_factory import OptionResponseXMLFactory, CustomResp
from courseware.tests.helpers import LoginEnrollmentTestCase from courseware.tests.helpers import LoginEnrollmentTestCase
from courseware.tests.modulestore_config import TEST_DATA_MIXED_MODULESTORE from courseware.tests.modulestore_config import TEST_DATA_MIXED_MODULESTORE
from lms.lib.xblock.runtime import quote_slashes from lms.lib.xblock.runtime import quote_slashes
from student.tests.factories import UserFactory
@override_settings(MODULESTORE=TEST_DATA_MIXED_MODULESTORE) @override_settings(MODULESTORE=TEST_DATA_MIXED_MODULESTORE)
...@@ -134,7 +138,7 @@ class TestSubmittingProblems(ModuleStoreTestCase, LoginEnrollmentTestCase): ...@@ -134,7 +138,7 @@ class TestSubmittingProblems(ModuleStoreTestCase, LoginEnrollmentTestCase):
question_text='The correct answer is Correct', question_text='The correct answer is Correct',
num_inputs=num_inputs, num_inputs=num_inputs,
weight=num_inputs, weight=num_inputs,
options=['Correct', 'Incorrect'], options=['Correct', 'Incorrect', u'ⓤⓝⓘⓒⓞⓓⓔ'],
correct_option='Correct' correct_option='Correct'
) )
...@@ -793,3 +797,156 @@ class TestPythonGradedResponse(TestSubmittingProblems): ...@@ -793,3 +797,156 @@ class TestPythonGradedResponse(TestSubmittingProblems):
name = 'computed_answer' name = 'computed_answer'
self.computed_answer_setup(name) self.computed_answer_setup(name)
self._check_ireset(name) self._check_ireset(name)
class TestAnswerDistributions(TestSubmittingProblems):
"""Check that we can pull answer distributions for problems."""
def setUp(self):
"""Set up a simple course with four problems."""
super(TestAnswerDistributions, self).setUp()
self.homework = self.add_graded_section_to_course('homework')
self.add_dropdown_to_section(self.homework.location, 'p1', 1)
self.add_dropdown_to_section(self.homework.location, 'p2', 1)
self.add_dropdown_to_section(self.homework.location, 'p3', 1)
self.refresh_course()
def test_empty(self):
# Just make sure we can process this without errors.
empty_distribution = grades.answer_distributions(self.course.id)
self.assertFalse(empty_distribution) # should be empty
def test_one_student(self):
# Basic test to make sure we have simple behavior right for a student
# Throw in a non-ASCII answer
self.submit_question_answer('p1', {'2_1': u'ⓤⓝⓘⓒⓞⓓⓔ'})
self.submit_question_answer('p2', {'2_1': 'Correct'})
distributions = grades.answer_distributions(self.course.id)
self.assertEqual(
distributions,
{
('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
u'ⓤⓝⓘⓒⓞⓓⓔ': 1
},
('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
'Correct': 1
}
}
)
def test_multiple_students(self):
# Our test class is based around making requests for a particular user,
# so we're going to cheat by creating another user and copying and
# modifying StudentModule entries to make them from other users. It's
# a little hacky, but it seemed the simpler way to do this.
self.submit_question_answer('p1', {'2_1': u'Correct'})
self.submit_question_answer('p2', {'2_1': u'Incorrect'})
self.submit_question_answer('p3', {'2_1': u'Correct'})
# Make the above submissions owned by user2
user2 = UserFactory.create()
problems = StudentModule.objects.filter(
course_id=self.course.id,
student_id=self.student_user.id
)
for problem in problems:
problem.student_id = user2.id
problem.save()
# Now make more submissions by our original user
self.submit_question_answer('p1', {'2_1': u'Correct'})
self.submit_question_answer('p2', {'2_1': u'Correct'})
self.assertEqual(
grades.answer_distributions(self.course.id),
{
('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
'Correct': 2
},
('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
'Correct': 1,
'Incorrect': 1
},
('p3', 'p3', 'i4x-MITx-100-problem-p3_2_1'): {
'Correct': 1
}
}
)
def test_other_data_types(self):
# We'll submit one problem, and then muck with the student_answers
# dict inside its state to try different data types (str, int, float,
# none)
self.submit_question_answer('p1', {'2_1': u'Correct'})
# Now fetch the state entry for that problem.
student_module = StudentModule.objects.get(
course_id=self.course.id,
student_id=self.student_user.id
)
for val in ('Correct', True, False, 0, 0.0, 1, 1.0, None):
state = json.loads(student_module.state)
state["student_answers"]['i4x-MITx-100-problem-p1_2_1'] = val
student_module.state = json.dumps(state)
student_module.save()
self.assertEqual(
grades.answer_distributions(self.course.id),
{
('p1', 'p1', 'i4x-MITx-100-problem-p1_2_1'): {
str(val): 1
},
}
)
def test_missing_content(self):
# If there's a StudentModule entry for content that no longer exists,
# we just quietly ignore it (because we can't display a meaningful url
# or name for it).
self.submit_question_answer('p1', {'2_1': 'Incorrect'})
# Now fetch the state entry for that problem and alter it so it points
# to a non-existent problem.
student_module = StudentModule.objects.get(
course_id=self.course.id,
student_id=self.student_user.id
)
student_module.module_state_key += "_fake"
student_module.save()
# It should be empty (ignored)
empty_distribution = grades.answer_distributions(self.course.id)
self.assertFalse(empty_distribution) # should be empty
def test_broken_state(self):
# Missing or broken state for a problem should be skipped without
# causing the whole answer_distribution call to explode.
# Submit p1
self.submit_question_answer('p1', {'2_1': u'Correct'})
# Now fetch the StudentModule entry for p1 so we can corrupt its state
prb1 = StudentModule.objects.get(
course_id=self.course.id,
student_id=self.student_user.id
)
# Submit p2
self.submit_question_answer('p2', {'2_1': u'Incorrect'})
for new_p1_state in ('{"student_answers": {}}', "invalid json!", None):
prb1.state = new_p1_state
prb1.save()
# p1 won't show up, but p2 should still work
self.assertEqual(
grades.answer_distributions(self.course.id),
{
('p2', 'p2', 'i4x-MITx-100-problem-p2_2_1'): {
'Incorrect': 1
},
}
)
...@@ -140,7 +140,14 @@ def instructor_dashboard(request, course_id): ...@@ -140,7 +140,14 @@ def instructor_dashboard(request, course_id):
writer.writerow(encoded_row) writer.writerow(encoded_row)
for datarow in datatable['data']: for datarow in datatable['data']:
# 's' here may be an integer, float (eg score) or string (eg student name) # 's' here may be an integer, float (eg score) or string (eg student name)
encoded_row = [unicode(s).encode('utf-8') for s in datarow] encoded_row = [
# If s is already a UTF-8 string, trying to make a unicode
# object out of it will fail unless we pass in an encoding to
# the constructor. But we can't do that across the board,
# because s is often a numeric type. So just do this.
s if isinstance(s, str) else unicode(s).encode('utf-8')
for s in datarow
]
writer.writerow(encoded_row) writer.writerow(encoded_row)
return response return response
...@@ -1492,14 +1499,16 @@ def get_answers_distribution(request, course_id): ...@@ -1492,14 +1499,16 @@ def get_answers_distribution(request, course_id):
""" """
course = get_course_with_access(request.user, course_id, 'staff') course = get_course_with_access(request.user, course_id, 'staff')
dist = grades.answer_distributions(request, course) dist = grades.answer_distributions(course.id)
d = {} d = {}
d['header'] = ['url_name', 'display name', 'answer id', 'answer', 'count'] d['header'] = ['url_name', 'display name', 'answer id', 'answer', 'count']
d['data'] = [[url_name, display_name, answer_id, a, answers[a]] d['data'] = [
for (url_name, display_name, answer_id), answers in dist.items() [url_name, display_name, answer_id, a, answers[a]]
for a in answers] for (url_name, display_name, answer_id), answers in sorted(dist.items())
for a in answers
]
return d return d
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment