Script to remove course_ids with \n from CSM

Some StudentModule entries were written with newlines at the end of the course_id. This script finds those entries within a given time range, archive them, and merges records if necessary (capa problems).

Script to remove course_ids with \n from CSM
Some StudentModule entries were written with newlines at the end of the course_id. This script finds those entries within a given time range, archive them, and merges records if necessary (capa problems).
75f9a522 · David Ormsbee · Nimisha Asthagiri · b525fe0e · 75f9a522
Commit 75f9a522 authored Aug 26, 2016 by David Ormsbee Committed by Nimisha Asthagiri Sep 01, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 365 additions and 0 deletions

lms/djangoapps/courseware/management/commands/fix_student_module_newlines.py
+365 -0

No files found.
--- a/lms/djangoapps/courseware/management/commands/fix_student_module_newlines.py
+++ b/lms/djangoapps/courseware/management/commands/fix_student_module_newlines.py
+# pragma: no cover
+"""
+Fix StudentModule entries that have Course IDs with trailing newlines.
+
+Due to a bug, many rows in courseware_studentmodule were written with a
+course_id that had a trailing newline. This command tries to fix that, and to
+merge that data with data that might have been written to the correct course_id.
+"""
+from collections import namedtuple
+from optparse import make_option
+from textwrap import dedent
+import json
+import logging
+
+from django.db import DatabaseError
+from django.db import transaction
+from django.core.management.base import BaseCommand, CommandError
+
+from courseware.models import StudentModule
+from util.query import use_read_replica_if_available
+
+log = logging.getLogger("fix_student_module_newlines")
+
+FixResult = namedtuple('FixResult', 'record_trimmed data_copied record_archived capa_merge error')
+
+
+class Command(BaseCommand):
+    """Fix StudentModule entries that have Course IDs with trailing newlines."""
+    args = "<start_date> <end_date>"
+    help = dedent(__doc__).strip()
+    option_list = BaseCommand.option_list + (
+        make_option('--dry_run',
+                    action='store_true',
+                    default=False,
+                    help="Run read queries and say what we're going to do, but don't actually do it."),
+    )
+
+    def handle(self, *args, **options):
+        """Fix newline courses in CSM!"""
+        if len(args) != 2:
+            raise CommandError('Must specify start and end dates: e.g. "2016-08-23 16:43:00" "2016-08-24 22:00:00"')
+
+        start, end = args
+        dry_run = options['dry_run']
+
+        log.info(
+            "Starting fix_student_module_newlines in %s mode!",
+            "dry_run" if dry_run else "real"
+        )
+
+        rows_to_fix = use_read_replica_if_available(
+            # pylint: disable=no-member
+            StudentModule.objects.raw(
+                "select * from courseware_studentmodule where modified between %s and %s and course_id like %s",
+                (start, end, '%\n')
+            )
+        )
+
+        results = [self.fix_row(row, dry_run=dry_run) for row in rows_to_fix]
+        log.info(
+            "Finished fix_student_module_newlines in %s mode!",
+            "dry_run" if dry_run else "real"
+        )
+        log.info("Stats: %s rows detected", len(results))
+        if results:
+            # Add up all the columns
+            aggregated_result = FixResult(*[sum(col) for col in zip(*results)])
+            log.info("Results: %s", aggregated_result)
+
+    def fix_row(self, read_only_newline_course_row, dry_run=False):
+        """
+        Fix a StudentModule with a trailing newline in the course_id.
+
+        Returns a count of (row modified, )
+
+        At the end of this method call, the record should no longer have a
+        trailing newline for the course_id. There are three possible outcomes:
+
+        1. There was never a conflicting entry:
+            -> We just update this row.
+        2. Conflict and the other row (with correct course_id) wins:
+            -> We archive this row.
+        2. Conflict and this row wins:
+            -> We copy the data to the conflicting row (the one that has a
+               correct course_id), and archive this row.
+
+        Even though all the StudentModule entries coming in here have trailing
+        newlines in the course_id, the deserialization logic will obscure that
+        (it gets parsed out when read from the database). We will also
+        automatically strip the newlines when writing back to the database, so
+        we have to be very careful about violating unique constraints by doing
+        unintended updates. That's why we only do an update to
+        newline_course_row if no correct_course_row exists.
+        """
+        # pylint: disable=too-many-statements
+        # We got the StudentModule from the read replica, so we have to fetch it
+        # again from our writeable database before making changes
+        log.info(
+            "Fixing row %s, course %s, student %s, block %s",
+            read_only_newline_course_row.id,
+            read_only_newline_course_row.course_id,
+            read_only_newline_course_row.student_id,
+            read_only_newline_course_row.module_state_key,
+        )
+        try:
+            newline_course_row = StudentModule.objects.get(id=read_only_newline_course_row.id)
+        except StudentModule.DoesNotExist:
+            # We're not going to be able to make any corrective writes, so just fail fast
+            log.exception("Could not find editable CSM row %s", read_only_newline_course_row.id)
+            return FixResult(record_trimmed=0, data_copied=0, record_archived=0, capa_merge=0, error=1)
+
+        # Find the StudentModule entry with the correct course_id.
+        try:
+            correct_course_row = StudentModule.objects.get(
+                student_id=newline_course_row.student_id,
+                course_id=newline_course_row.course_id,
+                module_state_key=newline_course_row.module_state_key,
+            )
+            assert correct_course_row.pk != newline_course_row.pk
+        except StudentModule.DoesNotExist:
+            correct_course_row = None
+
+        # If only an entry with the newline course exists, just change the course_id and save.
+        if correct_course_row is None:
+            log.info(
+                "No conflict: Removing trailing newline from course_id in CSM row %s - (%s) %s",
+                newline_course_row.id,
+                newline_course_row.module_type,
+                newline_course_row.module_state_key,
+            )
+            if dry_run:
+                return FixResult(record_trimmed=1, data_copied=0, record_archived=0, capa_merge=0, error=0)
+
+            try:
+                with transaction.atomic():
+                    newline_course_row.save()
+                return FixResult(record_trimmed=1, data_copied=0, record_archived=0, capa_merge=0, error=0)
+            except DatabaseError:
+                log.exception(
+                    "Could not remove newline and update CSM row %s", newline_course_row.id
+                )
+                return FixResult(record_trimmed=0, data_copied=0, record_archived=0, capa_merge=0, error=1)
+
+        # If we're here, then both versions of the row exist. We're going to
+        # pick a winner. This is handled differently for capa (where we have to
+        # merge entries) and everything else.
+        #
+        # Handle CAPA record merging!
+        if newline_course_row.module_type == 'problem':
+            return self.handle_capa_merge(correct_course_row, newline_course_row, dry_run)
+
+        row_to_keep = self.row_to_keep(correct_course_row, newline_course_row)
+
+        # To minimize the chances of conflicts and preserve history, if we
+        # decided to keep the data from the newline_course_row, we copy that
+        # data into correct_course_row, and then archive the newline_course_row
+        # as a cleanup step.
+        if row_to_keep.id == newline_course_row.id:
+            log.info(
+                "Conflict: Choosing data from newline course, copying data from CSM %s to %s - (%s) %s",
+                newline_course_row.id,
+                correct_course_row.id,
+                newline_course_row.module_type,
+                newline_course_row.module_state_key
+            )
+            correct_course_row.state = newline_course_row.state
+            correct_course_row.grade = newline_course_row.grade
+            correct_course_row.max_grade = newline_course_row.max_grade
+
+            if dry_run:
+                return FixResult(record_trimmed=0, data_copied=1, record_archived=1, capa_merge=0, error=0)
+
+            try:
+                with transaction.atomic():
+                    correct_course_row.save()
+                    self.cleanup_row(newline_course_row)
+                return FixResult(record_trimmed=0, data_copied=1, record_archived=1, capa_merge=0, error=0)
+            except DatabaseError:
+                log.exception(
+                    "Failed while trying save CSM row %s and archiving row %s",
+                    correct_course_row.id,
+                    newline_course_row.id
+                )
+                return FixResult(record_trimmed=0, data_copied=0, record_archived=0, capa_merge=0, error=1)
+
+        # If we've reached this point, we just want to keep correct_course_row
+        # and delete the newline_course_row entry.
+        log.info(
+            "Conflict: Choosing data from record with correct course_id (%s) "
+            "and archiving row with newline in course_id (%s) - (%s) %s",
+            correct_course_row.id,
+            newline_course_row.id,
+            newline_course_row.module_type,
+            newline_course_row.module_state_key
+        )
+
+        if dry_run:
+            return FixResult(record_trimmed=0, data_copied=0, record_archived=1, capa_merge=0, error=0)
+
+        try:
+            with transaction.atomic():
+                self.cleanup_row(newline_course_row)
+            return FixResult(record_trimmed=0, data_copied=0, record_archived=1, capa_merge=0, error=0)
+        except DatabaseError:
+            log.exception("Could not archive CSM row %s", newline_course_row.id)
+            return FixResult(record_trimmed=0, data_copied=0, record_archived=0, capa_merge=0, error=1)
+
+    def cleanup_row(self, model):
+        """Rename the row for archiving purposes"""
+        new_run = model.course_id.run.strip() + "_FIX_FOR_ECOM-5345"
+        model.course_id = model.course_id.replace(run=new_run)
+        model.save()
+
+    def handle_capa_merge(self, correct_course_row, newline_course_row, dry_run):
+        """
+        Merge capa state and grades, if possible.
+        """
+        log.info(
+            "-> Merging capa grade and state information for CSM rows %s (newline) and %s (correct course_id)",
+            newline_course_row.id,
+            correct_course_row.id
+        )
+        state, grade, max_grade = self.capa_state_and_grade(
+            correct_course_row, newline_course_row
+        )
+        # The derived grade should match at least one of the two records
+        nl_grade_tuple = (newline_course_row.grade, newline_course_row.max_grade)
+        cc_grade_tuple = (correct_course_row.grade, correct_course_row.max_grade)
+        safe_to_change_grade = (grade, max_grade) in [nl_grade_tuple, cc_grade_tuple]
+        if not safe_to_change_grade:
+            log.error(
+                "-> Derived grade %s does not match grades from either %s %s or "
+                "%s %s -- Archiving newline entry %s and leaving %s alone",
+                (grade, max_grade),
+                newline_course_row.id,
+                nl_grade_tuple,
+                correct_course_row.id,
+                cc_grade_tuple,
+                newline_course_row.id,
+                correct_course_row.id,
+            )
+
+        safe_to_change_result = FixResult(record_trimmed=0, data_copied=0, record_archived=1, capa_merge=1, error=0)
+        not_safe_to_change_result = FixResult(record_trimmed=0, data_copied=0, record_archived=1, capa_merge=0, error=1)
+
+        if dry_run:
+            return safe_to_change_result if safe_to_change_grade else not_safe_to_change_result
+
+        try:
+            with transaction.atomic():
+                if safe_to_change_grade:
+                    correct_course_row.state = state
+                    correct_course_row.grade = grade
+                    correct_course_row.max_grade = max_grade
+                    correct_course_row.save()
+                self.cleanup_row(newline_course_row)
+            return safe_to_change_result if safe_to_change_grade else not_safe_to_change_result
+        except DatabaseError:
+            log.exception("Failed while trying to merge capa CSM row %s (newline) and %s (correct course_id)")
+            return FixResult(record_trimmed=0, data_copied=0, record_archived=0, capa_merge=0, error=1)
+
+    def capa_state_and_grade(self, correct_course_row, newline_course_row):
+        """
+        Return a tuple that is (state, grade, max_grade) to preserve.
+
+        THIS SHOULD ONLY BE CALLED FOR CAPA PROBLEMS.
+
+        Updates to scores could have gotten crossed with updates to state, so
+        that the states and scores do not match up to each other (e.g. you might
+        have correct_course_row with the proper state, but the score
+        corresponding to that state might live in newline_course_row). So we're
+        going to re-derive the scores from the state for capa problems.
+        """
+        ccr_grade, ccr_max_grade = self.grade_for_state(correct_course_row.state)
+        if (ccr_grade, ccr_max_grade) != (correct_course_row.grade, correct_course_row.max_grade):
+            log.info(
+                "-> Correct course row %s has grade %s but should have grade %s",
+                correct_course_row.id,
+                (correct_course_row.grade, correct_course_row.max_grade),
+                (ccr_grade, ccr_max_grade)
+            )
+
+        ncr_grade, ncr_max_grade = self.grade_for_state(newline_course_row.state)
+        if (ncr_grade, ncr_max_grade) != (newline_course_row.grade, newline_course_row.max_grade):
+            log.info(
+                "-> Newline course row %s has grade %s but should have grade %s",
+                newline_course_row.id,
+                (newline_course_row.grade, newline_course_row.max_grade),
+                (ncr_grade, ncr_max_grade)
+            )
+
+        if ncr_grade > ccr_grade:
+            log.info(
+                "-> Newline course row %s has higher derived grade for state.",
+                newline_course_row.id
+            )
+            return newline_course_row.state, ncr_grade, ncr_max_grade
+
+        log.info(
+            "-> Correct course row %s has higher or equal derived grade for state.",
+            correct_course_row.id
+        )
+        return correct_course_row.state, ccr_grade, ccr_max_grade
+
+    def grade_for_state(self, state):
+        """Given unparsed state, return the (grade, max_grade) we should have."""
+        parsed_state = json.loads(state)
+        correct_map = parsed_state.get("correct_map")
+        if not correct_map:
+            input_state = parsed_state.get('input_state')
+            if input_state is not None:
+                return 0.0, float(len(input_state))
+            return None, None
+
+        def item_score(item):
+            """Return score for an individual correct_map entry."""
+            # Partial credit overrides all, even if correctness is 'incorrect'.
+            # We're inconsistent on whether we say the correctness is
+            # 'incorrect' or 'partially-correct' in this situation.
+            partial_points = item.get('npoints')
+            if partial_points is not None:
+                return partial_points
+
+            # Otherwise, we're either right or wrong.
+            correctness = item.get('correctness')
+            if correctness in ['correct', 'partially-correct']:
+                return 1.0
+
+            return 0.0
+
+        grade = sum(item_score(item) for item in correct_map.values())
+        max_grade = float(len(correct_map))
+
+        return grade, max_grade
+
+    def row_to_keep(self, correct_course_row, newline_course_row):
+        """Determine which row's data we want to keep."""
+        # Rule 1: Take the higher grade.
+        if newline_course_row.grade > correct_course_row.grade:
+            log.info(
+                "-> Newline course record grade %s is higher than correct course record grade %s",
+                newline_course_row.grade,
+                correct_course_row.grade
+            )
+            return newline_course_row
+        elif correct_course_row.grade > newline_course_row.grade:
+            log.info(
+                "-> Correct course record grade %s is higher than newline course record grade %s",
+                correct_course_row.grade,
+                newline_course_row.grade
+            )
+            return correct_course_row
+
+        # Rule 2: Take the newline course record if they've interacted with it
+        #         more recently.
+        if newline_course_row.modified > correct_course_row.modified:
+            log.info(
+                "-> Newline course record modified %s is later than correct course record modified %s",
+                newline_course_row.modified,
+                correct_course_row.modified
+            )
+            return newline_course_row
+
+        # In all other cases, take the correct_course_row
+        return correct_course_row