Commit 07aad296 by Ned Batchelder

The clean_history management command to remove excess courseware_studentmodulehistory records.

parent 8300bb5e
"""A command to clean the StudentModuleHistory table.
When we added XBlock storage, each field modification wrote a new history row
to the db. Now that we have bulk saves to avoid that database hammering, we
need to clean out the unnecessary rows from the database.
This command that does that.
"""
import datetime
import json
from optparse import make_option
import traceback
from django.core.management.base import NoArgsCommand
from django.db import connection
class Command(NoArgsCommand):
"""The actual clean_history command to clean history rows."""
help = "Deletes unneeded rows from the StudentModuleHistory table."
option_list = NoArgsCommand.option_list + (
make_option(
'--dry-run',
action='store_true',
default=False,
help="Don't change the database, just show what would be done.",
),
)
def handle_noargs(self, **options):
smhc = StudentModuleHistoryCleaner(
dry_run=options["dry_run"],
)
smhc.main()
class StudentModuleHistoryCleaner(object):
"""Logic to clean rows from the StudentModuleHistory table."""
DELETE_GAP_SECS = 0.5 # Rows this close can be discarded.
STATE_FILE = "clean_history.json"
BATCH_SIZE = 100
def __init__(self, dry_run=False):
self.dry_run = dry_run
self.next_student_module_id = 0
self.last_student_module_id = 0
def main(self, batch_size=None):
"""Invoked from the management command to do all the work."""
batch_size = batch_size or self.BATCH_SIZE
self.last_student_module_id = self.get_last_student_module_id()
self.load_state()
while self.next_student_module_id <= self.last_student_module_id:
for smid in self.module_ids_to_check(batch_size):
try:
self.clean_one_student_module(smid)
except Exception: # pylint: disable=W0703
trace = traceback.format_exc()
self.say("Couldn't clean student_module_id {}:\n{}".format(smid, trace))
self.commit()
self.save_state()
def say(self, message):
"""
Display a message to the user.
The message will have a trailing newline added to it.
"""
print message
def commit(self):
"""
Commit the transaction.
"""
self.say("Committing")
connection.commit()
def load_state(self):
"""
Load the latest state from disk.
"""
try:
state_file = open(self.STATE_FILE)
except IOError:
self.say("No stored state")
self.next_student_module_id = 0
else:
with state_file:
state = json.load(state_file)
self.say(
"Loaded stored state: {}".format(
json.dumps(state, sort_keys=True)
)
)
self.next_student_module_id = state['next_student_module_id']
def save_state(self):
"""
Save the state to disk.
"""
state = {
'next_student_module_id': self.next_student_module_id,
}
with open(self.STATE_FILE, "w") as state_file:
json.dump(state, state_file)
self.say("Saved state: {}".format(json.dumps(state, sort_keys=True)))
def get_last_student_module_id(self):
"""
Return the id of the last student_module.
"""
cursor = connection.cursor()
cursor.execute("""
SELECT max(student_module_id) FROM courseware_studentmodulehistory
""")
last = cursor.fetchone()[0]
self.say("Last student_module_id is {}".format(last))
return last
def module_ids_to_check(self, batch_size):
"""Produce a sequence of student module ids to check.
`batch_size` is how many module ids to produce, max.
The sequence starts with `next_student_module_id`, and goes up to
and including `last_student_module_id`.
`next_student_module_id` is updated as each id is yielded.
"""
start = self.next_student_module_id
for smid in range(start, start+batch_size):
if smid > self.last_student_module_id:
break
yield smid
self.next_student_module_id = smid+1
def get_history_for_student_modules(self, student_module_id):
"""
Get the history rows for a student module.
```student_module_id```: the id of the student module we're
interested in.
Return a list: [(id, created), ...], all the rows of history.
"""
cursor = connection.cursor()
cursor.execute("""
SELECT id, created FROM courseware_studentmodulehistory
WHERE student_module_id = %s
ORDER BY created
""",
[student_module_id]
)
history = cursor.fetchall()
return history
def delete_history(self, ids_to_delete):
"""
Delete history rows.
```ids_to_delete```: a non-empty list (or set...) of history row ids to delete.
"""
assert ids_to_delete
cursor = connection.cursor()
cursor.execute("""
DELETE FROM courseware_studentmodulehistory
WHERE id IN ({ids})
""".format(ids=",".join(str(i) for i in ids_to_delete))
)
def clean_one_student_module(self, student_module_id):
"""Clean one StudentModule's-worth of history.
`student_module_id`: the id of the StudentModule to process.
"""
delete_gap = datetime.timedelta(seconds=self.DELETE_GAP_SECS)
history = self.get_history_for_student_modules(student_module_id)
if not history:
self.say("No history for student_module_id {}".format(student_module_id))
return
ids_to_delete = []
next_created = None
for history_id, created in reversed(history):
if next_created is not None:
# Compare this timestamp with the next one.
if (next_created - created) < delete_gap:
# This row is followed closely by another, we can discard
# this one.
ids_to_delete.append(history_id)
next_created = created
verb = "Would have deleted" if self.dry_run else "Deleting"
self.say("{verb} {to_delete} rows of {total} for student_module_id {id}".format(
verb=verb,
to_delete=len(ids_to_delete),
total=len(history),
id=student_module_id,
))
if ids_to_delete and not self.dry_run:
self.delete_history(ids_to_delete)
"""Test the clean_history management command."""
import fnmatch
from mock import Mock
import os.path
import textwrap
import dateutil.parser
from django.test import TestCase
from django.db import connection
from courseware.management.commands.clean_history import StudentModuleHistoryCleaner
# In lots of places in this file, smhc == StudentModuleHistoryCleaner
def parse_date(sdate):
"""Parse a string date into a datetime."""
parsed = dateutil.parser.parse(sdate)
parsed = parsed.replace(tzinfo=dateutil.tz.gettz('UTC'))
return parsed
class SmhcSayStubbed(StudentModuleHistoryCleaner):
"""StudentModuleHistoryCleaner, but with .say() stubbed for testing."""
def __init__(self, **kwargs):
super(SmhcSayStubbed, self).__init__(**kwargs)
self.said_lines = []
def say(self, msg):
self.said_lines.append(msg)
class SmhcDbMocked(SmhcSayStubbed):
"""StudentModuleHistoryCleaner, but with db access mocked."""
def __init__(self, **kwargs):
super(SmhcDbMocked, self).__init__(**kwargs)
self.get_history_for_student_modules = Mock()
self.delete_history = Mock()
def set_rows(self, rows):
"""Set the mocked history rows."""
rows = [(row_id, parse_date(created)) for row_id, created in rows]
self.get_history_for_student_modules.return_value = rows
class HistoryCleanerTest(TestCase):
"""Base class for all history cleaner tests."""
maxDiff = None
def setUp(self):
super(HistoryCleanerTest, self).setUp()
self.addCleanup(self.clean_up_state_file)
def write_state_file(self, state):
"""Write the string `state` into the state file read by StudentModuleHistoryCleaner."""
with open(StudentModuleHistoryCleaner.STATE_FILE, "w") as state_file:
state_file.write(state)
def read_state_file(self):
"""Return the string contents of the state file read by StudentModuleHistoryCleaner."""
with open(StudentModuleHistoryCleaner.STATE_FILE) as state_file:
return state_file.read()
def clean_up_state_file(self):
"""Remove any state file lying around."""
if os.path.exists(StudentModuleHistoryCleaner.STATE_FILE):
os.remove(StudentModuleHistoryCleaner.STATE_FILE)
def assert_said(self, smhc, *msgs):
"""Fail if the `smhc` didn't say `msgs`.
The messages passed here are `fnmatch`-style patterns: "*" means anything.
"""
for said, pattern in zip(smhc.said_lines, msgs):
if not fnmatch.fnmatch(said, pattern):
fmt = textwrap.dedent("""\
Messages:
{msgs}
don't match patterns:
{patterns}
Failed at {said!r} and {pattern!r}
""")
msg = fmt.format(
msgs="\n".join(smhc.said_lines),
patterns="\n".join(msgs),
said=said,
pattern=pattern
)
self.fail(msg)
def parse_rows(self, rows):
"""Parse convenient rows into real data."""
rows = [
(row_id, parse_date(created), student_module_id)
for row_id, created, student_module_id in rows
]
return rows
def write_history(self, rows):
"""Write history rows to the db.
Each row should be (id, created, student_module_id).
"""
cursor = connection.cursor()
cursor.executemany("""
INSERT INTO courseware_studentmodulehistory
(id, created, student_module_id)
VALUES (%s, %s, %s)
""",
self.parse_rows(rows),
)
def read_history(self):
"""Read the history from the db, and return it as a list of tuples.
Returns [(id, created, student_module_id), ...]
"""
cursor = connection.cursor()
cursor.execute("""
SELECT id, created, student_module_id FROM courseware_studentmodulehistory
""")
return cursor.fetchall()
def assert_history(self, rows):
"""Assert that the history rows are the same as `rows`."""
self.assertEqual(self.parse_rows(rows), self.read_history())
class HistoryCleanerNoDbTest(HistoryCleanerTest):
"""Tests of StudentModuleHistoryCleaner with db access mocked."""
def test_empty(self):
smhc = SmhcDbMocked()
smhc.set_rows([])
smhc.clean_one_student_module(1)
self.assert_said(smhc, "No history for student_module_id 1")
# Nothing to delete, so delete_history wasn't called.
self.assertFalse(smhc.delete_history.called)
def test_one_row(self):
smhc = SmhcDbMocked()
smhc.set_rows([
(1, "2013-07-13 12:11:10.987"),
])
smhc.clean_one_student_module(1)
self.assert_said(smhc, "Deleting 0 rows of 1 for student_module_id 1")
# Nothing to delete, so delete_history wasn't called.
self.assertFalse(smhc.delete_history.called)
def test_one_row_dry_run(self):
smhc = SmhcDbMocked(dry_run=True)
smhc.set_rows([
(1, "2013-07-13 12:11:10.987"),
])
smhc.clean_one_student_module(1)
self.assert_said(smhc, "Would have deleted 0 rows of 1 for student_module_id 1")
# Nothing to delete, so delete_history wasn't called.
self.assertFalse(smhc.delete_history.called)
def test_two_rows_close(self):
smhc = SmhcDbMocked()
smhc.set_rows([
(7, "2013-07-13 12:34:56.789"),
(9, "2013-07-13 12:34:56.987"),
])
smhc.clean_one_student_module(1)
self.assert_said(smhc, "Deleting 1 rows of 2 for student_module_id 1")
smhc.delete_history.assert_called_once_with([7])
def test_two_rows_far(self):
smhc = SmhcDbMocked()
smhc.set_rows([
(7, "2013-07-13 12:34:56.789"),
(9, "2013-07-13 12:34:57.890"),
])
smhc.clean_one_student_module(1)
self.assert_said(smhc, "Deleting 0 rows of 2 for student_module_id 1")
self.assertFalse(smhc.delete_history.called)
def test_a_bunch_of_rows(self):
smhc = SmhcDbMocked()
smhc.set_rows([
( 4, "2013-07-13 16:30:00.000"), # keep
( 8, "2013-07-13 16:30:01.100"),
(15, "2013-07-13 16:30:01.200"),
(16, "2013-07-13 16:30:01.300"), # keep
(23, "2013-07-13 16:30:02.400"),
(42, "2013-07-13 16:30:02.500"),
(98, "2013-07-13 16:30:02.600"), # keep
(99, "2013-07-13 16:30:59.000"), # keep
])
smhc.clean_one_student_module(17)
self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 17")
smhc.delete_history.assert_called_once_with([42, 23, 15, 8])
class HistoryCleanerWitDbTest(HistoryCleanerTest):
"""Tests of StudentModuleHistoryCleaner with a real db."""
def test_no_history(self):
# Cleaning a student_module_id with no history leaves the db unchanged.
smhc = SmhcSayStubbed()
self.write_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
( 8, "2013-07-13 16:30:01.100", 11),
(15, "2013-07-13 16:30:01.200", 11),
(16, "2013-07-13 16:30:01.300", 11), # keep
(23, "2013-07-13 16:30:02.400", 11),
(42, "2013-07-13 16:30:02.500", 11),
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
smhc.clean_one_student_module(22)
self.assert_said(smhc, "No history for student_module_id 22")
self.assert_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
( 8, "2013-07-13 16:30:01.100", 11),
(15, "2013-07-13 16:30:01.200", 11),
(16, "2013-07-13 16:30:01.300", 11), # keep
(23, "2013-07-13 16:30:02.400", 11),
(42, "2013-07-13 16:30:02.500", 11),
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
def test_a_bunch_of_rows(self):
# Cleaning a student_module_id with 8 records, 4 to delete.
smhc = SmhcSayStubbed()
self.write_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
( 8, "2013-07-13 16:30:01.100", 11),
(15, "2013-07-13 16:30:01.200", 11),
(16, "2013-07-13 16:30:01.300", 11), # keep
(17, "2013-07-13 16:30:01.310", 22), # other student_module_id!
(23, "2013-07-13 16:30:02.400", 11),
(42, "2013-07-13 16:30:02.500", 11),
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
smhc.clean_one_student_module(11)
self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 11")
self.assert_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
(16, "2013-07-13 16:30:01.300", 11), # keep
(17, "2013-07-13 16:30:01.310", 22), # other student_module_id!
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
def test_a_bunch_of_rows_dry_run(self):
# Cleaning a student_module_id with 8 records, 4 to delete,
# but don't really do it.
smhc = SmhcSayStubbed(dry_run=True)
self.write_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
( 8, "2013-07-13 16:30:01.100", 11),
(15, "2013-07-13 16:30:01.200", 11),
(16, "2013-07-13 16:30:01.300", 11), # keep
(23, "2013-07-13 16:30:02.400", 11),
(42, "2013-07-13 16:30:02.500", 11),
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
smhc.clean_one_student_module(11)
self.assert_said(smhc, "Would have deleted 4 rows of 8 for student_module_id 11")
self.assert_history([
( 4, "2013-07-13 16:30:00.000", 11), # keep
( 8, "2013-07-13 16:30:01.100", 11),
(15, "2013-07-13 16:30:01.200", 11),
(16, "2013-07-13 16:30:01.300", 11), # keep
(23, "2013-07-13 16:30:02.400", 11),
(42, "2013-07-13 16:30:02.500", 11),
(98, "2013-07-13 16:30:02.600", 11), # keep
(99, "2013-07-13 16:30:59.000", 11), # keep
])
def test_a_bunch_of_rows_in_jumbled_order(self):
# Cleaning a student_module_id with 8 records, 4 to delete.
smhc = SmhcSayStubbed()
self.write_history([
(23, "2013-07-13 16:30:01.100", 11),
(24, "2013-07-13 16:30:01.300", 11), # keep
(27, "2013-07-13 16:30:02.500", 11),
(30, "2013-07-13 16:30:01.350", 22), # other student_module_id!
(32, "2013-07-13 16:30:59.000", 11), # keep
(50, "2013-07-13 16:30:02.400", 11),
(51, "2013-07-13 16:30:02.600", 11), # keep
(56, "2013-07-13 16:30:00.000", 11), # keep
(57, "2013-07-13 16:30:01.200", 11),
])
smhc.clean_one_student_module(11)
self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 11")
self.assert_history([
(24, "2013-07-13 16:30:01.300", 11), # keep
(30, "2013-07-13 16:30:01.350", 22), # other student_module_id!
(32, "2013-07-13 16:30:59.000", 11), # keep
(51, "2013-07-13 16:30:02.600", 11), # keep
(56, "2013-07-13 16:30:00.000", 11), # keep
])
def test_get_last_student_module(self):
# Can we find the last student_module_id properly?
smhc = SmhcSayStubbed()
self.write_history([
(23, "2013-07-13 16:30:01.100", 11),
(24, "2013-07-13 16:30:01.300", 44),
(27, "2013-07-13 16:30:02.500", 11),
(30, "2013-07-13 16:30:01.350", 22),
(32, "2013-07-13 16:30:59.000", 11),
(51, "2013-07-13 16:30:02.600", 33),
(56, "2013-07-13 16:30:00.000", 11),
])
last = smhc.get_last_student_module_id()
self.assertEqual(last, 44)
self.assert_said(smhc, "Last student_module_id is 44")
def test_load_state_with_no_stored_state(self):
smhc = SmhcSayStubbed()
self.assertFalse(os.path.exists(smhc.STATE_FILE))
smhc.load_state()
self.assertEqual(smhc.next_student_module_id, 0)
self.assert_said(smhc, "No stored state")
def test_load_stored_state(self):
self.write_state_file('{"next_student_module_id": 23}')
smhc = SmhcSayStubbed()
smhc.load_state()
self.assertEqual(smhc.next_student_module_id, 23)
self.assert_said(smhc, 'Loaded stored state: {"next_student_module_id": 23}')
def test_save_state(self):
smhc = SmhcSayStubbed()
smhc.next_student_module_id = 47
smhc.save_state()
state = self.read_state_file()
self.assertEqual(state, '{"next_student_module_id": 47}')
class SmhcForTestingMain(SmhcSayStubbed):
"""A StudentModuleHistoryCleaner with a few function stubbed for testing main."""
def __init__(self, *args, **kwargs):
self.exception_smids = kwargs.pop('exception_smids', ())
super(SmhcForTestingMain, self).__init__(*args, **kwargs)
def clean_one_student_module(self, smid):
self.say("(not really cleaning {})".format(smid))
if smid in self.exception_smids:
raise Exception("Something went wrong!")
def commit(self):
self.say("(not really committing)")
class HistoryCleanerMainTest(HistoryCleanerTest):
"""Tests of StudentModuleHistoryCleaner.main(), using SmhcForTestingMain."""
def test_only_one_record(self):
smhc = SmhcForTestingMain()
self.write_history([
(1, "2013-07-15 11:47:00.000", 1),
])
smhc.main()
self.assert_said(smhc,
'Last student_module_id is 1',
'No stored state',
'(not really cleaning 0)',
'(not really cleaning 1)',
'(not really committing)',
'Saved state: {"next_student_module_id": 2}',
)
def test_already_processed_some(self):
smhc = SmhcForTestingMain()
self.write_state_file('{"next_student_module_id": 25}')
self.write_history([
(1, "2013-07-15 15:04:00.000", 23),
(2, "2013-07-15 15:04:11.000", 23),
(3, "2013-07-15 15:04:01.000", 24),
(4, "2013-07-15 15:04:00.000", 25),
(5, "2013-07-15 15:04:00.000", 26),
])
smhc.main()
self.assert_said(smhc,
'Last student_module_id is 26',
'Loaded stored state: {"next_student_module_id": 25}',
'(not really cleaning 25)',
'(not really cleaning 26)',
'(not really committing)',
'Saved state: {"next_student_module_id": 27}'
)
def test_working_in_batches(self):
smhc = SmhcForTestingMain()
self.write_state_file('{"next_student_module_id": 25}')
self.write_history([
(3, "2013-07-15 15:04:01.000", 24),
(4, "2013-07-15 15:04:00.000", 25),
(5, "2013-07-15 15:04:00.000", 26),
(6, "2013-07-15 15:04:00.000", 27),
(7, "2013-07-15 15:04:00.000", 28),
(8, "2013-07-15 15:04:00.000", 29),
])
smhc.main(batch_size=3)
self.assert_said(smhc,
'Last student_module_id is 29',
'Loaded stored state: {"next_student_module_id": 25}',
'(not really cleaning 25)',
'(not really cleaning 26)',
'(not really cleaning 27)',
'(not really committing)',
'Saved state: {"next_student_module_id": 28}',
'(not really cleaning 28)',
'(not really cleaning 29)',
'(not really committing)',
'Saved state: {"next_student_module_id": 30}',
)
def test_something_failing_while_cleaning(self):
smhc = SmhcForTestingMain(exception_smids=[26])
self.write_state_file('{"next_student_module_id": 25}')
self.write_history([
(3, "2013-07-15 15:04:01.000", 24),
(4, "2013-07-15 15:04:00.000", 25),
(5, "2013-07-15 15:04:00.000", 26),
(6, "2013-07-15 15:04:00.000", 27),
(7, "2013-07-15 15:04:00.000", 28),
(8, "2013-07-15 15:04:00.000", 29),
])
smhc.main(batch_size=3)
self.assert_said(smhc,
'Last student_module_id is 29',
'Loaded stored state: {"next_student_module_id": 25}',
'(not really cleaning 25)',
'(not really cleaning 26)',
"Couldn't clean student_module_id 26:\nTraceback*Exception: Something went wrong!\n",
'(not really cleaning 27)',
'(not really committing)',
'Saved state: {"next_student_module_id": 28}',
'(not really cleaning 28)',
'(not really cleaning 29)',
'(not really committing)',
'Saved state: {"next_student_module_id": 30}',
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment