Batch the bulk_creates in bulk_create_notes

TNL-2428

Batch the bulk_creates in bulk_create_notes
TNL-2428
90940d48 · Daniel Friedman · f048f5ed · 90940d48
Commit 90940d48 authored Jun 11, 2015 by Daniel Friedman
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 2 deletions

notesapi/v1/management/commands/bulk_create_notes.py
+30 -2

No files found.
--- a/notesapi/v1/management/commands/bulk_create_notes.py
+++ b/notesapi/v1/management/commands/bulk_create_notes.py
+import itertools
 import json
 from optparse import make_option
 import os
@@ -33,18 +34,30 @@ class Command(BaseCommand):
            default=['edX/DemoX/Demo_Course'],
            help='comma-separated list of course_ids for which notes should be randomly attributed'
        ),
+        make_option(
+            '--batch_size',
+            action='store',
+            type='int',
+            default=1000,
+            help='number of notes that should be bulk inserted at a time - useful for getting around the maximum SQL '
+                 'query size'
+        )
    )
    help = 'Add N random notes to the database'
    def handle(self, *args, **options):
        if len(args) != 1:
-            raise CommandError("generate_random_notes takes the following arguments: " + self.args)
+            raise CommandError("bulk_create_notes takes the following arguments: " + self.args)
        total_notes = int(args[0])
        notes_per_user = options['per_user']
        course_ids = options['course_ids']
-        Note.objects.bulk_create(note_iter(total_notes, notes_per_user, course_ids))
+        batch_size = options['batch_size']
+        # In production, there is a max SQL query size.  Batch the bulk inserts
+        # such that we don't exceed this limit.
+        for notes_chunk in grouper_it(note_iter(total_notes, notes_per_user, course_ids), batch_size):
+            Note.objects.bulk_create(notes_chunk)
 def note_iter(total_notes, notes_per_user, course_ids):
    """
@@ -89,3 +102,18 @@ def note_iter(total_notes, notes_per_user, course_ids):
            ranges=json.dumps([{"start": "/div[1]/p[1]", "end": "/div[1]/p[1]", "startOffset": 0, "endOffset": 6}]),
            tags=json.dumps(weighted_get_words([(1, 40), (2, 30), (5, 15), (10, 10), (15, 5)]))
        )
+def grouper_it(iterable, batch_size):
+    """
+    Return an iterator of iterators.  Each child iterator yields the
+    next `batch_size`-many elements from `iterable`.
+    """
+    iterator = iter(iterable)
+    while True:
+        chunk_it = itertools.islice(iterable, batch_size)
+        try:
+            first_el = next(chunk_it)
+        except StopIteration:
+            break
+        yield itertools.chain((first_el,), chunk_it)