Make BlockStructure caching more space-efficient (grouping, zlib).

Before this commit, BlockStructures were being cached with one entry for the top level data (structure, course-wide transform data), and an entry for each block's data. There can be thousands of blocks in a course. While this makes it so that you can grab a small subset of blocks without incurring much overhead, splitting the block information means that the overall size of the data stored for generating course-level views increases drastically, since compression is less efficient. Making this change yields about a 7X decrease in serialized data size.

Make BlockStructure caching more space-efficient (grouping, zlib).
Before this commit, BlockStructures were being cached with one entry for the top level data (structure, course-wide transform data), and an entry for each block's data. There can be thousands of blocks in a course. While this makes it so that you can grab a small subset of blocks without incurring much overhead, splitting the block information means that the overall size of the data stored for generating course-level views increases drastically, since compression is less efficient. Making this change yields about a 7X decrease in serialized data size.
ac7879fd · David Ormsbee · Nimisha Asthagiri · 2e0a6619 · ac7879fd · ac7879fd
Commit ac7879fd authored Sep 29, 2015 by David Ormsbee Committed by Nimisha Asthagiri Oct 05, 2015
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 39 deletions

lms/djangoapps/course_blocks/api.py
+3 -8

openedx/core/lib/block_cache/block_structure.py
+45 -30

openedx/core/lib/cache_utils.py
+12 -1

No files found.
--- a/lms/djangoapps/course_blocks/api.py
+++ b/lms/djangoapps/course_blocks/api.py
 """
 ...
 """
-from django.core.cache import get_cache
+from django.core.cache import cache
 from openedx.core.lib.block_cache.block_cache import get_blocks, clear_block_cache
 from xmodule.modulestore.django import modulestore
@@ -23,14 +23,9 @@ LMS_COURSE_TRANSFORMERS = [
 ]
-_COURSE_BLOCKS_CACHE = None
 def _get_cache():
-    global _COURSE_BLOCKS_CACHE
+    """Function exists for mocking/testing, or if we want a custom cache."""
-    if not _COURSE_BLOCKS_CACHE:
+    return cache
-        _COURSE_BLOCKS_CACHE = get_cache('lms.course_blocks')
-    return _COURSE_BLOCKS_CACHE
 def get_course_blocks(

--- a/openedx/core/lib/block_cache/block_structure.py
+++ b/openedx/core/lib/block_cache/block_structure.py
@@ -5,6 +5,7 @@ from collections import defaultdict
 from graph_traversals import traverse_topologically, traverse_post_order
 from logging import getLogger
+from openedx.core.lib.cache_utils import zpickle, zunpickle
 from transformer import BlockStructureTransformers
@@ -226,15 +227,21 @@ class BlockStructureFactory(object):
    @classmethod
    def serialize_to_cache(cls, block_structure, cache):
+        data_to_cache = (
+            block_structure._block_relations,
+            block_structure._transformer_data,
+            block_structure._block_data_map
+        )
+        zp_data_to_cache = zpickle(data_to_cache)
        cache.set(
            cls._encode_root_cache_key(block_structure.root_block_key),
-            (block_structure._block_relations, block_structure._transformer_data)
+            zp_data_to_cache
+        )
+        logger.debug(
+            "Wrote BlockStructure {} to cache, size: {}".format(
+                block_structure.root_block_key, len(zp_data_to_cache)
+            )
        )
-        cache.set_many({
-            unicode(usage_key): block_data
-            for usage_key, block_data
-            in block_structure._block_data_map.iteritems()
-        })
    @classmethod
    def create_from_cache(cls, root_block_key, cache):
@@ -243,30 +250,38 @@ class BlockStructureFactory(object):
            BlockStructure, if the block structure is in the cache, and
            NoneType otherwise.
        """
-        block_relations, transformer_data = cache.get(cls._encode_root_cache_key(root_block_key), (None, None))
+        zp_data_from_cache = cache.get(cls._encode_root_cache_key(root_block_key))
-        if block_relations:
+        if not zp_data_from_cache:
-            block_structure = BlockStructureBlockData(root_block_key)
+            return None
-            block_structure._block_relations = block_relations
-            block_structure._transformer_data = transformer_data
+        logger.debug(
+            "Read BlockStructure {} from cache, size: {}".format(
-            transformer_issues = {}
+                root_block_key, len(zp_data_from_cache)
-            for transformer in BlockStructureTransformers.get_registered_transformers():
+            )
-                cached_transformer_version = block_structure.get_transformer_data_version(transformer)
+        )
-                if transformer.VERSION != cached_transformer_version:
-                    transformer_issues[transformer.name()] = "version: {}, cached: {}".format(
+        block_relations, transformer_data, block_data_map = zunpickle(zp_data_from_cache)
-                        transformer.VERSION,
+        block_structure = BlockStructureBlockData(root_block_key)
-                        cached_transformer_version,
+        block_structure._block_relations = block_relations
-                    )
+        block_structure._transformer_data = transformer_data
+        block_structure._block_data_map = block_data_map
-            if not transformer_issues:
-                block_structure._block_data_map = cache.get_many(block_relations.iterkeys())
+        transformer_issues = {}
-                return block_structure
+        for transformer in BlockStructureTransformers.get_registered_transformers():
-            else:
+            cached_transformer_version = block_structure.get_transformer_data_version(transformer)
-                logger.info(
+            if transformer.VERSION != cached_transformer_version:
-                    "Collected data for the following transformers have issues:\n{}."
+                transformer_issues[transformer.name()] = "version: {}, cached: {}".format(
-                ).format('\n'.join([t_name + ": " + t_value for t_name, t_value in transformer_issues.iteritems()]))
+                    transformer.VERSION,
+                    cached_transformer_version,
-        return None
+                )
+        if transformer_issues:
+            logger.info(
+                "Collected data for the following transformers have issues:\n{}."
+            ).format('\n'.join([t_name + ": " + t_value for t_name, t_value in transformer_issues.iteritems()]))
+            return None
+        return block_structure
    @classmethod
    def remove_from_cache(cls, root_block_key, cache):

--- a/openedx/core/lib/cache_utils.py
+++ b/openedx/core/lib/cache_utils.py
 """
 Utilities related to caching.
 """
+import cPickle as pickle
 import functools
+import zlib
 from xblock.core import XBlock
@@ -47,3 +48,13 @@ def hashvalue(arg):
        return unicode(arg.location)
    else:
        return unicode(arg)
+def zpickle(data):
+    """Given any data structure, returns a zlib compressed pickled serialization."""
+    return zlib.compress(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
+def zunpickle(zdata):
+    """Given a zlib compressed pickled serialization, returns the deserialized data."""
+    return pickle.loads(zlib.decompress(zdata))