Commit ac7879fd by David Ormsbee Committed by Nimisha Asthagiri

Make BlockStructure caching more space-efficient (grouping, zlib).

Before this commit, BlockStructures were being cached with one
entry for the top level data (structure, course-wide transform
data), and an entry for each block's data. There can be thousands
of blocks in a course. While this makes it so that you can grab
a small subset of blocks without incurring much overhead,
splitting the block information means that the overall size of the
data stored for generating course-level views increases drastically,
since compression is less efficient.

Making this change yields about a 7X decrease in serialized data
size.
parent 2e0a6619
"""
...
"""
from django.core.cache import get_cache
from django.core.cache import cache
from openedx.core.lib.block_cache.block_cache import get_blocks, clear_block_cache
from xmodule.modulestore.django import modulestore
......@@ -23,14 +23,9 @@ LMS_COURSE_TRANSFORMERS = [
]
_COURSE_BLOCKS_CACHE = None
def _get_cache():
global _COURSE_BLOCKS_CACHE
if not _COURSE_BLOCKS_CACHE:
_COURSE_BLOCKS_CACHE = get_cache('lms.course_blocks')
return _COURSE_BLOCKS_CACHE
"""Function exists for mocking/testing, or if we want a custom cache."""
return cache
def get_course_blocks(
......
......@@ -5,6 +5,7 @@ from collections import defaultdict
from graph_traversals import traverse_topologically, traverse_post_order
from logging import getLogger
from openedx.core.lib.cache_utils import zpickle, zunpickle
from transformer import BlockStructureTransformers
......@@ -226,15 +227,21 @@ class BlockStructureFactory(object):
@classmethod
def serialize_to_cache(cls, block_structure, cache):
data_to_cache = (
block_structure._block_relations,
block_structure._transformer_data,
block_structure._block_data_map
)
zp_data_to_cache = zpickle(data_to_cache)
cache.set(
cls._encode_root_cache_key(block_structure.root_block_key),
(block_structure._block_relations, block_structure._transformer_data)
zp_data_to_cache
)
logger.debug(
"Wrote BlockStructure {} to cache, size: {}".format(
block_structure.root_block_key, len(zp_data_to_cache)
)
)
cache.set_many({
unicode(usage_key): block_data
for usage_key, block_data
in block_structure._block_data_map.iteritems()
})
@classmethod
def create_from_cache(cls, root_block_key, cache):
......@@ -243,30 +250,38 @@ class BlockStructureFactory(object):
BlockStructure, if the block structure is in the cache, and
NoneType otherwise.
"""
block_relations, transformer_data = cache.get(cls._encode_root_cache_key(root_block_key), (None, None))
if block_relations:
block_structure = BlockStructureBlockData(root_block_key)
block_structure._block_relations = block_relations
block_structure._transformer_data = transformer_data
transformer_issues = {}
for transformer in BlockStructureTransformers.get_registered_transformers():
cached_transformer_version = block_structure.get_transformer_data_version(transformer)
if transformer.VERSION != cached_transformer_version:
transformer_issues[transformer.name()] = "version: {}, cached: {}".format(
transformer.VERSION,
cached_transformer_version,
)
if not transformer_issues:
block_structure._block_data_map = cache.get_many(block_relations.iterkeys())
return block_structure
else:
logger.info(
"Collected data for the following transformers have issues:\n{}."
).format('\n'.join([t_name + ": " + t_value for t_name, t_value in transformer_issues.iteritems()]))
return None
zp_data_from_cache = cache.get(cls._encode_root_cache_key(root_block_key))
if not zp_data_from_cache:
return None
logger.debug(
"Read BlockStructure {} from cache, size: {}".format(
root_block_key, len(zp_data_from_cache)
)
)
block_relations, transformer_data, block_data_map = zunpickle(zp_data_from_cache)
block_structure = BlockStructureBlockData(root_block_key)
block_structure._block_relations = block_relations
block_structure._transformer_data = transformer_data
block_structure._block_data_map = block_data_map
transformer_issues = {}
for transformer in BlockStructureTransformers.get_registered_transformers():
cached_transformer_version = block_structure.get_transformer_data_version(transformer)
if transformer.VERSION != cached_transformer_version:
transformer_issues[transformer.name()] = "version: {}, cached: {}".format(
transformer.VERSION,
cached_transformer_version,
)
if transformer_issues:
logger.info(
"Collected data for the following transformers have issues:\n{}."
).format('\n'.join([t_name + ": " + t_value for t_name, t_value in transformer_issues.iteritems()]))
return None
return block_structure
@classmethod
def remove_from_cache(cls, root_block_key, cache):
......
"""
Utilities related to caching.
"""
import cPickle as pickle
import functools
import zlib
from xblock.core import XBlock
......@@ -47,3 +48,13 @@ def hashvalue(arg):
return unicode(arg.location)
else:
return unicode(arg)
def zpickle(data):
"""Given any data structure, returns a zlib compressed pickled serialization."""
return zlib.compress(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
def zunpickle(zdata):
"""Given a zlib compressed pickled serialization, returns the deserialized data."""
return pickle.loads(zlib.decompress(zdata))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment