Commit 10bd4ef6 by Adam Palay Committed by Adam Palay

dump item information to neo4j

clear request cache to avoid memory leak

update to use https

delete courses in the same transaction as writing them

python3 compatibility
parent c7df4ed0
"""
This file contains a management command for exporting the modulestore to
neo4j, a graph database.
"""
from __future__ import unicode_literals
import logging
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.utils import six
from py2neo import Graph, Node, Relationship, authenticate
from py2neo.compat import integer, string, unicode as neo4j_unicode
from request_cache.middleware import RequestCache
from xmodule.modulestore.django import modulestore
log = logging.getLogger(__name__)
# When testing locally, neo4j's bolt logger was noisy, so we'll only have it
# emit logs if there's an error.
bolt_log = logging.getLogger('neo4j.bolt') # pylint: disable=invalid-name
bolt_log.setLevel(logging.ERROR)
ITERABLE_NEO4J_TYPES = (tuple, list, set, frozenset)
PRIMITIVE_NEO4J_TYPES = (integer, string, neo4j_unicode, float, bool)
class ModuleStoreSerializer(object):
"""
Class with functionality to serialize a modulestore into subgraphs,
one graph per course.
"""
def __init__(self):
self.all_courses = modulestore().get_course_summaries()
@staticmethod
def serialize_item(item):
"""
Args:
item: an XBlock
Returns:
fields: a dictionary of an XBlock's field names and values
label: the name of the XBlock's type (i.e. 'course'
or 'problem')
"""
# convert all fields to a dict and filter out parent and children field
fields = dict(
(field, field_value.read_from(item))
for (field, field_value) in six.iteritems(item.fields)
if field not in ['parent', 'children']
)
course_key = item.scope_ids.usage_id.course_key
# set or reset some defaults
fields['edited_on'] = six.text_type(getattr(item, 'edited_on', ''))
fields['display_name'] = item.display_name_with_default
fields['org'] = course_key.org
fields['course'] = course_key.course
fields['run'] = course_key.run
fields['course_key'] = six.text_type(course_key)
label = item.scope_ids.block_type
# prune some fields
if label == 'course':
if 'checklists' in fields:
del fields['checklists']
return fields, label
def serialize_course(self, course_id):
"""
Args:
course_id: CourseKey of the course we want to serialize
Returns:
nodes: a list of py2neo Node objects
relationships: a list of py2neo Relationships objects
Serializes a course into Nodes and Relationships
"""
# create a location to node mapping we'll need later for
# writing relationships
location_to_node = {}
items = modulestore().get_items(course_id)
# create nodes
nodes = []
for item in items:
fields, label = self.serialize_item(item)
for field_name, value in six.iteritems(fields):
fields[field_name] = self.coerce_types(value)
node = Node(label, 'item', **fields)
nodes.append(node)
location_to_node[item.location] = node
# create relationships
relationships = []
for item in items:
for child_loc in item.get_children():
parent_node = location_to_node.get(item.location)
child_node = location_to_node.get(child_loc.location)
if parent_node is not None and child_node is not None:
relationship = Relationship(parent_node, "PARENT_OF", child_node)
relationships.append(relationship)
return nodes, relationships
@staticmethod
def coerce_types(value):
"""
Args:
value: the value of an xblock's field
Returns: either the value, a text version of the value, or, if the
value is iterable, the value with each element being converted to text
"""
coerced_value = value
if isinstance(value, ITERABLE_NEO4J_TYPES):
coerced_value = []
for element in value:
coerced_value.append(six.text_type(element))
# convert coerced_value back to its original type
coerced_value = type(value)(coerced_value)
# if it's not one of the types that neo4j accepts,
# just convert it to text
elif not isinstance(value, PRIMITIVE_NEO4J_TYPES):
coerced_value = six.text_type(value)
return coerced_value
class Command(BaseCommand):
"""
Command to dump modulestore data to neo4j
"""
@staticmethod
def add_to_transaction(neo4j_entities, transaction):
"""
Args:
neo4j_entities: a list of Nodes or Relationships
transaction: a neo4j transaction
"""
for entity in neo4j_entities:
transaction.create(entity)
def handle(self, *args, **options): # pylint: disable=unused-argument
"""
Iterates through each course, serializes them into graphs, and saves
those graphs to neo4j.
"""
# first, make sure that there's a valid neo4j configuration
if settings.NEO4J_CONFIG is None:
raise CommandError(
"No neo4j configuration (NEO4J_CONFIG) defined in lms.auth.json."
)
auth_params = ["{host}:{https_port}", "{user}", "{password}"]
authenticate(*[param.format(**settings.NEO4J_CONFIG) for param in auth_params])
graph = Graph(**settings.NEO4J_CONFIG)
mss = ModuleStoreSerializer()
total_number_of_courses = len(mss.all_courses)
for index, course in enumerate(mss.all_courses):
# first, clear the request cache to prevent memory leaks
RequestCache.clear_request_cache()
log.info(
"Now exporting %s to neo4j: course %d of %d total courses",
course.id,
index + 1,
total_number_of_courses
)
nodes, relationships = mss.serialize_course(course.id)
log.info(
"%d nodes and %d relationships in %s",
len(nodes),
len(relationships),
course.id
)
transaction = graph.begin()
try:
# first, delete existing course
transaction.run(
"MATCH (n:item) WHERE n.course_key='{}' DETACH DELETE n".format(
six.text_type(course.id)
)
)
# now, re-add it
self.add_to_transaction(nodes, transaction)
self.add_to_transaction(relationships, transaction)
transaction.commit()
except Exception: # pylint: disable=broad-except
log.exception(
"Error trying to dump course %s to neo4j, rolling back",
six.text_type(course.id)
)
transaction.rollback()
# coding=utf-8
"""
Tests for the dump_to_neo4j management command.
"""
from __future__ import unicode_literals
import ddt
import mock
from courseware.management.commands.dump_to_neo4j import (
ModuleStoreSerializer,
ITERABLE_NEO4J_TYPES,
)
from django.core.management import call_command
from django.core.management.base import CommandError
from django.utils import six
from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase
from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory
class TestDumpToNeo4jCommandBase(SharedModuleStoreTestCase):
"""
Base class for the test suites in this file. Sets up a couple courses.
"""
@classmethod
def setUpClass(cls):
super(TestDumpToNeo4jCommandBase, cls).setUpClass()
cls.course = CourseFactory.create()
cls.chapter = ItemFactory.create(parent=cls.course, category='chapter')
cls.sequential = ItemFactory.create(parent=cls.chapter, category='sequential')
cls.vertical = ItemFactory.create(parent=cls.sequential, category='vertical')
cls.html = ItemFactory.create(parent=cls.vertical, category='html')
cls.problem = ItemFactory.create(parent=cls.vertical, category='problem')
cls.video = ItemFactory.create(parent=cls.vertical, category='video')
cls.video2 = ItemFactory.create(parent=cls.vertical, category='video')
cls.course2 = CourseFactory.create()
class TestDumpToNeo4jCommand(TestDumpToNeo4jCommandBase):
"""
Tests for the dump to neo4j management command
"""
@mock.patch('courseware.management.commands.dump_to_neo4j.Graph')
def test_dump_to_neo4j(self, mock_graph_class):
"""
Tests the dump_to_neo4j management command works against a mock
py2neo Graph
"""
mock_graph = mock_graph_class.return_value
mock_transaction = mock.Mock()
mock_graph.begin.return_value = mock_transaction
call_command('dump_to_neo4j')
self.assertEqual(mock_graph.begin.call_count, 2)
self.assertEqual(mock_transaction.commit.call_count, 2)
self.assertEqual(mock_transaction.rollback.call_count, 0)
# 7 nodes + 9 relationships from the first course
# 2 nodes and no relationships from the second
self.assertEqual(mock_transaction.create.call_count, 18)
self.assertEqual(mock_transaction.run.call_count, 2)
@mock.patch('courseware.management.commands.dump_to_neo4j.Graph')
def test_dump_to_neo4j_rollback(self, mock_graph_class):
"""
Tests that the management command handles the case where there's
an exception trying to write to the neo4j database.
"""
mock_graph = mock_graph_class.return_value
mock_transaction = mock.Mock()
mock_graph.begin.return_value = mock_transaction
mock_transaction.run.side_effect = ValueError('Something went wrong!')
call_command('dump_to_neo4j')
self.assertEqual(mock_graph.begin.call_count, 2)
self.assertEqual(mock_transaction.commit.call_count, 0)
self.assertEqual(mock_transaction.rollback.call_count, 2)
@mock.patch('django.conf.settings.NEO4J_CONFIG', None)
def test_dump_to_neo4j_no_config(self):
"""
Tests that the command errors out if there isn't a configuration
file found
"""
with self.assertRaises(CommandError):
call_command('dump_to_neo4j')
@ddt.ddt
class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase):
"""
Tests for the ModuleStoreSerializer
"""
def setUp(self):
super(TestModuleStoreSerializer, self).setUp()
self.modulestore_serializer = ModuleStoreSerializer()
def test_serialize_item(self):
"""
Tests the serialize_item method.
"""
fields, label = self.modulestore_serializer.serialize_item(self.course)
self.assertEqual(label, "course")
self.assertIn("edited_on", fields.keys())
self.assertIn("display_name", fields.keys())
self.assertIn("org", fields.keys())
self.assertIn("course", fields.keys())
self.assertIn("run", fields.keys())
self.assertIn("course_key", fields.keys())
self.assertNotIn("checklist", fields.keys())
def test_serialize_course(self):
"""
Tests the serialize_course method.
"""
nodes, relationships = self.modulestore_serializer.serialize_course(
self.course.id
)
self.assertEqual(len(nodes), 9)
self.assertEqual(len(relationships), 7)
@ddt.data(*ITERABLE_NEO4J_TYPES)
def test_coerce_types_iterable(self, iterable_type):
"""
Tests the coerce_types helper method for iterable types
"""
example_iterable = iterable_type([object, object, object])
# each element in the iterable is not unicode:
self.assertFalse(any(isinstance(tab, six.text_type) for tab in example_iterable))
# but after they are coerced, they are:
coerced = self.modulestore_serializer.coerce_types(example_iterable)
self.assertTrue(all(isinstance(tab, six.text_type) for tab in coerced))
# finally, make sure we haven't changed the type:
self.assertEqual(type(coerced), iterable_type)
@ddt.data(
(1, 1),
(object, "<type 'object'>"),
(1.5, 1.5),
("úñîçø∂é", "úñîçø∂é"),
(b"plain string", b"plain string"),
(True, True),
(None, "None"),
)
@ddt.unpack
def test_coerce_types_base(self, original_value, coerced_expected):
"""
Tests the coerce_types helper for the neo4j base types
"""
coerced_value = self.modulestore_serializer.coerce_types(original_value)
self.assertEqual(coerced_value, coerced_expected)
......@@ -863,3 +863,6 @@ API_ACCESS_FROM_EMAIL = ENV_TOKENS.get('API_ACCESS_FROM_EMAIL')
APP_UPGRADE_CACHE_TIMEOUT = ENV_TOKENS.get('APP_UPGRADE_CACHE_TIMEOUT', APP_UPGRADE_CACHE_TIMEOUT)
AFFILIATE_COOKIE_NAME = ENV_TOKENS.get('AFFILIATE_COOKIE_NAME', AFFILIATE_COOKIE_NAME)
############## Settings for Neo4j ############################
NEO4J_CONFIG = AUTH_TOKENS.get('NEO4J_CONFIG')
......@@ -2951,3 +2951,8 @@ AFFILIATE_COOKIE_NAME = 'affiliate_id'
# The cache is cleared when Redirect models are saved/deleted
REDIRECT_CACHE_TIMEOUT = None # The length of time we cache Redirect model data
REDIRECT_CACHE_KEY_PREFIX = 'redirects'
############## Settings for Neo4j ############################
# This should be set in configuration
NEO4J_CONFIG = None
......@@ -590,3 +590,13 @@ COURSE_CATALOG_API_URL = 'https://catalog.example.com/api/v1'
COMPREHENSIVE_THEME_DIRS = [REPO_ROOT / "themes", REPO_ROOT / "common/test"]
LMS_ROOT_URL = "http://localhost:8000"
# Test configuration for neo4j
NEO4J_CONFIG = {
'bolt': True,
'password': 'password',
'user': 'neo4j',
'https_port': 7473,
'host': 'localhost',
'secure': True,
}
......@@ -186,3 +186,7 @@ sailthru-client==2.2.3
# Release utils for the edx release pipeline
edx-django-release-util==0.1.0
# Used to communicate with Neo4j, which is used internally for
# modulestore inspection
py2neo==3.1.2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment