Commit 4f6905ed by E. Kolpakov

Added library search indexer and tests

Search indexer is not bound to any library modifications, so tests fail - this is expected.
parent d34c6a07
......@@ -10,10 +10,8 @@ from eventtracking import tracker
from xmodule.modulestore import ModuleStoreEnum
from search.search_engine_base import SearchEngine
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
# Use default index and document names for now
INDEX_NAME = "courseware_index"
DOCUMENT_TYPE = "courseware_content"
# REINDEX_AGE is the default amount of time that we look back for changes
# that might have happened. If we are provided with a time at which the
......@@ -40,18 +38,56 @@ class SearchIndexingError(Exception):
self.error_list = error_list
class CoursewareSearchIndexer(object):
class SearchIndexBase(object):
"""
Class to perform indexing for courseware search from different modulestores
"""
INDEX_NAME = None
DOCUMENT_TYPE = None
INDEX_EVENT = {
'name': None,
'category': None
}
@classmethod
def _fetch_top_level(self, modulestore, structure_key):
""" Fetch the item from the modulestore location """
raise NotImplementedError("Should be overridden in child classes")
@classmethod
def _get_location_info(self, structure_key):
""" Builds location info dictionary """
raise NotImplementedError("Should be overridden in child classes")
@classmethod
def index_course(cls, modulestore, course_key, triggered_at=None, reindex_age=REINDEX_AGE):
def _id_modifier(self, usage_id):
""" Modifies usage_id to submit to index """
return usage_id
@classmethod
def remove_deleted_items(cls, searcher, structure_key, exclude_items):
"""
remove any item that is present in the search index that is not present in updated list of indexed items
as we find items we can shorten the set of items to keep
"""
response = searcher.search(
doc_type=cls.DOCUMENT_TYPE,
field_dictionary=cls._get_location_info(structure_key),
exclude_ids=exclude_items
)
result_ids = [result["data"]["id"] for result in response["results"]]
for result_id in result_ids:
searcher.remove(cls.DOCUMENT_TYPE, result_id)
@classmethod
def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE):
"""
Process course for indexing
Arguments:
course_key (CourseKey) - course identifier
structure_key (CourseKey|LibraryKey) - course or library identifier
triggered_at (datetime) - provides time at which indexing was triggered;
useful for index updates - only things changed recently from that date
......@@ -64,13 +100,11 @@ class CoursewareSearchIndexer(object):
Number of items that have been added to the index
"""
error_list = []
searcher = SearchEngine.get_search_engine(INDEX_NAME)
searcher = SearchEngine.get_search_engine(cls.INDEX_NAME)
if not searcher:
return
location_info = {
"course": unicode(course_key),
}
location_info = cls._get_location_info(structure_key)
# Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `index_item`
indexed_count = {
......@@ -101,7 +135,7 @@ class CoursewareSearchIndexer(object):
if not item_index_dictionary and not item.has_children:
return
item_id = unicode(item.scope_ids.usage_id)
item_id = unicode(cls._id_modifier(item.scope_ids.usage_id))
indexed_items.add(item_id)
if item.has_children:
# determine if it's okay to skip adding the children herein based upon how recently any may have changed
......@@ -122,38 +156,24 @@ class CoursewareSearchIndexer(object):
if item.start:
item_index['start_date'] = item.start
searcher.index(DOCUMENT_TYPE, item_index)
searcher.index(cls.DOCUMENT_TYPE, item_index)
indexed_count["count"] += 1
except Exception as err: # pylint: disable=broad-except
# broad exception so that index operation does not fail on one item of many
log.warning('Could not index item: %s - %r', item.location, err)
error_list.append(_('Could not index item: {}').format(item.location))
def remove_deleted_items():
"""
remove any item that is present in the search index that is not present in updated list of indexed items
as we find items we can shorten the set of items to keep
"""
response = searcher.search(
doc_type=DOCUMENT_TYPE,
field_dictionary={"course": unicode(course_key)},
exclude_ids=indexed_items
)
result_ids = [result["data"]["id"] for result in response["results"]]
for result_id in result_ids:
searcher.remove(DOCUMENT_TYPE, result_id)
try:
with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only):
course = modulestore.get_course(course_key, depth=None)
for item in course.get_children():
structure = cls._fetch_top_level(modulestore, structure_key)
for item in structure.get_children():
index_item(item)
remove_deleted_items()
cls.remove_deleted_items(searcher, structure_key, indexed_items)
except Exception as err: # pylint: disable=broad-except
# broad exception so that index operation does not prevent the rest of the application from working
log.exception(
"Indexing error encountered, courseware index may be out of date %s - %r",
course_key,
structure_key,
err
)
error_list.append(_('General indexing error occurred'))
......@@ -164,31 +184,93 @@ class CoursewareSearchIndexer(object):
return indexed_count["count"]
@classmethod
def do_course_reindex(cls, modulestore, course_key):
def _do_reindex(cls, modulestore, structure_key):
"""
(Re)index all content within the given course, tracking the fact that a full reindex has taking place
(Re)index all content within the given structure (course or library),
tracking the fact that a full reindex has taken place
"""
indexed_count = cls.index_course(modulestore, course_key)
indexed_count = cls.index(modulestore, structure_key)
if indexed_count:
cls._track_index_request('edx.course.index.reindexed', indexed_count)
cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count)
return indexed_count
@classmethod
def _track_index_request(cls, event_name, indexed_count):
def _track_index_request(cls, event_name, category, indexed_count):
"""Track content index requests.
Arguments:
event_name (str): Name of the event to be logged.
category (str): cat3gory of indexed items
indexed_count (int): number of indexed items
Returns:
None
"""
data = {
"indexed_count": indexed_count,
'category': 'courseware_index',
'category': category,
}
tracker.emit(
event_name,
data
)
class CoursewareSearchIndexer(SearchIndexBase):
INDEX_NAME = "courseware_index"
DOCUMENT_TYPE = "courseware_content"
INDEX_EVENT = {
'name': 'edx.course.index.reindexed',
'category': 'courseware_index'
}
@classmethod
def _fetch_top_level(self, modulestore, structure_key):
""" Fetch the item from the modulestore location """
return modulestore.get_course(structure_key, depth=None)
@classmethod
def _get_location_info(self, structure_key):
""" Builds location info dictionary """
return {"course": unicode(structure_key)}
@classmethod
def do_course_reindex(cls, modulestore, course_key):
"""
(Re)index all content within the given course, tracking the fact that a full reindex has taken place
"""
return cls._do_reindex(modulestore, course_key)
class LibrarySearchIndexer(SearchIndexBase):
INDEX_NAME = "library_index"
DOCUMENT_TYPE = "library_content"
INDEX_EVENT = {
'name': 'edx.library.index.reindexed',
'category': 'library_index'
}
@classmethod
def _fetch_top_level(self, modulestore, structure_key):
""" Fetch the item from the modulestore location """
return modulestore.get_library(structure_key, depth=None)
@classmethod
def _get_location_info(self, structure_key):
""" Builds location info dictionary """
return {"library": unicode(structure_key.replace(version_guid=None, branch=None))}
@classmethod
def _id_modifier(self, usage_id):
""" Modifies usage_id to submit to index """
return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None)))
@classmethod
def do_library_reindex(cls, modulestore, library_key):
"""
(Re)index all content within the given library, tracking the fact that a full reindex has taken place
"""
return cls._do_reindex(modulestore, library_key)
\ No newline at end of file
......@@ -2,6 +2,7 @@
Testing indexing of the courseware as it is changed
"""
import ddt
from lazy.lazy import lazy
import time
from datetime import datetime
from mock import patch
......@@ -15,15 +16,18 @@ from xmodule.modulestore.edit_info import EditInfoMixin
from xmodule.modulestore.inheritance import InheritanceMixin
from xmodule.modulestore.mixed import MixedModuleStore
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory
from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory, LibraryFactory
from xmodule.modulestore.tests.mongo_connection import MONGO_PORT_NUM, MONGO_HOST
from xmodule.modulestore.tests.test_cross_modulestore_import_export import MongoContentstoreBuilder
from xmodule.modulestore.tests.utils import create_modulestore_instance, LocationMixin, MixedSplitTestCase
from xmodule.tests import DATA_DIR
from xmodule.x_module import XModuleMixin
from search.search_engine_base import SearchEngine
from contentstore.courseware_index import CoursewareSearchIndexer, INDEX_NAME, DOCUMENT_TYPE, SearchIndexingError
from contentstore.courseware_index import (
CoursewareSearchIndexer, LibrarySearchIndexer, SearchIndexingError, get_indexer_for_location
)
from contentstore.signals import listen_for_course_publish
......@@ -123,9 +127,21 @@ class MixedWithOptionsTestCase(MixedSplitTestCase):
""" base version of setup_course_base is a no-op """
pass
@lazy
def searcher(self):
return self.get_search_engine()
def _get_default_search(self):
""" Returns field_dictionary for default search """
return {}
def get_search_engine(self):
""" Centralized call to getting the search engine for the test """
return SearchEngine.get_search_engine(INDEX_NAME)
return SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME)
def search(self, field_dictionary=None):
fields = field_dictionary if field_dictionary else self._get_default_search()
return self.searcher.search(field_dictionary=fields)
def _perform_test_using_store(self, store_type, test_to_perform):
""" Helper method to run a test function that uses a specific store """
......@@ -217,38 +233,39 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
def index_recent_changes(self, store, since_time):
""" index course using recent changes """
trigger_time = datetime.now(UTC)
return CoursewareSearchIndexer.index_course(
return CoursewareSearchIndexer.index(
store,
self.course.id,
triggered_at=trigger_time,
reindex_age=(trigger_time - since_time)
)
def _get_default_search(self):
return {"course": unicode(self.course.id)}
def _test_indexing_course(self, store):
""" indexing course tests """
searcher = self.get_search_engine()
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 0)
# Only published modules should be in the index
added_to_index = self.reindex_course(store)
self.assertEqual(added_to_index, 3)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 3)
# Publish the vertical as is, and any unpublished children should now be available
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
def _test_not_indexing_unpublished_content(self, store):
""" add a new one, only appers in index once added """
searcher = self.get_search_engine()
# Publish the vertical to start with
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# Now add a new unit to the existing vertical
......@@ -260,44 +277,42 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
modulestore=store,
)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# Now publish it and we should find it
# Publish the vertical as is, and everything should be available
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 5)
def _test_deleting_item(self, store):
""" test deleting an item """
searcher = self.get_search_engine()
# Publish the vertical to start with
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# just a delete should not change anything
self.delete_item(store, self.html_unit.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# but after publishing, we should no longer find the html_unit
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 3)
def _test_not_indexable(self, store):
""" test not indexable items """
searcher = self.get_search_engine()
# Publish the vertical to start with
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# Add a non-indexable item
......@@ -309,25 +324,24 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
modulestore=store,
)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
# even after publishing, we should not find the non-indexable item
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
def _test_start_date_propagation(self, store):
""" make sure that the start date is applied at the right level """
searcher = self.get_search_engine()
early_date = self.course.start
later_date = self.vertical.start
# Publish the vertical
self.publish_item(store, self.vertical.location)
self.reindex_course(store)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = self.search()
self.assertEqual(response["total"], 4)
results = response["results"]
......@@ -438,13 +452,13 @@ class TestLargeCourseDeletions(MixedWithOptionsTestCase):
def _clean_course_id(self):
""" Clean all documents from the index that have a specific course provided """
if self.course_id:
searcher = self.get_search_engine()
response = searcher.search(field_dictionary={"course": self.course_id})
response = self.searcher.search(field_dictionary={"course": self.course_id})
while response["total"] > 0:
for item in response["results"]:
searcher.remove(DOCUMENT_TYPE, item["data"]["id"])
searcher.remove(DOCUMENT_TYPE, item["data"]["id"])
response = searcher.search(field_dictionary={"course": self.course_id})
self.searcher.remove(TestCoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"])
self.searcher.remove(TestCoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"])
response = self.searcher.search(field_dictionary={"course": self.course_id})
self.course_id = None
def setUp(self):
......@@ -457,8 +471,8 @@ class TestLargeCourseDeletions(MixedWithOptionsTestCase):
def assert_search_count(self, expected_count):
""" Check that the search within this course will yield the expected number of results """
searcher = self.get_search_engine()
response = searcher.search(field_dictionary={"course": self.course_id})
response = self.searcher.search(field_dictionary={"course": self.course_id})
self.assertEqual(response["total"], expected_count)
def _do_test_large_course_deletion(self, store, load_factor):
......@@ -553,7 +567,7 @@ class TestTaskExecution(ModuleStoreTestCase):
def test_task_indexing_course(self):
""" Making sure that the receiver correctly fires off the task when invoked by signal """
searcher = SearchEngine.get_search_engine(INDEX_NAME)
searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
self.assertEqual(response["total"], 0)
......@@ -563,3 +577,173 @@ class TestTaskExecution(ModuleStoreTestCase):
# Note that this test will only succeed if celery is working in inline mode
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
self.assertEqual(response["total"], 3)
@ddt.ddt
class TestLibrarySearchIndexer(MixedWithOptionsTestCase):
""" Tests the operation of the CoursewareSearchIndexer """
def setUp(self):
super(TestLibrarySearchIndexer, self).setUp()
self.library = None
self.html_unit1 = None
self.html_unit2 = None
def setup_course_base(self, store):
"""
Set up the for the course outline tests.
"""
self.library = LibraryFactory.create(modulestore=store)
self.html_unit1 = ItemFactory.create(
parent_location=self.library.location,
category="html",
display_name="Html Content",
modulestore=store,
publish_item=False,
)
self.html_unit2 = ItemFactory.create(
parent_location=self.library.location,
category="html",
display_name="Html Content 2",
modulestore=store,
publish_item=False,
)
def _get_default_search(self):
""" Returns field_dictionary for default search """
return {"library": unicode(self.library.location.replace(version_guid=None, branch=None))}
def reindex_library(self, store):
""" kick off complete reindex of the course """
return LibrarySearchIndexer.do_library_reindex(store, self.library.location.library_key)
def _get_contents(self, response):
""" Extracts contents from search response """
return [item['contents'] for item in response['results']]
def index_recent_changes(self, store, since_time):
""" index course using recent changes """
trigger_time = datetime.now(UTC)
return LibrarySearchIndexer.index(
store,
self.library.id,
triggered_at=trigger_time,
reindex_age=(trigger_time - since_time)
)
def _test_indexing_library(self, store):
""" indexing course tests """
response = self.search()
self.assertEqual(response["total"], 2)
added_to_index = self.reindex_library(store)
self.assertEqual(added_to_index, 2)
response = self.search()
self.assertEqual(response["total"], 2)
def _test_creating_item(self, store):
""" test updating an item """
response = self.search()
self.assertEqual(response["total"], 2)
# updating a library item causes immediate reindexing
data = "Some data"
ItemFactory.create(
parent_location=self.library.location,
category="html",
display_name="Html Content 3",
data=data,
modulestore=store,
publish_item=False,
)
response = self.search()
self.assertEqual(response["total"], 3)
html_contents = [cont['html_content'] for cont in self._get_contents(response)]
self.assertIn(data, html_contents)
def _test_updating_item(self, store):
""" test updating an item """
response = self.search()
self.assertEqual(response["total"], 2)
# updating a library item causes immediate reindexing
new_data = "I'm new data"
self.html_unit1.data = new_data
self.reindex_library(store)
response = self.search()
self.assertEqual(response["total"], 2)
html_contents = [cont['html_content'] for cont in self._get_contents(response)]
self.assertIn(new_data, html_contents)
def _test_deleting_item(self, store):
""" test deleting an item """
response = self.search()
self.assertEqual(response["total"], 2)
# deleting a library item causes immediate reindexing
self.delete_item(store, self.html_unit1.location)
self.reindex_library(store)
response = self.search()
self.assertEqual(response["total"], 1)
def _test_not_indexable(self, store):
""" test not indexable items """
response = self.search()
self.assertEqual(response["total"], 2)
# Add a non-indexable item
ItemFactory.create(
parent_location=self.library.location,
category="problem",
display_name="Some other content",
publish_item=False,
modulestore=store,
)
self.reindex_library(store)
response = self.search()
self.assertEqual(response["total"], 2)
@patch('django.conf.settings.SEARCH_ENGINE', None)
def _test_search_disabled(self, store):
""" if search setting has it as off, confirm that nothing is indexed """
indexed_count = self.reindex_library(store)
self.assertFalse(indexed_count)
@patch('django.conf.settings.SEARCH_ENGINE', 'search.tests.utils.ErroringIndexEngine')
def _test_exception(self, store):
""" Test that exception within indexing yields a SearchIndexingError """
with self.assertRaises(SearchIndexingError):
self.reindex_library(store)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_indexing_library(self, store_type):
self._perform_test_using_store(store_type, self._test_indexing_library)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_updating_item(self, store_type):
self._perform_test_using_store(store_type, self._test_updating_item)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_creating_item(self, store_type):
self._perform_test_using_store(store_type, self._test_creating_item)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_deleting_item(self, store_type):
self._perform_test_using_store(store_type, self._test_deleting_item)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_not_indexable(self, store_type):
self._perform_test_using_store(store_type, self._test_not_indexable)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_search_disabled(self, store_type):
self._perform_test_using_store(store_type, self._test_search_disabled)
@ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split)
def test_exception(self, store_type):
self._perform_test_using_store(store_type, self._test_exception)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment