Commit c776272a by Martyn James

Merge pull request #7654 from edx/mjames/SOL-280

Index course information alongside courseware
parents c006befd b9f1c5db
......@@ -3,14 +3,19 @@ from __future__ import absolute_import
from abc import ABCMeta, abstractmethod
from datetime import timedelta
import logging
import re
from six import add_metaclass
from django.conf import settings
from django.utils.translation import ugettext as _
from contentstore.utils import course_image_url
from course_modes.models import CourseMode
from eventtracking import tracker
from search.search_engine_base import SearchEngine
from xmodule.annotator_mixin import html_to_text
from xmodule.modulestore import ModuleStoreEnum
from xmodule.library_tools import normalize_key_for_search
from search.search_engine_base import SearchEngine
# REINDEX_AGE is the default amount of time that we look back for changes
# that might have happened. If we are provided with a time at which the
......@@ -22,6 +27,25 @@ REINDEX_AGE = timedelta(0, 60) # 60 seconds
log = logging.getLogger('edx.modulestore')
def strip_html_content_to_text(html_content):
""" Gets only the textual part for html content - useful for building text to be searched """
# Removing HTML-encoded non-breaking space characters
text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content))
# Removing HTML CDATA
text_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", text_content)
# Removing HTML comments
text_content = re.sub(r"<!--.*-->", "", text_content)
return text_content
def indexing_is_enabled():
"""
Checks to see if the indexing feature is enabled
"""
return settings.FEATURES.get('ENABLE_COURSEWARE_INDEX', False)
class SearchIndexingError(Exception):
""" Indicates some error(s) occured during indexing """
......@@ -94,6 +118,8 @@ class SearchIndexerBase(object):
Process course for indexing
Arguments:
modulestore - modulestore object to use for operations
structure_key (CourseKey|LibraryKey) - course or library identifier
triggered_at (datetime) - provides time at which indexing was triggered;
......@@ -174,6 +200,11 @@ class SearchIndexerBase(object):
try:
with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only):
structure = cls._fetch_top_level(modulestore, structure_key)
# First perform any additional indexing from the structure object
cls.supplemental_index_information(modulestore, structure)
# Now index the content
for item in structure.get_children():
index_item(item)
cls.remove_deleted_items(searcher, structure_key, indexed_items)
......@@ -224,6 +255,21 @@ class SearchIndexerBase(object):
data
)
@classmethod
def supplemental_index_information(cls, modulestore, structure):
"""
Perform any supplemental indexing given that the structure object has
already been loaded. Base implementation performs no operation.
Arguments:
modulestore - modulestore object used during the indexing operation
structure - structure object loaded during the indexing job
Returns:
None
"""
pass
class CoursewareSearchIndexer(SearchIndexerBase):
"""
......@@ -260,6 +306,13 @@ class CoursewareSearchIndexer(SearchIndexerBase):
"""
return cls._do_reindex(modulestore, course_key)
@classmethod
def supplemental_index_information(cls, modulestore, structure):
"""
Perform additional indexing from loaded structure object
"""
CourseAboutSearchIndexer.index_about_information(modulestore, structure)
class LibrarySearchIndexer(SearchIndexerBase):
"""
......@@ -300,3 +353,176 @@ class LibrarySearchIndexer(SearchIndexerBase):
(Re)index all content within the given library, tracking the fact that a full reindex has taken place
"""
return cls._do_reindex(modulestore, library_key)
class AboutInfo(object):
""" About info structure to contain
1) Property name to use
2) Where to add in the index (using flags above)
3) Where to source the properties value
"""
# Bitwise Flags for where to index the information
#
# ANALYSE - states that the property text contains content that we wish to be able to find matched within
# e.g. "joe" should yield a result for "I'd like to drink a cup of joe"
#
# PROPERTY - states that the property text should be a property of the indexed document, to be returned with the
# results: search matches will only be made on exact string matches
# e.g. "joe" will only match on "joe"
#
# We are using bitwise flags because one may want to add the property to EITHER or BOTH parts of the index
# e.g. university name is desired to be analysed, so that a search on "Oxford" will match
# property values "University of Oxford" and "Oxford Brookes University",
# but it is also a useful property, because within a (future) filtered search a user
# may have chosen to filter courses from "University of Oxford"
#
# see https://wiki.python.org/moin/BitwiseOperators for information about bitwise shift operator used below
#
ANALYSE = 1 << 0 # Add the information to the analysed content of the index
PROPERTY = 1 << 1 # Add the information as a property of the object being indexed (not analysed)
def __init__(self, property_name, index_flags, source_from):
self.property_name = property_name
self.index_flags = index_flags
self.source_from = source_from
def get_value(self, **kwargs):
""" get the value for this piece of information, using the correct source """
return self.source_from(self, **kwargs)
def from_about_dictionary(self, **kwargs):
""" gets the value from the kwargs provided 'about_dictionary' """
about_dictionary = kwargs.get('about_dictionary', None)
if not about_dictionary:
raise ValueError("Context dictionary does not contain expected argument 'about_dictionary'")
return about_dictionary.get(self.property_name, None)
def from_course_property(self, **kwargs):
""" gets the value from the kwargs provided 'course' """
course = kwargs.get('course', None)
if not course:
raise ValueError("Context dictionary does not contain expected argument 'course'")
return getattr(course, self.property_name, None)
def from_course_mode(self, **kwargs):
""" fetches the available course modes from the CourseMode model """
course = kwargs.get('course', None)
if not course:
raise ValueError("Context dictionary does not contain expected argument 'course'")
return [mode.slug for mode in CourseMode.modes_for_course(course.id)]
# Source location options - either from the course or the about info
FROM_ABOUT_INFO = from_about_dictionary
FROM_COURSE_PROPERTY = from_course_property
FROM_COURSE_MODE = from_course_mode
class CourseAboutSearchIndexer(object):
"""
Class to perform indexing of about information from course object
"""
DISCOVERY_DOCUMENT_TYPE = "course_info"
INDEX_NAME = CoursewareSearchIndexer.INDEX_NAME
# List of properties to add to the index - each item in the list is an instance of AboutInfo object
ABOUT_INFORMATION_TO_INCLUDE = [
AboutInfo("advertised_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("announcement", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("effort", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("display_name", AboutInfo.ANALYSE, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("overview", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("title", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("university", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("number", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("short_description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("key_dates", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("video", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("course_staff_short", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("course_staff_extended", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("requirements", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("syllabus", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("textbook", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("faq", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("more_info", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("ocw_links", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
AboutInfo("enrollment_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("enrollment_end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("org", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
AboutInfo("modes", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_MODE),
]
@classmethod
def index_about_information(cls, modulestore, course):
"""
Add the given course to the course discovery index
Arguments:
modulestore - modulestore object to use for operations
course - course object from which to take properties, locate about information
"""
searcher = SearchEngine.get_search_engine(cls.INDEX_NAME)
if not searcher:
return
course_id = unicode(course.id)
course_info = {
'id': course_id,
'course': course_id,
'content': {},
'image_url': course_image_url(course),
}
# load data for all of the 'about' modules for this course into a dictionary
about_dictionary = {
item.location.name: item.data
for item in modulestore.get_items(course.id, qualifiers={"category": "about"})
}
about_context = {
"course": course,
"about_dictionary": about_dictionary,
}
for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE:
# Broad exception handler so that a single bad property does not scupper the collection of others
try:
section_content = about_information.get_value(**about_context)
except: # pylint: disable=bare-except
section_content = None
log.warning(
"Course discovery could not collect property %s for course %s",
about_information.property_name,
course_id,
exc_info=True,
)
if section_content:
if about_information.index_flags & AboutInfo.ANALYSE:
analyse_content = section_content
if isinstance(section_content, basestring):
analyse_content = strip_html_content_to_text(section_content)
course_info['content'][about_information.property_name] = analyse_content
if about_information.index_flags & AboutInfo.PROPERTY:
course_info[about_information.property_name] = section_content
# Broad exception handler to protect around and report problems with indexing
try:
searcher.index(cls.DISCOVERY_DOCUMENT_TYPE, course_info)
except: # pylint: disable=bare-except
log.exception(
"Course discovery indexing error encountered, course discovery index may be out of date %s",
course_id,
)
raise
log.debug(
"Successfully added %s course to the course discovery index",
course_id
)
......@@ -10,10 +10,12 @@ from pytz import UTC
from uuid import uuid4
from unittest import skip
from course_modes.models import CourseMode
from xmodule.library_tools import normalize_key_for_search
from xmodule.modulestore import ModuleStoreEnum
from xmodule.modulestore.django import SignalHandler
from xmodule.modulestore.edit_info import EditInfoMixin
from xmodule.modulestore.exceptions import ItemNotFoundError
from xmodule.modulestore.inheritance import InheritanceMixin
from xmodule.modulestore.mixed import MixedModuleStore
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
......@@ -26,7 +28,12 @@ from xmodule.x_module import XModuleMixin
from search.search_engine_base import SearchEngine
from contentstore.courseware_index import CoursewareSearchIndexer, LibrarySearchIndexer, SearchIndexingError
from contentstore.courseware_index import (
CoursewareSearchIndexer,
LibrarySearchIndexer,
SearchIndexingError,
CourseAboutSearchIndexer,
)
from contentstore.signals import listen_for_course_publish, listen_for_library_update
......@@ -120,6 +127,7 @@ class MixedWithOptionsTestCase(MixedSplitTestCase):
}
INDEX_NAME = None
DOCUMENT_TYPE = None
def setUp(self):
super(MixedWithOptionsTestCase, self).setUp()
......@@ -140,7 +148,7 @@ class MixedWithOptionsTestCase(MixedSplitTestCase):
def search(self, field_dictionary=None):
""" Performs index search according to passed parameters """
fields = field_dictionary if field_dictionary else self._get_default_search()
return self.searcher.search(field_dictionary=fields)
return self.searcher.search(field_dictionary=fields, doc_type=self.DOCUMENT_TYPE)
def _perform_test_using_store(self, store_type, test_to_perform):
""" Helper method to run a test function that uses a specific store """
......@@ -172,6 +180,26 @@ class MixedWithOptionsTestCase(MixedSplitTestCase):
with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred):
store.update_item(item, ModuleStoreEnum.UserID.test)
def update_about_item(self, store, about_key, data):
"""
Update the about item with the new data blob. If data is None, then
delete the about item.
"""
temploc = self.course.id.make_usage_key('about', about_key)
if data is None:
try:
self.delete_item(store, temploc)
# Ignore an attempt to delete an item that doesn't exist
except ValueError:
pass
else:
try:
about_item = store.get_item(temploc)
except ItemNotFoundError:
about_item = store.create_xblock(self.course.runtime, self.course.id, 'about', about_key)
about_item.data = data
store.update_item(about_item, ModuleStoreEnum.UserID.test, allow_not_found=True)
@ddt.ddt
class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
......@@ -228,6 +256,7 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
)
INDEX_NAME = CoursewareSearchIndexer.INDEX_NAME
DOCUMENT_TYPE = CoursewareSearchIndexer.DOCUMENT_TYPE
def reindex_course(self, store):
""" kick off complete reindex of the course """
......@@ -407,6 +436,55 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
indexed_count = self.reindex_course(store)
self.assertEqual(indexed_count, 7)
def _test_course_about_property_index(self, store):
""" Test that informational properties in the course object end up in the course_info index """
display_name = "Help, I need somebody!"
self.course.display_name = display_name
self.update_item(store, self.course)
self.reindex_course(store)
response = self.searcher.search(
doc_type=CourseAboutSearchIndexer.DISCOVERY_DOCUMENT_TYPE,
field_dictionary={"course": unicode(self.course.id)}
)
self.assertEqual(response["total"], 1)
self.assertEqual(response["results"][0]["data"]["content"]["display_name"], display_name)
def _test_course_about_store_index(self, store):
""" Test that informational properties in the about store end up in the course_info index """
short_description = "Not just anybody"
self.update_about_item(store, "short_description", short_description)
self.reindex_course(store)
response = self.searcher.search(
doc_type=CourseAboutSearchIndexer.DISCOVERY_DOCUMENT_TYPE,
field_dictionary={"course": unicode(self.course.id)}
)
self.assertEqual(response["total"], 1)
self.assertEqual(response["results"][0]["data"]["content"]["short_description"], short_description)
def _test_course_about_mode_index(self, store):
""" Test that informational properties in the course modes store end up in the course_info index """
honour_mode = CourseMode(
course_id=unicode(self.course.id),
mode_slug=CourseMode.HONOR,
mode_display_name=CourseMode.HONOR
)
honour_mode.save()
verified_mode = CourseMode(
course_id=unicode(self.course.id),
mode_slug=CourseMode.VERIFIED,
mode_display_name=CourseMode.VERIFIED
)
verified_mode.save()
self.reindex_course(store)
response = self.searcher.search(
doc_type=CourseAboutSearchIndexer.DISCOVERY_DOCUMENT_TYPE,
field_dictionary={"course": unicode(self.course.id)}
)
self.assertEqual(response["total"], 1)
self.assertIn(CourseMode.HONOR, response["results"][0]["data"]["modes"])
self.assertIn(CourseMode.VERIFIED, response["results"][0]["data"]["modes"])
@patch('django.conf.settings.SEARCH_ENGINE', 'search.tests.utils.ErroringIndexEngine')
def _test_exception(self, store):
""" Test that exception within indexing yields a SearchIndexingError """
......@@ -446,6 +524,18 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase):
def test_exception(self, store_type):
self._perform_test_using_store(store_type, self._test_exception)
@ddt.data(*WORKS_WITH_STORES)
def test_course_about_property_index(self, store_type):
self._perform_test_using_store(store_type, self._test_course_about_property_index)
@ddt.data(*WORKS_WITH_STORES)
def test_course_about_store_index(self, store_type):
self._perform_test_using_store(store_type, self._test_course_about_store_index)
@ddt.data(*WORKS_WITH_STORES)
def test_course_about_mode_index(self, store_type):
self._perform_test_using_store(store_type, self._test_course_about_mode_index)
@patch('django.conf.settings.SEARCH_ENGINE', 'search.tests.utils.ForceRefreshElasticSearchEngine')
@ddt.ddt
......@@ -462,7 +552,6 @@ class TestLargeCourseDeletions(MixedWithOptionsTestCase):
while response["total"] > 0:
for item in response["results"]:
self.searcher.remove(CoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"])
self.searcher.remove(CoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"])
response = self.searcher.search(field_dictionary={"course": self.course_id})
self.course_id = None
......@@ -590,13 +679,19 @@ class TestTaskExecution(ModuleStoreTestCase):
def test_task_indexing_course(self):
""" Making sure that the receiver correctly fires off the task when invoked by signal """
searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME)
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = searcher.search(
doc_type=CoursewareSearchIndexer.DOCUMENT_TYPE,
field_dictionary={"course": unicode(self.course.id)}
)
self.assertEqual(response["total"], 0)
listen_for_course_publish(self, self.course.id)
# Note that this test will only succeed if celery is working in inline mode
response = searcher.search(field_dictionary={"course": unicode(self.course.id)})
response = searcher.search(
doc_type=CoursewareSearchIndexer.DOCUMENT_TYPE,
field_dictionary={"course": unicode(self.course.id)}
)
self.assertEqual(response["total"], 3)
def test_task_library_update(self):
......@@ -650,6 +745,7 @@ class TestLibrarySearchIndexer(MixedWithOptionsTestCase):
)
INDEX_NAME = LibrarySearchIndexer.INDEX_NAME
DOCUMENT_TYPE = LibrarySearchIndexer.DOCUMENT_TYPE
def _get_default_search(self):
""" Returns field_dictionary for default search """
......
"""
Content library unit tests that require the CMS runtime.
"""
from django.test.utils import override_settings
from contentstore.tests.utils import AjaxEnabledTestClient, parse_json
from contentstore.utils import reverse_url, reverse_usage_url, reverse_library_url
from contentstore.views.item import _duplicate_item
......@@ -730,6 +731,7 @@ class TestLibraryAccess(SignalDisconnectTestMixin, LibraryTestCase):
@ddt.ddt
@override_settings(SEARCH_ENGINE=None)
class TestOverrides(LibraryTestCase):
"""
Test that overriding block Scope.settings fields from a library in a specific course works
......
......@@ -43,7 +43,7 @@ git+https://github.com/pmitros/pyfs.git@96e1922348bfe6d99201b9512a9ed946c87b7e0b
-e git+https://github.com/edx/edx-val.git@d6087908aa3dd05ceaa7f56a21284f86c53cb3f0#egg=edx-val
-e git+https://github.com/pmitros/RecommenderXBlock.git@9b07e807c89ba5761827d0387177f71aa57ef056#egg=recommender-xblock
-e git+https://github.com/edx/edx-milestones.git@547f2250ee49e73ce8d7ff4e78ecf1b049892510#egg=edx-milestones
-e git+https://github.com/edx/edx-search.git@6ce8b25a2539370de32dc7cb643ae27c9b8f798d#egg=edx-search
-e git+https://github.com/edx/edx-search.git@9d566b88fd80cb0b60c052eee2bee30eb9f35b9c#egg=edx-search
git+https://github.com/edx/edx-lint.git@8bf82a32ecb8598c415413df66f5232ab8d974e9#egg=edx_lint==0.2.1
-e git+https://github.com/edx/xblock-utils.git@581ed636c862b286002bb9a3724cc883570eb54c#egg=xblock-utils
-e git+https://github.com/edx-solutions/xblock-google-drive.git@138e6fa0bf3a2013e904a085b9fed77dab7f3f21#egg=xblock-google-drive
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment