courseware_index.py 27 KB
Newer Older
1 2
""" Code to allow module store to interface with courseware index """
from __future__ import absolute_import
3

4
import logging
5
import re
6 7
from abc import ABCMeta, abstractmethod
from datetime import timedelta
8 9

from django.conf import settings
10
from django.core.urlresolvers import resolve
11 12 13 14
from django.utils.translation import ugettext as _
from django.utils.translation import ugettext_lazy
from search.search_engine_base import SearchEngine
from six import add_metaclass
15

16
from contentstore.course_group_config import GroupConfiguration
17
from course_modes.models import CourseMode
18
from eventtracking import tracker
19
from openedx.core.lib.courses import course_image_url
20
from xmodule.annotator_mixin import html_to_text
21
from xmodule.library_tools import normalize_key_for_search
22
from xmodule.modulestore import ModuleStoreEnum
23 24 25 26 27 28 29 30 31 32 33

# REINDEX_AGE is the default amount of time that we look back for changes
# that might have happened. If we are provided with a time at which the
# indexing is triggered, then we know it is safe to only index items
# recently changed at that time. This is the time period that represents
# how far back from the trigger point to look back in order to index
REINDEX_AGE = timedelta(0, 60)  # 60 seconds

log = logging.getLogger('edx.modulestore')


34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
def strip_html_content_to_text(html_content):
    """ Gets only the textual part for html content - useful for building text to be searched """
    # Removing HTML-encoded non-breaking space characters
    text_content = re.sub(r"(\s| |//)+", " ", html_to_text(html_content))
    # Removing HTML CDATA
    text_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", text_content)
    # Removing HTML comments
    text_content = re.sub(r"<!--.*-->", "", text_content)

    return text_content


def indexing_is_enabled():
    """
    Checks to see if the indexing feature is enabled
    """
    return settings.FEATURES.get('ENABLE_COURSEWARE_INDEX', False)


53 54 55 56 57 58 59 60
class SearchIndexingError(Exception):
    """ Indicates some error(s) occured during indexing """

    def __init__(self, message, error_list):
        super(SearchIndexingError, self).__init__(message)
        self.error_list = error_list


61
@add_metaclass(ABCMeta)
E. Kolpakov committed
62
class SearchIndexerBase(object):
63
    """
64
    Base class to perform indexing for courseware or library search from different modulestores
65
    """
66
    __metaclass__ = ABCMeta
67

68 69
    INDEX_NAME = None
    DOCUMENT_TYPE = None
70
    ENABLE_INDEXING_KEY = None
71 72 73 74 75 76 77

    INDEX_EVENT = {
        'name': None,
        'category': None
    }

    @classmethod
78 79 80 81 82 83 84
    def indexing_is_enabled(cls):
        """
        Checks to see if the indexing feature is enabled
        """
        return settings.FEATURES.get(cls.ENABLE_INDEXING_KEY, False)

    @classmethod
85
    @abstractmethod
86
    def normalize_structure_key(cls, structure_key):
87 88 89
        """ Normalizes structure key for use in indexing """

    @classmethod
90
    @abstractmethod
91
    def _fetch_top_level(cls, modulestore, structure_key):
92 93 94
        """ Fetch the item from the modulestore location """

    @classmethod
95
    @abstractmethod
96
    def _get_location_info(cls, normalized_structure_key):
97 98 99
        """ Builds location info dictionary """

    @classmethod
100
    def _id_modifier(cls, usage_id):
101 102 103 104 105 106 107 108 109 110 111 112
        """ Modifies usage_id to submit to index """
        return usage_id

    @classmethod
    def remove_deleted_items(cls, searcher, structure_key, exclude_items):
        """
        remove any item that is present in the search index that is not present in updated list of indexed items
        as we find items we can shorten the set of items to keep
        """
        response = searcher.search(
            doc_type=cls.DOCUMENT_TYPE,
            field_dictionary=cls._get_location_info(structure_key),
113
            exclude_dictionary={"id": list(exclude_items)}
114 115
        )
        result_ids = [result["data"]["id"] for result in response["results"]]
116
        searcher.remove(cls.DOCUMENT_TYPE, result_ids)
117

118
    @classmethod
119
    def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE):
120 121 122 123
        """
        Process course for indexing

        Arguments:
124 125
        modulestore - modulestore object to use for operations

126
        structure_key (CourseKey|LibraryKey) - course or library identifier
127 128 129 130 131 132 133 134 135 136 137 138

        triggered_at (datetime) - provides time at which indexing was triggered;
            useful for index updates - only things changed recently from that date
            (within REINDEX_AGE above ^^) will have their index updated, others skip
            updating their index but are still walked through in order to identify
            which items may need to be removed from the index
            If None, then a full reindex takes place

        Returns:
        Number of items that have been added to the index
        """
        error_list = []
139
        searcher = SearchEngine.get_search_engine(cls.INDEX_NAME)
140 141 142
        if not searcher:
            return

143
        structure_key = cls.normalize_structure_key(structure_key)
144
        location_info = cls._get_location_info(structure_key)
145

146
        # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index`
147 148 149 150 151 152 153 154 155 156
        indexed_count = {
            "count": 0
        }

        # indexed_items is a list of all the items that we wish to remain in the
        # index, whether or not we are planning to actually update their index.
        # This is used in order to build a query to remove those items not in this
        # list - those are ready to be destroyed
        indexed_items = set()

157 158 159 160 161
        # items_index is a list of all the items index dictionaries.
        # it is used to collect all indexes and index them using bulk API,
        # instead of per item index API call.
        items_index = []

162 163 164 165 166 167
        def get_item_location(item):
            """
            Gets the version agnostic item location
            """
            return item.location.version_agnostic().replace(branch=None)

168
        def prepare_item_index(item, skip_index=False, groups_usage_info=None):
169
            """
170
            Add this item to the items_index and indexed_items list
171 172 173 174 175 176 177 178

            Arguments:
            item - item to add to index, its children will be processed recursively

            skip_index - simply walk the children in the tree, the content change is
                older than the REINDEX_AGE window and would have been already indexed.
                This should really only be passed from the recursive child calls when
                this method has determined that it is safe to do so
179 180 181

            Returns:
            item_content_groups - content groups assigned to indexed item
182 183 184 185 186 187 188
            """
            is_indexable = hasattr(item, "index_dictionary")
            item_index_dictionary = item.index_dictionary() if is_indexable else None
            # if it's not indexable and it does not have children, then ignore
            if not item_index_dictionary and not item.has_children:
                return

189
            item_content_groups = None
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206

            if item.category == "split_test":
                split_partition = item.get_selected_partition()
                for split_test_child in item.get_children():
                    if split_partition:
                        for group in split_partition.groups:
                            group_id = unicode(group.id)
                            child_location = item.group_id_to_child.get(group_id, None)
                            if child_location == split_test_child.location:
                                groups_usage_info.update({
                                    unicode(get_item_location(split_test_child)): [group_id],
                                })
                                for component in split_test_child.get_children():
                                    groups_usage_info.update({
                                        unicode(get_item_location(component)): [group_id]
                                    })

207
            if groups_usage_info:
208
                item_location = get_item_location(item)
209 210
                item_content_groups = groups_usage_info.get(unicode(item_location), None)

211
            item_id = unicode(cls._id_modifier(item.scope_ids.usage_id))
212 213 214 215 216
            indexed_items.add(item_id)
            if item.has_children:
                # determine if it's okay to skip adding the children herein based upon how recently any may have changed
                skip_child_index = skip_index or \
                    (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age)
217
                children_groups_usage = []
218
                for child_item in item.get_children():
219
                    if modulestore.has_published_version(child_item):
220
                        children_groups_usage.append(
221
                            prepare_item_index(
222 223 224 225 226 227 228
                                child_item,
                                skip_index=skip_child_index,
                                groups_usage_info=groups_usage_info
                            )
                        )
                if None in children_groups_usage:
                    item_content_groups = None
229 230 231 232 233 234 235 236 237 238 239 240

            if skip_index or not item_index_dictionary:
                return

            item_index = {}
            # if it has something to add to the index, then add it
            try:
                item_index.update(location_info)
                item_index.update(item_index_dictionary)
                item_index['id'] = item_id
                if item.start:
                    item_index['start_date'] = item.start
241
                item_index['content_groups'] = item_content_groups if item_content_groups else None
242
                item_index.update(cls.supplemental_fields(item))
243
                items_index.append(item_index)
244
                indexed_count["count"] += 1
245
                return item_content_groups
246 247 248 249 250 251 252
            except Exception as err:  # pylint: disable=broad-except
                # broad exception so that index operation does not fail on one item of many
                log.warning('Could not index item: %s - %r', item.location, err)
                error_list.append(_('Could not index item: {}').format(item.location))

        try:
            with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only):
253
                structure = cls._fetch_top_level(modulestore, structure_key)
254
                groups_usage_info = cls.fetch_group_usage(modulestore, structure)
255 256 257 258 259

                # First perform any additional indexing from the structure object
                cls.supplemental_index_information(modulestore, structure)

                # Now index the content
260
                for item in structure.get_children():
261 262
                    prepare_item_index(item, groups_usage_info=groups_usage_info)
                searcher.index(cls.DOCUMENT_TYPE, items_index)
263
                cls.remove_deleted_items(searcher, structure_key, indexed_items)
264 265 266 267
        except Exception as err:  # pylint: disable=broad-except
            # broad exception so that index operation does not prevent the rest of the application from working
            log.exception(
                "Indexing error encountered, courseware index may be out of date %s - %r",
268
                structure_key,
269 270 271 272 273
                err
            )
            error_list.append(_('General indexing error occurred'))

        if error_list:
louyihua committed
274
            raise SearchIndexingError('Error(s) present during indexing', error_list)
275 276 277 278

        return indexed_count["count"]

    @classmethod
279
    def _do_reindex(cls, modulestore, structure_key):
280
        """
281 282
        (Re)index all content within the given structure (course or library),
        tracking the fact that a full reindex has taken place
283
        """
284
        indexed_count = cls.index(modulestore, structure_key)
285
        if indexed_count:
286
            cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count)
287 288 289
        return indexed_count

    @classmethod
290
    def _track_index_request(cls, event_name, category, indexed_count):
291 292 293 294
        """Track content index requests.

        Arguments:
            event_name (str):  Name of the event to be logged.
E. Kolpakov committed
295
            category (str): category of indexed items
296
            indexed_count (int): number of indexed items
297 298 299 300 301 302
        Returns:
            None

        """
        data = {
            "indexed_count": indexed_count,
303
            'category': category,
304 305 306 307 308 309
        }

        tracker.emit(
            event_name,
            data
        )
310

311
    @classmethod
312 313 314 315 316 317 318
    def fetch_group_usage(cls, modulestore, structure):  # pylint: disable=unused-argument
        """
        Base implementation of fetch group usage on course/library.
        """
        return None

    @classmethod
319 320 321 322 323 324 325 326 327 328 329 330 331 332
    def supplemental_index_information(cls, modulestore, structure):
        """
        Perform any supplemental indexing given that the structure object has
        already been loaded. Base implementation performs no operation.

        Arguments:
            modulestore - modulestore object used during the indexing operation
            structure - structure object loaded during the indexing job

        Returns:
            None
        """
        pass

333 334 335 336 337 338 339 340
    @classmethod
    def supplemental_fields(cls, item):  # pylint: disable=unused-argument
        """
        Any supplemental fields that get added to the index for the specified
        item. Base implementation returns an empty dictionary
        """
        return {}

341

E. Kolpakov committed
342
class CoursewareSearchIndexer(SearchIndexerBase):
343 344 345
    """
    Class to perform indexing for courseware search from different modulestores
    """
346 347
    INDEX_NAME = "courseware_index"
    DOCUMENT_TYPE = "courseware_content"
348
    ENABLE_INDEXING_KEY = 'ENABLE_COURSEWARE_INDEX'
349 350 351 352 353 354

    INDEX_EVENT = {
        'name': 'edx.course.index.reindexed',
        'category': 'courseware_index'
    }

355
    UNNAMED_MODULE_NAME = ugettext_lazy("(Unnamed)")
356

357
    @classmethod
358
    def normalize_structure_key(cls, structure_key):
359 360 361 362 363
        """ Normalizes structure key for use in indexing """
        return structure_key

    @classmethod
    def _fetch_top_level(cls, modulestore, structure_key):
364 365 366 367
        """ Fetch the item from the modulestore location """
        return modulestore.get_course(structure_key, depth=None)

    @classmethod
368
    def _get_location_info(cls, normalized_structure_key):
369
        """ Builds location info dictionary """
370
        return {"course": unicode(normalized_structure_key), "org": normalized_structure_key.org}
371 372 373 374 375 376 377 378

    @classmethod
    def do_course_reindex(cls, modulestore, course_key):
        """
        (Re)index all content within the given course, tracking the fact that a full reindex has taken place
        """
        return cls._do_reindex(modulestore, course_key)

379
    @classmethod
380 381
    def fetch_group_usage(cls, modulestore, structure):
        groups_usage_dict = {}
382
        groups_usage_info = GroupConfiguration.get_partitions_usage_info(modulestore, structure).items()
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
        groups_usage_info.extend(
            GroupConfiguration.get_content_groups_items_usage_info(
                modulestore,
                structure
            ).items()
        )
        if groups_usage_info:
            for name, group in groups_usage_info:
                for module in group:
                    view, args, kwargs = resolve(module['url'])  # pylint: disable=unused-variable
                    usage_key_string = unicode(kwargs['usage_key_string'])
                    if groups_usage_dict.get(usage_key_string, None):
                        groups_usage_dict[usage_key_string].append(name)
                    else:
                        groups_usage_dict[usage_key_string] = [name]
        return groups_usage_dict

    @classmethod
401 402 403 404 405 406
    def supplemental_index_information(cls, modulestore, structure):
        """
        Perform additional indexing from loaded structure object
        """
        CourseAboutSearchIndexer.index_about_information(modulestore, structure)

407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
    @classmethod
    def supplemental_fields(cls, item):
        """
        Add location path to the item object

        Once we've established the path of names, the first name is the course
        name, and the next 3 names are the navigable path within the edx
        application. Notice that we stop at that level because a full path to
        deep children would be confusing.
        """
        location_path = []
        parent = item
        while parent is not None:
            path_component_name = parent.display_name
            if not path_component_name:
422
                path_component_name = unicode(cls.UNNAMED_MODULE_NAME)
423 424 425 426 427 428 429 430
            location_path.append(path_component_name)
            parent = parent.get_parent()
        location_path.reverse()
        return {
            "course_name": location_path[0],
            "location": location_path[1:4]
        }

431

E. Kolpakov committed
432
class LibrarySearchIndexer(SearchIndexerBase):
433 434 435
    """
    Base class to perform indexing for library search from different modulestores
    """
436 437
    INDEX_NAME = "library_index"
    DOCUMENT_TYPE = "library_content"
438
    ENABLE_INDEXING_KEY = 'ENABLE_LIBRARY_INDEX'
439 440 441 442 443 444 445

    INDEX_EVENT = {
        'name': 'edx.library.index.reindexed',
        'category': 'library_index'
    }

    @classmethod
446
    def normalize_structure_key(cls, structure_key):
447
        """ Normalizes structure key for use in indexing """
448
        return normalize_key_for_search(structure_key)
449 450 451

    @classmethod
    def _fetch_top_level(cls, modulestore, structure_key):
452 453 454 455
        """ Fetch the item from the modulestore location """
        return modulestore.get_library(structure_key, depth=None)

    @classmethod
456
    def _get_location_info(cls, normalized_structure_key):
457
        """ Builds location info dictionary """
458
        return {"library": unicode(normalized_structure_key)}
459 460

    @classmethod
461
    def _id_modifier(cls, usage_id):
462 463 464 465 466 467 468 469
        """ Modifies usage_id to submit to index """
        return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None)))

    @classmethod
    def do_library_reindex(cls, modulestore, library_key):
        """
        (Re)index all content within the given library, tracking the fact that a full reindex has taken place
        """
470
        return cls._do_reindex(modulestore, library_key)
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572


class AboutInfo(object):
    """ About info structure to contain
       1) Property name to use
       2) Where to add in the index (using flags above)
       3) Where to source the properties value
    """
    # Bitwise Flags for where to index the information
    #
    # ANALYSE - states that the property text contains content that we wish to be able to find matched within
    #   e.g. "joe" should yield a result for "I'd like to drink a cup of joe"
    #
    # PROPERTY - states that the property text should be a property of the indexed document, to be returned with the
    # results: search matches will only be made on exact string matches
    #   e.g. "joe" will only match on "joe"
    #
    # We are using bitwise flags because one may want to add the property to EITHER or BOTH parts of the index
    #   e.g. university name is desired to be analysed, so that a search on "Oxford" will match
    #   property values "University of Oxford" and "Oxford Brookes University",
    #   but it is also a useful property, because within a (future) filtered search a user
    #   may have chosen to filter courses from "University of Oxford"
    #
    # see https://wiki.python.org/moin/BitwiseOperators for information about bitwise shift operator used below
    #
    ANALYSE = 1 << 0  # Add the information to the analysed content of the index
    PROPERTY = 1 << 1  # Add the information as a property of the object being indexed (not analysed)

    def __init__(self, property_name, index_flags, source_from):
        self.property_name = property_name
        self.index_flags = index_flags
        self.source_from = source_from

    def get_value(self, **kwargs):
        """ get the value for this piece of information, using the correct source """
        return self.source_from(self, **kwargs)

    def from_about_dictionary(self, **kwargs):
        """ gets the value from the kwargs provided 'about_dictionary' """
        about_dictionary = kwargs.get('about_dictionary', None)
        if not about_dictionary:
            raise ValueError("Context dictionary does not contain expected argument 'about_dictionary'")

        return about_dictionary.get(self.property_name, None)

    def from_course_property(self, **kwargs):
        """ gets the value from the kwargs provided 'course' """
        course = kwargs.get('course', None)
        if not course:
            raise ValueError("Context dictionary does not contain expected argument 'course'")

        return getattr(course, self.property_name, None)

    def from_course_mode(self, **kwargs):
        """ fetches the available course modes from the CourseMode model """
        course = kwargs.get('course', None)
        if not course:
            raise ValueError("Context dictionary does not contain expected argument 'course'")

        return [mode.slug for mode in CourseMode.modes_for_course(course.id)]

    # Source location options - either from the course or the about info
    FROM_ABOUT_INFO = from_about_dictionary
    FROM_COURSE_PROPERTY = from_course_property
    FROM_COURSE_MODE = from_course_mode


class CourseAboutSearchIndexer(object):
    """
    Class to perform indexing of about information from course object
    """
    DISCOVERY_DOCUMENT_TYPE = "course_info"
    INDEX_NAME = CoursewareSearchIndexer.INDEX_NAME

    # List of properties to add to the index - each item in the list is an instance of AboutInfo object
    ABOUT_INFORMATION_TO_INCLUDE = [
        AboutInfo("advertised_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("announcement", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("effort", AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("display_name", AboutInfo.ANALYSE, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("overview", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("title", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("university", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("number", AboutInfo.ANALYSE | AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("short_description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("description", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("key_dates", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("video", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("course_staff_short", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("course_staff_extended", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("requirements", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("syllabus", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("textbook", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("faq", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("more_info", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("ocw_links", AboutInfo.ANALYSE, AboutInfo.FROM_ABOUT_INFO),
        AboutInfo("enrollment_start", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("enrollment_end", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("org", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
        AboutInfo("modes", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_MODE),
573
        AboutInfo("language", AboutInfo.PROPERTY, AboutInfo.FROM_COURSE_PROPERTY),
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
    ]

    @classmethod
    def index_about_information(cls, modulestore, course):
        """
        Add the given course to the course discovery index

        Arguments:
        modulestore - modulestore object to use for operations

        course - course object from which to take properties, locate about information
        """
        searcher = SearchEngine.get_search_engine(cls.INDEX_NAME)
        if not searcher:
            return

        course_id = unicode(course.id)
        course_info = {
            'id': course_id,
            'course': course_id,
            'content': {},
            'image_url': course_image_url(course),
        }

        # load data for all of the 'about' modules for this course into a dictionary
        about_dictionary = {
            item.location.name: item.data
            for item in modulestore.get_items(course.id, qualifiers={"category": "about"})
        }

        about_context = {
            "course": course,
            "about_dictionary": about_dictionary,
        }

        for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE:
            # Broad exception handler so that a single bad property does not scupper the collection of others
            try:
                section_content = about_information.get_value(**about_context)
            except:  # pylint: disable=bare-except
                section_content = None
                log.warning(
                    "Course discovery could not collect property %s for course %s",
                    about_information.property_name,
                    course_id,
                    exc_info=True,
                )

            if section_content:
                if about_information.index_flags & AboutInfo.ANALYSE:
                    analyse_content = section_content
                    if isinstance(section_content, basestring):
                        analyse_content = strip_html_content_to_text(section_content)
                    course_info['content'][about_information.property_name] = analyse_content
                if about_information.index_flags & AboutInfo.PROPERTY:
                    course_info[about_information.property_name] = section_content

        # Broad exception handler to protect around and report problems with indexing
        try:
633
            searcher.index(cls.DISCOVERY_DOCUMENT_TYPE, [course_info])
634 635 636 637 638 639 640 641 642 643 644
        except:  # pylint: disable=bare-except
            log.exception(
                "Course discovery indexing error encountered, course discovery index may be out of date %s",
                course_id,
            )
            raise

        log.debug(
            "Successfully added %s course to the course discovery index",
            course_id
        )
645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663

    @classmethod
    def _get_location_info(cls, normalized_structure_key):
        """ Builds location info dictionary """
        return {"course": unicode(normalized_structure_key), "org": normalized_structure_key.org}

    @classmethod
    def remove_deleted_items(cls, structure_key):
        """ Remove item from Course About Search_index """
        searcher = SearchEngine.get_search_engine(cls.INDEX_NAME)
        if not searcher:
            return

        response = searcher.search(
            doc_type=cls.DISCOVERY_DOCUMENT_TYPE,
            field_dictionary=cls._get_location_info(structure_key)
        )
        result_ids = [result["data"]["id"] for result in response["results"]]
        searcher.remove(cls.DISCOVERY_DOCUMENT_TYPE, result_ids)