add aggregation_key to indices

ECOM-6815 Adds a new field, aggregation_key, to CourseIndex, CourseRunIndex and ProgramIndex. This will make it possible to compute accurate counts for distinct records accross content types, which will be necessary when we begin filtering duplicate courses from search results on the marketing site.

add aggregation_key to indices
ECOM-6815 Adds a new field, aggregation_key, to CourseIndex, CourseRunIndex and ProgramIndex. This will make it possible to compute accurate counts for distinct records accross content types, which will be necessary when we begin filtering duplicate courses from search results on the marketing site.
f41b4df0 · Anthony Mangano · ed75b9b5 · f41b4df0 · f41b4df0 · f41b4df0
Commit f41b4df0 authored Feb 28, 2017 by Anthony Mangano
Showing with 34 additions and 1 deletions

course_discovery/apps/course_metadata/search_indexes.py
+14 -1

course_discovery/apps/edx_haystack_extensions/backends.py
+9 -0

course_discovery/apps/edx_haystack_extensions/tests/test_backends.py
+11 -0

No files found.
--- a/course_discovery/apps/course_metadata/search_indexes.py
+++ b/course_discovery/apps/course_metadata/search_indexes.py
@@ -47,8 +47,11 @@ class OrganizationsMixin:
 class BaseIndex(indexes.SearchIndex):
    model = None
-    text = indexes.CharField(document=True, use_template=True)
+    # A key that can be used to group related documents together to enable the computation of distinct facet and hit
+    # counts.
+    aggregation_key = indexes.CharField()
    content_type = indexes.CharField(faceted=True)
+    text = indexes.CharField(document=True, use_template=True)
    def prepare_content_type(self, obj):  # pylint: disable=unused-argument
        return self.model.__name__.lower()
@@ -110,6 +113,9 @@ class CourseIndex(BaseCourseIndex, indexes.Indexable):
    prerequisites = indexes.MultiValueField(faceted=True)
+    def prepare_aggregation_key(self, obj):
+        return 'course:{}'.format(obj.key)
    def prepare_course_runs(self, obj):
        return [course_run.key for course_run in obj.course_runs.all()]
@@ -153,6 +159,10 @@ class CourseRunIndex(BaseCourseIndex, indexes.Indexable):
    has_enrollable_paid_seats = indexes.BooleanField(null=False)
    paid_seat_enrollment_end = indexes.DateTimeField(null=True)
+    def prepare_aggregation_key(self, obj):
+        # Aggregate CourseRuns by Course key since that is how we plan to dedup CourseRuns on the marketing site.
+        return 'courserun:{}'.format(obj.course.key)
    def prepare_has_enrollable_paid_seats(self, obj):
        return obj.has_enrollable_paid_seats()
@@ -226,6 +236,9 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
    seat_types = indexes.MultiValueField(model_attr='seat_types', null=True, faceted=True)
    published = indexes.BooleanField(null=False, faceted=True)
+    def prepare_aggregation_key(self, obj):
+        return 'program:{}'.format(obj.uuid)
    def prepare_published(self, obj):
        return obj.status == ProgramStatus.Active

--- a/course_discovery/apps/edx_haystack_extensions/backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/backends.py
@@ -92,6 +92,14 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
    def build_schema(self, fields):
        content_field_name, mapping = super().build_schema(fields)
+        # The aggregation_key is intended to be used for computing distinct record counts. We do not want it to be
+        # analyzed because doing so would result in more values being counted, as each key would be broken down
+        # into substrings by the analyzer.
+        if mapping.get('aggregation_key'):
+            mapping['aggregation_key']['index'] = 'not_analyzed'
+            del mapping['aggregation_key']['analyzer']
        # Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality
        snowball_with_synonyms = 'snowball_with_synonyms'
        for field, value in mapping.items():
@@ -107,6 +115,7 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
                               index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
        self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete',
                               index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
        return (content_field_name, mapping)

--- a/course_discovery/apps/edx_haystack_extensions/tests/test_backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/tests/test_backends.py
+import haystack
 from django.test import TestCase
 from course_discovery.apps.edx_haystack_extensions.backends import EdxElasticsearchSearchBackend
@@ -10,3 +11,13 @@ class EdxElasticsearchSearchBackendTests(NonClearingSearchBackendMixinTestMixin,
                                         TestCase):
    """ Tests for EdxElasticsearchSearchBackend.  """
    backend_class = EdxElasticsearchSearchBackend
+    def test_build_schema_handles_aggregation_key(self):
+        """Verify that build_schema marks the aggregation_key field as not_analyzed."""
+        backend = self.get_backend()
+        index = haystack.connections[backend.connection_alias].get_unified_index()
+        fields = index.all_searchfields()
+        mapping = backend.build_schema(fields)[1]
+        assert mapping.get('aggregation_key')
+        assert mapping['aggregation_key']['index'] == 'not_analyzed'
+        assert 'analyzer' not in mapping['aggregation_key']