Commit f41b4df0 by Anthony Mangano

add aggregation_key to indices

ECOM-6815

Adds a new field, aggregation_key, to CourseIndex,
CourseRunIndex and ProgramIndex. This will make it possible to compute
accurate counts for distinct records accross content types, which will
be necessary when we begin filtering duplicate courses from search
results on the marketing site.
parent ed75b9b5
......@@ -47,8 +47,11 @@ class OrganizationsMixin:
class BaseIndex(indexes.SearchIndex):
model = None
text = indexes.CharField(document=True, use_template=True)
# A key that can be used to group related documents together to enable the computation of distinct facet and hit
# counts.
aggregation_key = indexes.CharField()
content_type = indexes.CharField(faceted=True)
text = indexes.CharField(document=True, use_template=True)
def prepare_content_type(self, obj): # pylint: disable=unused-argument
return self.model.__name__.lower()
......@@ -110,6 +113,9 @@ class CourseIndex(BaseCourseIndex, indexes.Indexable):
prerequisites = indexes.MultiValueField(faceted=True)
def prepare_aggregation_key(self, obj):
return 'course:{}'.format(obj.key)
def prepare_course_runs(self, obj):
return [course_run.key for course_run in obj.course_runs.all()]
......@@ -153,6 +159,10 @@ class CourseRunIndex(BaseCourseIndex, indexes.Indexable):
has_enrollable_paid_seats = indexes.BooleanField(null=False)
paid_seat_enrollment_end = indexes.DateTimeField(null=True)
def prepare_aggregation_key(self, obj):
# Aggregate CourseRuns by Course key since that is how we plan to dedup CourseRuns on the marketing site.
return 'courserun:{}'.format(obj.course.key)
def prepare_has_enrollable_paid_seats(self, obj):
return obj.has_enrollable_paid_seats()
......@@ -226,6 +236,9 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
seat_types = indexes.MultiValueField(model_attr='seat_types', null=True, faceted=True)
published = indexes.BooleanField(null=False, faceted=True)
def prepare_aggregation_key(self, obj):
return 'program:{}'.format(obj.uuid)
def prepare_published(self, obj):
return obj.status == ProgramStatus.Active
......
......@@ -92,6 +92,14 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields)
# The aggregation_key is intended to be used for computing distinct record counts. We do not want it to be
# analyzed because doing so would result in more values being counted, as each key would be broken down
# into substrings by the analyzer.
if mapping.get('aggregation_key'):
mapping['aggregation_key']['index'] = 'not_analyzed'
del mapping['aggregation_key']['analyzer']
# Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality
snowball_with_synonyms = 'snowball_with_synonyms'
for field, value in mapping.items():
......@@ -107,6 +115,7 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
return (content_field_name, mapping)
......
import haystack
from django.test import TestCase
from course_discovery.apps.edx_haystack_extensions.backends import EdxElasticsearchSearchBackend
......@@ -10,3 +11,13 @@ class EdxElasticsearchSearchBackendTests(NonClearingSearchBackendMixinTestMixin,
TestCase):
""" Tests for EdxElasticsearchSearchBackend. """
backend_class = EdxElasticsearchSearchBackend
def test_build_schema_handles_aggregation_key(self):
"""Verify that build_schema marks the aggregation_key field as not_analyzed."""
backend = self.get_backend()
index = haystack.connections[backend.connection_alias].get_unified_index()
fields = index.all_searchfields()
mapping = backend.build_schema(fields)[1]
assert mapping.get('aggregation_key')
assert mapping['aggregation_key']['index'] == 'not_analyzed'
assert 'analyzer' not in mapping['aggregation_key']
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment