Commit f41b4df0 by Anthony Mangano

add aggregation_key to indices

ECOM-6815

Adds a new field, aggregation_key, to CourseIndex,
CourseRunIndex and ProgramIndex. This will make it possible to compute
accurate counts for distinct records accross content types, which will
be necessary when we begin filtering duplicate courses from search
results on the marketing site.
parent ed75b9b5
...@@ -47,8 +47,11 @@ class OrganizationsMixin: ...@@ -47,8 +47,11 @@ class OrganizationsMixin:
class BaseIndex(indexes.SearchIndex): class BaseIndex(indexes.SearchIndex):
model = None model = None
text = indexes.CharField(document=True, use_template=True) # A key that can be used to group related documents together to enable the computation of distinct facet and hit
# counts.
aggregation_key = indexes.CharField()
content_type = indexes.CharField(faceted=True) content_type = indexes.CharField(faceted=True)
text = indexes.CharField(document=True, use_template=True)
def prepare_content_type(self, obj): # pylint: disable=unused-argument def prepare_content_type(self, obj): # pylint: disable=unused-argument
return self.model.__name__.lower() return self.model.__name__.lower()
...@@ -110,6 +113,9 @@ class CourseIndex(BaseCourseIndex, indexes.Indexable): ...@@ -110,6 +113,9 @@ class CourseIndex(BaseCourseIndex, indexes.Indexable):
prerequisites = indexes.MultiValueField(faceted=True) prerequisites = indexes.MultiValueField(faceted=True)
def prepare_aggregation_key(self, obj):
return 'course:{}'.format(obj.key)
def prepare_course_runs(self, obj): def prepare_course_runs(self, obj):
return [course_run.key for course_run in obj.course_runs.all()] return [course_run.key for course_run in obj.course_runs.all()]
...@@ -153,6 +159,10 @@ class CourseRunIndex(BaseCourseIndex, indexes.Indexable): ...@@ -153,6 +159,10 @@ class CourseRunIndex(BaseCourseIndex, indexes.Indexable):
has_enrollable_paid_seats = indexes.BooleanField(null=False) has_enrollable_paid_seats = indexes.BooleanField(null=False)
paid_seat_enrollment_end = indexes.DateTimeField(null=True) paid_seat_enrollment_end = indexes.DateTimeField(null=True)
def prepare_aggregation_key(self, obj):
# Aggregate CourseRuns by Course key since that is how we plan to dedup CourseRuns on the marketing site.
return 'courserun:{}'.format(obj.course.key)
def prepare_has_enrollable_paid_seats(self, obj): def prepare_has_enrollable_paid_seats(self, obj):
return obj.has_enrollable_paid_seats() return obj.has_enrollable_paid_seats()
...@@ -226,6 +236,9 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin): ...@@ -226,6 +236,9 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
seat_types = indexes.MultiValueField(model_attr='seat_types', null=True, faceted=True) seat_types = indexes.MultiValueField(model_attr='seat_types', null=True, faceted=True)
published = indexes.BooleanField(null=False, faceted=True) published = indexes.BooleanField(null=False, faceted=True)
def prepare_aggregation_key(self, obj):
return 'program:{}'.format(obj.uuid)
def prepare_published(self, obj): def prepare_published(self, obj):
return obj.status == ProgramStatus.Active return obj.status == ProgramStatus.Active
......
...@@ -92,6 +92,14 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend): ...@@ -92,6 +92,14 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def build_schema(self, fields): def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields) content_field_name, mapping = super().build_schema(fields)
# The aggregation_key is intended to be used for computing distinct record counts. We do not want it to be
# analyzed because doing so would result in more values being counted, as each key would be broken down
# into substrings by the analyzer.
if mapping.get('aggregation_key'):
mapping['aggregation_key']['index'] = 'not_analyzed'
del mapping['aggregation_key']['analyzer']
# Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality # Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality
snowball_with_synonyms = 'snowball_with_synonyms' snowball_with_synonyms = 'snowball_with_synonyms'
for field, value in mapping.items(): for field, value in mapping.items():
...@@ -107,6 +115,7 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend): ...@@ -107,6 +115,7 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms) index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete', self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms) index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
return (content_field_name, mapping) return (content_field_name, mapping)
......
import haystack
from django.test import TestCase from django.test import TestCase
from course_discovery.apps.edx_haystack_extensions.backends import EdxElasticsearchSearchBackend from course_discovery.apps.edx_haystack_extensions.backends import EdxElasticsearchSearchBackend
...@@ -10,3 +11,13 @@ class EdxElasticsearchSearchBackendTests(NonClearingSearchBackendMixinTestMixin, ...@@ -10,3 +11,13 @@ class EdxElasticsearchSearchBackendTests(NonClearingSearchBackendMixinTestMixin,
TestCase): TestCase):
""" Tests for EdxElasticsearchSearchBackend. """ """ Tests for EdxElasticsearchSearchBackend. """
backend_class = EdxElasticsearchSearchBackend backend_class = EdxElasticsearchSearchBackend
def test_build_schema_handles_aggregation_key(self):
"""Verify that build_schema marks the aggregation_key field as not_analyzed."""
backend = self.get_backend()
index = haystack.connections[backend.connection_alias].get_unified_index()
fields = index.all_searchfields()
mapping = backend.build_schema(fields)[1]
assert mapping.get('aggregation_key')
assert mapping['aggregation_key']['index'] == 'not_analyzed'
assert 'analyzer' not in mapping['aggregation_key']
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment