add ability to compute distinct hit and facet counts

ECOM-6815

add ability to compute distinct hit and facet counts
ECOM-6815
d72f5077 · Anthony Mangano · 5f25faf1 · d72f5077 · d72f5077 · d72f5077
Commit d72f5077 authored Feb 28, 2017 by Anthony Mangano
7 changed files
--- a/course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
+++ b/course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
--- a/course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
--- a/course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
+++ b/course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
+from haystack.query import SearchQuerySet
+from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
+class DistinctCountsSearchQuerySet(SearchQuerySet):
+    """Custom SearchQuerySet class that can compute and cache distinct hit and facet counts for a query."""
+    @staticmethod
+    def from_queryset(queryset):
+        """ Builds a DistinctCountsSearchQuerySet from an existing SearchQuerySet."""
+        return queryset._clone(klass=DistinctCountsSearchQuerySet)  # pylint: disable=protected-access
+    def __init__(self, **kwargs):
+        """
+        Initialize a new instance of the DistinctCountsSearchQuerySet.
+        Overrides SearchQuerySet.__init__ from:
+        https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/query.py#L24
+        """
+        super(DistinctCountsSearchQuerySet, self).__init__(**kwargs)
+        self._distinct_result_count = None
+    def with_distinct_counts(self, aggregation_key):
+        """
+        Adds distinct_count aggregations to the Query.
+        Arguments:
+            aggregation_key (str): The field that should be used to group records when computing distinct counts.
+                It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
+                Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
+                the search backend and will result in records being grouped by substrings of the aggregation_key field.
+        """
+        clone = self._clone()
+        clone.query = clone.query._clone(DistinctCountsSearchQuery)  # pylint: disable=protected-access
+        clone.query.aggregation_key = aggregation_key
+        clone.query.validate()
+        return clone
+    def distinct_count(self):
+        """
+        Return the distinct hit count.
+        Note: This will raise an error if the SearchQuerySet has not been configured to compute distinct counts. It
+        will also force the query to run if it hasn't already.
+        """
+        if not isinstance(self.query, DistinctCountsSearchQuery):
+            raise RuntimeError('This SearchQuerySet has not been configured to compute distinct counts.')
+        if self._distinct_result_count is None:
+            self._distinct_result_count = self.query.get_distinct_count()
+        return self._distinct_result_count
--- a/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
+++ b/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
--- a/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
--- a/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
+++ b/course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
+import datetime
+import pytest
+from django.test import TestCase
+from haystack.query import SearchQuerySet
+from course_discovery.apps.core.tests.mixins import ElasticsearchTestMixin
+from course_discovery.apps.course_metadata.models import CourseRun
+from course_discovery.apps.course_metadata.tests.factories import CourseFactory, CourseRunFactory
+from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
+from course_discovery.apps.edx_haystack_extensions.distinct_counts.query import DistinctCountsSearchQuerySet
+class DistinctCountsSearchQuerySetTests(ElasticsearchTestMixin, TestCase):
+    def test_from_queryset(self):
+        """ Verify that a DistinctCountsSearchQuerySet can be built from an existing SearchQuerySet."""
+        course_1 = CourseFactory()
+        CourseRunFactory(title='foo', course=course_1)
+        CourseRunFactory(title='foo', course=course_1)
+        course_2 = CourseFactory()
+        CourseRunFactory(title='foo', course=course_2)
+        CourseRunFactory(title='bar', course=course_2)
+        queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
+        expected = sorted([run.key for run in queryset])
+        actual = sorted([run.key for run in dc_queryset])
+        assert expected == actual
+    def test_with_distinct_counts(self):
+        """
+        Verify that the query object is converted to a DistinctCountsSearchQuery and the aggregation_key is
+        configured properly.
+        """
+        queryset = SearchQuerySet()
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
+        assert isinstance(dc_queryset.query, DistinctCountsSearchQuery)
+        assert dc_queryset.query.aggregation_key == 'aggregation_key'
+    def test_with_distinct_counts_raises_when_queryset_includes_unsupported_options(self):
+        """
+        Verify that an error is raised if the original queryset includes options that are not supported by our
+        custom Query class.
+        """
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(SearchQuerySet())
+        with pytest.raises(RuntimeError) as err:
+            now = datetime.datetime.now()
+            ten_days = datetime.timedelta(days=10)
+            start = now - ten_days
+            end = now + ten_days
+            dc_queryset.date_facet('start', start, end, 'day').with_distinct_counts('aggregation_key')
+        assert str(err.value) == 'DistinctCountsSearchQuery does not support date facets.'
+        with pytest.raises(RuntimeError) as err:
+            dc_queryset.facet('pacing_type', order='term').with_distinct_counts('aggregation_key')
+        assert 'DistinctCountsSearchQuery only supports a limited set of field facet options.' in str(err.value)
+    def test_distinct_count_returns_cached_distinct_count(self):
+        """ Verify that distinct_count returns the cached distinct_result_count when present."""
+        queryset = SearchQuerySet()
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
+        dc_queryset._distinct_result_count = 123  # pylint: disable=protected-access
+        assert dc_queryset.distinct_count() == 123
+    def test_distinct_count_runs_query_when_cache_is_empty(self):
+        """ Verify that distinct_count runs the query, caches, and returns the distinct_count when cache is empty."""
+        course_1 = CourseFactory()
+        CourseRunFactory(title='foo', course=course_1)
+        CourseRunFactory(title='foo', course=course_1)
+        course_2 = CourseFactory()
+        CourseRunFactory(title='foo', course=course_2)
+        CourseRunFactory(title='bar', course=course_2)
+        queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
+        assert dc_queryset._distinct_result_count is None  # pylint: disable=protected-access
+        assert dc_queryset.distinct_count() == 2
+        assert dc_queryset._distinct_result_count == 2  # pylint: disable=protected-access
+    def test_distinct_count_raises_when_not_properly_configured(self):
+        """
+        Verify that distinct_count raises when called without configuring the SearchQuerySet to compute distinct
+        counts.
+        """
+        queryset = SearchQuerySet()
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
+        with pytest.raises(RuntimeError) as err:
+            dc_queryset.distinct_count()
+        assert str(err.value) == 'This SearchQuerySet has not been configured to compute distinct counts.'
+    def test_facet_counts_includes_distinct_counts(self):
+        """ Verify that facet_counts include distinct counts. """
+        course = CourseFactory()
+        CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
+        CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
+        CourseRunFactory(title='foo', pacing_type='instructor_paced', hidden=False, course=course)
+        # Make sure to add both a field facet and a query facet so that we can be sure that both work.
+        queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
+        queryset = queryset.facet('pacing_type').query_facet('hidden', 'hidden:true')
+        dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
+        facet_counts = dc_queryset.facet_counts()
+        # Field facets are expected to be formatted as a list of three-tuples (field_value, count, distinct_count)
+        for val, count, distinct_count in facet_counts['fields']['pacing_type']:
+            assert val in {'self_paced', 'instructor_paced'}
+            if val == 'self_paced':
+                assert count == 2
+                assert distinct_count == 1
+            elif val == 'instructor_paced':
+                assert count == 1
+                assert distinct_count == 1
+        # Query facets are expected to be formatted as a dictionary mapping facet_names to two-tuples (count,
+        # distinct_count)
+        hidden_count, hidden_distinct_count = facet_counts['queries']['hidden']
+        assert hidden_count == 2
+        assert hidden_distinct_count == 1
--- a/course_discovery/settings/base.py
+++ b/course_discovery/settings/base.py
@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
 # See  https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
 SEARCH_FACET_LIMIT = 10000
+# Precision settings for the elasticsearch cardinality aggregations used to compute distinct hit and facet counts.
+# The elasticsearch cardinality aggregation is not guarenteed to produce accurate results. Accuracy is configurable via
+# an optional precision_threshold setting. Cardinality aggregations for queries that produce fewer results than the
+# precision threshold can be expected to be pretty accurate. Cardinality aggregations for queries that produce more
+# results than the precision_threshold will be less accurate. Setting a higher value for precision_threshold requires
+# a memory tradeoff of rougly precision_threshold * 8 bytes. See the elasticsearch docs for more details:
+# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
+#
+# We use a higher value for hit precision than for facet precision for two reasons:
+#   1.) The hit count is more visible to users than the facet counts.
+#   2.) The performance penalty for having a higher hit precision is less than the penalty for a higher facet
+#       precision, since the hit count only requires a single aggregation.
+DISTINCT_COUNTS_HIT_PRECISION = 1500
+DISTINCT_COUNTS_FACET_PRECISION = 250
 DEFAULT_PARTNER_ID = None
 # See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id