Commit d72f5077 by Anthony Mangano

add ability to compute distinct hit and facet counts

ECOM-6815
parent 5f25faf1
from haystack.query import SearchQuerySet
from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
class DistinctCountsSearchQuerySet(SearchQuerySet):
"""Custom SearchQuerySet class that can compute and cache distinct hit and facet counts for a query."""
@staticmethod
def from_queryset(queryset):
""" Builds a DistinctCountsSearchQuerySet from an existing SearchQuerySet."""
return queryset._clone(klass=DistinctCountsSearchQuerySet) # pylint: disable=protected-access
def __init__(self, **kwargs):
"""
Initialize a new instance of the DistinctCountsSearchQuerySet.
Overrides SearchQuerySet.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/query.py#L24
"""
super(DistinctCountsSearchQuerySet, self).__init__(**kwargs)
self._distinct_result_count = None
def with_distinct_counts(self, aggregation_key):
"""
Adds distinct_count aggregations to the Query.
Arguments:
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
clone = self._clone()
clone.query = clone.query._clone(DistinctCountsSearchQuery) # pylint: disable=protected-access
clone.query.aggregation_key = aggregation_key
clone.query.validate()
return clone
def distinct_count(self):
"""
Return the distinct hit count.
Note: This will raise an error if the SearchQuerySet has not been configured to compute distinct counts. It
will also force the query to run if it hasn't already.
"""
if not isinstance(self.query, DistinctCountsSearchQuery):
raise RuntimeError('This SearchQuerySet has not been configured to compute distinct counts.')
if self._distinct_result_count is None:
self._distinct_result_count = self.query.get_distinct_count()
return self._distinct_result_count
import datetime
import pytest
from django.test import TestCase
from haystack.query import SearchQuerySet
from course_discovery.apps.core.tests.mixins import ElasticsearchTestMixin
from course_discovery.apps.course_metadata.models import CourseRun
from course_discovery.apps.course_metadata.tests.factories import CourseFactory, CourseRunFactory
from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
from course_discovery.apps.edx_haystack_extensions.distinct_counts.query import DistinctCountsSearchQuerySet
class DistinctCountsSearchQuerySetTests(ElasticsearchTestMixin, TestCase):
def test_from_queryset(self):
""" Verify that a DistinctCountsSearchQuerySet can be built from an existing SearchQuerySet."""
course_1 = CourseFactory()
CourseRunFactory(title='foo', course=course_1)
CourseRunFactory(title='foo', course=course_1)
course_2 = CourseFactory()
CourseRunFactory(title='foo', course=course_2)
CourseRunFactory(title='bar', course=course_2)
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
expected = sorted([run.key for run in queryset])
actual = sorted([run.key for run in dc_queryset])
assert expected == actual
def test_with_distinct_counts(self):
"""
Verify that the query object is converted to a DistinctCountsSearchQuery and the aggregation_key is
configured properly.
"""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
assert isinstance(dc_queryset.query, DistinctCountsSearchQuery)
assert dc_queryset.query.aggregation_key == 'aggregation_key'
def test_with_distinct_counts_raises_when_queryset_includes_unsupported_options(self):
"""
Verify that an error is raised if the original queryset includes options that are not supported by our
custom Query class.
"""
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(SearchQuerySet())
with pytest.raises(RuntimeError) as err:
now = datetime.datetime.now()
ten_days = datetime.timedelta(days=10)
start = now - ten_days
end = now + ten_days
dc_queryset.date_facet('start', start, end, 'day').with_distinct_counts('aggregation_key')
assert str(err.value) == 'DistinctCountsSearchQuery does not support date facets.'
with pytest.raises(RuntimeError) as err:
dc_queryset.facet('pacing_type', order='term').with_distinct_counts('aggregation_key')
assert 'DistinctCountsSearchQuery only supports a limited set of field facet options.' in str(err.value)
def test_distinct_count_returns_cached_distinct_count(self):
""" Verify that distinct_count returns the cached distinct_result_count when present."""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
dc_queryset._distinct_result_count = 123 # pylint: disable=protected-access
assert dc_queryset.distinct_count() == 123
def test_distinct_count_runs_query_when_cache_is_empty(self):
""" Verify that distinct_count runs the query, caches, and returns the distinct_count when cache is empty."""
course_1 = CourseFactory()
CourseRunFactory(title='foo', course=course_1)
CourseRunFactory(title='foo', course=course_1)
course_2 = CourseFactory()
CourseRunFactory(title='foo', course=course_2)
CourseRunFactory(title='bar', course=course_2)
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
assert dc_queryset._distinct_result_count is None # pylint: disable=protected-access
assert dc_queryset.distinct_count() == 2
assert dc_queryset._distinct_result_count == 2 # pylint: disable=protected-access
def test_distinct_count_raises_when_not_properly_configured(self):
"""
Verify that distinct_count raises when called without configuring the SearchQuerySet to compute distinct
counts.
"""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
with pytest.raises(RuntimeError) as err:
dc_queryset.distinct_count()
assert str(err.value) == 'This SearchQuerySet has not been configured to compute distinct counts.'
def test_facet_counts_includes_distinct_counts(self):
""" Verify that facet_counts include distinct counts. """
course = CourseFactory()
CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
CourseRunFactory(title='foo', pacing_type='instructor_paced', hidden=False, course=course)
# Make sure to add both a field facet and a query facet so that we can be sure that both work.
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
queryset = queryset.facet('pacing_type').query_facet('hidden', 'hidden:true')
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
facet_counts = dc_queryset.facet_counts()
# Field facets are expected to be formatted as a list of three-tuples (field_value, count, distinct_count)
for val, count, distinct_count in facet_counts['fields']['pacing_type']:
assert val in {'self_paced', 'instructor_paced'}
if val == 'self_paced':
assert count == 2
assert distinct_count == 1
elif val == 'instructor_paced':
assert count == 1
assert distinct_count == 1
# Query facets are expected to be formatted as a dictionary mapping facet_names to two-tuples (count,
# distinct_count)
hidden_count, hidden_distinct_count = facet_counts['queries']['hidden']
assert hidden_count == 2
assert hidden_distinct_count == 1
...@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3 ...@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control # See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
SEARCH_FACET_LIMIT = 10000 SEARCH_FACET_LIMIT = 10000
# Precision settings for the elasticsearch cardinality aggregations used to compute distinct hit and facet counts.
# The elasticsearch cardinality aggregation is not guarenteed to produce accurate results. Accuracy is configurable via
# an optional precision_threshold setting. Cardinality aggregations for queries that produce fewer results than the
# precision threshold can be expected to be pretty accurate. Cardinality aggregations for queries that produce more
# results than the precision_threshold will be less accurate. Setting a higher value for precision_threshold requires
# a memory tradeoff of rougly precision_threshold * 8 bytes. See the elasticsearch docs for more details:
# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
#
# We use a higher value for hit precision than for facet precision for two reasons:
# 1.) The hit count is more visible to users than the facet counts.
# 2.) The performance penalty for having a higher hit precision is less than the penalty for a higher facet
# precision, since the hit count only requires a single aggregation.
DISTINCT_COUNTS_HIT_PRECISION = 1500
DISTINCT_COUNTS_FACET_PRECISION = 250
DEFAULT_PARTNER_ID = None DEFAULT_PARTNER_ID = None
# See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id # See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment