Commit d72f5077 by Anthony Mangano

add ability to compute distinct hit and facet counts

ECOM-6815
parent 5f25faf1
import elasticsearch
from django.conf import settings
from haystack.backends.elasticsearch_backend import ElasticsearchSearchQuery
from haystack.models import SearchResult
class DistinctCountsSearchQuery(ElasticsearchSearchQuery):
""" Custom Haystack Query class that computes and caches distinct hit and facet counts for a query."""
def __init__(self, **kwargs):
"""
Create and return a new instance of DistinctCountsSearchQuery.
Overrides BaseSearchQuery.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L443
"""
super(DistinctCountsSearchQuery, self).__init__(**kwargs)
self.aggregation_key = None
self._distinct_hit_count = None
def _clone(self, **kwargs):
"""
Create and return a new DistinctCountsSearchQuery with fields set to match those on the original object.
Overrides BaseSearchQuery._clone from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L981
"""
clone = super(DistinctCountsSearchQuery, self)._clone(**kwargs)
if isinstance(clone, DistinctCountsSearchQuery):
clone.aggregation_key = self.aggregation_key
clone._distinct_hit_count = self._distinct_hit_count # pylint: disable=protected-access
return clone
def get_distinct_count(self):
"""
Return the distinct hit count for this query. Calling this method will cause the query to execute if
it hasn't already been run.
"""
if self._distinct_hit_count is None:
self.run()
return self._distinct_hit_count
def run(self, spelling_query=None, **kwargs):
"""
Run the query and cache the results.
Overrides and re-implements ElasticsearchSearchQuery.run from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/elasticsearch_backend.py#L941
"""
# Make sure that the Query is valid before running it.
self.validate()
final_query = self.build_query()
search_kwargs = self.build_params(spelling_query)
if kwargs:
search_kwargs.update(kwargs)
# Use the DistinctCountsElasticsearchBackendWrapper to execute the query so that distinct hit and query
# counts may be computed.
backend = DistinctCountsElasticsearchBackendWrapper(self.backend, self.aggregation_key)
results = backend.search(final_query, **search_kwargs)
self._results = results.get('results', [])
self._hit_count = results.get('hits', 0)
self._distinct_hit_count = results.get('distinct_hits', 0)
self._facet_counts = self.post_process_facets(results)
self._spelling_suggestion = results.get('spelling_suggestion', None)
def validate(self):
""" Verify that all Query options are valid and supported by this custom Query class."""
if self._more_like_this:
raise RuntimeError('DistinctCountsSearchQuery does not support more_like_this queries.')
if self._raw_query:
raise RuntimeError('DistinctCountsSearchQuery does not support raw queries.')
if self.date_facets:
raise RuntimeError('DistinctCountsSearchQuery does not support date facets.')
if self.facets:
for field, options in self.facets.items():
self._validate_field_facet_options(field, options)
if self.aggregation_key is None:
raise RuntimeError('aggregation_key is required.')
def _validate_field_facet_options(self, field, options):
""" Verify that the provided field facet options are valid and can be converted to an aggregation."""
supported_options = DistinctCountsElasticsearchBackendWrapper.SUPPORTED_FIELD_FACET_OPTIONS
for option, __ in options.items():
if option not in supported_options:
msg = (
'DistinctCountsSearchQuery only supports a limited set of field facet options.'
'Field: {field}, Supported Options: ({supported}), Provided Options: ({provided})'
).format(field=field, supported=','.join(supported_options), provided=','.join(options.keys()))
raise RuntimeError(msg)
def more_like_this(self, *args, **kwargs): # pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support more_like_this queries."""
raise RuntimeError('DistinctCountsSearchQuery does not support more_like_this queries.')
def run_mlt(self, *args, **kwargs): # pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support more_like_this queries."""
raise RuntimeError('DistinctCountsSearchQuery does not support more_like_this queries.')
def raw_search(self, *args, **kwargs): # pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support raw queries."""
raise RuntimeError('DistinctCountsSearchQuery does not support raw queries.')
def run_raw(self, *args, **kwargs): # pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support raw queries."""
raise RuntimeError('DistinctCountsSearchQuery does not support raw queries.')
def add_date_facet(self, *args, **kwargs): # pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support date facets."""
raise RuntimeError('DistinctCountsSearchQuery does not support date facets.')
def add_field_facet(self, field, **options):
"""
Add a field facet to the Query. Raise an error if any unsupported options are provided.
Overrides BaseSearchQuery.add_field_facet from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L897
"""
self._validate_field_facet_options(field, options)
return super(DistinctCountsSearchQuery, self).add_field_facet(field, **options)
class DistinctCountsElasticsearchBackendWrapper(object):
"""
Custom backend-like class that enables the computation of distinct hit and facet counts during search queries.
This class is not meant to be a true ElasticsearchSearchBackend. It is meant to wrap an existing
ElasticsearchSearchBackend instance and expose a very limited subset of backend functionality.
"""
# The options that are supported for building field facet aggregations.
SUPPORTED_FIELD_FACET_OPTIONS = {'size'}
# The default size for field facet aggregations. This is the same value used by haystack.
DEFAULT_FIELD_FACET_SIZE = 100
def __init__(self, backend, aggregation_key):
"""
Initialize a new instance of the DistinctCountsElasticsearchBackendWrapper.
Arguments:
backend (ElasticsearchSearchBackend)
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
self.backend = backend
self.aggregation_key = aggregation_key
self.aggregation_name = 'distinct_{}'.format(aggregation_key)
def search(self, query_string, **kwargs):
"""
Run a search query and return the results.
Re-implements ElasticsearchSearchBackend.search from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/elasticsearch_backend.py#L495
"""
if len(query_string) == 0:
return {'results': [], 'hits': 0, 'distinct_hits': 0}
if not self.backend.setup_complete:
self.backend.setup()
search_kwargs = self._build_search_kwargs(query_string, **kwargs)
search_kwargs['from'] = kwargs.get('start_offset', 0)
order_fields = set()
for order in search_kwargs.get('sort', []):
for key in order.keys():
order_fields.add(key)
geo_sort = '_geo_distance' in order_fields
end_offset = kwargs.get('end_offset')
start_offset = kwargs.get('start_offset', 0)
if end_offset is not None and end_offset > start_offset:
search_kwargs['size'] = end_offset - start_offset
try:
raw_results = self.backend.conn.search(
body=search_kwargs,
index=self.backend.index_name,
doc_type='modelresult',
_source=True
)
except elasticsearch.TransportError as e:
if not self.backend.silently_fail:
raise
self.backend.log.error('Failed to query Elasticsearch using "%s": %s', query_string, e, exc_info=True)
raw_results = {}
return self._process_results(raw_results,
highlight=kwargs.get('highlight'),
result_class=kwargs.get('result_class', SearchResult),
distance_point=kwargs.get('distance_point'),
geo_sort=geo_sort)
def _build_search_kwargs(self, *args, **kwargs):
""" Build and return the arguments for the elasticsearch query."""
aggregations = self._build_cardinality_aggregation(precision=settings.DISTINCT_COUNTS_HIT_PRECISION)
if 'facets' in kwargs:
aggregations.update(self._build_field_facet_aggregations(
facet_dict=kwargs.pop('facets', {}),
precision=settings.DISTINCT_COUNTS_FACET_PRECISION
))
if 'query_facets' in kwargs:
aggregations.update(self._build_query_facet_aggregations(
facet_list=kwargs.pop('query_facets', []),
precision=settings.DISTINCT_COUNTS_FACET_PRECISION
))
if 'date_facets' in kwargs:
raise RuntimeError('DistinctCountsElasticsearchBackendWrapper does not support date facets.')
search_kwargs = self.backend.build_search_kwargs(*args, **kwargs)
search_kwargs['aggregations'] = aggregations
return search_kwargs
def _build_cardinality_aggregation(self, precision=None):
"""
Build and return a cardinality aggregation using the configured aggregation_key.
The elasticsearch cardinality aggregation does not guarantee accurate results. Accuracy
is configurable via an optional precision_threshold argument. See
https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
Arguments:
precision (str): a numeric value below which counts computed by the cardinality aggregation can
be expected to be close to accurate. Setting this value requires a memory tradeoff of
about (precision * 8) bytes.
"""
aggregation = {self.aggregation_name: {'cardinality': {'field': self.aggregation_key}}}
if precision is not None:
aggregation[self.aggregation_name]['cardinality']['precision_threshold'] = precision
return aggregation
def _build_field_facet_aggregations(self, facet_dict, precision=None):
""" Build and return a dictionary of aggregations for field facets."""
aggregations = {}
for facet_fieldname, opts in facet_dict.items():
for opt, __ in opts.items():
if opt not in self.SUPPORTED_FIELD_FACET_OPTIONS:
opts_str = ','.join(opts.keys())
msg = 'Cannot build aggregation for field facet with unsupported options: {}'.format(opts_str)
raise RuntimeError(msg)
aggregations[facet_fieldname] = {
'terms': {'field': facet_fieldname, 'size': opts.get('size', self.DEFAULT_FIELD_FACET_SIZE)},
'aggregations': self._build_cardinality_aggregation(precision=precision),
}
return aggregations
def _build_query_facet_aggregations(self, facet_list, precision=None):
""" Build and return a dictionary of aggregations for query facets."""
aggregations = {}
for facet_fieldname, value in facet_list:
aggregations[facet_fieldname] = {
'filter': {'query': {'query_string': {'query': value}}},
'aggregations': self._build_cardinality_aggregation(precision=precision),
}
return aggregations
def _process_results(self, raw_results, **kwargs):
""" Process the query results into a form that is more easily consumable by the client."""
results = self.backend._process_results(raw_results, **kwargs) # pylint: disable=protected-access
aggregations = raw_results['aggregations']
# Process the distinct hit count
results['distinct_hits'] = aggregations[self.aggregation_name]['value']
# Process the remaining aggregations, which should all be for facets.
facets = {'fields': {}, 'dates': {}, 'queries': {}}
for name, data in aggregations.items():
# The distinct hit count for the overall query was already processed.
if name == self.aggregation_name:
continue
# Field facets:
elif 'buckets' in data:
buckets = data['buckets']
facets['fields'][name] = [
# Extract the facet name, count, and distinct_count
(bucket['key'], bucket['doc_count'], bucket[self.aggregation_name]['value']) for bucket in buckets
]
# Query facets:
else:
# Extract the facet name, count, and distinct_count
facets['queries'][name] = (data['doc_count'], data[self.aggregation_name]['value'])
results['facets'] = facets
return results
from haystack.query import SearchQuerySet
from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
class DistinctCountsSearchQuerySet(SearchQuerySet):
"""Custom SearchQuerySet class that can compute and cache distinct hit and facet counts for a query."""
@staticmethod
def from_queryset(queryset):
""" Builds a DistinctCountsSearchQuerySet from an existing SearchQuerySet."""
return queryset._clone(klass=DistinctCountsSearchQuerySet) # pylint: disable=protected-access
def __init__(self, **kwargs):
"""
Initialize a new instance of the DistinctCountsSearchQuerySet.
Overrides SearchQuerySet.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/query.py#L24
"""
super(DistinctCountsSearchQuerySet, self).__init__(**kwargs)
self._distinct_result_count = None
def with_distinct_counts(self, aggregation_key):
"""
Adds distinct_count aggregations to the Query.
Arguments:
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
clone = self._clone()
clone.query = clone.query._clone(DistinctCountsSearchQuery) # pylint: disable=protected-access
clone.query.aggregation_key = aggregation_key
clone.query.validate()
return clone
def distinct_count(self):
"""
Return the distinct hit count.
Note: This will raise an error if the SearchQuerySet has not been configured to compute distinct counts. It
will also force the query to run if it hasn't already.
"""
if not isinstance(self.query, DistinctCountsSearchQuery):
raise RuntimeError('This SearchQuerySet has not been configured to compute distinct counts.')
if self._distinct_result_count is None:
self._distinct_result_count = self.query.get_distinct_count()
return self._distinct_result_count
import datetime
import mock
import pytest
from django.test import TestCase
from haystack.backends import SQ
from haystack.backends.elasticsearch_backend import ElasticsearchSearchQuery
from haystack.query import SearchQuerySet
from course_discovery.apps.core.tests.mixins import ElasticsearchTestMixin
from course_discovery.apps.course_metadata.models import CourseRun
from course_discovery.apps.course_metadata.tests.factories import CourseFactory, CourseRunFactory
from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import (
DistinctCountsElasticsearchBackendWrapper, DistinctCountsSearchQuery
)
# pylint: disable=protected-access
class DistinctCountsSearchQueryTests(ElasticsearchTestMixin, TestCase):
def test_clone(self):
""" Verify that clone copies all fields, including the aggregation_key and distinct_hit_count."""
query = DistinctCountsSearchQuery()
query.add_field_facet('pacing_type')
query.aggregation_key = 'aggregation_key'
query._distinct_hit_count = 123
clone = query._clone()
assert query.facets == clone.facets
assert query.aggregation_key == clone.aggregation_key # pylint: disable=no-member
assert query._distinct_hit_count == clone._distinct_hit_count # pylint: disable=no-member
def test_clone_with_different_class(self):
""" Verify that clone does not copy aggregation_key and distinct_hit_count when using different class."""
query = DistinctCountsSearchQuery()
query.add_field_facet('pacing_type')
query.aggregation_key = 'aggregation_key'
query._distinct_hit_count = 123
clone = query._clone(klass=ElasticsearchSearchQuery)
assert isinstance(clone, ElasticsearchSearchQuery)
assert query.facets == clone.facets
assert not hasattr(clone, 'aggregation_key')
assert not hasattr(clone, '_distinct_hit_count')
def test_get_distinct_count_returns_cached_value(self):
""" Verify that get_distinct_count returns the distinct_count from the cache when present."""
query = DistinctCountsSearchQuery()
query._distinct_hit_count = 123
assert query.get_distinct_count() == 123
def test_get_distinct_count_runs_query_when_cache_empty(self):
""" Verify that get_distinct_count runs the query and caches/returns the distinct_count."""
course = CourseFactory()
CourseRunFactory(title='foo', course=course)
CourseRunFactory(title='foo', course=course)
query = DistinctCountsSearchQuery()
query.aggregation_key = 'aggregation_key'
query.add_filter(SQ(title='foo'))
query.add_model(CourseRun)
assert query._distinct_hit_count is None
assert query.get_distinct_count() == 1
assert query._distinct_hit_count == 1
def test_run_executes_the_query_and_caches_the_results(self):
""" Verify that run executes the query and caches the results."""
course_1 = CourseFactory()
run_1 = CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course_1)
run_2 = CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course_1)
course_2 = CourseFactory()
run_3 = CourseRunFactory(title='foo', pacing_type='instructor_paced', hidden=False, course=course_2)
CourseRunFactory(title='bar', pacing_type='instructor_paced', hidden=False, course=course_2)
query = DistinctCountsSearchQuery()
query.aggregation_key = 'aggregation_key'
query.add_filter(SQ(title='foo'))
query.add_model(CourseRun)
query.add_field_facet('pacing_type')
query.add_query_facet('hidden', 'hidden:true')
assert query._distinct_hit_count is None
assert query._hit_count is None
assert query._results is None
assert query._facet_counts is None
query.run()
expected_results = sorted([run_1.key, run_2.key, run_3.key])
actual_results = sorted([run.key for run in query._results])
assert query._distinct_hit_count == 2
assert query._hit_count == 3
assert expected_results == actual_results
facet_counts = query._facet_counts
for field_val, count, distinct_count in facet_counts['fields']['pacing_type']:
assert field_val in {'self_paced', 'instructor_paced'}
if field_val == 'self_paced':
assert count == 2 and distinct_count == 1
elif field_val == 'instructor_paced':
assert count == 1 and distinct_count == 1
count, distinct_count = facet_counts['queries']['hidden']
assert count == 2 and distinct_count == 1
def test_run_handles_pagination(self):
""" Verify that run supports paginated queries. """
course_1 = CourseFactory()
for _ in range(5):
CourseRunFactory(title='foo', course=course_1)
query = DistinctCountsSearchQuery()
query.aggregation_key = 'aggregation_key'
query.add_filter(SQ(title='foo'))
query.add_model(CourseRun)
query.run()
all_results = query._results
assert len(all_results) == 5
query._reset()
query.set_limits(low=1, high=3)
query.run()
paginated_results = query._results
assert len(paginated_results) == 2
expected = sorted([run.key for run in all_results[1:3]])
actual = sorted([run.key for run in paginated_results])
assert expected == actual
def test_run_raises_when_validation_fails(self):
""" Verify that run raises an exception when the Query is misconfigured. """
with mock.patch.object(DistinctCountsSearchQuery, 'validate') as mock_validate:
mock_validate.side_effect = RuntimeError('validation failed')
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().run()
assert str(err.value) == 'validation failed'
def test_validate_raises_when_configured_with_more_like_this_query(self):
""" Verify that validate raises when Query configured with more_like_this query."""
query = DistinctCountsSearchQuery()
query._more_like_this = True
with pytest.raises(RuntimeError) as err:
query.validate()
assert 'does not support more_like_this queries' in str(err.value)
def test_validate_raises_when_configured_with_raw_query(self):
""" Verify that validate raises when Query configured with raw query."""
# The raw_search method on DistinctCountsSearchQuery raises, so configure a raw query
# on a normal ElasticsearchSearchQuery and then clone it to a DistinctCountsSearchQuery.
query = ElasticsearchSearchQuery()
query.raw_search('title:foo')
query = query._clone(klass=DistinctCountsSearchQuery)
query.aggregation_key = 'aggregation_key'
with pytest.raises(RuntimeError) as err:
query.validate()
assert 'does not support raw queries' in str(err.value)
def test_validate_raises_when_configured_with_date_facet(self):
""" Verify that validate raises when Query configured with date facet."""
now = datetime.datetime.now()
# The add_date_facet method on DistinctCountsSearchQuery raises, so configure a date facet
# on a normal ElasticsearchSearchQuery and then clone it to a DistinctCountsSearchQuery.
query = ElasticsearchSearchQuery()
query.add_date_facet('start', now - datetime.timedelta(days=10), now + datetime.timedelta(days=10), 'day')
query = query._clone(klass=DistinctCountsSearchQuery)
query.aggregation_key = 'aggregation_key'
with pytest.raises(RuntimeError) as err:
query.validate()
assert 'does not support date facets' in str(err.value)
def test_validate_raises_when_configured_with_facet_with_unsupported_options(self):
""" Verify that validate raises when Query configured with facet with unsupported options."""
# The add_field_facet method on DistinctCountsSearchQuery raises when unsupported options are passed,
# so configure a field facet with those options on a normal ElasticsearchSearchQuery and then clone
# it to a DistinctCountsSearchQuery.
query = ElasticsearchSearchQuery()
query.add_field_facet('pacing_type', order='term')
query = query._clone(klass=DistinctCountsSearchQuery)
query.aggregation_key = 'aggregation_key'
with pytest.raises(RuntimeError) as err:
query.validate()
assert 'only supports a limited set of field facet options' in str(err.value)
def test_validate_raises_when_configured_without_aggregation_key(self):
""" Verify that validate raises when Query configured without aggregation_key."""
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().run()
assert str(err.value) == 'aggregation_key is required.'
def test_more_like_this_raises(self):
""" Verify that more_like_this raises an exception."""
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().more_like_this()
assert 'does not support more_like_this queries' in str(err.value)
def test_run_mlt_raises(self):
""" Verify that run_mlt raises an exception."""
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().run_mlt()
assert 'does not support more_like_this queries' in str(err.value)
def test_raw_search_raises(self):
""" Verify that raw_search raises an exception."""
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().raw_search()
assert 'does not support raw queries' in str(err.value)
def test_run_raw_raises(self):
""" Verify that run_raw raises an exception."""
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().run_raw()
assert 'does not support raw queries' in str(err.value)
def test_add_date_facet_raises(self):
""" Verify that add_date_facet raises an exception. """
with pytest.raises(RuntimeError) as err:
DistinctCountsSearchQuery().add_date_facet()
assert 'does not support date facets' in str(err.value)
def test_add_field_facet_validates_options(self):
""" Verify that add_field_facet validates the provided options."""
query = DistinctCountsSearchQuery()
with pytest.raises(RuntimeError) as err:
query.add_field_facet('pacing_type', order='term')
assert 'only supports a limited set of field facet options' in str(err.value)
query.add_field_facet('pacing_type', size=5)
assert query.facets['pacing_type_exact']['size'] == 5
class DistinctCountsElasticsearchBackendWrapperTests(ElasticsearchTestMixin, TestCase):
def test_search_raises_when_called_with_date_facet(self):
now = datetime.datetime.now()
one_day = datetime.timedelta(days=1)
queryset = SearchQuerySet().date_facet('start', now - one_day, now + one_day, 'day')
querystring = queryset.query.build_query()
params = queryset.query.build_params()
backend = DistinctCountsElasticsearchBackendWrapper(queryset.query.backend, 'aggregation_key')
with pytest.raises(RuntimeError) as err:
backend.search(querystring, **params)
assert 'does not support date facets' in str(err.value)
def test_search_raises_when_called_with_unsupported_field_facet_option(self):
queryset = SearchQuerySet().facet('pacing_type', order='term')
querystring = queryset.query.build_query()
params = queryset.query.build_params()
backend = DistinctCountsElasticsearchBackendWrapper(queryset.query.backend, 'aggregation_key')
with pytest.raises(RuntimeError) as err:
backend.search(querystring, **params)
assert 'field facet with unsupported options' in str(err.value)
def test_build_search_kwargs_does_not_include_facet_clause(self):
""" Verify that a facets clause is not included with search kwargs."""
queryset = SearchQuerySet().query_facet('hidden', 'hidden:true').facet('pacing_type')
querystring = queryset.query.build_query()
params = queryset.query.build_params()
backend = DistinctCountsElasticsearchBackendWrapper(queryset.query.backend, 'aggregation_key')
search_kwargs = backend._build_search_kwargs(querystring, **params)
assert 'facets' not in search_kwargs
assert 'aggregations' in search_kwargs
import datetime
import pytest
from django.test import TestCase
from haystack.query import SearchQuerySet
from course_discovery.apps.core.tests.mixins import ElasticsearchTestMixin
from course_discovery.apps.course_metadata.models import CourseRun
from course_discovery.apps.course_metadata.tests.factories import CourseFactory, CourseRunFactory
from course_discovery.apps.edx_haystack_extensions.distinct_counts.backends import DistinctCountsSearchQuery
from course_discovery.apps.edx_haystack_extensions.distinct_counts.query import DistinctCountsSearchQuerySet
class DistinctCountsSearchQuerySetTests(ElasticsearchTestMixin, TestCase):
def test_from_queryset(self):
""" Verify that a DistinctCountsSearchQuerySet can be built from an existing SearchQuerySet."""
course_1 = CourseFactory()
CourseRunFactory(title='foo', course=course_1)
CourseRunFactory(title='foo', course=course_1)
course_2 = CourseFactory()
CourseRunFactory(title='foo', course=course_2)
CourseRunFactory(title='bar', course=course_2)
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
expected = sorted([run.key for run in queryset])
actual = sorted([run.key for run in dc_queryset])
assert expected == actual
def test_with_distinct_counts(self):
"""
Verify that the query object is converted to a DistinctCountsSearchQuery and the aggregation_key is
configured properly.
"""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
assert isinstance(dc_queryset.query, DistinctCountsSearchQuery)
assert dc_queryset.query.aggregation_key == 'aggregation_key'
def test_with_distinct_counts_raises_when_queryset_includes_unsupported_options(self):
"""
Verify that an error is raised if the original queryset includes options that are not supported by our
custom Query class.
"""
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(SearchQuerySet())
with pytest.raises(RuntimeError) as err:
now = datetime.datetime.now()
ten_days = datetime.timedelta(days=10)
start = now - ten_days
end = now + ten_days
dc_queryset.date_facet('start', start, end, 'day').with_distinct_counts('aggregation_key')
assert str(err.value) == 'DistinctCountsSearchQuery does not support date facets.'
with pytest.raises(RuntimeError) as err:
dc_queryset.facet('pacing_type', order='term').with_distinct_counts('aggregation_key')
assert 'DistinctCountsSearchQuery only supports a limited set of field facet options.' in str(err.value)
def test_distinct_count_returns_cached_distinct_count(self):
""" Verify that distinct_count returns the cached distinct_result_count when present."""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
dc_queryset._distinct_result_count = 123 # pylint: disable=protected-access
assert dc_queryset.distinct_count() == 123
def test_distinct_count_runs_query_when_cache_is_empty(self):
""" Verify that distinct_count runs the query, caches, and returns the distinct_count when cache is empty."""
course_1 = CourseFactory()
CourseRunFactory(title='foo', course=course_1)
CourseRunFactory(title='foo', course=course_1)
course_2 = CourseFactory()
CourseRunFactory(title='foo', course=course_2)
CourseRunFactory(title='bar', course=course_2)
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
assert dc_queryset._distinct_result_count is None # pylint: disable=protected-access
assert dc_queryset.distinct_count() == 2
assert dc_queryset._distinct_result_count == 2 # pylint: disable=protected-access
def test_distinct_count_raises_when_not_properly_configured(self):
"""
Verify that distinct_count raises when called without configuring the SearchQuerySet to compute distinct
counts.
"""
queryset = SearchQuerySet()
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset)
with pytest.raises(RuntimeError) as err:
dc_queryset.distinct_count()
assert str(err.value) == 'This SearchQuerySet has not been configured to compute distinct counts.'
def test_facet_counts_includes_distinct_counts(self):
""" Verify that facet_counts include distinct counts. """
course = CourseFactory()
CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
CourseRunFactory(title='foo', pacing_type='self_paced', hidden=True, course=course)
CourseRunFactory(title='foo', pacing_type='instructor_paced', hidden=False, course=course)
# Make sure to add both a field facet and a query facet so that we can be sure that both work.
queryset = SearchQuerySet().filter(title='foo').models(CourseRun)
queryset = queryset.facet('pacing_type').query_facet('hidden', 'hidden:true')
dc_queryset = DistinctCountsSearchQuerySet.from_queryset(queryset).with_distinct_counts('aggregation_key')
facet_counts = dc_queryset.facet_counts()
# Field facets are expected to be formatted as a list of three-tuples (field_value, count, distinct_count)
for val, count, distinct_count in facet_counts['fields']['pacing_type']:
assert val in {'self_paced', 'instructor_paced'}
if val == 'self_paced':
assert count == 2
assert distinct_count == 1
elif val == 'instructor_paced':
assert count == 1
assert distinct_count == 1
# Query facets are expected to be formatted as a dictionary mapping facet_names to two-tuples (count,
# distinct_count)
hidden_count, hidden_distinct_count = facet_counts['queries']['hidden']
assert hidden_count == 2
assert hidden_distinct_count == 1
......@@ -430,7 +430,7 @@ HAYSTACK_CONNECTIONS = {
},
}
# We do not use the RealtimeSignalProcessor here to avoid overloading our
# We do not use the RealtimeSignalProcessor here to avoid overloading our
# Elasticsearch instance when running the refresh_course_metadata command
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor'
HAYSTACK_INDEX_RETENTION_LIMIT = 3
......@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
SEARCH_FACET_LIMIT = 10000
# Precision settings for the elasticsearch cardinality aggregations used to compute distinct hit and facet counts.
# The elasticsearch cardinality aggregation is not guarenteed to produce accurate results. Accuracy is configurable via
# an optional precision_threshold setting. Cardinality aggregations for queries that produce fewer results than the
# precision threshold can be expected to be pretty accurate. Cardinality aggregations for queries that produce more
# results than the precision_threshold will be less accurate. Setting a higher value for precision_threshold requires
# a memory tradeoff of rougly precision_threshold * 8 bytes. See the elasticsearch docs for more details:
# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
#
# We use a higher value for hit precision than for facet precision for two reasons:
# 1.) The hit count is more visible to users than the facet counts.
# 2.) The performance penalty for having a higher hit precision is less than the penalty for a higher facet
# precision, since the hit count only requires a single aggregation.
DISTINCT_COUNTS_HIT_PRECISION = 1500
DISTINCT_COUNTS_FACET_PRECISION = 250
DEFAULT_PARTNER_ID = None
# See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment