Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
C
course-discovery
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
course-discovery
Commits
d72f5077
Commit
d72f5077
authored
Feb 28, 2017
by
Anthony Mangano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add ability to compute distinct hit and facet counts
ECOM-6815
parent
5f25faf1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
191 additions
and
0 deletions
+191
-0
course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
+0
-0
course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
+0
-0
course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
+51
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
+0
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
+0
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
+125
-0
course_discovery/settings/base.py
+15
-0
No files found.
course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
0 → 100644
View file @
d72f5077
course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
0 → 100644
View file @
d72f5077
This diff is collapsed.
Click to expand it.
course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
0 → 100644
View file @
d72f5077
from
haystack.query
import
SearchQuerySet
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.backends
import
DistinctCountsSearchQuery
class
DistinctCountsSearchQuerySet
(
SearchQuerySet
):
"""Custom SearchQuerySet class that can compute and cache distinct hit and facet counts for a query."""
@staticmethod
def
from_queryset
(
queryset
):
""" Builds a DistinctCountsSearchQuerySet from an existing SearchQuerySet."""
return
queryset
.
_clone
(
klass
=
DistinctCountsSearchQuerySet
)
# pylint: disable=protected-access
def
__init__
(
self
,
**
kwargs
):
"""
Initialize a new instance of the DistinctCountsSearchQuerySet.
Overrides SearchQuerySet.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/query.py#L24
"""
super
(
DistinctCountsSearchQuerySet
,
self
)
.
__init__
(
**
kwargs
)
self
.
_distinct_result_count
=
None
def
with_distinct_counts
(
self
,
aggregation_key
):
"""
Adds distinct_count aggregations to the Query.
Arguments:
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
clone
=
self
.
_clone
()
clone
.
query
=
clone
.
query
.
_clone
(
DistinctCountsSearchQuery
)
# pylint: disable=protected-access
clone
.
query
.
aggregation_key
=
aggregation_key
clone
.
query
.
validate
()
return
clone
def
distinct_count
(
self
):
"""
Return the distinct hit count.
Note: This will raise an error if the SearchQuerySet has not been configured to compute distinct counts. It
will also force the query to run if it hasn't already.
"""
if
not
isinstance
(
self
.
query
,
DistinctCountsSearchQuery
):
raise
RuntimeError
(
'This SearchQuerySet has not been configured to compute distinct counts.'
)
if
self
.
_distinct_result_count
is
None
:
self
.
_distinct_result_count
=
self
.
query
.
get_distinct_count
()
return
self
.
_distinct_result_count
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
0 → 100644
View file @
d72f5077
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
0 → 100644
View file @
d72f5077
This diff is collapsed.
Click to expand it.
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
0 → 100644
View file @
d72f5077
import
datetime
import
pytest
from
django.test
import
TestCase
from
haystack.query
import
SearchQuerySet
from
course_discovery.apps.core.tests.mixins
import
ElasticsearchTestMixin
from
course_discovery.apps.course_metadata.models
import
CourseRun
from
course_discovery.apps.course_metadata.tests.factories
import
CourseFactory
,
CourseRunFactory
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.backends
import
DistinctCountsSearchQuery
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.query
import
DistinctCountsSearchQuerySet
class
DistinctCountsSearchQuerySetTests
(
ElasticsearchTestMixin
,
TestCase
):
def
test_from_queryset
(
self
):
""" Verify that a DistinctCountsSearchQuerySet can be built from an existing SearchQuerySet."""
course_1
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
course_2
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_2
)
CourseRunFactory
(
title
=
'bar'
,
course
=
course_2
)
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
expected
=
sorted
([
run
.
key
for
run
in
queryset
])
actual
=
sorted
([
run
.
key
for
run
in
dc_queryset
])
assert
expected
==
actual
def
test_with_distinct_counts
(
self
):
"""
Verify that the query object is converted to a DistinctCountsSearchQuery and the aggregation_key is
configured properly.
"""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
isinstance
(
dc_queryset
.
query
,
DistinctCountsSearchQuery
)
assert
dc_queryset
.
query
.
aggregation_key
==
'aggregation_key'
def
test_with_distinct_counts_raises_when_queryset_includes_unsupported_options
(
self
):
"""
Verify that an error is raised if the original queryset includes options that are not supported by our
custom Query class.
"""
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
SearchQuerySet
())
with
pytest
.
raises
(
RuntimeError
)
as
err
:
now
=
datetime
.
datetime
.
now
()
ten_days
=
datetime
.
timedelta
(
days
=
10
)
start
=
now
-
ten_days
end
=
now
+
ten_days
dc_queryset
.
date_facet
(
'start'
,
start
,
end
,
'day'
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
str
(
err
.
value
)
==
'DistinctCountsSearchQuery does not support date facets.'
with
pytest
.
raises
(
RuntimeError
)
as
err
:
dc_queryset
.
facet
(
'pacing_type'
,
order
=
'term'
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
'DistinctCountsSearchQuery only supports a limited set of field facet options.'
in
str
(
err
.
value
)
def
test_distinct_count_returns_cached_distinct_count
(
self
):
""" Verify that distinct_count returns the cached distinct_result_count when present."""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
dc_queryset
.
_distinct_result_count
=
123
# pylint: disable=protected-access
assert
dc_queryset
.
distinct_count
()
==
123
def
test_distinct_count_runs_query_when_cache_is_empty
(
self
):
""" Verify that distinct_count runs the query, caches, and returns the distinct_count when cache is empty."""
course_1
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
course_2
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_2
)
CourseRunFactory
(
title
=
'bar'
,
course
=
course_2
)
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
dc_queryset
.
_distinct_result_count
is
None
# pylint: disable=protected-access
assert
dc_queryset
.
distinct_count
()
==
2
assert
dc_queryset
.
_distinct_result_count
==
2
# pylint: disable=protected-access
def
test_distinct_count_raises_when_not_properly_configured
(
self
):
"""
Verify that distinct_count raises when called without configuring the SearchQuerySet to compute distinct
counts.
"""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
with
pytest
.
raises
(
RuntimeError
)
as
err
:
dc_queryset
.
distinct_count
()
assert
str
(
err
.
value
)
==
'This SearchQuerySet has not been configured to compute distinct counts.'
def
test_facet_counts_includes_distinct_counts
(
self
):
""" Verify that facet_counts include distinct counts. """
course
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course
)
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course
)
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'instructor_paced'
,
hidden
=
False
,
course
=
course
)
# Make sure to add both a field facet and a query facet so that we can be sure that both work.
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
queryset
=
queryset
.
facet
(
'pacing_type'
)
.
query_facet
(
'hidden'
,
'hidden:true'
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
facet_counts
=
dc_queryset
.
facet_counts
()
# Field facets are expected to be formatted as a list of three-tuples (field_value, count, distinct_count)
for
val
,
count
,
distinct_count
in
facet_counts
[
'fields'
][
'pacing_type'
]:
assert
val
in
{
'self_paced'
,
'instructor_paced'
}
if
val
==
'self_paced'
:
assert
count
==
2
assert
distinct_count
==
1
elif
val
==
'instructor_paced'
:
assert
count
==
1
assert
distinct_count
==
1
# Query facets are expected to be formatted as a dictionary mapping facet_names to two-tuples (count,
# distinct_count)
hidden_count
,
hidden_distinct_count
=
facet_counts
[
'queries'
][
'hidden'
]
assert
hidden_count
==
2
assert
hidden_distinct_count
==
1
course_discovery/settings/base.py
View file @
d72f5077
...
@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
...
@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
SEARCH_FACET_LIMIT
=
10000
SEARCH_FACET_LIMIT
=
10000
# Precision settings for the elasticsearch cardinality aggregations used to compute distinct hit and facet counts.
# The elasticsearch cardinality aggregation is not guarenteed to produce accurate results. Accuracy is configurable via
# an optional precision_threshold setting. Cardinality aggregations for queries that produce fewer results than the
# precision threshold can be expected to be pretty accurate. Cardinality aggregations for queries that produce more
# results than the precision_threshold will be less accurate. Setting a higher value for precision_threshold requires
# a memory tradeoff of rougly precision_threshold * 8 bytes. See the elasticsearch docs for more details:
# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
#
# We use a higher value for hit precision than for facet precision for two reasons:
# 1.) The hit count is more visible to users than the facet counts.
# 2.) The performance penalty for having a higher hit precision is less than the penalty for a higher facet
# precision, since the hit count only requires a single aggregation.
DISTINCT_COUNTS_HIT_PRECISION
=
1500
DISTINCT_COUNTS_FACET_PRECISION
=
250
DEFAULT_PARTNER_ID
=
None
DEFAULT_PARTNER_ID
=
None
# See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id
# See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment