Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
C
course-discovery
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
course-discovery
Commits
d72f5077
Commit
d72f5077
authored
Feb 28, 2017
by
Anthony Mangano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add ability to compute distinct hit and facet counts
ECOM-6815
parent
5f25faf1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
765 additions
and
1 deletions
+765
-1
course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
+0
-0
course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
+303
-0
course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
+51
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
+0
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
+270
-0
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
+125
-0
course_discovery/settings/base.py
+16
-1
No files found.
course_discovery/apps/edx_haystack_extensions/distinct_counts/__init__.py
0 → 100644
View file @
d72f5077
course_discovery/apps/edx_haystack_extensions/distinct_counts/backends.py
0 → 100644
View file @
d72f5077
import
elasticsearch
from
django.conf
import
settings
from
haystack.backends.elasticsearch_backend
import
ElasticsearchSearchQuery
from
haystack.models
import
SearchResult
class
DistinctCountsSearchQuery
(
ElasticsearchSearchQuery
):
""" Custom Haystack Query class that computes and caches distinct hit and facet counts for a query."""
def
__init__
(
self
,
**
kwargs
):
"""
Create and return a new instance of DistinctCountsSearchQuery.
Overrides BaseSearchQuery.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L443
"""
super
(
DistinctCountsSearchQuery
,
self
)
.
__init__
(
**
kwargs
)
self
.
aggregation_key
=
None
self
.
_distinct_hit_count
=
None
def
_clone
(
self
,
**
kwargs
):
"""
Create and return a new DistinctCountsSearchQuery with fields set to match those on the original object.
Overrides BaseSearchQuery._clone from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L981
"""
clone
=
super
(
DistinctCountsSearchQuery
,
self
)
.
_clone
(
**
kwargs
)
if
isinstance
(
clone
,
DistinctCountsSearchQuery
):
clone
.
aggregation_key
=
self
.
aggregation_key
clone
.
_distinct_hit_count
=
self
.
_distinct_hit_count
# pylint: disable=protected-access
return
clone
def
get_distinct_count
(
self
):
"""
Return the distinct hit count for this query. Calling this method will cause the query to execute if
it hasn't already been run.
"""
if
self
.
_distinct_hit_count
is
None
:
self
.
run
()
return
self
.
_distinct_hit_count
def
run
(
self
,
spelling_query
=
None
,
**
kwargs
):
"""
Run the query and cache the results.
Overrides and re-implements ElasticsearchSearchQuery.run from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/elasticsearch_backend.py#L941
"""
# Make sure that the Query is valid before running it.
self
.
validate
()
final_query
=
self
.
build_query
()
search_kwargs
=
self
.
build_params
(
spelling_query
)
if
kwargs
:
search_kwargs
.
update
(
kwargs
)
# Use the DistinctCountsElasticsearchBackendWrapper to execute the query so that distinct hit and query
# counts may be computed.
backend
=
DistinctCountsElasticsearchBackendWrapper
(
self
.
backend
,
self
.
aggregation_key
)
results
=
backend
.
search
(
final_query
,
**
search_kwargs
)
self
.
_results
=
results
.
get
(
'results'
,
[])
self
.
_hit_count
=
results
.
get
(
'hits'
,
0
)
self
.
_distinct_hit_count
=
results
.
get
(
'distinct_hits'
,
0
)
self
.
_facet_counts
=
self
.
post_process_facets
(
results
)
self
.
_spelling_suggestion
=
results
.
get
(
'spelling_suggestion'
,
None
)
def
validate
(
self
):
""" Verify that all Query options are valid and supported by this custom Query class."""
if
self
.
_more_like_this
:
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support more_like_this queries.'
)
if
self
.
_raw_query
:
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support raw queries.'
)
if
self
.
date_facets
:
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support date facets.'
)
if
self
.
facets
:
for
field
,
options
in
self
.
facets
.
items
():
self
.
_validate_field_facet_options
(
field
,
options
)
if
self
.
aggregation_key
is
None
:
raise
RuntimeError
(
'aggregation_key is required.'
)
def
_validate_field_facet_options
(
self
,
field
,
options
):
""" Verify that the provided field facet options are valid and can be converted to an aggregation."""
supported_options
=
DistinctCountsElasticsearchBackendWrapper
.
SUPPORTED_FIELD_FACET_OPTIONS
for
option
,
__
in
options
.
items
():
if
option
not
in
supported_options
:
msg
=
(
'DistinctCountsSearchQuery only supports a limited set of field facet options.'
'Field: {field}, Supported Options: ({supported}), Provided Options: ({provided})'
)
.
format
(
field
=
field
,
supported
=
','
.
join
(
supported_options
),
provided
=
','
.
join
(
options
.
keys
()))
raise
RuntimeError
(
msg
)
def
more_like_this
(
self
,
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support more_like_this queries."""
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support more_like_this queries.'
)
def
run_mlt
(
self
,
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support more_like_this queries."""
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support more_like_this queries.'
)
def
raw_search
(
self
,
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support raw queries."""
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support raw queries.'
)
def
run_raw
(
self
,
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support raw queries."""
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support raw queries.'
)
def
add_date_facet
(
self
,
*
args
,
**
kwargs
):
# pylint: disable=unused-argument
""" Raise an exception since we do not currently want/need to support date facets."""
raise
RuntimeError
(
'DistinctCountsSearchQuery does not support date facets.'
)
def
add_field_facet
(
self
,
field
,
**
options
):
"""
Add a field facet to the Query. Raise an error if any unsupported options are provided.
Overrides BaseSearchQuery.add_field_facet from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/__init__.py#L897
"""
self
.
_validate_field_facet_options
(
field
,
options
)
return
super
(
DistinctCountsSearchQuery
,
self
)
.
add_field_facet
(
field
,
**
options
)
class
DistinctCountsElasticsearchBackendWrapper
(
object
):
"""
Custom backend-like class that enables the computation of distinct hit and facet counts during search queries.
This class is not meant to be a true ElasticsearchSearchBackend. It is meant to wrap an existing
ElasticsearchSearchBackend instance and expose a very limited subset of backend functionality.
"""
# The options that are supported for building field facet aggregations.
SUPPORTED_FIELD_FACET_OPTIONS
=
{
'size'
}
# The default size for field facet aggregations. This is the same value used by haystack.
DEFAULT_FIELD_FACET_SIZE
=
100
def
__init__
(
self
,
backend
,
aggregation_key
):
"""
Initialize a new instance of the DistinctCountsElasticsearchBackendWrapper.
Arguments:
backend (ElasticsearchSearchBackend)
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
self
.
backend
=
backend
self
.
aggregation_key
=
aggregation_key
self
.
aggregation_name
=
'distinct_{}'
.
format
(
aggregation_key
)
def
search
(
self
,
query_string
,
**
kwargs
):
"""
Run a search query and return the results.
Re-implements ElasticsearchSearchBackend.search from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/backends/elasticsearch_backend.py#L495
"""
if
len
(
query_string
)
==
0
:
return
{
'results'
:
[],
'hits'
:
0
,
'distinct_hits'
:
0
}
if
not
self
.
backend
.
setup_complete
:
self
.
backend
.
setup
()
search_kwargs
=
self
.
_build_search_kwargs
(
query_string
,
**
kwargs
)
search_kwargs
[
'from'
]
=
kwargs
.
get
(
'start_offset'
,
0
)
order_fields
=
set
()
for
order
in
search_kwargs
.
get
(
'sort'
,
[]):
for
key
in
order
.
keys
():
order_fields
.
add
(
key
)
geo_sort
=
'_geo_distance'
in
order_fields
end_offset
=
kwargs
.
get
(
'end_offset'
)
start_offset
=
kwargs
.
get
(
'start_offset'
,
0
)
if
end_offset
is
not
None
and
end_offset
>
start_offset
:
search_kwargs
[
'size'
]
=
end_offset
-
start_offset
try
:
raw_results
=
self
.
backend
.
conn
.
search
(
body
=
search_kwargs
,
index
=
self
.
backend
.
index_name
,
doc_type
=
'modelresult'
,
_source
=
True
)
except
elasticsearch
.
TransportError
as
e
:
if
not
self
.
backend
.
silently_fail
:
raise
self
.
backend
.
log
.
error
(
'Failed to query Elasticsearch using "
%
s":
%
s'
,
query_string
,
e
,
exc_info
=
True
)
raw_results
=
{}
return
self
.
_process_results
(
raw_results
,
highlight
=
kwargs
.
get
(
'highlight'
),
result_class
=
kwargs
.
get
(
'result_class'
,
SearchResult
),
distance_point
=
kwargs
.
get
(
'distance_point'
),
geo_sort
=
geo_sort
)
def
_build_search_kwargs
(
self
,
*
args
,
**
kwargs
):
""" Build and return the arguments for the elasticsearch query."""
aggregations
=
self
.
_build_cardinality_aggregation
(
precision
=
settings
.
DISTINCT_COUNTS_HIT_PRECISION
)
if
'facets'
in
kwargs
:
aggregations
.
update
(
self
.
_build_field_facet_aggregations
(
facet_dict
=
kwargs
.
pop
(
'facets'
,
{}),
precision
=
settings
.
DISTINCT_COUNTS_FACET_PRECISION
))
if
'query_facets'
in
kwargs
:
aggregations
.
update
(
self
.
_build_query_facet_aggregations
(
facet_list
=
kwargs
.
pop
(
'query_facets'
,
[]),
precision
=
settings
.
DISTINCT_COUNTS_FACET_PRECISION
))
if
'date_facets'
in
kwargs
:
raise
RuntimeError
(
'DistinctCountsElasticsearchBackendWrapper does not support date facets.'
)
search_kwargs
=
self
.
backend
.
build_search_kwargs
(
*
args
,
**
kwargs
)
search_kwargs
[
'aggregations'
]
=
aggregations
return
search_kwargs
def
_build_cardinality_aggregation
(
self
,
precision
=
None
):
"""
Build and return a cardinality aggregation using the configured aggregation_key.
The elasticsearch cardinality aggregation does not guarantee accurate results. Accuracy
is configurable via an optional precision_threshold argument. See
https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
Arguments:
precision (str): a numeric value below which counts computed by the cardinality aggregation can
be expected to be close to accurate. Setting this value requires a memory tradeoff of
about (precision * 8) bytes.
"""
aggregation
=
{
self
.
aggregation_name
:
{
'cardinality'
:
{
'field'
:
self
.
aggregation_key
}}}
if
precision
is
not
None
:
aggregation
[
self
.
aggregation_name
][
'cardinality'
][
'precision_threshold'
]
=
precision
return
aggregation
def
_build_field_facet_aggregations
(
self
,
facet_dict
,
precision
=
None
):
""" Build and return a dictionary of aggregations for field facets."""
aggregations
=
{}
for
facet_fieldname
,
opts
in
facet_dict
.
items
():
for
opt
,
__
in
opts
.
items
():
if
opt
not
in
self
.
SUPPORTED_FIELD_FACET_OPTIONS
:
opts_str
=
','
.
join
(
opts
.
keys
())
msg
=
'Cannot build aggregation for field facet with unsupported options: {}'
.
format
(
opts_str
)
raise
RuntimeError
(
msg
)
aggregations
[
facet_fieldname
]
=
{
'terms'
:
{
'field'
:
facet_fieldname
,
'size'
:
opts
.
get
(
'size'
,
self
.
DEFAULT_FIELD_FACET_SIZE
)},
'aggregations'
:
self
.
_build_cardinality_aggregation
(
precision
=
precision
),
}
return
aggregations
def
_build_query_facet_aggregations
(
self
,
facet_list
,
precision
=
None
):
""" Build and return a dictionary of aggregations for query facets."""
aggregations
=
{}
for
facet_fieldname
,
value
in
facet_list
:
aggregations
[
facet_fieldname
]
=
{
'filter'
:
{
'query'
:
{
'query_string'
:
{
'query'
:
value
}}},
'aggregations'
:
self
.
_build_cardinality_aggregation
(
precision
=
precision
),
}
return
aggregations
def
_process_results
(
self
,
raw_results
,
**
kwargs
):
""" Process the query results into a form that is more easily consumable by the client."""
results
=
self
.
backend
.
_process_results
(
raw_results
,
**
kwargs
)
# pylint: disable=protected-access
aggregations
=
raw_results
[
'aggregations'
]
# Process the distinct hit count
results
[
'distinct_hits'
]
=
aggregations
[
self
.
aggregation_name
][
'value'
]
# Process the remaining aggregations, which should all be for facets.
facets
=
{
'fields'
:
{},
'dates'
:
{},
'queries'
:
{}}
for
name
,
data
in
aggregations
.
items
():
# The distinct hit count for the overall query was already processed.
if
name
==
self
.
aggregation_name
:
continue
# Field facets:
elif
'buckets'
in
data
:
buckets
=
data
[
'buckets'
]
facets
[
'fields'
][
name
]
=
[
# Extract the facet name, count, and distinct_count
(
bucket
[
'key'
],
bucket
[
'doc_count'
],
bucket
[
self
.
aggregation_name
][
'value'
])
for
bucket
in
buckets
]
# Query facets:
else
:
# Extract the facet name, count, and distinct_count
facets
[
'queries'
][
name
]
=
(
data
[
'doc_count'
],
data
[
self
.
aggregation_name
][
'value'
])
results
[
'facets'
]
=
facets
return
results
course_discovery/apps/edx_haystack_extensions/distinct_counts/query.py
0 → 100644
View file @
d72f5077
from
haystack.query
import
SearchQuerySet
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.backends
import
DistinctCountsSearchQuery
class
DistinctCountsSearchQuerySet
(
SearchQuerySet
):
"""Custom SearchQuerySet class that can compute and cache distinct hit and facet counts for a query."""
@staticmethod
def
from_queryset
(
queryset
):
""" Builds a DistinctCountsSearchQuerySet from an existing SearchQuerySet."""
return
queryset
.
_clone
(
klass
=
DistinctCountsSearchQuerySet
)
# pylint: disable=protected-access
def
__init__
(
self
,
**
kwargs
):
"""
Initialize a new instance of the DistinctCountsSearchQuerySet.
Overrides SearchQuerySet.__init__ from:
https://github.com/django-haystack/django-haystack/blob/v2.5.0/haystack/query.py#L24
"""
super
(
DistinctCountsSearchQuerySet
,
self
)
.
__init__
(
**
kwargs
)
self
.
_distinct_result_count
=
None
def
with_distinct_counts
(
self
,
aggregation_key
):
"""
Adds distinct_count aggregations to the Query.
Arguments:
aggregation_key (str): The field that should be used to group records when computing distinct counts.
It should be a field that is NOT analyzed by the index (like one of the faceted _exact fields).
Using a field that is analyzed will result in inaccurate counts, as analyzed fields are broken down by
the search backend and will result in records being grouped by substrings of the aggregation_key field.
"""
clone
=
self
.
_clone
()
clone
.
query
=
clone
.
query
.
_clone
(
DistinctCountsSearchQuery
)
# pylint: disable=protected-access
clone
.
query
.
aggregation_key
=
aggregation_key
clone
.
query
.
validate
()
return
clone
def
distinct_count
(
self
):
"""
Return the distinct hit count.
Note: This will raise an error if the SearchQuerySet has not been configured to compute distinct counts. It
will also force the query to run if it hasn't already.
"""
if
not
isinstance
(
self
.
query
,
DistinctCountsSearchQuery
):
raise
RuntimeError
(
'This SearchQuerySet has not been configured to compute distinct counts.'
)
if
self
.
_distinct_result_count
is
None
:
self
.
_distinct_result_count
=
self
.
query
.
get_distinct_count
()
return
self
.
_distinct_result_count
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/__init__.py
0 → 100644
View file @
d72f5077
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_backends.py
0 → 100644
View file @
d72f5077
import
datetime
import
mock
import
pytest
from
django.test
import
TestCase
from
haystack.backends
import
SQ
from
haystack.backends.elasticsearch_backend
import
ElasticsearchSearchQuery
from
haystack.query
import
SearchQuerySet
from
course_discovery.apps.core.tests.mixins
import
ElasticsearchTestMixin
from
course_discovery.apps.course_metadata.models
import
CourseRun
from
course_discovery.apps.course_metadata.tests.factories
import
CourseFactory
,
CourseRunFactory
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.backends
import
(
DistinctCountsElasticsearchBackendWrapper
,
DistinctCountsSearchQuery
)
# pylint: disable=protected-access
class
DistinctCountsSearchQueryTests
(
ElasticsearchTestMixin
,
TestCase
):
def
test_clone
(
self
):
""" Verify that clone copies all fields, including the aggregation_key and distinct_hit_count."""
query
=
DistinctCountsSearchQuery
()
query
.
add_field_facet
(
'pacing_type'
)
query
.
aggregation_key
=
'aggregation_key'
query
.
_distinct_hit_count
=
123
clone
=
query
.
_clone
()
assert
query
.
facets
==
clone
.
facets
assert
query
.
aggregation_key
==
clone
.
aggregation_key
# pylint: disable=no-member
assert
query
.
_distinct_hit_count
==
clone
.
_distinct_hit_count
# pylint: disable=no-member
def
test_clone_with_different_class
(
self
):
""" Verify that clone does not copy aggregation_key and distinct_hit_count when using different class."""
query
=
DistinctCountsSearchQuery
()
query
.
add_field_facet
(
'pacing_type'
)
query
.
aggregation_key
=
'aggregation_key'
query
.
_distinct_hit_count
=
123
clone
=
query
.
_clone
(
klass
=
ElasticsearchSearchQuery
)
assert
isinstance
(
clone
,
ElasticsearchSearchQuery
)
assert
query
.
facets
==
clone
.
facets
assert
not
hasattr
(
clone
,
'aggregation_key'
)
assert
not
hasattr
(
clone
,
'_distinct_hit_count'
)
def
test_get_distinct_count_returns_cached_value
(
self
):
""" Verify that get_distinct_count returns the distinct_count from the cache when present."""
query
=
DistinctCountsSearchQuery
()
query
.
_distinct_hit_count
=
123
assert
query
.
get_distinct_count
()
==
123
def
test_get_distinct_count_runs_query_when_cache_empty
(
self
):
""" Verify that get_distinct_count runs the query and caches/returns the distinct_count."""
course
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course
)
CourseRunFactory
(
title
=
'foo'
,
course
=
course
)
query
=
DistinctCountsSearchQuery
()
query
.
aggregation_key
=
'aggregation_key'
query
.
add_filter
(
SQ
(
title
=
'foo'
))
query
.
add_model
(
CourseRun
)
assert
query
.
_distinct_hit_count
is
None
assert
query
.
get_distinct_count
()
==
1
assert
query
.
_distinct_hit_count
==
1
def
test_run_executes_the_query_and_caches_the_results
(
self
):
""" Verify that run executes the query and caches the results."""
course_1
=
CourseFactory
()
run_1
=
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course_1
)
run_2
=
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course_1
)
course_2
=
CourseFactory
()
run_3
=
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'instructor_paced'
,
hidden
=
False
,
course
=
course_2
)
CourseRunFactory
(
title
=
'bar'
,
pacing_type
=
'instructor_paced'
,
hidden
=
False
,
course
=
course_2
)
query
=
DistinctCountsSearchQuery
()
query
.
aggregation_key
=
'aggregation_key'
query
.
add_filter
(
SQ
(
title
=
'foo'
))
query
.
add_model
(
CourseRun
)
query
.
add_field_facet
(
'pacing_type'
)
query
.
add_query_facet
(
'hidden'
,
'hidden:true'
)
assert
query
.
_distinct_hit_count
is
None
assert
query
.
_hit_count
is
None
assert
query
.
_results
is
None
assert
query
.
_facet_counts
is
None
query
.
run
()
expected_results
=
sorted
([
run_1
.
key
,
run_2
.
key
,
run_3
.
key
])
actual_results
=
sorted
([
run
.
key
for
run
in
query
.
_results
])
assert
query
.
_distinct_hit_count
==
2
assert
query
.
_hit_count
==
3
assert
expected_results
==
actual_results
facet_counts
=
query
.
_facet_counts
for
field_val
,
count
,
distinct_count
in
facet_counts
[
'fields'
][
'pacing_type'
]:
assert
field_val
in
{
'self_paced'
,
'instructor_paced'
}
if
field_val
==
'self_paced'
:
assert
count
==
2
and
distinct_count
==
1
elif
field_val
==
'instructor_paced'
:
assert
count
==
1
and
distinct_count
==
1
count
,
distinct_count
=
facet_counts
[
'queries'
][
'hidden'
]
assert
count
==
2
and
distinct_count
==
1
def
test_run_handles_pagination
(
self
):
""" Verify that run supports paginated queries. """
course_1
=
CourseFactory
()
for
_
in
range
(
5
):
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
query
=
DistinctCountsSearchQuery
()
query
.
aggregation_key
=
'aggregation_key'
query
.
add_filter
(
SQ
(
title
=
'foo'
))
query
.
add_model
(
CourseRun
)
query
.
run
()
all_results
=
query
.
_results
assert
len
(
all_results
)
==
5
query
.
_reset
()
query
.
set_limits
(
low
=
1
,
high
=
3
)
query
.
run
()
paginated_results
=
query
.
_results
assert
len
(
paginated_results
)
==
2
expected
=
sorted
([
run
.
key
for
run
in
all_results
[
1
:
3
]])
actual
=
sorted
([
run
.
key
for
run
in
paginated_results
])
assert
expected
==
actual
def
test_run_raises_when_validation_fails
(
self
):
""" Verify that run raises an exception when the Query is misconfigured. """
with
mock
.
patch
.
object
(
DistinctCountsSearchQuery
,
'validate'
)
as
mock_validate
:
mock_validate
.
side_effect
=
RuntimeError
(
'validation failed'
)
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
run
()
assert
str
(
err
.
value
)
==
'validation failed'
def
test_validate_raises_when_configured_with_more_like_this_query
(
self
):
""" Verify that validate raises when Query configured with more_like_this query."""
query
=
DistinctCountsSearchQuery
()
query
.
_more_like_this
=
True
with
pytest
.
raises
(
RuntimeError
)
as
err
:
query
.
validate
()
assert
'does not support more_like_this queries'
in
str
(
err
.
value
)
def
test_validate_raises_when_configured_with_raw_query
(
self
):
""" Verify that validate raises when Query configured with raw query."""
# The raw_search method on DistinctCountsSearchQuery raises, so configure a raw query
# on a normal ElasticsearchSearchQuery and then clone it to a DistinctCountsSearchQuery.
query
=
ElasticsearchSearchQuery
()
query
.
raw_search
(
'title:foo'
)
query
=
query
.
_clone
(
klass
=
DistinctCountsSearchQuery
)
query
.
aggregation_key
=
'aggregation_key'
with
pytest
.
raises
(
RuntimeError
)
as
err
:
query
.
validate
()
assert
'does not support raw queries'
in
str
(
err
.
value
)
def
test_validate_raises_when_configured_with_date_facet
(
self
):
""" Verify that validate raises when Query configured with date facet."""
now
=
datetime
.
datetime
.
now
()
# The add_date_facet method on DistinctCountsSearchQuery raises, so configure a date facet
# on a normal ElasticsearchSearchQuery and then clone it to a DistinctCountsSearchQuery.
query
=
ElasticsearchSearchQuery
()
query
.
add_date_facet
(
'start'
,
now
-
datetime
.
timedelta
(
days
=
10
),
now
+
datetime
.
timedelta
(
days
=
10
),
'day'
)
query
=
query
.
_clone
(
klass
=
DistinctCountsSearchQuery
)
query
.
aggregation_key
=
'aggregation_key'
with
pytest
.
raises
(
RuntimeError
)
as
err
:
query
.
validate
()
assert
'does not support date facets'
in
str
(
err
.
value
)
def
test_validate_raises_when_configured_with_facet_with_unsupported_options
(
self
):
""" Verify that validate raises when Query configured with facet with unsupported options."""
# The add_field_facet method on DistinctCountsSearchQuery raises when unsupported options are passed,
# so configure a field facet with those options on a normal ElasticsearchSearchQuery and then clone
# it to a DistinctCountsSearchQuery.
query
=
ElasticsearchSearchQuery
()
query
.
add_field_facet
(
'pacing_type'
,
order
=
'term'
)
query
=
query
.
_clone
(
klass
=
DistinctCountsSearchQuery
)
query
.
aggregation_key
=
'aggregation_key'
with
pytest
.
raises
(
RuntimeError
)
as
err
:
query
.
validate
()
assert
'only supports a limited set of field facet options'
in
str
(
err
.
value
)
def
test_validate_raises_when_configured_without_aggregation_key
(
self
):
""" Verify that validate raises when Query configured without aggregation_key."""
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
run
()
assert
str
(
err
.
value
)
==
'aggregation_key is required.'
def
test_more_like_this_raises
(
self
):
""" Verify that more_like_this raises an exception."""
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
more_like_this
()
assert
'does not support more_like_this queries'
in
str
(
err
.
value
)
def
test_run_mlt_raises
(
self
):
""" Verify that run_mlt raises an exception."""
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
run_mlt
()
assert
'does not support more_like_this queries'
in
str
(
err
.
value
)
def
test_raw_search_raises
(
self
):
""" Verify that raw_search raises an exception."""
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
raw_search
()
assert
'does not support raw queries'
in
str
(
err
.
value
)
def
test_run_raw_raises
(
self
):
""" Verify that run_raw raises an exception."""
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
run_raw
()
assert
'does not support raw queries'
in
str
(
err
.
value
)
def
test_add_date_facet_raises
(
self
):
""" Verify that add_date_facet raises an exception. """
with
pytest
.
raises
(
RuntimeError
)
as
err
:
DistinctCountsSearchQuery
()
.
add_date_facet
()
assert
'does not support date facets'
in
str
(
err
.
value
)
def
test_add_field_facet_validates_options
(
self
):
""" Verify that add_field_facet validates the provided options."""
query
=
DistinctCountsSearchQuery
()
with
pytest
.
raises
(
RuntimeError
)
as
err
:
query
.
add_field_facet
(
'pacing_type'
,
order
=
'term'
)
assert
'only supports a limited set of field facet options'
in
str
(
err
.
value
)
query
.
add_field_facet
(
'pacing_type'
,
size
=
5
)
assert
query
.
facets
[
'pacing_type_exact'
][
'size'
]
==
5
class
DistinctCountsElasticsearchBackendWrapperTests
(
ElasticsearchTestMixin
,
TestCase
):
def
test_search_raises_when_called_with_date_facet
(
self
):
now
=
datetime
.
datetime
.
now
()
one_day
=
datetime
.
timedelta
(
days
=
1
)
queryset
=
SearchQuerySet
()
.
date_facet
(
'start'
,
now
-
one_day
,
now
+
one_day
,
'day'
)
querystring
=
queryset
.
query
.
build_query
()
params
=
queryset
.
query
.
build_params
()
backend
=
DistinctCountsElasticsearchBackendWrapper
(
queryset
.
query
.
backend
,
'aggregation_key'
)
with
pytest
.
raises
(
RuntimeError
)
as
err
:
backend
.
search
(
querystring
,
**
params
)
assert
'does not support date facets'
in
str
(
err
.
value
)
def
test_search_raises_when_called_with_unsupported_field_facet_option
(
self
):
queryset
=
SearchQuerySet
()
.
facet
(
'pacing_type'
,
order
=
'term'
)
querystring
=
queryset
.
query
.
build_query
()
params
=
queryset
.
query
.
build_params
()
backend
=
DistinctCountsElasticsearchBackendWrapper
(
queryset
.
query
.
backend
,
'aggregation_key'
)
with
pytest
.
raises
(
RuntimeError
)
as
err
:
backend
.
search
(
querystring
,
**
params
)
assert
'field facet with unsupported options'
in
str
(
err
.
value
)
def
test_build_search_kwargs_does_not_include_facet_clause
(
self
):
""" Verify that a facets clause is not included with search kwargs."""
queryset
=
SearchQuerySet
()
.
query_facet
(
'hidden'
,
'hidden:true'
)
.
facet
(
'pacing_type'
)
querystring
=
queryset
.
query
.
build_query
()
params
=
queryset
.
query
.
build_params
()
backend
=
DistinctCountsElasticsearchBackendWrapper
(
queryset
.
query
.
backend
,
'aggregation_key'
)
search_kwargs
=
backend
.
_build_search_kwargs
(
querystring
,
**
params
)
assert
'facets'
not
in
search_kwargs
assert
'aggregations'
in
search_kwargs
course_discovery/apps/edx_haystack_extensions/tests/test_distinct_counts/test_query.py
0 → 100644
View file @
d72f5077
import
datetime
import
pytest
from
django.test
import
TestCase
from
haystack.query
import
SearchQuerySet
from
course_discovery.apps.core.tests.mixins
import
ElasticsearchTestMixin
from
course_discovery.apps.course_metadata.models
import
CourseRun
from
course_discovery.apps.course_metadata.tests.factories
import
CourseFactory
,
CourseRunFactory
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.backends
import
DistinctCountsSearchQuery
from
course_discovery.apps.edx_haystack_extensions.distinct_counts.query
import
DistinctCountsSearchQuerySet
class
DistinctCountsSearchQuerySetTests
(
ElasticsearchTestMixin
,
TestCase
):
def
test_from_queryset
(
self
):
""" Verify that a DistinctCountsSearchQuerySet can be built from an existing SearchQuerySet."""
course_1
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
course_2
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_2
)
CourseRunFactory
(
title
=
'bar'
,
course
=
course_2
)
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
expected
=
sorted
([
run
.
key
for
run
in
queryset
])
actual
=
sorted
([
run
.
key
for
run
in
dc_queryset
])
assert
expected
==
actual
def
test_with_distinct_counts
(
self
):
"""
Verify that the query object is converted to a DistinctCountsSearchQuery and the aggregation_key is
configured properly.
"""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
isinstance
(
dc_queryset
.
query
,
DistinctCountsSearchQuery
)
assert
dc_queryset
.
query
.
aggregation_key
==
'aggregation_key'
def
test_with_distinct_counts_raises_when_queryset_includes_unsupported_options
(
self
):
"""
Verify that an error is raised if the original queryset includes options that are not supported by our
custom Query class.
"""
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
SearchQuerySet
())
with
pytest
.
raises
(
RuntimeError
)
as
err
:
now
=
datetime
.
datetime
.
now
()
ten_days
=
datetime
.
timedelta
(
days
=
10
)
start
=
now
-
ten_days
end
=
now
+
ten_days
dc_queryset
.
date_facet
(
'start'
,
start
,
end
,
'day'
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
str
(
err
.
value
)
==
'DistinctCountsSearchQuery does not support date facets.'
with
pytest
.
raises
(
RuntimeError
)
as
err
:
dc_queryset
.
facet
(
'pacing_type'
,
order
=
'term'
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
'DistinctCountsSearchQuery only supports a limited set of field facet options.'
in
str
(
err
.
value
)
def
test_distinct_count_returns_cached_distinct_count
(
self
):
""" Verify that distinct_count returns the cached distinct_result_count when present."""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
dc_queryset
.
_distinct_result_count
=
123
# pylint: disable=protected-access
assert
dc_queryset
.
distinct_count
()
==
123
def
test_distinct_count_runs_query_when_cache_is_empty
(
self
):
""" Verify that distinct_count runs the query, caches, and returns the distinct_count when cache is empty."""
course_1
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
CourseRunFactory
(
title
=
'foo'
,
course
=
course_1
)
course_2
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
course
=
course_2
)
CourseRunFactory
(
title
=
'bar'
,
course
=
course_2
)
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
assert
dc_queryset
.
_distinct_result_count
is
None
# pylint: disable=protected-access
assert
dc_queryset
.
distinct_count
()
==
2
assert
dc_queryset
.
_distinct_result_count
==
2
# pylint: disable=protected-access
def
test_distinct_count_raises_when_not_properly_configured
(
self
):
"""
Verify that distinct_count raises when called without configuring the SearchQuerySet to compute distinct
counts.
"""
queryset
=
SearchQuerySet
()
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
with
pytest
.
raises
(
RuntimeError
)
as
err
:
dc_queryset
.
distinct_count
()
assert
str
(
err
.
value
)
==
'This SearchQuerySet has not been configured to compute distinct counts.'
def
test_facet_counts_includes_distinct_counts
(
self
):
""" Verify that facet_counts include distinct counts. """
course
=
CourseFactory
()
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course
)
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'self_paced'
,
hidden
=
True
,
course
=
course
)
CourseRunFactory
(
title
=
'foo'
,
pacing_type
=
'instructor_paced'
,
hidden
=
False
,
course
=
course
)
# Make sure to add both a field facet and a query facet so that we can be sure that both work.
queryset
=
SearchQuerySet
()
.
filter
(
title
=
'foo'
)
.
models
(
CourseRun
)
queryset
=
queryset
.
facet
(
'pacing_type'
)
.
query_facet
(
'hidden'
,
'hidden:true'
)
dc_queryset
=
DistinctCountsSearchQuerySet
.
from_queryset
(
queryset
)
.
with_distinct_counts
(
'aggregation_key'
)
facet_counts
=
dc_queryset
.
facet_counts
()
# Field facets are expected to be formatted as a list of three-tuples (field_value, count, distinct_count)
for
val
,
count
,
distinct_count
in
facet_counts
[
'fields'
][
'pacing_type'
]:
assert
val
in
{
'self_paced'
,
'instructor_paced'
}
if
val
==
'self_paced'
:
assert
count
==
2
assert
distinct_count
==
1
elif
val
==
'instructor_paced'
:
assert
count
==
1
assert
distinct_count
==
1
# Query facets are expected to be formatted as a dictionary mapping facet_names to two-tuples (count,
# distinct_count)
hidden_count
,
hidden_distinct_count
=
facet_counts
[
'queries'
][
'hidden'
]
assert
hidden_count
==
2
assert
hidden_distinct_count
==
1
course_discovery/settings/base.py
View file @
d72f5077
...
...
@@ -430,7 +430,7 @@ HAYSTACK_CONNECTIONS = {
},
}
# We do not use the RealtimeSignalProcessor here to avoid overloading our
# We do not use the RealtimeSignalProcessor here to avoid overloading our
# Elasticsearch instance when running the refresh_course_metadata command
HAYSTACK_SIGNAL_PROCESSOR
=
'haystack.signals.BaseSignalProcessor'
HAYSTACK_INDEX_RETENTION_LIMIT
=
3
...
...
@@ -439,6 +439,21 @@ HAYSTACK_INDEX_RETENTION_LIMIT = 3
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
SEARCH_FACET_LIMIT
=
10000
# Precision settings for the elasticsearch cardinality aggregations used to compute distinct hit and facet counts.
# The elasticsearch cardinality aggregation is not guarenteed to produce accurate results. Accuracy is configurable via
# an optional precision_threshold setting. Cardinality aggregations for queries that produce fewer results than the
# precision threshold can be expected to be pretty accurate. Cardinality aggregations for queries that produce more
# results than the precision_threshold will be less accurate. Setting a higher value for precision_threshold requires
# a memory tradeoff of rougly precision_threshold * 8 bytes. See the elasticsearch docs for more details:
# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-aggregations-metrics-cardinality-aggregation.html
#
# We use a higher value for hit precision than for facet precision for two reasons:
# 1.) The hit count is more visible to users than the facet counts.
# 2.) The performance penalty for having a higher hit precision is less than the penalty for a higher facet
# precision, since the hit count only requires a single aggregation.
DISTINCT_COUNTS_HIT_PRECISION
=
1500
DISTINCT_COUNTS_FACET_PRECISION
=
250
DEFAULT_PARTNER_ID
=
None
# See: https://docs.djangoproject.com/en/dev/ref/settings/#site-id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment