Commit dd134c90 by Renzo Lucioni

Tune Elasticsearch boost config

Reward course runs with enrollable, paid seats. Penalize those without them. Slightly increase boost given to MicroMasters and Professional Certificate programs. Tweak self-paced and enrollable boosts to match. Taken together, this modified boost configuration yields search results which tend to promote course runs with enrollable, paid seats above those without them.

LEARNER-1374
parent 87aecfb0
......@@ -36,7 +36,7 @@ class SimpleQuerySearchBackendMixin(object):
'auto_generate_phrase_queries': True,
}
# https://www.elastic.co/guide/en/elasticsearch/reference/1.7/query-dsl-function-score-query.html
# https://www.elastic.co/guide/en/elasticsearch/reference/1.5/query-dsl-function-score-query.html
function_score_config = get_elasticsearch_boost_config()['function_score']
function_score_config['query'] = {
'query_string': simple_query
......
......@@ -20,15 +20,36 @@ def get_elasticsearch_boost_config():
To see how a given hit's score was computed, use the explain parameter:
https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-request-explain.html
"""
elasticsearch_boost_config = {
return {
'function_score': {
'boost_mode': 'sum',
'boost': 1.0,
'score_mode': 'sum',
'functions': [
{'filter': {'term': {'pacing_type_exact': 'self_paced'}}, 'weight': 1.0},
{'filter': {'term': {'type_exact': 'Professional Certificate'}}, 'weight': 1.0},
{'filter': {'term': {'type_exact': 'MicroMasters'}}, 'weight': 1.0},
{
'filter': {
'term': {
'pacing_type_exact': 'self_paced'
}
},
'weight': 5.0
},
{
'filter': {
'term': {
'type_exact': 'MicroMasters'
}
},
'weight': 5.0
},
{
'filter': {
'term': {
'type_exact': 'Professional Certificate'
}
},
'weight': 5.0
},
# Decay function for modifying scores based on the value of the
# start field. The Gaussian function decays slowly, then rapidly,
......@@ -58,74 +79,169 @@ def get_elasticsearch_boost_config():
'weight': 5.0
},
# Boost function for CourseRuns with enrollable paid Seats.
# We want to boost if:
# - The course run has at least one enrollable paid Seat (has_enrollable_paid_seats is True)
# AND one of the following two conditions are true
# - The paid_seat_enrollment_end is unspecified.
# - The paid_seat_enrollment_end is in the future.
# We apply a weight of 1.0 to match the boost given for self paced courses.
# Reward course runs with enrollable, paid seats.
{
'filter': {
'bool': {
'must': [
{'exists': {'field': 'has_enrollable_paid_seats'}},
{'term': {'has_enrollable_paid_seats': True}}
],
'must': {
'term': {
'has_enrollable_paid_seats': True
}
},
'should': [
{'bool': {'must_not': {'exists': {'field': 'paid_seat_enrollment_end'}}}},
{'range': {'paid_seat_enrollment_end': {'gte': 'now'}}}
# A paid seat with a null enrollment end date is
# considered to be available, as if the end date
# were in the future.
{
'bool': {
'must_not': {
'exists': {
'field': 'paid_seat_enrollment_end'
}
}
}
},
{
'range': {
'paid_seat_enrollment_end': {
'gte': 'now'
}
}
}
]
}
},
'weight': 1.0
'weight': 15.0
},
# Boost function for enrollable CourseRuns.
# We want to boost if:
# - enrollment_start and enrollment_end are unspecified
# - enrollment_start is unspecified and enrollment_end is in the future
# - enrollment_end is unspecified and enrollment_start is in the past
# - enrollment_start is in the past and enrollment_end is in the future
# We apply a weight of 1.0 to match the boost given for self paced and enrollable paid courses.
# Penalize course runs without enrollable, paid seats. This penalty
# applies specifically to course runs, so that we don't reduce the
# relevance score of programs.
{
'filter': {
'bool': {
'must': {
'term': {
'content_type_exact': 'courserun'
}
},
'must_not': {
'range': {
'paid_seat_enrollment_end': {
'gte': 'now'
}
}
}
}
},
'weight': -20.0
},
# Give a slight boost to enrollable course runs, regardless of seat
# configuration. Course runs with unexpired, paid seats should be
# rewarded more generously, but when comparing two course runs,
# the one in which the user can enroll should be given preference.
{
'filter': {
'bool': {
'should': [
{'bool': {
{
'bool': {
'must_not': [
{'exists': {'field': 'enrollment_start'}},
{'exists': {'field': 'enrollment_end'}}
{
'exists': {
'field': 'enrollment_start'
}
},
{
'exists': {
'field': 'enrollment_end'
}
}
]
}},
{'bool': {
'must_not': {'exists': {'field': 'enrollment_start'}},
}
},
{
'bool': {
'must': [
{'exists': {'field': 'enrollment_end'}},
{'range': {'enrollment_end': {'gt': 'now'}}}
]
}},
{'bool': {
'must_not': {'exists': {'field': 'enrollment_end'}},
{
'exists': {
'field': 'enrollment_end'
}
},
{
'range': {
'enrollment_end': {
'gt': 'now'
}
}
}
],
'must_not': {
'exists': {
'field': 'enrollment_start'
}
}
}
},
{
'bool': {
'must': [
{'exists': {'field': 'enrollment_start'}},
{'range': {'enrollment_start': {'lte': 'now'}}}
]
}},
{'bool': {
{
'exists': {
'field': 'enrollment_start'
}
},
{
'range': {
'enrollment_start': {
'lte': 'now'
}
}
}
],
'must_not': {
'exists': {
'field': 'enrollment_end'
}
}
}
},
{
'bool': {
'must': [
{'exists': {'field': 'enrollment_start'}},
{'exists': {'field': 'enrollment_end'}},
{'range': {'enrollment_start': {'lte': 'now'}}},
{'range': {'enrollment_end': {'gt': 'now'}}}
{
'exists': {
'field': 'enrollment_start'
}
},
{
'exists': {
'field': 'enrollment_end'
}
},
{
'range': {
'enrollment_start': {
'lte': 'now'
}
}
},
{
'range': {
'enrollment_end': {
'gt': 'now'
}
}
}
]
}}
}
}
]
}
},
'weight': 1.0
'weight': 2.0
}
]
}
}
return elasticsearch_boost_config
......@@ -12,54 +12,68 @@ from course_discovery.apps.course_metadata.tests.factories import CourseRunFacto
@ddt.ddt
class SearchBoostingTests(ElasticsearchTestMixin, TestCase):
class TestSearchBoosting(ElasticsearchTestMixin, TestCase):
def build_normalized_course_run(self, **kwargs):
""" Builds a CourseRun with fields set to normalize boosting behavior."""
"""Builds a CourseRun with fields set to normalize boosting behavior."""
defaults = {
'pacing_type': 'instructor_paced',
'start': datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(weeks=52),
'enrollment_start': datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(weeks=50),
'enrollment_end': None
'enrollment_end': None,
**kwargs
}
defaults.update(kwargs)
return CourseRunFactory(**defaults)
def test_self_paced_boosting(self):
"""Verify that self paced courses are boosted over instructor led courses."""
self.build_normalized_course_run(pacing_type='instructor_paced')
test_record = self.build_normalized_course_run(pacing_type='self_paced')
search_results = SearchQuerySet().models(CourseRun).all()
assert len(search_results) == 2
assert search_results[0].score > search_results[1].score
assert test_record.pacing_type == search_results[0].pacing_type
@ddt.data('MicroMasters', 'Professional Certificate')
def test_program_type_boosting(self, program_type):
"""Verify MicroMasters and Professional Certificate are boosted over XSeries."""
ProgramFactory(type=ProgramType.objects.get(name='XSeries'))
test_record = ProgramFactory(type=ProgramType.objects.get(name=program_type))
search_results = SearchQuerySet().models(Program).all()
assert len(search_results) == 2
assert search_results[0].score > search_results[1].score
assert str(test_record.type) == str(search_results[0].type)
def test_start_date_boosting(self):
""" Verify upcoming courses are boosted over past courses."""
"""Verify upcoming courses are boosted over past courses."""
now = datetime.datetime.now(pytz.timezone('utc'))
self.build_normalized_course_run(start=now + datetime.timedelta(weeks=10))
test_record = self.build_normalized_course_run(start=now + datetime.timedelta(weeks=1))
search_results = SearchQuerySet().models(CourseRun).all()
self.assertEqual(2, len(search_results))
self.assertGreater(search_results[0].score, search_results[1].score)
self.assertEqual(int(test_record.start.timestamp()), int(search_results[0].start.timestamp())) # pylint: disable=no-member
def test_self_paced_boosting(self):
""" Verify that self paced courses are boosted over instructor led courses."""
self.build_normalized_course_run(pacing_type='instructor_paced')
test_record = self.build_normalized_course_run(pacing_type='self_paced')
search_results = SearchQuerySet().models(CourseRun).all()
self.assertEqual(2, len(search_results))
self.assertGreater(search_results[0].score, search_results[1].score)
self.assertEqual(test_record.pacing_type, search_results[0].pacing_type)
assert len(search_results) == 2
assert search_results[0].score > search_results[1].score
assert int(test_record.start.timestamp()) == int(search_results[0].start.timestamp()) # pylint: disable=no-member
@ddt.data(
# Case 1: Should not get boost if has_enrollable_paid_seats is False, has_enrollable_paid_seats is None or
# paid_seat_enrollment_end is in the past.
# Should not get boost if has_enrollable_paid_seats is False, has_enrollable_paid_seats
# is None, or paid_seat_enrollment_end is in the past.
(False, None, False),
(None, None, False),
(True, datetime.datetime.now(pytz.timezone('utc')) - datetime.timedelta(days=15), False),
# Case 2: Should get boost if has_enrollable_paid_seats is True and paid_seat_enrollment_end is None or
# in the future.
# Should get boost if has_enrollable_paid_seats is True and paid_seat_enrollment_end
# is None or in the future.
(True, None, True),
(True, datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(days=15), True)
(True, datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(days=15), True),
)
@ddt.unpack
def test_enrollable_paid_seat_boosting(self, has_enrollable_paid_seats, paid_seat_enrollment_end, expects_boost):
""" Verify that CourseRuns for which an unenrolled user may enroll and purchase a paid Seat are boosted."""
"""
Verify that CourseRuns for which an unenrolled user may enroll and
purchase a paid Seat are boosted.
"""
# Create a control record (one that should never be boosted).
with patch.object(CourseRun, 'has_enrollable_paid_seats', return_value=False):
......@@ -72,48 +86,65 @@ class SearchBoostingTests(ElasticsearchTestMixin, TestCase):
test_record = self.build_normalized_course_run(title='test2')
search_results = SearchQuerySet().models(CourseRun).all()
self.assertEqual(2, len(search_results))
assert len(search_results) == 2
if expects_boost:
self.assertGreater(search_results[0].score, search_results[1].score)
self.assertEqual(test_record.title, search_results[0].title)
assert search_results[0].score > search_results[1].score
assert test_record.title == search_results[0].title
else:
self.assertEqual(search_results[0].score, search_results[1].score)
assert search_results[0].score == search_results[1].score
@ddt.data('MicroMasters', 'Professional Certificate')
def test_program_type_boosting(self, program_type):
""" Verify MicroMasters and Professional Certificate are boosted over XSeries."""
ProgramFactory(type=ProgramType.objects.get(name='XSeries'))
test_record = ProgramFactory(type=ProgramType.objects.get(name=program_type))
def test_expired_paid_seat_penalized(self):
"""
Verify that a course run with an expired, paid seat is penalized relative
to one with an enrollable, paid seat.
"""
now = datetime.datetime.now(pytz.timezone('utc'))
search_results = SearchQuerySet().models(Program).all()
self.assertEqual(2, len(search_results))
self.assertGreater(search_results[0].score, search_results[1].score)
self.assertEqual(str(test_record.type), str(search_results[0].type))
future = now + datetime.timedelta(days=15)
with patch.object(CourseRun, 'has_enrollable_paid_seats', return_value=True):
with patch.object(CourseRun, 'get_paid_seat_enrollment_end', return_value=future):
promoted_run = self.build_normalized_course_run(title='promoted')
past = now - datetime.timedelta(days=15)
with patch.object(CourseRun, 'has_enrollable_paid_seats', return_value=True):
with patch.object(CourseRun, 'get_paid_seat_enrollment_end', return_value=past):
penalized_run = self.build_normalized_course_run(title='penalized')
search_results = SearchQuerySet().models(CourseRun).all()
assert len(search_results) == 2
assert [promoted_run.title, penalized_run.title] == [hit.title for hit in search_results]
assert search_results[0].score > search_results[1].score
# Verify that this result has a negative score. Course runs with expired,
# paid seats are penalized by having a relatively large value subtracted
# from their relevance score. In this test case, the result should be a
# negative relevance score.
assert 0 > search_results[1].score
@ddt.data(
# Case 1: Should get boost if enrollment_start and enrollment_end unspecified.
# Should get boost if enrollment_start and enrollment_end unspecified.
(None, None, True),
# Case 2: Should get boost if enrollment_start unspecified and enrollment_end in future.
# Should get boost if enrollment_start unspecified and enrollment_end in future.
(None, datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(days=15), True),
# Case 3: Should get boost if enrollment_start in past and enrollment_end unspecified.
# Should get boost if enrollment_start in past and enrollment_end unspecified.
(datetime.datetime.now(pytz.timezone('utc')) - datetime.timedelta(days=15), None, True),
# Case 4: Should get boost if enrollment_start in past and enrollment_end in future.
(datetime.datetime.now(pytz.timezone('utc')) - datetime.timedelta(days=15),
# Should get boost if enrollment_start in past and enrollment_end in future.
(
datetime.datetime.now(pytz.timezone('utc')) - datetime.timedelta(days=15),
datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(days=15),
True),
# Case 5: Should not get boost if enrollment_start in future.
True
),
# Should not get boost if enrollment_start in future.
(datetime.datetime.now(pytz.timezone('utc')) + datetime.timedelta(days=15), None, False),
# Case 5: Should not get boost if enrollment_end in past.
# Should not get boost if enrollment_end in past.
(None, datetime.datetime.now(pytz.timezone('utc')) - datetime.timedelta(days=15), False),
)
@ddt.unpack
def test_enrollable_course_run_boosting(self, enrollment_start, enrollment_end, expects_boost):
""" Verify that enrollable CourseRuns are boosted."""
"""Verify that enrollable CourseRuns are boosted."""
# Create a control record that should never be boosted
self.build_normalized_course_run(title='test1')
......@@ -125,9 +156,10 @@ class SearchBoostingTests(ElasticsearchTestMixin, TestCase):
)
search_results = SearchQuerySet().models(CourseRun).all()
self.assertEqual(2, len(search_results))
assert len(search_results) == 2
if expects_boost:
self.assertGreater(search_results[0].score, search_results[1].score)
self.assertEqual(test_record.title, search_results[0].title)
assert search_results[0].score > search_results[1].score
assert test_record.title == search_results[0].title
else:
self.assertEqual(search_results[0].score, search_results[1].score)
assert search_results[0].score == search_results[1].score
......@@ -7,7 +7,6 @@ LOGGING['handlers']['local'] = {
'class': 'logging.NullHandler',
}
# Determine which requests should render Django Debug Toolbar
INTERNAL_IPS = ('127.0.0.1',)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment