Commit 6332bcbf by Matthew Piatetsky

Use ngrams for autocomplete and set up index settings/mappings

ECOM-4738
parent d9ace3d8
...@@ -298,7 +298,7 @@ class AggregateSearchViewSet(DefaultPartnerMixin, SerializationMixin, LoginMixin ...@@ -298,7 +298,7 @@ class AggregateSearchViewSet(DefaultPartnerMixin, SerializationMixin, LoginMixin
[self.serialize_course_run(course_run), self.serialize_program(program)]) [self.serialize_course_run(course_run), self.serialize_program(program)])
class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestCase): class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, ElasticsearchTestMixin, APITestCase):
path = reverse('api:v1:search-typeahead') path = reverse('api:v1:search-typeahead')
function_score = { function_score = {
'functions': [ 'functions': [
...@@ -307,12 +307,6 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC ...@@ -307,12 +307,6 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
{'linear': {'start': {'origin': 'now', 'scale': '1d', 'decay': 0.95}}, 'weight': 5.0} {'linear': {'start': {'origin': 'now', 'scale': '1d', 'decay': 0.95}}, 'weight': 5.0}
], ],
'boost': 1.0, 'score_mode': 'sum', 'boost_mode': 'sum', 'boost': 1.0, 'score_mode': 'sum', 'boost_mode': 'sum',
'query': {
'query_string': {
'auto_generate_phrase_queries': True, 'analyze_wildcard': True,
'query': '((title:*pytho* OR course_key:*pytho*) AND status:(active))'
}
}
} }
def get_typeahead_response(self, query=None): def get_typeahead_response(self, query=None):
...@@ -323,7 +317,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC ...@@ -323,7 +317,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
url = '{path}?{qs}'.format(path=self.path, qs=qs) url = '{path}?{qs}'.format(path=self.path, qs=qs)
config = ElasticsearchBoostConfig.get_solo() config = ElasticsearchBoostConfig.get_solo()
config.function_score = self.function_score config.function_score.update(self.function_score)
config.save() config.save()
return self.client.get(url) return self.client.get(url)
...@@ -405,7 +399,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC ...@@ -405,7 +399,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_micromasters_boosting(self): def test_micromasters_boosting(self):
""" Verify micromasters are boosted over xseries.""" """ Verify micromasters are boosted over xseries."""
title = "test_micromasters_boosting" title = "micromasters"
ProgramFactory( ProgramFactory(
title=title + "1", title=title + "1",
status=ProgramStatus.Active, status=ProgramStatus.Active,
...@@ -420,7 +414,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC ...@@ -420,7 +414,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_start_date_boosting(self): def test_start_date_boosting(self):
""" Verify upcoming courses are boosted over past courses.""" """ Verify upcoming courses are boosted over past courses."""
title = "test_start_date_boosting" title = "start"
now = datetime.datetime.utcnow() now = datetime.datetime.utcnow()
CourseRunFactory(title=title + "1", start=now - datetime.timedelta(weeks=10)) CourseRunFactory(title=title + "1", start=now - datetime.timedelta(weeks=10))
CourseRunFactory(title=title + "2", start=now + datetime.timedelta(weeks=1)) CourseRunFactory(title=title + "2", start=now + datetime.timedelta(weeks=1))
...@@ -431,7 +425,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC ...@@ -431,7 +425,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_self_paced_boosting(self): def test_self_paced_boosting(self):
""" Verify that self paced courses are boosted over instructor led courses.""" """ Verify that self paced courses are boosted over instructor led courses."""
title = "test_self_paced_boosting" title = "paced"
CourseRunFactory(title=title + "1", pacing_type='instructor_paced') CourseRunFactory(title=title + "1", pacing_type='instructor_paced')
CourseRunFactory(title=title + "2", pacing_type='self_paced') CourseRunFactory(title=title + "2", pacing_type='self_paced')
response = self.get_typeahead_response(title) response = self.get_typeahead_response(title)
......
...@@ -121,23 +121,36 @@ class AggregateSearchViewSet(BaseHaystackViewSet): ...@@ -121,23 +121,36 @@ class AggregateSearchViewSet(BaseHaystackViewSet):
class TypeaheadSearchView(APIView): class TypeaheadSearchView(APIView):
""" """ Typeahead for courses and programs. """
Typeahead for courses and programs.
"""
RESULT_COUNT = 3 RESULT_COUNT = 3
permission_classes = (IsAuthenticated,) permission_classes = (IsAuthenticated,)
def get_results(self, query): def get_results(self, query):
query = '(title:*{query}* OR course_key:*{query}*)'.format(query=query.lower()) course_runs = SearchQuerySet().models(CourseRun).filter(SQ(title_autocomplete=query) | SQ(course_key=query))
course_runs = SearchQuerySet().models(CourseRun).raw_search(query)
course_runs = course_runs.filter(published=True).exclude(hidden=True) course_runs = course_runs.filter(published=True).exclude(hidden=True)
course_runs = course_runs[:self.RESULT_COUNT] course_runs = course_runs[:self.RESULT_COUNT]
programs = SearchQuerySet().models(Program).raw_search(query) programs = SearchQuerySet().models(Program).filter(SQ(title_autocomplete=query))
programs = programs.filter(status=ProgramStatus.Active) programs = programs.filter(status=ProgramStatus.Active)
programs = programs[:self.RESULT_COUNT] programs = programs[:self.RESULT_COUNT]
return course_runs, programs return course_runs, programs
def get(self, request, *args, **kwargs): def get(self, request, *args, **kwargs):
"""
Typeahead uses the ngram_analyzer as the index_analyzer to generate ngrams of the title during indexing.
i.e. Data Science -> da, dat, at, ata, data, etc...
Typeahead uses the lowercase analyzer as the search_analyzer.
The ngram_analyzer uses the lowercase filter as well, which makes typeahead case insensitive.
Available analyzers are defined in index _settings and field level analyzers are defined in the index _mapping.
NGrams are used rather than EdgeNgrams because NGrams allow partial searches across white space:
i.e. data sci - > data science, but not data analysis or scientific method
---
parameters:
- name: q
description: "Search text"
paramType: query
required: true
type: string
"""
query = request.query_params.get('q') query = request.query_params.get('q')
if not query: if not query:
raise ParseError("The 'q' querystring parameter is required for searching.") raise ParseError("The 'q' querystring parameter is required for searching.")
......
import logging import logging
from django.conf import settings from django.conf import settings
from elasticsearch import Elasticsearch from haystack import connections as haystack_connections
from course_discovery.apps.core.utils import ElasticsearchUtils from course_discovery.apps.core.utils import ElasticsearchUtils
...@@ -12,12 +12,19 @@ class ElasticsearchTestMixin(object): ...@@ -12,12 +12,19 @@ class ElasticsearchTestMixin(object):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
super(ElasticsearchTestMixin, cls).setUpClass() super(ElasticsearchTestMixin, cls).setUpClass()
host = settings.HAYSTACK_CONNECTIONS['default']['URL']
cls.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME'] cls.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME']
cls.es = Elasticsearch(host) # Make use of the changes in our custom ES backend
# This is required for typeahead autocomplete to work in the tests
connection = haystack_connections['default']
cls.backend = connection.get_backend()
# Without this line, haystack doesn't fully recreate the connection
# The first test using this backend succeeds, but the following tests
# do not set the Elasticsearch _mapping
def setUp(self): def setUp(self):
super(ElasticsearchTestMixin, self).setUp() super(ElasticsearchTestMixin, self).setUp()
self.backend.setup_complete = False
self.es = self.backend.conn
self.reset_index() self.reset_index()
self.refresh_index() self.refresh_index()
......
import datetime import datetime
import logging import logging
from django.conf import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -16,7 +18,8 @@ class ElasticsearchUtils(object): ...@@ -16,7 +18,8 @@ class ElasticsearchUtils(object):
# Create an index with a unique (timestamped) name # Create an index with a unique (timestamped) name
timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp) index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
es.indices.create(index=index) index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
es.indices.create(index=index, body=index_settings)
logger.info('...index [%s] created.', index) logger.info('...index [%s] created.', index)
# Point the alias to the new index # Point the alias to the new index
......
...@@ -59,6 +59,7 @@ class BaseIndex(indexes.SearchIndex): ...@@ -59,6 +59,7 @@ class BaseIndex(indexes.SearchIndex):
class BaseCourseIndex(OrganizationsMixin, BaseIndex): class BaseCourseIndex(OrganizationsMixin, BaseIndex):
key = indexes.CharField(model_attr='key', stored=True) key = indexes.CharField(model_attr='key', stored=True)
title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST) title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
short_description = indexes.CharField(model_attr='short_description', null=True) short_description = indexes.CharField(model_attr='short_description', null=True)
full_description = indexes.CharField(model_attr='full_description', null=True) full_description = indexes.CharField(model_attr='full_description', null=True)
subjects = indexes.MultiValueField(faceted=True) subjects = indexes.MultiValueField(faceted=True)
...@@ -181,6 +182,7 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin): ...@@ -181,6 +182,7 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
uuid = indexes.CharField(model_attr='uuid') uuid = indexes.CharField(model_attr='uuid')
title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST) title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
subtitle = indexes.CharField(model_attr='subtitle') subtitle = indexes.CharField(model_attr='subtitle')
type = indexes.CharField(model_attr='type__name', faceted=True) type = indexes.CharField(model_attr='type__name', faceted=True)
marketing_url = indexes.CharField(null=True) marketing_url = indexes.CharField(null=True)
......
...@@ -74,8 +74,37 @@ class NonClearingSearchBackendMixin(object): ...@@ -74,8 +74,37 @@ class NonClearingSearchBackendMixin(object):
# pylint: disable=abstract-method # pylint: disable=abstract-method
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def specify_analyzers(self, mapping, field, index_analyzer, search_analyzer):
""" Specify separate index and search analyzers for the given field.
Args:
mapping (dict): /_mapping attribute on index (maps analyzers to fields)
field (str): which field to modify
index_analyzer (str): name of the index_analyzer (should be defined in the /_settings attribute)
search_analyzer (str): name of the search_analyzer (should be defined in the /_settings attribute)
"""
# The generic analyzer is used for both if index_analyzer and search_analyzer are not specified
mapping[field].pop('analyzer')
mapping[field].update({
'index_analyzer': index_analyzer,
'search_analyzer': search_analyzer
})
def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields)
# Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
# This is necessary to support partial searches/typeahead
# If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
# and all words containing ing would come back in typeahead.
self.specify_analyzers(mapping=mapping, field='title_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer='lowercase')
return (content_field_name, mapping)
# pylint: disable=abstract-method
class EdxElasticsearchSearchBackend(SimpleQuerySearchBackendMixin, NonClearingSearchBackendMixin, class EdxElasticsearchSearchBackend(SimpleQuerySearchBackendMixin, NonClearingSearchBackendMixin,
ElasticsearchSearchBackend): ConfigurableElasticBackend):
pass pass
......
import datetime import datetime
import logging import logging
from django.conf import settings
from haystack import connections as haystack_connections from haystack import connections as haystack_connections
from haystack.management.commands.update_index import Command as HaystackCommand from haystack.management.commands.update_index import Command as HaystackCommand
...@@ -84,5 +85,6 @@ class Command(HaystackCommand): ...@@ -84,5 +85,6 @@ class Command(HaystackCommand):
""" """
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S') timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp) index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
backend.conn.indices.create(index=index_name) index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
backend.conn.indices.create(index=index_name, body=index_settings)
return index_name return index_name
...@@ -350,6 +350,67 @@ SWAGGER_SETTINGS = { ...@@ -350,6 +350,67 @@ SWAGGER_SETTINGS = {
'permission_denied_handler': 'course_discovery.apps.api.views.api_docs_permission_denied_handler' 'permission_denied_handler': 'course_discovery.apps.api.views.api_docs_permission_denied_handler'
} }
# Elasticsearch uses index settings to specify available analyzers.
# We are adding the lowercase analyzer and tweaking the ngram analyzers here,
# so we need to use these settings rather than the index defaults.
# We are making these changes to enable autocomplete for the typeahead endpoint.
ELASTICSEARCH_INDEX_SETTINGS = {
'settings': {
'analysis': {
'tokenizer': {
'haystack_edgengram_tokenizer': {
'type': 'edgeNGram',
'side': 'front',
'min_gram': 2,
'max_gram': 15
},
'haystack_ngram_tokenizer': {
'type': 'nGram',
'min_gram': 2,
'max_gram': 15
}
},
'analyzer': {
'lowercase': {
'type': 'custom',
'tokenizer': 'keyword',
'filter': [
'lowercase'
]
},
'ngram_analyzer': {
'type':'custom',
'filter': [
'haystack_ngram',
'lowercase'
],
'tokenizer': 'standard'
},
'edgengram_analyzer': {
'type': 'custom',
'filter': [
'haystack_edgengram',
'lowercase'
],
'tokenizer': 'standard'
}
},
'filter': {
'haystack_edgengram': {
'type': 'edgeNGram',
'min_gram': 2,
'max_gram': 15
},
'haystack_ngram': {
'type': 'nGram',
'min_gram': 2,
'max_gram': 15
}
}
}
}
}
# Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html) # Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200 HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment