Commit 6332bcbf by Matthew Piatetsky

Use ngrams for autocomplete and set up index settings/mappings

ECOM-4738
parent d9ace3d8
......@@ -298,7 +298,7 @@ class AggregateSearchViewSet(DefaultPartnerMixin, SerializationMixin, LoginMixin
[self.serialize_course_run(course_run), self.serialize_program(program)])
class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestCase):
class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, ElasticsearchTestMixin, APITestCase):
path = reverse('api:v1:search-typeahead')
function_score = {
'functions': [
......@@ -307,12 +307,6 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
{'linear': {'start': {'origin': 'now', 'scale': '1d', 'decay': 0.95}}, 'weight': 5.0}
],
'boost': 1.0, 'score_mode': 'sum', 'boost_mode': 'sum',
'query': {
'query_string': {
'auto_generate_phrase_queries': True, 'analyze_wildcard': True,
'query': '((title:*pytho* OR course_key:*pytho*) AND status:(active))'
}
}
}
def get_typeahead_response(self, query=None):
......@@ -323,7 +317,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
url = '{path}?{qs}'.format(path=self.path, qs=qs)
config = ElasticsearchBoostConfig.get_solo()
config.function_score = self.function_score
config.function_score.update(self.function_score)
config.save()
return self.client.get(url)
......@@ -405,7 +399,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_micromasters_boosting(self):
""" Verify micromasters are boosted over xseries."""
title = "test_micromasters_boosting"
title = "micromasters"
ProgramFactory(
title=title + "1",
status=ProgramStatus.Active,
......@@ -420,7 +414,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_start_date_boosting(self):
""" Verify upcoming courses are boosted over past courses."""
title = "test_start_date_boosting"
title = "start"
now = datetime.datetime.utcnow()
CourseRunFactory(title=title + "1", start=now - datetime.timedelta(weeks=10))
CourseRunFactory(title=title + "2", start=now + datetime.timedelta(weeks=1))
......@@ -431,7 +425,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def test_self_paced_boosting(self):
""" Verify that self paced courses are boosted over instructor led courses."""
title = "test_self_paced_boosting"
title = "paced"
CourseRunFactory(title=title + "1", pacing_type='instructor_paced')
CourseRunFactory(title=title + "2", pacing_type='self_paced')
response = self.get_typeahead_response(title)
......
......@@ -121,23 +121,36 @@ class AggregateSearchViewSet(BaseHaystackViewSet):
class TypeaheadSearchView(APIView):
"""
Typeahead for courses and programs.
"""
""" Typeahead for courses and programs. """
RESULT_COUNT = 3
permission_classes = (IsAuthenticated,)
def get_results(self, query):
query = '(title:*{query}* OR course_key:*{query}*)'.format(query=query.lower())
course_runs = SearchQuerySet().models(CourseRun).raw_search(query)
course_runs = SearchQuerySet().models(CourseRun).filter(SQ(title_autocomplete=query) | SQ(course_key=query))
course_runs = course_runs.filter(published=True).exclude(hidden=True)
course_runs = course_runs[:self.RESULT_COUNT]
programs = SearchQuerySet().models(Program).raw_search(query)
programs = SearchQuerySet().models(Program).filter(SQ(title_autocomplete=query))
programs = programs.filter(status=ProgramStatus.Active)
programs = programs[:self.RESULT_COUNT]
return course_runs, programs
def get(self, request, *args, **kwargs):
"""
Typeahead uses the ngram_analyzer as the index_analyzer to generate ngrams of the title during indexing.
i.e. Data Science -> da, dat, at, ata, data, etc...
Typeahead uses the lowercase analyzer as the search_analyzer.
The ngram_analyzer uses the lowercase filter as well, which makes typeahead case insensitive.
Available analyzers are defined in index _settings and field level analyzers are defined in the index _mapping.
NGrams are used rather than EdgeNgrams because NGrams allow partial searches across white space:
i.e. data sci - > data science, but not data analysis or scientific method
---
parameters:
- name: q
description: "Search text"
paramType: query
required: true
type: string
"""
query = request.query_params.get('q')
if not query:
raise ParseError("The 'q' querystring parameter is required for searching.")
......
import logging
from django.conf import settings
from elasticsearch import Elasticsearch
from haystack import connections as haystack_connections
from course_discovery.apps.core.utils import ElasticsearchUtils
......@@ -12,12 +12,19 @@ class ElasticsearchTestMixin(object):
@classmethod
def setUpClass(cls):
super(ElasticsearchTestMixin, cls).setUpClass()
host = settings.HAYSTACK_CONNECTIONS['default']['URL']
cls.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME']
cls.es = Elasticsearch(host)
# Make use of the changes in our custom ES backend
# This is required for typeahead autocomplete to work in the tests
connection = haystack_connections['default']
cls.backend = connection.get_backend()
# Without this line, haystack doesn't fully recreate the connection
# The first test using this backend succeeds, but the following tests
# do not set the Elasticsearch _mapping
def setUp(self):
super(ElasticsearchTestMixin, self).setUp()
self.backend.setup_complete = False
self.es = self.backend.conn
self.reset_index()
self.refresh_index()
......
import datetime
import logging
from django.conf import settings
logger = logging.getLogger(__name__)
......@@ -16,7 +18,8 @@ class ElasticsearchUtils(object):
# Create an index with a unique (timestamped) name
timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
es.indices.create(index=index)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
es.indices.create(index=index, body=index_settings)
logger.info('...index [%s] created.', index)
# Point the alias to the new index
......
......@@ -59,6 +59,7 @@ class BaseIndex(indexes.SearchIndex):
class BaseCourseIndex(OrganizationsMixin, BaseIndex):
key = indexes.CharField(model_attr='key', stored=True)
title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
short_description = indexes.CharField(model_attr='short_description', null=True)
full_description = indexes.CharField(model_attr='full_description', null=True)
subjects = indexes.MultiValueField(faceted=True)
......@@ -181,6 +182,7 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
uuid = indexes.CharField(model_attr='uuid')
title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
subtitle = indexes.CharField(model_attr='subtitle')
type = indexes.CharField(model_attr='type__name', faceted=True)
marketing_url = indexes.CharField(null=True)
......
......@@ -74,8 +74,37 @@ class NonClearingSearchBackendMixin(object):
# pylint: disable=abstract-method
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def specify_analyzers(self, mapping, field, index_analyzer, search_analyzer):
""" Specify separate index and search analyzers for the given field.
Args:
mapping (dict): /_mapping attribute on index (maps analyzers to fields)
field (str): which field to modify
index_analyzer (str): name of the index_analyzer (should be defined in the /_settings attribute)
search_analyzer (str): name of the search_analyzer (should be defined in the /_settings attribute)
"""
# The generic analyzer is used for both if index_analyzer and search_analyzer are not specified
mapping[field].pop('analyzer')
mapping[field].update({
'index_analyzer': index_analyzer,
'search_analyzer': search_analyzer
})
def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields)
# Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
# This is necessary to support partial searches/typeahead
# If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
# and all words containing ing would come back in typeahead.
self.specify_analyzers(mapping=mapping, field='title_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer='lowercase')
return (content_field_name, mapping)
# pylint: disable=abstract-method
class EdxElasticsearchSearchBackend(SimpleQuerySearchBackendMixin, NonClearingSearchBackendMixin,
ElasticsearchSearchBackend):
ConfigurableElasticBackend):
pass
......
import datetime
import logging
from django.conf import settings
from haystack import connections as haystack_connections
from haystack.management.commands.update_index import Command as HaystackCommand
......@@ -84,5 +85,6 @@ class Command(HaystackCommand):
"""
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
backend.conn.indices.create(index=index_name)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
backend.conn.indices.create(index=index_name, body=index_settings)
return index_name
......@@ -350,6 +350,67 @@ SWAGGER_SETTINGS = {
'permission_denied_handler': 'course_discovery.apps.api.views.api_docs_permission_denied_handler'
}
# Elasticsearch uses index settings to specify available analyzers.
# We are adding the lowercase analyzer and tweaking the ngram analyzers here,
# so we need to use these settings rather than the index defaults.
# We are making these changes to enable autocomplete for the typeahead endpoint.
ELASTICSEARCH_INDEX_SETTINGS = {
'settings': {
'analysis': {
'tokenizer': {
'haystack_edgengram_tokenizer': {
'type': 'edgeNGram',
'side': 'front',
'min_gram': 2,
'max_gram': 15
},
'haystack_ngram_tokenizer': {
'type': 'nGram',
'min_gram': 2,
'max_gram': 15
}
},
'analyzer': {
'lowercase': {
'type': 'custom',
'tokenizer': 'keyword',
'filter': [
'lowercase'
]
},
'ngram_analyzer': {
'type':'custom',
'filter': [
'haystack_ngram',
'lowercase'
],
'tokenizer': 'standard'
},
'edgengram_analyzer': {
'type': 'custom',
'filter': [
'haystack_edgengram',
'lowercase'
],
'tokenizer': 'standard'
}
},
'filter': {
'haystack_edgengram': {
'type': 'edgeNGram',
'min_gram': 2,
'max_gram': 15
},
'haystack_ngram': {
'type': 'nGram',
'min_gram': 2,
'max_gram': 15
}
}
}
}
}
# Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment