Use ngrams for autocomplete and set up index settings/mappings

ECOM-4738

Use ngrams for autocomplete and set up index settings/mappings
ECOM-4738
6332bcbf · Matthew Piatetsky · d9ace3d8 · 6332bcbf · 6332bcbf · 6332bcbf
Commit 6332bcbf authored Dec 06, 2016 by Matthew Piatetsky
8 changed files
--- a/course_discovery/apps/api/v1/tests/test_views/test_search.py
+++ b/course_discovery/apps/api/v1/tests/test_views/test_search.py
@@ -298,7 +298,7 @@ class AggregateSearchViewSet(DefaultPartnerMixin, SerializationMixin, LoginMixin
                             [self.serialize_course_run(course_run), self.serialize_program(program)])


-class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestCase):
+class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, ElasticsearchTestMixin, APITestCase):
    path = reverse('api:v1:search-typeahead')
    function_score = {
        'functions': [
@@ -307,12 +307,6 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
            {'linear': {'start': {'origin': 'now', 'scale': '1d', 'decay': 0.95}}, 'weight': 5.0}
        ],
        'boost': 1.0, 'score_mode': 'sum', 'boost_mode': 'sum',
-        'query': {
-            'query_string': {
-                'auto_generate_phrase_queries': True, 'analyze_wildcard': True,
-                'query': '((title:*pytho* OR course_key:*pytho*) AND status:(active))'
-            }
-        }
    }

    def get_typeahead_response(self, query=None):
@@ -323,7 +317,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC

        url = '{path}?{qs}'.format(path=self.path, qs=qs)
        config = ElasticsearchBoostConfig.get_solo()
-        config.function_score = self.function_score
+        config.function_score.update(self.function_score)
        config.save()
        return self.client.get(url)

@@ -405,7 +399,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC

    def test_micromasters_boosting(self):
        """ Verify micromasters are boosted over xseries."""
-        title = "test_micromasters_boosting"
+        title = "micromasters"
        ProgramFactory(
            title=title + "1",
            status=ProgramStatus.Active,
@@ -420,7 +414,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC

    def test_start_date_boosting(self):
        """ Verify upcoming courses are boosted over past courses."""
-        title = "test_start_date_boosting"
+        title = "start"
        now = datetime.datetime.utcnow()
        CourseRunFactory(title=title + "1", start=now - datetime.timedelta(weeks=10))
        CourseRunFactory(title=title + "2", start=now + datetime.timedelta(weeks=1))
@@ -431,7 +425,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC

    def test_self_paced_boosting(self):
        """ Verify that self paced courses are boosted over instructor led courses."""
-        title = "test_self_paced_boosting"
+        title = "paced"
        CourseRunFactory(title=title + "1", pacing_type='instructor_paced')
        CourseRunFactory(title=title + "2", pacing_type='self_paced')
        response = self.get_typeahead_response(title)

--- a/course_discovery/apps/api/v1/views/search.py
+++ b/course_discovery/apps/api/v1/views/search.py
@@ -121,23 +121,36 @@ class AggregateSearchViewSet(BaseHaystackViewSet):


 class TypeaheadSearchView(APIView):
-    """
-    Typeahead for courses and programs.
-    """
+    """ Typeahead for courses and programs. """
    RESULT_COUNT = 3
    permission_classes = (IsAuthenticated,)

    def get_results(self, query):
-        query = '(title:*{query}* OR course_key:*{query}*)'.format(query=query.lower())
-        course_runs = SearchQuerySet().models(CourseRun).raw_search(query)
+        course_runs = SearchQuerySet().models(CourseRun).filter(SQ(title_autocomplete=query) | SQ(course_key=query))
        course_runs = course_runs.filter(published=True).exclude(hidden=True)
        course_runs = course_runs[:self.RESULT_COUNT]
-        programs = SearchQuerySet().models(Program).raw_search(query)
+        programs = SearchQuerySet().models(Program).filter(SQ(title_autocomplete=query))
        programs = programs.filter(status=ProgramStatus.Active)
        programs = programs[:self.RESULT_COUNT]
        return course_runs, programs

    def get(self, request, *args, **kwargs):
+        """
+        Typeahead uses the ngram_analyzer as the index_analyzer to generate ngrams of the title during indexing.
+        i.e. Data Science -> da, dat, at, ata, data, etc...
+        Typeahead uses the lowercase analyzer as the search_analyzer.
+        The ngram_analyzer uses the lowercase filter as well, which makes typeahead case insensitive.
+        Available analyzers are defined in index _settings and field level analyzers are defined in the index _mapping.
+        NGrams are used rather than EdgeNgrams because NGrams allow partial searches across white space:
+        i.e. data sci - > data science, but not data analysis or scientific method
+        ---
+        parameters:
+            - name: q
+              description: "Search text"
+              paramType: query
+              required: true
+              type: string
+        """
        query = request.query_params.get('q')
        if not query:
            raise ParseError("The 'q' querystring parameter is required for searching.")

--- a/course_discovery/apps/core/tests/mixins.py
+++ b/course_discovery/apps/core/tests/mixins.py
 import logging

 from django.conf import settings
-from elasticsearch import Elasticsearch
+from haystack import connections as haystack_connections

 from course_discovery.apps.core.utils import ElasticsearchUtils

@@ -12,12 +12,19 @@ class ElasticsearchTestMixin(object):
    @classmethod
    def setUpClass(cls):
        super(ElasticsearchTestMixin, cls).setUpClass()
-        host = settings.HAYSTACK_CONNECTIONS['default']['URL']
        cls.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME']
-        cls.es = Elasticsearch(host)
+        # Make use of the changes in our custom ES backend
+        # This is required for typeahead autocomplete to work in the tests
+        connection = haystack_connections['default']
+        cls.backend = connection.get_backend()
+        # Without this line, haystack doesn't fully recreate the connection
+        # The first test using this backend succeeds, but the following tests
+        # do not set the Elasticsearch _mapping

    def setUp(self):
        super(ElasticsearchTestMixin, self).setUp()
+        self.backend.setup_complete = False
+        self.es = self.backend.conn
        self.reset_index()
        self.refresh_index()


--- a/course_discovery/apps/core/utils.py
+++ b/course_discovery/apps/core/utils.py
 import datetime
 import logging

+from django.conf import settings
+
 logger = logging.getLogger(__name__)


@@ -16,7 +18,8 @@ class ElasticsearchUtils(object):
            # Create an index with a unique (timestamped) name
            timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
            index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
-            es.indices.create(index=index)
+            index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
+            es.indices.create(index=index, body=index_settings)
            logger.info('...index [%s] created.', index)

            # Point the alias to the new index

--- a/course_discovery/apps/course_metadata/search_indexes.py
+++ b/course_discovery/apps/course_metadata/search_indexes.py
@@ -59,6 +59,7 @@ class BaseIndex(indexes.SearchIndex):
 class BaseCourseIndex(OrganizationsMixin, BaseIndex):
    key = indexes.CharField(model_attr='key', stored=True)
    title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
+    title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
    short_description = indexes.CharField(model_attr='short_description', null=True)
    full_description = indexes.CharField(model_attr='full_description', null=True)
    subjects = indexes.MultiValueField(faceted=True)
@@ -181,6 +182,7 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):

    uuid = indexes.CharField(model_attr='uuid')
    title = indexes.CharField(model_attr='title', boost=TITLE_FIELD_BOOST)
+    title_autocomplete = indexes.NgramField(model_attr='title', boost=TITLE_FIELD_BOOST)
    subtitle = indexes.CharField(model_attr='subtitle')
    type = indexes.CharField(model_attr='type__name', faceted=True)
    marketing_url = indexes.CharField(null=True)

--- a/course_discovery/apps/edx_haystack_extensions/backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/backends.py
@@ -74,8 +74,37 @@ class NonClearingSearchBackendMixin(object):


 # pylint: disable=abstract-method
+class ConfigurableElasticBackend(ElasticsearchSearchBackend):
+
+    def specify_analyzers(self, mapping, field, index_analyzer, search_analyzer):
+        """ Specify separate index and search analyzers for the given field.
+          Args:
+            mapping (dict): /_mapping attribute on index (maps analyzers to fields)
+            field (str): which field to modify
+            index_analyzer (str): name of the index_analyzer (should be defined in the /_settings attribute)
+            search_analyzer (str): name of the search_analyzer (should be defined in the /_settings attribute)
+        """
+        # The generic analyzer is used for both if index_analyzer and search_analyzer are not specified
+        mapping[field].pop('analyzer')
+        mapping[field].update({
+            'index_analyzer': index_analyzer,
+            'search_analyzer': search_analyzer
+        })
+
+    def build_schema(self, fields):
+        content_field_name, mapping = super().build_schema(fields)
+        # Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
+        # This is necessary to support partial searches/typeahead
+        # If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
+        # and all words containing ing would come back in typeahead.
+        self.specify_analyzers(mapping=mapping, field='title_autocomplete',
+                               index_analyzer='ngram_analyzer', search_analyzer='lowercase')
+        return (content_field_name, mapping)
+
+
+# pylint: disable=abstract-method
 class EdxElasticsearchSearchBackend(SimpleQuerySearchBackendMixin, NonClearingSearchBackendMixin,
-                                    ElasticsearchSearchBackend):
+                                    ConfigurableElasticBackend):
    pass



--- a/course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
+++ b/course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
 import datetime
 import logging

+from django.conf import settings
 from haystack import connections as haystack_connections
 from haystack.management.commands.update_index import Command as HaystackCommand

@@ -84,5 +85,6 @@ class Command(HaystackCommand):
        """
        timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
        index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
-        backend.conn.indices.create(index=index_name)
+        index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
+        backend.conn.indices.create(index=index_name, body=index_settings)
        return index_name
--- a/course_discovery/settings/base.py
+++ b/course_discovery/settings/base.py
@@ -350,6 +350,67 @@ SWAGGER_SETTINGS = {
    'permission_denied_handler': 'course_discovery.apps.api.views.api_docs_permission_denied_handler'
 }

+# Elasticsearch uses index settings to specify available analyzers.
+# We are adding the lowercase analyzer and tweaking the ngram analyzers here,
+# so we need to use these settings rather than the index defaults.
+# We are making these changes to enable autocomplete for the typeahead endpoint.
+ELASTICSEARCH_INDEX_SETTINGS = {
+    'settings': {
+        'analysis': {
+            'tokenizer': {
+                'haystack_edgengram_tokenizer': {
+                    'type': 'edgeNGram',
+                    'side': 'front',
+                    'min_gram': 2,
+                    'max_gram': 15
+                },
+                'haystack_ngram_tokenizer': {
+                    'type': 'nGram',
+                    'min_gram': 2,
+                    'max_gram': 15
+                }
+            },
+            'analyzer': {
+                'lowercase': {
+                    'type': 'custom',
+                    'tokenizer': 'keyword',
+                    'filter': [
+                        'lowercase'
+                    ]
+                },
+                'ngram_analyzer': {
+                    'type':'custom',
+                    'filter': [
+                        'haystack_ngram',
+                        'lowercase'
+                    ],
+                    'tokenizer': 'standard'
+                },
+                'edgengram_analyzer': {
+                    'type': 'custom',
+                    'filter': [
+                        'haystack_edgengram',
+                        'lowercase'
+                    ],
+                    'tokenizer': 'standard'
+                }
+            },
+            'filter': {
+                'haystack_edgengram': {
+                    'type': 'edgeNGram',
+                    'min_gram': 2,
+                    'max_gram': 15
+                },
+                'haystack_ngram': {
+                    'type': 'nGram',
+                    'min_gram': 2,
+                    'max_gram': 15
+                }
+            }
+        }
+    }
+}
+
 # Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
 HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200