Enable configuration of synonyms for search and typeahead

ECOM-4959

Enable configuration of synonyms for search and typeahead
ECOM-4959
17f4da7e · Matthew Piatetsky · 5d278776 · 17f4da7e · 17f4da7e · 17f4da7e
Commit 17f4da7e authored Jan 25, 2017 by Matthew Piatetsky
9 changed files
--- a/course_discovery/apps/api/v1/tests/test_views/test_search.py
+++ b/course_discovery/apps/api/v1/tests/test_views/test_search.py
--- a/course_discovery/apps/core/utils.py
+++ b/course_discovery/apps/core/utils.py
@@ -3,6 +3,8 @@ import logging
 from django.conf import settings
+from course_discovery.settings.process_synonyms import get_synonyms
 logger = logging.getLogger(__name__)
@@ -19,6 +21,7 @@ class ElasticsearchUtils(object):
            timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
            index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
            index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
+            index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(es)
            es.indices.create(index=index, body=index_settings)
            logger.info('...index [%s] created.', index)

--- a/course_discovery/apps/edx_haystack_extensions/backends.py
+++ b/course_discovery/apps/edx_haystack_extensions/backends.py
@@ -93,14 +93,21 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
    def build_schema(self, fields):
        content_field_name, mapping = super().build_schema(fields)
+        # Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality
+        snowball_with_synonyms = 'snowball_with_synonyms'
+        for field, value in mapping.items():
+            if value.get('analyzer') == 'snowball':
+                self.specify_analyzers(mapping=mapping, field=field,
+                                       index_analyzer=snowball_with_synonyms,
+                                       search_analyzer=snowball_with_synonyms)
        # Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
        # This is necessary to support partial searches/typeahead
        # If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
        # and all words containing ing would come back in typeahead.
        self.specify_analyzers(mapping=mapping, field='title_autocomplete',
-                               index_analyzer='ngram_analyzer', search_analyzer='lowercase')
+                               index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
        self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete',
-                               index_analyzer='ngram_analyzer', search_analyzer='lowercase')
+                               index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
        return (content_field_name, mapping)

--- a/course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
+++ b/course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
@@ -5,6 +5,9 @@ from django.conf import settings
 from haystack import connections as haystack_connections
 from haystack.management.commands.update_index import Command as HaystackCommand
+from course_discovery.settings.process_synonyms import get_synonyms
 logger = logging.getLogger(__name__)
@@ -86,5 +89,6 @@ class Command(HaystackCommand):
        timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
        index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
        index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
+        index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(backend.conn)
        backend.conn.indices.create(index=index_name, body=index_settings)
        return index_name
--- a/course_discovery/settings/base.py
+++ b/course_discovery/settings/base.py
@@ -377,42 +377,48 @@ ELASTICSEARCH_INDEX_SETTINGS = {
                    'type': 'custom',
                    'tokenizer': 'keyword',
                    'filter': [
-                        'lowercase'
+                        'lowercase',
+                        'synonym',
                    ]
                },
-                'ngram_analyzer': {
+                'snowball_with_synonyms': {
-                    'type':'custom',
+                    'type': 'custom',
                    'filter': [
-                        'haystack_ngram',
+                        'standard',
-                        'lowercase'
+                        'lowercase',
+                        'snowball',
+                        'synonym'
                    ],
                    'tokenizer': 'standard'
                },
-                'edgengram_analyzer': {
+                'ngram_analyzer': {
-                    'type': 'custom',
+                    'type':'custom',
                    'filter': [
-                        'haystack_edgengram',
+                        'lowercase',
-                        'lowercase'
+                        'haystack_ngram',
+                        'synonym',
                    ],
-                    'tokenizer': 'standard'
+                    'tokenizer': 'keyword'
                }
            },
            'filter': {
-                'haystack_edgengram': {
-                    'type': 'edgeNGram',
-                    'min_gram': 2,
-                    'max_gram': 15
-                },
                'haystack_ngram': {
                    'type': 'nGram',
                    'min_gram': 2,
-                    'max_gram': 15
+                    'max_gram': 22
+                },
+                'synonym' : {
+                  'type': 'synonym',
+                  'ignore_case': 'true',
+                  'synonyms': []
                }
            }
        }
    }
 }
+SYNONYMS_MODULE = 'course_discovery.settings.synonyms'
 # Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
 HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200

--- a/course_discovery/settings/process_synonyms.py
+++ b/course_discovery/settings/process_synonyms.py
+from functools import lru_cache
+import importlib
+from django.conf import settings
+def process_synonyms(es, synonyms):
+    """Convert synonyms to analyzed form with snowball analyzer.
+    This method takes list of synonyms in the form 'running, jogging',
+    applies the snowball analyzer and returns a list of synonyms in the format 'run, jog'.
+    Attributes:
+        es (client): client for making requests to es
+        synonyms (list): list of synonyms (each synonym group is a comma separated string)
+    """
+    processed_synonyms = []
+    for line in synonyms:
+        processed_line = []
+        for synonym in line:
+            response = es.indices.analyze(text=synonym, analyzer='snowball')
+            synonym_tokens = ' '.join([item['token'] for item in response['tokens']])
+            processed_line.append(synonym_tokens)
+        processed_line = ','.join(processed_line)
+        processed_synonyms.append(processed_line)
+    return processed_synonyms
+def get_synonym_lines_from_file():
+    synonyms_module = importlib.import_module(settings.SYNONYMS_MODULE)
+    return synonyms_module.SYNONYMS
+@lru_cache()
+def get_synonyms(es):
+    synonyms = get_synonym_lines_from_file()
+    synonyms = process_synonyms(es, synonyms)
+    return synonyms
--- a/course_discovery/settings/shared/test.py
+++ b/course_discovery/settings/shared/test.py
@@ -9,6 +9,8 @@ HAYSTACK_CONNECTIONS = {
    },
 }
+SYNONYMS_MODULE = 'course_discovery.settings.test_synonyms'
 EDX_DRF_EXTENSIONS = {
    'OAUTH2_USER_INFO_URL': 'http://example.com/oauth2/user_info',
 }

--- a/course_discovery/settings/synonyms.py
+++ b/course_discovery/settings/synonyms.py
+# Note: Do not use synonyms with punctuation, search and typeahead do not yet fully support punctuation
+SYNONYMS = [
+    # Organizations
+    ['ACCA', 'ACCA', 'ACCAx'],
+    ['ACLU', 'American Civil Liberties Union'],
+    ['Berkeley', 'UC BerkeleyX', 'UCBerkeleyX'],
+    ['Georgia Institute of Technology', 'Georgia Tech', 'GTx'],
+    ['Instituto Tecnologico y De Estudios Superiores De Monterrey', 'Monterrey', 'TecdeMonterreyX'],
+    ['Microsoft', 'MicrosoftX', 'msft'],
+    ['MIT', 'MITx'],
+    ['New York Institute of Finance', 'NYIF', 'NYIFx'],
+    ['The University of Michigan', 'MichiganX', 'UMichiganX', 'U Michigan'],
+    ['The University of Texas System', 'UTx'],
+    ['The University of California San Diego', 'UC San DiegoX', 'UCSanDiegoX'],
+    ['The Disque Foundation', 'Save A LifeX', 'SaveALifeX'],
+    ['University of Pennsylvania', 'PennX', 'UPennX', 'UPenn'],
+    ['Universitat Politècnica de València', 'València', 'Valencia'],
+    ['Wharton', 'WhartonX'],
+    # Common Mispellings
+    ['cs50x', 'cs50'],
+    ['ilets', 'ielts'],
+    ['phyton', 'python'],
+    ['toefl', 'tofel', 'toelf'],
+    # Subjects
+    ['a11y', 'accessibility'],
+    ['bi', 'business intelligence'],
+    ['bme', 'biomedical engineering'],
+    ['computer science', 'cs'],
+    ['econ', 'economics'],
+    ['ee', 'electrical engineering'],
+    ['español', 'espanol', 'spanish'],
+    ['français', 'francais', 'french'],
+    ['it', 'information technology'],
+    ['mis', 'management information systems'],
+    ['psych', 'psychology'],
+    ['seo', 'search engine optimization'],
+    ['ux', 'user experience'],
+    # Other Terms
+    ['autocad', 'auto cad', 'cad'],
+    ['aws', 'amazon web services'],
+    ['css', 'cascading style sheets'],
+    ['excel', 'microsoft excel', 'msft excel'],
+    ['hr', 'human resources'],
+    ['HTML5', 'HTML'],
+    ['iot', 'internet of things'],
+    ['javascript', 'js', 'java script', 'react', 'typescript', 'jquery'],
+    ['management', 'mgmt'],
+    ['os', 'operating system'],
+    ['photo', 'photography'],
+    ['vb', 'visual basic'],
+    ['vba', 'excel'],
+    ['usa', 'united states of america', 'murika'],
+]
--- a/course_discovery/settings/test_synonyms.py
+++ b/course_discovery/settings/test_synonyms.py
+# Test Synonyms
+SYNONYMS = [
+    ["University", "UniversityX"],
+    ["HTML5", "HTML"],
+    ["running", "jogging"],
+    ["spanish", "español"]
+]