Commit 17f4da7e by Matthew Piatetsky

Enable configuration of synonyms for search and typeahead

ECOM-4959
parent 5d278776
...@@ -3,6 +3,8 @@ import logging ...@@ -3,6 +3,8 @@ import logging
from django.conf import settings from django.conf import settings
from course_discovery.settings.process_synonyms import get_synonyms
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -19,6 +21,7 @@ class ElasticsearchUtils(object): ...@@ -19,6 +21,7 @@ class ElasticsearchUtils(object):
timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp) index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(es)
es.indices.create(index=index, body=index_settings) es.indices.create(index=index, body=index_settings)
logger.info('...index [%s] created.', index) logger.info('...index [%s] created.', index)
......
...@@ -93,14 +93,21 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend): ...@@ -93,14 +93,21 @@ class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def build_schema(self, fields): def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields) content_field_name, mapping = super().build_schema(fields)
# Fields default to snowball analyzer, this keeps snowball functionality, but adds synonym functionality
snowball_with_synonyms = 'snowball_with_synonyms'
for field, value in mapping.items():
if value.get('analyzer') == 'snowball':
self.specify_analyzers(mapping=mapping, field=field,
index_analyzer=snowball_with_synonyms,
search_analyzer=snowball_with_synonyms)
# Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer # Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
# This is necessary to support partial searches/typeahead # This is necessary to support partial searches/typeahead
# If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing" # If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
# and all words containing ing would come back in typeahead. # and all words containing ing would come back in typeahead.
self.specify_analyzers(mapping=mapping, field='title_autocomplete', self.specify_analyzers(mapping=mapping, field='title_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer='lowercase') index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete', self.specify_analyzers(mapping=mapping, field='authoring_organizations_autocomplete',
index_analyzer='ngram_analyzer', search_analyzer='lowercase') index_analyzer='ngram_analyzer', search_analyzer=snowball_with_synonyms)
return (content_field_name, mapping) return (content_field_name, mapping)
......
...@@ -5,6 +5,9 @@ from django.conf import settings ...@@ -5,6 +5,9 @@ from django.conf import settings
from haystack import connections as haystack_connections from haystack import connections as haystack_connections
from haystack.management.commands.update_index import Command as HaystackCommand from haystack.management.commands.update_index import Command as HaystackCommand
from course_discovery.settings.process_synonyms import get_synonyms
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -86,5 +89,6 @@ class Command(HaystackCommand): ...@@ -86,5 +89,6 @@ class Command(HaystackCommand):
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S') timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp) index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(backend.conn)
backend.conn.indices.create(index=index_name, body=index_settings) backend.conn.indices.create(index=index_name, body=index_settings)
return index_name return index_name
...@@ -377,42 +377,48 @@ ELASTICSEARCH_INDEX_SETTINGS = { ...@@ -377,42 +377,48 @@ ELASTICSEARCH_INDEX_SETTINGS = {
'type': 'custom', 'type': 'custom',
'tokenizer': 'keyword', 'tokenizer': 'keyword',
'filter': [ 'filter': [
'lowercase' 'lowercase',
'synonym',
] ]
}, },
'ngram_analyzer': { 'snowball_with_synonyms': {
'type':'custom', 'type': 'custom',
'filter': [ 'filter': [
'haystack_ngram', 'standard',
'lowercase' 'lowercase',
'snowball',
'synonym'
], ],
'tokenizer': 'standard' 'tokenizer': 'standard'
}, },
'edgengram_analyzer': { 'ngram_analyzer': {
'type': 'custom', 'type':'custom',
'filter': [ 'filter': [
'haystack_edgengram', 'lowercase',
'lowercase' 'haystack_ngram',
'synonym',
], ],
'tokenizer': 'standard' 'tokenizer': 'keyword'
} }
}, },
'filter': { 'filter': {
'haystack_edgengram': {
'type': 'edgeNGram',
'min_gram': 2,
'max_gram': 15
},
'haystack_ngram': { 'haystack_ngram': {
'type': 'nGram', 'type': 'nGram',
'min_gram': 2, 'min_gram': 2,
'max_gram': 15 'max_gram': 22
},
'synonym' : {
'type': 'synonym',
'ignore_case': 'true',
'synonyms': []
} }
} }
} }
} }
} }
SYNONYMS_MODULE = 'course_discovery.settings.synonyms'
# Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html) # Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200 HAYSTACK_ITERATOR_LOAD_PER_QUERY = 200
......
from functools import lru_cache
import importlib
from django.conf import settings
def process_synonyms(es, synonyms):
"""Convert synonyms to analyzed form with snowball analyzer.
This method takes list of synonyms in the form 'running, jogging',
applies the snowball analyzer and returns a list of synonyms in the format 'run, jog'.
Attributes:
es (client): client for making requests to es
synonyms (list): list of synonyms (each synonym group is a comma separated string)
"""
processed_synonyms = []
for line in synonyms:
processed_line = []
for synonym in line:
response = es.indices.analyze(text=synonym, analyzer='snowball')
synonym_tokens = ' '.join([item['token'] for item in response['tokens']])
processed_line.append(synonym_tokens)
processed_line = ','.join(processed_line)
processed_synonyms.append(processed_line)
return processed_synonyms
def get_synonym_lines_from_file():
synonyms_module = importlib.import_module(settings.SYNONYMS_MODULE)
return synonyms_module.SYNONYMS
@lru_cache()
def get_synonyms(es):
synonyms = get_synonym_lines_from_file()
synonyms = process_synonyms(es, synonyms)
return synonyms
...@@ -9,6 +9,8 @@ HAYSTACK_CONNECTIONS = { ...@@ -9,6 +9,8 @@ HAYSTACK_CONNECTIONS = {
}, },
} }
SYNONYMS_MODULE = 'course_discovery.settings.test_synonyms'
EDX_DRF_EXTENSIONS = { EDX_DRF_EXTENSIONS = {
'OAUTH2_USER_INFO_URL': 'http://example.com/oauth2/user_info', 'OAUTH2_USER_INFO_URL': 'http://example.com/oauth2/user_info',
} }
......
# Note: Do not use synonyms with punctuation, search and typeahead do not yet fully support punctuation
SYNONYMS = [
# Organizations
['ACCA', 'ACCA', 'ACCAx'],
['ACLU', 'American Civil Liberties Union'],
['Berkeley', 'UC BerkeleyX', 'UCBerkeleyX'],
['Georgia Institute of Technology', 'Georgia Tech', 'GTx'],
['Instituto Tecnologico y De Estudios Superiores De Monterrey', 'Monterrey', 'TecdeMonterreyX'],
['Microsoft', 'MicrosoftX', 'msft'],
['MIT', 'MITx'],
['New York Institute of Finance', 'NYIF', 'NYIFx'],
['The University of Michigan', 'MichiganX', 'UMichiganX', 'U Michigan'],
['The University of Texas System', 'UTx'],
['The University of California San Diego', 'UC San DiegoX', 'UCSanDiegoX'],
['The Disque Foundation', 'Save A LifeX', 'SaveALifeX'],
['University of Pennsylvania', 'PennX', 'UPennX', 'UPenn'],
['Universitat Politècnica de València', 'València', 'Valencia'],
['Wharton', 'WhartonX'],
# Common Mispellings
['cs50x', 'cs50'],
['ilets', 'ielts'],
['phyton', 'python'],
['toefl', 'tofel', 'toelf'],
# Subjects
['a11y', 'accessibility'],
['bi', 'business intelligence'],
['bme', 'biomedical engineering'],
['computer science', 'cs'],
['econ', 'economics'],
['ee', 'electrical engineering'],
['español', 'espanol', 'spanish'],
['français', 'francais', 'french'],
['it', 'information technology'],
['mis', 'management information systems'],
['psych', 'psychology'],
['seo', 'search engine optimization'],
['ux', 'user experience'],
# Other Terms
['autocad', 'auto cad', 'cad'],
['aws', 'amazon web services'],
['css', 'cascading style sheets'],
['excel', 'microsoft excel', 'msft excel'],
['hr', 'human resources'],
['HTML5', 'HTML'],
['iot', 'internet of things'],
['javascript', 'js', 'java script', 'react', 'typescript', 'jquery'],
['management', 'mgmt'],
['os', 'operating system'],
['photo', 'photography'],
['vb', 'visual basic'],
['vba', 'excel'],
['usa', 'united states of america', 'murika'],
]
# Test Synonyms
SYNONYMS = [
["University", "UniversityX"],
["HTML5", "HTML"],
["running", "jogging"],
["spanish", "español"]
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment