Commit f7349d4f by Mike Dikan Committed by mikedikan

Create remove_unused_indexes management command

ECOM-7129

Creating management command to remove older indexes automatically, to clean up dangling elasticsearch indexes.
parent 410c54d2
......@@ -10,21 +10,14 @@ logger = logging.getLogger(__name__)
class ElasticsearchUtils(object):
@classmethod
def create_alias_and_index(cls, es, alias):
def create_alias_and_index(cls, es_connection, alias):
logger.info('Making sure alias [%s] exists...', alias)
if es.indices.exists_alias(name=alias):
if es_connection.indices.exists_alias(name=alias):
# If the alias exists, and points to an open index, we are all set.
logger.info('...alias exists.')
else:
# Create an index with a unique (timestamped) name
timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
index = '{alias}_{timestamp}'.format(alias=alias, timestamp=timestamp)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(es)
es.indices.create(index=index, body=index_settings)
logger.info('...index [%s] created.', index)
index = cls.create_index(es_connection=es_connection, prefix=alias)
# Point the alias to the new index
body = {
'actions': [
......@@ -32,9 +25,30 @@ class ElasticsearchUtils(object):
{'add': {'alias': alias, 'index': index}},
]
}
es.indices.update_aliases(body)
es_connection.indices.update_aliases(body)
logger.info('...alias updated.')
@classmethod
def create_index(cls, es_connection, prefix):
"""
Creates a new index whose name is prefixed with the specified value.
Args:
es_connection (Elasticsearch): Elasticsearch connection - the connection object as created in the
ElasticsearchSearchBackend class - the 'conn' attribute
prefix (str): Alias for the connection, used as prefix for the index name
Returns:
index_name (str): Name of the new index.
"""
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(es_connection)
es_connection.indices.create(index=index_name, body=index_settings)
logger.info('...index [%s] created.', index_name)
return index_name
def get_all_related_field_names(model):
"""
......
import logging
from django.conf import settings
from django.core.management.base import BaseCommand
from haystack import connections as haystack_connections
logger = logging.getLogger(__name__)
class Command(BaseCommand):
backends = []
help = 'This command will purge the oldest indexes, freeing up disk space. This command will never delete the ' \
'currently used index.'
def handle(self, **options):
self.backends = options.get('using')
if not self.backends:
self.backends = list(haystack_connections.connections_info.keys())
for backend_name in self.backends:
connection = haystack_connections[backend_name]
backend = connection.get_backend()
indices_client = backend.conn.indices
current_alias_name = backend.index_name
self.remove_unused_indexes(indices_client=indices_client, current_alias_name=current_alias_name)
def remove_unused_indexes(self, indices_client, current_alias_name):
"""
Removes all but the newest (Elasticsearch) indexes, using the configured value to limit deletions
Args:
indices_client (IndicesClient): Elasticsearch Index API client, used to list/delete index
current_alias_name (str): The name of the configured alias, used for lookup
Returns:
None
"""
sorted_indexes_by_timestamp = self.get_indexes_sorted_by_timestamp(indices_client=indices_client,
index_prefix=current_alias_name)
index_count = len(sorted_indexes_by_timestamp)
logger.info('Found {index_count} indexes'.format(index_count=index_count))
# Remove current index from list so we don't delete it
current_alias = indices_client.get_alias(name=current_alias_name)
sorted_indexes_by_timestamp = list(set(sorted_indexes_by_timestamp) - set(current_alias.keys()))
num_indices_to_remove = len(sorted_indexes_by_timestamp) - settings.HAYSTACK_INDEX_RETENTION_LIMIT
if num_indices_to_remove > 0:
indices_to_remove = sorted_indexes_by_timestamp[:num_indices_to_remove]
logger.info('Deleting indices %s...', indices_to_remove)
indices_client.delete(index=','.join(indices_to_remove))
logger.info('Successfully deleted indices %s.', indices_to_remove)
else:
logger.info('No indices to remove.')
def get_indexes_sorted_by_timestamp(self, indices_client, index_prefix):
"""
Uses the haystack connection to fetch the (Elasticsearch) indexes, sorted by timestamp
Args:
indices_client (IndicesClient): Elasticsearch Index API client, used to fetch index info
index_prefix (str): The string prefix for the index, used to match the indexes fetched
Returns:
sorted_indexes_by_timestamp (list): The sorted listing of index names
"""
all_index_settings = indices_client.get_settings()
all_indexes = list(all_index_settings.keys())
all_current_indexes = [index_name for index_name in all_indexes if index_name.startswith(index_prefix + '_')]
return sorted(all_current_indexes)
import datetime
import logging
from django.conf import settings
from haystack import connections as haystack_connections
from haystack.management.commands.update_index import Command as HaystackCommand
from course_discovery.settings.process_synonyms import get_synonyms
from course_discovery.apps.core.utils import ElasticsearchUtils
logger = logging.getLogger(__name__)
......@@ -70,24 +68,6 @@ class Command(HaystackCommand):
index_name(str): Name of the newly-created index.
"""
alias = backend.index_name
index_name = self.create_timestamped_index(backend, alias)
index_name = ElasticsearchUtils.create_index(backend.conn, alias)
backend.index_name = index_name
return alias, index_name
def create_timestamped_index(self, backend, prefix):
"""
Creates a new index whose name is prefixed with the specified value.
Args:
backend (ElasticsearchSearchBackend): Backend through which to connect to Elasticsearch.
prefix (str): Prefix for the index name
Returns:
index_name (str): Name of the new index.
"""
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
index_name = '{alias}_{timestamp}'.format(alias=prefix, timestamp=timestamp)
index_settings = settings.ELASTICSEARCH_INDEX_SETTINGS
index_settings['settings']['analysis']['filter']['synonym']['synonyms'] = get_synonyms(backend.conn)
backend.conn.indices.create(index=index_name, body=index_settings)
return index_name
from django.conf import settings
from elasticsearch.helpers import bulk
from haystack import connections as haystack_connections
from haystack.backends import BaseSearchBackend
from mock import patch
......@@ -25,6 +26,21 @@ class SearchBackendTestMixin(ElasticsearchTestMixin):
return self.backend.conn.count(index=self.backend.index_name)['count']
class SearchIndexTestMixin(object):
backend = None
index_prefix = None # The backend.index_name is manipulated during operation, so we snapshot prefix during setUp
def setUp(self):
super(SearchIndexTestMixin, self).setUp()
self.backend = haystack_connections['default'].get_backend()
self.index_prefix = self.backend.index_name
def tearDown(self):
""" Remove the indexes we created """
self.backend.conn.indices.delete(index=self.index_prefix + '_*')
super(SearchIndexTestMixin, self).tearDown()
class SimpleQuerySearchBackendMixinTestMixin(SearchBackendTestMixin):
""" Test class mixin for testing children of SimpleQuerySearchBackendMixin. """
......
import datetime
from django.conf import settings
from django.core.management import call_command
from django.test import TestCase
from freezegun import freeze_time
from course_discovery.apps.core.utils import ElasticsearchUtils
from course_discovery.apps.edx_haystack_extensions.tests.mixins import SearchIndexTestMixin
class RemoveUnusedIndexesTests(SearchIndexTestMixin, TestCase):
backend = None
def test_handle(self):
""" Verify the command removes all but the newest indexes. """
# Create initial index with alias
ElasticsearchUtils.create_alias_and_index(es_connection=self.backend.conn, alias=self.backend.index_name)
# Use now as initial time, so indexes are created AFTER the current index so expected values are accurate
initial_time = datetime.datetime.now()
# Create 2 more indexes than we expect to exist after removal
for number in range(1, settings.HAYSTACK_INDEX_RETENTION_LIMIT + 2):
current_time = initial_time + datetime.timedelta(seconds=number)
freezer = freeze_time(current_time)
freezer.start()
ElasticsearchUtils.create_index(es_connection=self.backend.conn, prefix=self.backend.index_name)
freezer.stop()
# Prune indexes and confirm the right indexes are removed
call_command('remove_unused_indexes')
current_alias_name = self.backend.index_name
indices_client = self.backend.conn.indices
current_alias = indices_client.get_alias(name=current_alias_name)
indexes_to_keep = current_alias.keys()
# check that we keep the current indexes, which we don't want removed
all_indexes = self.get_current_index_names(indices_client=indices_client, index_prefix=self.backend.index_name)
assert set(all_indexes).issuperset(set(indexes_to_keep))
# check that other indexes are removed, excepting those that don't hit the retention limit
expected_count = settings.HAYSTACK_INDEX_RETENTION_LIMIT + len(indexes_to_keep)
assert len(all_indexes) == expected_count
# Attempt to prune indexes again and confirm that no indexes are removed
call_command('remove_unused_indexes')
# check that we keep the current indexes, which we don't want removed
all_indexes = self.get_current_index_names(indices_client=indices_client, index_prefix=self.backend.index_name)
assert set(all_indexes).issuperset(set(indexes_to_keep))
# check that index count remains the same as before
assert len(all_indexes) == expected_count
@staticmethod
def get_current_index_names(indices_client, index_prefix):
all_index_settings = indices_client.get_settings()
all_indexes = list(all_index_settings.keys())
all_current_indexes = [index_name for index_name in all_indexes if index_name.startswith(index_prefix + '_')]
return all_current_indexes
......@@ -4,8 +4,10 @@ from django.test import TestCase
from elasticsearch import Elasticsearch
from freezegun import freeze_time
from course_discovery.apps.edx_haystack_extensions.tests.mixins import SearchIndexTestMixin
class UpdateIndexTests(TestCase):
class UpdateIndexTests(SearchIndexTestMixin, TestCase):
@freeze_time('2016-06-21')
def test_handle(self):
""" Verify the command creates a timestamped index and repoints the alias. """
......
......@@ -431,6 +431,7 @@ HAYSTACK_CONNECTIONS = {
}
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
HAYSTACK_INDEX_RETENTION_LIMIT = 3
# Elasticsearch search query facet "size" option to increase from the default value of "100"
# See https://www.elastic.co/guide/en/elasticsearch/reference/1.5/search-facets-terms-facet.html#_accuracy_control
......
......@@ -11,3 +11,14 @@ until we can determine what other edX-specific components need to be extracted f
edX developers should add ``'course_discovery.apps.edx_catalog_extensions'`` to the ``INSTALLED_APPS`` setting in a
``private.py`` settings file.
Settings
========
``HAYSTACK_INDEX_RETENTION_LIMIT``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Default: ``3``
This field sets an upper bound for the number of indexes that will be retained after
a purge triggered by the 'remove_unused_indexes' command. This command will never delete the currently used index.
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment