Commit a026fd3b by Feanil Patel

Merge pull request #1796 from edx/feanil/supervisor_backoff

Feanil/supervisor backoff
parents 7e67dd72 1bb5c7e8
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import subprocess import subprocess
import traceback import traceback
import socket import socket
import time
# Services that should be checked for migrations. # Services that should be checked for migrations.
MIGRATION_COMMANDS = { MIGRATION_COMMANDS = {
...@@ -16,6 +17,10 @@ MIGRATION_COMMANDS = { ...@@ -16,6 +17,10 @@ MIGRATION_COMMANDS = {
} }
HIPCHAT_USER = "PreSupervisor" HIPCHAT_USER = "PreSupervisor"
# Max amount of time to wait for tags to be applied.
MAX_BACKOFF = 120
INITIAL_BACKOFF = 1
def services_for_instance(instance_id): def services_for_instance(instance_id):
""" """
Get the list of all services named by the services tag in this Get the list of all services named by the services tag in this
...@@ -106,31 +111,56 @@ if __name__ == '__main__': ...@@ -106,31 +111,56 @@ if __name__ == '__main__':
# Needs to exit with 1 instead of 0 to prevent # Needs to exit with 1 instead of 0 to prevent
# services from starting. # services from starting.
exit(1) exit(1)
time_left = MAX_BACKOFF
try: backoff = INITIAL_BACKOFF
environment, deployment, play = edp_for_instance(instance_id)
prefix = "{environment}-{deployment}-{play}-{instance_id}".format( environment = None
environment=environment, deployment = None
deployment=deployment, play = None
play=play, while time_left > 0:
instance_id=instance_id) try:
except: environment, deployment, play = edp_for_instance(instance_id)
print("Failed to get EDP for {}".format(instance_id)) prefix = "{environment}-{deployment}-{play}-{instance_id}".format(
environment=environment,
deployment=deployment,
play=play,
instance_id=instance_id)
break
except:
print("Failed to get EDP for {}".format(instance_id))
# With the time limit being 2 minutes we will
# try 5 times before giving up.
time.sleep(backoff)
time_left -= backoff
backoff = backoff * 2
if environment is None or deployment is None or play is None:
msg = "Unable to retrieve environment, deployment, or play tag."
print(msg)
if notify:
notify("{} : {}".format(prefix, msg))
exit(0)
#get the hostname of the sandbox #get the hostname of the sandbox
hostname = socket.gethostname() hostname = socket.gethostname()
#get the list of the volumes, that are attached to the instance try:
volumes = ec2.get_all_volumes(filters={'attachment.instance-id': instance_id}) #get the list of the volumes, that are attached to the instance
volumes = ec2.get_all_volumes(filters={'attachment.instance-id': instance_id})
for volume in volumes:
volume.add_tags({"hostname": hostname, for volume in volumes:
"environment": environment, volume.add_tags({"hostname": hostname,
"deployment": deployment, "environment": environment,
"cluster": play, "deployment": deployment,
"instance-id": instance_id, "cluster": play,
"created": volume.create_time }) "instance-id": instance_id,
"created": volume.create_time })
except:
msg = "Failed to tag volumes associated with {}".format(instance_id)
print(msg)
if notify:
notify(msg)
try: try:
for service in services_for_instance(instance_id): for service in services_for_instance(instance_id):
if service in MIGRATION_COMMANDS: if service in MIGRATION_COMMANDS:
...@@ -153,7 +183,7 @@ if __name__ == '__main__': ...@@ -153,7 +183,7 @@ if __name__ == '__main__':
output = subprocess.check_output(cmd, shell=True) output = subprocess.check_output(cmd, shell=True)
if 'Migrating' in output: if 'Migrating' in output:
raise Exception("Migrations have not been run for {}".format(service)) raise Exception("Migrations have not been run for {}".format(service))
# Link to available service. # Link to available service.
available_file = os.path.join(args.available, "{}.conf".format(service)) available_file = os.path.join(args.available, "{}.conf".format(service))
link_location = os.path.join(args.enabled, "{}.conf".format(service)) link_location = os.path.join(args.enabled, "{}.conf".format(service))
...@@ -174,6 +204,7 @@ if __name__ == '__main__': ...@@ -174,6 +204,7 @@ if __name__ == '__main__':
if notify: if notify:
notify(msg) notify(msg)
traceback.print_exc() traceback.print_exc()
raise e
else: else:
msg = "{}: {}".format(prefix, " | ".join(report)) msg = "{}: {}".format(prefix, " | ".join(report))
print(msg) print(msg)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment