Commit a026fd3b by Feanil Patel

Merge pull request #1796 from edx/feanil/supervisor_backoff

Feanil/supervisor backoff
parents 7e67dd72 1bb5c7e8
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import subprocess import subprocess
import traceback import traceback
import socket import socket
import time
# Services that should be checked for migrations. # Services that should be checked for migrations.
MIGRATION_COMMANDS = { MIGRATION_COMMANDS = {
...@@ -16,6 +17,10 @@ MIGRATION_COMMANDS = { ...@@ -16,6 +17,10 @@ MIGRATION_COMMANDS = {
} }
HIPCHAT_USER = "PreSupervisor" HIPCHAT_USER = "PreSupervisor"
# Max amount of time to wait for tags to be applied.
MAX_BACKOFF = 120
INITIAL_BACKOFF = 1
def services_for_instance(instance_id): def services_for_instance(instance_id):
""" """
Get the list of all services named by the services tag in this Get the list of all services named by the services tag in this
...@@ -106,7 +111,13 @@ if __name__ == '__main__': ...@@ -106,7 +111,13 @@ if __name__ == '__main__':
# Needs to exit with 1 instead of 0 to prevent # Needs to exit with 1 instead of 0 to prevent
# services from starting. # services from starting.
exit(1) exit(1)
time_left = MAX_BACKOFF
backoff = INITIAL_BACKOFF
environment = None
deployment = None
play = None
while time_left > 0:
try: try:
environment, deployment, play = edp_for_instance(instance_id) environment, deployment, play = edp_for_instance(instance_id)
prefix = "{environment}-{deployment}-{play}-{instance_id}".format( prefix = "{environment}-{deployment}-{play}-{instance_id}".format(
...@@ -114,12 +125,26 @@ if __name__ == '__main__': ...@@ -114,12 +125,26 @@ if __name__ == '__main__':
deployment=deployment, deployment=deployment,
play=play, play=play,
instance_id=instance_id) instance_id=instance_id)
break
except: except:
print("Failed to get EDP for {}".format(instance_id)) print("Failed to get EDP for {}".format(instance_id))
# With the time limit being 2 minutes we will
# try 5 times before giving up.
time.sleep(backoff)
time_left -= backoff
backoff = backoff * 2
if environment is None or deployment is None or play is None:
msg = "Unable to retrieve environment, deployment, or play tag."
print(msg)
if notify:
notify("{} : {}".format(prefix, msg))
exit(0)
#get the hostname of the sandbox #get the hostname of the sandbox
hostname = socket.gethostname() hostname = socket.gethostname()
try:
#get the list of the volumes, that are attached to the instance #get the list of the volumes, that are attached to the instance
volumes = ec2.get_all_volumes(filters={'attachment.instance-id': instance_id}) volumes = ec2.get_all_volumes(filters={'attachment.instance-id': instance_id})
...@@ -130,6 +155,11 @@ if __name__ == '__main__': ...@@ -130,6 +155,11 @@ if __name__ == '__main__':
"cluster": play, "cluster": play,
"instance-id": instance_id, "instance-id": instance_id,
"created": volume.create_time }) "created": volume.create_time })
except:
msg = "Failed to tag volumes associated with {}".format(instance_id)
print(msg)
if notify:
notify(msg)
try: try:
for service in services_for_instance(instance_id): for service in services_for_instance(instance_id):
...@@ -174,6 +204,7 @@ if __name__ == '__main__': ...@@ -174,6 +204,7 @@ if __name__ == '__main__':
if notify: if notify:
notify(msg) notify(msg)
traceback.print_exc() traceback.print_exc()
raise e
else: else:
msg = "{}: {}".format(prefix, " | ".join(report)) msg = "{}: {}".format(prefix, " | ".join(report))
print(msg) print(msg)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment