Commit 2e07567c by Hugh Saunders Committed by James Cammarata

Retry exec command via ssh_retry

This PR adds the option to retry failed ssh executions, if the failure
is caused by ssh itself, not the remote command. This can be helpful if
there are transient network issues. Retries are only implemented in the
openssh connection plugin and are disabled by default. Retries are
enabled by setting ssh_connection > retries to an integer greater
than 0.

Running a long series of playbooks, or a short playbook against a large
cluster may result in transient ssh failures, some examples logged
[here](https://trello.com/c/1yh6csEQ/13-ssh-errors).

Ansible should be able to retry an ssh connection in order to survive
transient failures.

Ansible marks a host as failed the first time it fails to contact it.
parent 684e30a5
...@@ -195,7 +195,9 @@ RETRY_FILES_SAVE_PATH = get_config(p, DEFAULTS, 'retry_files_save_path' ...@@ -195,7 +195,9 @@ RETRY_FILES_SAVE_PATH = get_config(p, DEFAULTS, 'retry_files_save_path'
ANSIBLE_SSH_ARGS = get_config(p, 'ssh_connection', 'ssh_args', 'ANSIBLE_SSH_ARGS', None) ANSIBLE_SSH_ARGS = get_config(p, 'ssh_connection', 'ssh_args', 'ANSIBLE_SSH_ARGS', None)
ANSIBLE_SSH_CONTROL_PATH = get_config(p, 'ssh_connection', 'control_path', 'ANSIBLE_SSH_CONTROL_PATH', "%(directory)s/ansible-ssh-%%h-%%p-%%r") ANSIBLE_SSH_CONTROL_PATH = get_config(p, 'ssh_connection', 'control_path', 'ANSIBLE_SSH_CONTROL_PATH', "%(directory)s/ansible-ssh-%%h-%%p-%%r")
ANSIBLE_SSH_PIPELINING = get_config(p, 'ssh_connection', 'pipelining', 'ANSIBLE_SSH_PIPELINING', False, boolean=True) ANSIBLE_SSH_PIPELINING = get_config(p, 'ssh_connection', 'pipelining', 'ANSIBLE_SSH_PIPELINING', False, boolean=True)
ANSIBLE_SSH_RETRIES = get_config(p, 'ssh_connection', 'retries', 'ANSIBLE_SSH_RETRIES', 0, integer=True)
PARAMIKO_RECORD_HOST_KEYS = get_config(p, 'paramiko_connection', 'record_host_keys', 'ANSIBLE_PARAMIKO_RECORD_HOST_KEYS', True, boolean=True) PARAMIKO_RECORD_HOST_KEYS = get_config(p, 'paramiko_connection', 'record_host_keys', 'ANSIBLE_PARAMIKO_RECORD_HOST_KEYS', True, boolean=True)
# obsolete -- will be formally removed # obsolete -- will be formally removed
ZEROMQ_PORT = get_config(p, 'fireball_connection', 'zeromq_port', 'ANSIBLE_ZEROMQ_PORT', 5099, integer=True) ZEROMQ_PORT = get_config(p, 'fireball_connection', 'zeromq_port', 'ANSIBLE_ZEROMQ_PORT', 5099, integer=True)
ACCELERATE_PORT = get_config(p, 'accelerate', 'accelerate_port', 'ACCELERATE_PORT', 5099, integer=True) ACCELERATE_PORT = get_config(p, 'accelerate', 'accelerate_port', 'ACCELERATE_PORT', 5099, integer=True)
......
...@@ -16,21 +16,22 @@ ...@@ -16,21 +16,22 @@
# along with Ansible. If not, see <http://www.gnu.org/licenses/>. # along with Ansible. If not, see <http://www.gnu.org/licenses/>.
# #
import fcntl
import gettext
import hmac
import os import os
import re
import subprocess
import shlex
import pipes import pipes
import pty
import pwd
import random import random
import re
import select import select
import fcntl import shlex
import hmac import subprocess
import pwd import time
import gettext
import pty
from hashlib import sha1 from hashlib import sha1
import ansible.constants as C import ansible.constants as C
from ansible.callbacks import vvv from ansible.callbacks import vvv, vv
from ansible import errors from ansible import errors
from ansible import utils from ansible import utils
...@@ -256,7 +257,51 @@ class Connection(object): ...@@ -256,7 +257,51 @@ class Connection(object):
vvv("EXEC previous known host file not found for %s" % host) vvv("EXEC previous known host file not found for %s" % host)
return True return True
def exec_command(self, cmd, tmp_path, become_user=None, sudoable=False, executable='/bin/sh', in_data=None): def exec_command(self, *args, **kwargs):
""" Wrapper around _exec_command to retry in the case of an ssh
failure
Will retry if:
* an exception is caught
* ssh returns 255
Will not retry if
* remaining_tries is <2
* retries limit reached
"""
remaining_tries = C.get_config(
C.p, 'ssh_connection', 'retries',
'ANSIBLE_SSH_RETRIES', 3, integer=True) + 1
cmd_summary = "%s %s..." % (args[0], str(kwargs)[:200])
for attempt in xrange(remaining_tries):
pause = 2 ** attempt - 1
if pause > 30:
pause = 30
time.sleep(pause)
try:
return_tuple = self._exec_command(*args, **kwargs)
except Exception as e:
msg = ("ssh_retry: attempt: %d, caught exception(%s) from cmd "
"(%s).") % (attempt, e, cmd_summary)
vv(msg)
if attempt == remaining_tries - 1:
raise e
else:
continue
# 0 = success
# 1-254 = remote command return code
# 255 = failure from the ssh command itself
if return_tuple[0] != 255:
break
else:
msg = ('ssh_retry: attempt: %d, ssh return code is 255. cmd '
'(%s).') % (attempt, cmd_summary)
vv(msg)
return return_tuple
def _exec_command(self, cmd, tmp_path, become_user=None, sudoable=False, executable='/bin/sh', in_data=None):
''' run a command on the remote host ''' ''' run a command on the remote host '''
if sudoable and self.runner.become and self.runner.become_method not in self.become_methods_supported: if sudoable and self.runner.become and self.runner.become_method not in self.become_methods_supported:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment