Commit d8f755c6 by Feanil Patel

Add a NAT monitoring role to jenkins_admin.

parent 5a9076ee
......@@ -56,7 +56,7 @@ aws_debian_pkgs:
aws_pip_pkgs:
- https://s3.amazonaws.com/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz
- awscli
- boto==2.29.1
- boto==2.32.0
aws_redhat_pkgs: []
aws_s3cmd_version: s3cmd-1.5.0-beta1
......
......@@ -143,3 +143,18 @@ jenkins_admin_plugins:
jenkins_admin_jobs:
- 'backup-jenkins'
# Supervisor related settings
jenkins_supervisor_user: "{{ jenkins_user }}"
jenkins_supervisor_app_dir: "{{ jenkins_home }}/supervisor"
jenkins_supervisor_cfg_dir: "{{ jenkins_supervisor_app_dir }}/conf.d"
jenkins_supervisor_available_dir: "{{ jenkins_supervisor_app_dir }}/available.d"
jenkins_supervisor_data_dir: "{{ jenkins_home }}/supervisor/data"
jenkins_supervisor_cfg: "{{ jenkins_supervisor_app_dir }}/supervisord.conf"
jenkins_supervisor_log_dir: "{{ COMMON_LOG_DIR }}/supervisor/jenkins"
jenkins_supervisor_venv_dir: "{{ jenkins_home }}/venvs/supervisor"
jenkins_supervisor_venv_bin: "{{ jenkins_supervisor_venv_dir }}/bin"
jenkins_supervisor_ctl: "{{ jenkins_supervisor_venv_bin }}/supervisorctl"
jenkins_supervisor_service_user: "{{ jenkins_user }}"
jenkins_admin_scripts_dir: "{{ jenkins_home }}/scripts"
#!/bin/bash -x
# This script will monitor two NATs and route to a backup nat
# if the primary fails.
set -e
# NAT instance variables
PRIMARY_NAT_ID=`aws ec2 describe-route-tables --filters Name=tag:aws:cloudformation:stack-name,Values=$VPC_NAME Name=tag:aws:cloudformation:logical-id,Values=PrivateRouteTable | jq '.RouteTables[].Routes[].InstanceId|strings' -r`
BACKUP_NAT_ID=`aws ec2 describe-instances --filters Name=tag:aws:cloudformation:stack-name,Values=$VPC_NAME Name=tag:aws:cloudformation:logical-id,Values=NATDevice,BackupNATDevice | jq '.Reservations[].Instances[].InstanceId' -r | grep -v $PRIMARY_NAT_ID`
NAT_RT_ID=`aws ec2 describe-route-tables --filters Name=tag:aws:cloudformation:stack-name,Values=$VPC_NAME Name=tag:aws:cloudformation:logical-id,Values=PrivateRouteTable | jq '.RouteTables[].RouteTableId' -r`
# Health Check variables
Num_Pings=3
Ping_Timeout=1
Wait_Between_Pings=2
Wait_for_Instance_Stop=60
Wait_for_Instance_Start=300
send_message() {
message_file=/var/tmp/message-$$.json
message_string=$1
if [ -z $message_string ]; then
message_string="Unknown error for $VPC_NAME NAT monitor"
fi
message_body=$2
cat << EOF > $message_file
{"Subject":{"Data":"$message_string"},"Body":{"Text":{"Data": "$message_body"}}}
EOF
echo `date` "-- $message_body"
BASE_PROFILE=$AWS_DEFAULT_PROFILE
export AWS_DEFAULT_PROFILE=$AWS_MAIL_PROFILE
aws ses send-email --from $NAT_MONITOR_FROM_EMAIL --to $NAT_MONITOR_TO_EMAIL --message file://$message_file
export AWS_DEFAULT_PROFILE=$BASE_PROFILE
}
trap send_message ERR SIGHUP SIGINT SIGTERM
# Determine the NAT instance private IP so we can ping the other NAT instance, take over
# its route, and reboot it. Requires EC2 DescribeInstances, ReplaceRoute, and Start/RebootInstances
# permissions. The following example EC2 Roles policy will authorize these commands:
# {
# "Statement": [
# {
# "Action": [
# "ec2:DescribeInstances",
# "ec2:CreateRoute",
# "ec2:ReplaceRoute",
# "ec2:StartInstances",
# "ec2:StopInstances"
# ],
# "Effect": "Allow",
# "Resource": "*"
# }
# ]
# }
# Get the primary NAT instance's IP
PRIMARY_NAT_IP=`aws ec2 describe-instances --instance-ids $PRIMARY_NAT_ID | jq -r ".Reservations[].Instances[].PrivateIpAddress"`
BACKUP_NAT_IP=`aws ec2 describe-instances --instance-ids $BACKUP_NAT_ID | jq -r ".Reservations[].Instances[].PrivateIpAddress"`
echo `date` "-- Running NAT monitor"
while [ . ]; do
# Check the health of both instances.
primary_pingresult=`ping -c $Num_Pings -W $Ping_Timeout $PRIMARY_NAT_IP| grep time= | wc -l`
if [ "$primary_pingresult" == "0" ]; then
backup_pingresult=`ping -c $Num_Pings -W $Ping_Timeout $BACKUP_NAT_IP| grep time= | wc -l`
if [ "$backup_pingresult" == "0" ]; then
send_message "Error monitoring NATs for $VPC_NAME." "ERROR -- Both NATs($PRIMARY_NAT_ID and $BACKUP_NAT_ID) were unreachable."
else #Backup nat is healthy.
# Set HEALTHY variables to unhealthy (0)
ROUTE_HEALTHY=0
NAT_HEALTHY=0
STOPPING_NAT=0
while [ "$NAT_HEALTHY" == "0" ]; do
# Primary NAT instance is unhealthy, loop while we try to fix it
if [ "$ROUTE_HEALTHY" == "0" ]; then
aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $BACKUP_NAT_ID
send_message " Primary $VPC_NAME NAT failed" "-- NAT($PRIMARY_NAT_ID) heartbeat failed, using $BACKUP_NAT_ID for $NAT_RT_ID default route"
ROUTE_HEALTHY=1
fi
# Check NAT state to see if we should stop it or start it again
NAT_STATE=`aws ec2 describe-instances --instance-ids $PRIMARY_NAT_ID | jq -r ".Reservations[].Instances[].State.Name"`
if [ "$NAT_STATE" == "stopped" ]; then
echo `date` "-- NAT($PRIMARY_NAT_ID) instance stopped, starting it back up"
aws ec2 start-instances --instance-ids $PRIMARY_NAT_ID
sleep $Wait_for_Instance_Start
else
if [ "$STOPPING_NAT" == "0" ]; then
echo `date` "-- NAT($PRIMARY_NAT_ID) instance $NAT_STATE, attempting to stop for reboot"
aws ec2 stop-instances --instance-ids $PRIMARY_NAT_ID
STOPPING_NAT=1
fi
sleep $Wait_for_Instance_Stop
fi
unhealthy_nat_pingresult=`ping -c $Num_Pings -W $Ping_Timeout $PRIMARY_NAT_IP| grep time= | wc -l`
if [ "$unhealthy_nat_pingresult" == "$Num_Pings" ]; then
NAT_HEALTHY=1
fi
done
# Backup nat was healthy so we switched to it. It is now the primary.
if [ "$ROUTE_HEALTHY" == "1" ]; then
TEMP_NAT_ID=$PRIMARY_NAT_ID
TEMP_NAT_IP=$PRIMARY_NAT_IP
PRIMARY_NAT_ID=$BACKUP_NAT_ID
PRIMARY_NAT_IP=$BACKUP_NAT_IP
BACKUP_NAT_ID=$TEMP_NAT_ID
BACKUP_NAT_IP=$TEMP_NAT_IP
fi
fi
else
echo `date` "-- PRIMARY NAT ($PRIMARY_NAT_ID $PRIMARY_NAT_IP) reports healthy to pings"
sleep $Wait_Between_Pings
fi
done
---
#
# edX Configuration
#
# github: https://github.com/edx/configuration
# wiki: https://github.com/edx/configuration/wiki
# code style: https://github.com/edx/configuration/wiki/Ansible-Coding-Conventions
# license: https://github.com/edx/configuration/blob/master/LICENSE.TXT
#
#
#
# Handlers for role jenkins_admin
#
# Overview:
#
# Have to use shell here because supervisorctl doesn't support
# process groups.
- name: restart nat monitor
shell: "{{ jenkins_supervisor_ctl }} -c {{ jenkins_supervisor_cfg }} restart nat_monitor:*"
when: not disable_edx_services
......@@ -20,4 +20,15 @@
# }
dependencies:
- common
- aws
- jenkins_master
- role: supervisor
supervisor_app_dir: "{{ jenkins_supervisor_app_dir }}"
supervisor_data_dir: "{{ jenkins_supervisor_data_dir }}"
supervisor_log_dir: "{{ jenkins_supervisor_log_dir }}"
supervisor_venv_dir: "{{ jenkins_supervisor_venv_dir }}"
supervisor_service_user: "{{ jenkins_supervisor_user }}"
supervisor_available_dir: "{{ jenkins_supervisor_available_dir }}"
supervisor_cfg_dir: "{{ jenkins_supervisor_cfg_dir }}"
supervisor_service: "supervisor.jenkins"
supervisor_http_bind_port: '9003'
......@@ -37,6 +37,10 @@
apt_repository: repo="{{ item }}" state=present update_cache=yes
with_items: jenkins_admin_debian_repos
- name: create the scripts directory
file: path={{ jenkins_admin_scripts_dir }} state=directory
owner={{ jenkins_user }} group={{ jenkins_group }} mode=755
# We first download the plugins to a temp directory and include
# the version in the file name. That way, if we increment
# the version, the plugin will be updated in Jenkins
......@@ -71,6 +75,24 @@
owner="{{ jenkins_user }}"
group="{{ jenkins_group }}"
mode="0600"
tags:
- aws-config
- name: create the .aws directory
file: path={{ jenkins_home }}/.aws state=directory
owner={{ jenkins_user }} group={{ jenkins_group }} mode=700
tags:
- aws-config
- name: configure the awscli profiles for jenkins
template: >
src="./{{ jenkins_home }}/aws_config.j2"
dest="{{ jenkins_home }}/.aws/config"
owner="{{ jenkins_user }}"
group="{{ jenkins_group }}"
mode="0600"
tags:
- aws-config
- name: create the ssh directory
file: >
......@@ -134,3 +156,5 @@
version={{ item.version }}
user_install=no
with_items: jenkins_admin_gem_pkgs
- include: nat_monitor.yml
---
- fail: msg="NAT_MONITORS is not defined."
when: NAT_MONITORS is not defined
- name: upload the monitor script
copy:
dest="{{ jenkins_admin_scripts_dir }}/nat-monitor.sh"
src="nat-monitor.sh"
owner="{{ jenkins_user }}"
group="{{ jenkins_group }}"
mode="755"
sudo_user: "{{ jenkins_user }}"
- name: create a supervisor config
template:
src="nat-monitor.conf.j2" dest="{{ jenkins_supervisor_available_dir }}/nat-monitor.conf"
owner="{{ jenkins_user }}"
group="{{ jenkins_group }}"
sudo_user: "{{ jenkins_user }}"
notify: restart nat monitor
- name: enable the supervisor config
file:
src="{{ jenkins_supervisor_available_dir }}/nat-monitor.conf"
dest="{{ jenkins_supervisor_cfg_dir }}/nat-monitor.conf"
state=link
force=yes
mode=0644
sudo_user: "{{ jenkins_user }}"
when: not disable_edx_services
notify: restart nat monitor
- name: update supervisor configuration
shell: "{{ jenkins_supervisor_ctl }} -c {{ jenkins_supervisor_cfg }} update"
register: supervisor_update
changed_when: supervisor_update.stdout is defined and supervisor_update.stdout != ""
when: not disable_edx_services
# Have to use shell here because supervisorctl doesn't support
# process groups.
- name: ensure nat monitor is started
shell: "{{ jenkins_supervisor_ctl }} -c {{ jenkins_supervisor_cfg }} start nat_monitor:*"
when: not disable_edx_services
{% for deployment, creds in JENKINS_ADMIN_AWS_CREDENTIALS.iteritems() %}
[profile {{deployment}}]
aws_access_key_id = {{ creds.access_id }}
aws_secret_access_key = {{ creds.secret_key }}
{% endfor %}
{% for m in NAT_MONITORS %}
[program:nat_monitor_{{ m.vpc_name|replace('-','_') }}]
environment=VPC_NAME="{{ m.vpc_name }}",AWS_DEFAULT_REGION="{{ m.region }}",AWS_DEFAULT_PROFILE="{{ m.deployment }}",AWS_MAIL_PROFILE="{{ JENKINS_ADMIN_MAIL_PROFILE }}",NAT_MONITOR_FROM_EMAIL="{{ JENKINS_ADMIN_FROM_EMAIL }}",NAT_MONITOR_TO_EMAIL="{{ JENKINS_ADMIN_TO_EMAIL }}"
user={{ jenkins_supervisor_service_user }}
directory={{ jenkins_admin_scripts_dir }}
stdout_logfile={{ jenkins_supervisor_log_dir }}/%(program_name)-stdout.log
stderr_logfile={{ jenkins_supervisor_log_dir }}/%(program_name)-stderr.log
command={{ jenkins_admin_scripts_dir }}/nat-monitor.sh
killasgroup=true
stopasgroup=true
{% endfor %}
[group:nat_monitor]
programs={%- for m in NAT_MONITORS %}nat_monitor_{{ m.vpc_name|replace('-','_') }}{%- if not loop.last %},{%- endif %}{%- endfor %}
......@@ -4,7 +4,7 @@ jenkins_group: "edx"
jenkins_server_name: "jenkins.testeng.edx.org"
jenkins_port: 8080
jenkins_version: 1.571
jenkins_version: 1.574
jenkins_deb_url: "http://pkg.jenkins-ci.org/debian/binary/jenkins_{{ jenkins_version }}_all.deb"
jenkins_deb: "jenkins_{{ jenkins_version }}_all.deb"
......
......@@ -72,6 +72,15 @@
with_items:
- "{{ supervisor_app_dir }}"
- "{{ supervisor_venv_dir }}"
- name: create service user accessible dirs
file: >
name={{ item }}
state=directory
owner={{ supervisor_user }}
group={{ supervisor_service_user }}
mode="775"
with_items:
- "{{ supervisor_cfg_dir }}"
- "{{ supervisor_available_dir }}"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment