Merge pull request #1404 from edx/feanil/warn_only_nat_monitor

Make NAT monitor only warn about NAT failures.

Merge pull request #1404 from edx/feanil/warn_only_nat_monitor
Make NAT monitor only warn about NAT failures.
801fe570 · Feanil Patel · df50f173 · b51085e0 · 801fe570
Commit 801fe570 authored Aug 13, 2014 by Feanil Patel
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 43 deletions

playbooks/roles/jenkins_admin/files/nat-monitor.sh
+4 -43

No files found.
--- a/playbooks/roles/jenkins_admin/files/nat-monitor.sh
+++ b/playbooks/roles/jenkins_admin/files/nat-monitor.sh
@@ -5,7 +5,7 @@ set -e
 # Health Check variables
 Num_Pings=3
-Ping_Timeout=1
+Ping_Timeout=2
 Wait_Between_Pings=2
 Wait_for_Instance_Stop=60
 Wait_for_Instance_Start=300
@@ -77,48 +77,9 @@ while [ . ]; do
    if [ "$backup_pingresult" == "0" ]; then
      send_message "Error monitoring NATs for $VPC_NAME."  "ERROR -- Both NATs($PRIMARY_NAT_ID and $BACKUP_NAT_ID) were unreachable."
    else #Backup nat is healthy.
-      # Set HEALTHY variables to unhealthy (0)
+      send_message "Primary $VPC_NAME NAT failed ping" "-- NAT($PRIMARY_NAT_ID) heartbeat failed, consider using $BACKUP_NAT_ID for $NAT_RT_ID default route
-      ROUTE_HEALTHY=0
+Command for re-routing:
-      NAT_HEALTHY=0
+aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $BACKUP_NAT_ID"
-      STOPPING_NAT=0
-      while [ "$NAT_HEALTHY" == "0" ]; do
-      # Primary NAT instance is unhealthy, loop while we try to fix it
-        if [ "$ROUTE_HEALTHY" == "0" ]; then
-          aws ec2 replace-route --route-table-id $NAT_RT_ID --destination-cidr-block 0.0.0.0/0 --instance-id $BACKUP_NAT_ID
-          send_message " Primary $VPC_NAME NAT failed" "-- NAT($PRIMARY_NAT_ID) heartbeat failed, using $BACKUP_NAT_ID for $NAT_RT_ID default route"
-          ROUTE_HEALTHY=1
-        fi
-        # Check NAT state to see if we should stop it or start it again
-        NAT_STATE=`aws ec2 describe-instances --instance-ids $PRIMARY_NAT_ID | jq -r ".Reservations[].Instances[].State.Name"`
-        if [ "$NAT_STATE" == "stopped" ]; then
-          echo `date` "-- NAT($PRIMARY_NAT_ID) instance stopped, starting it back up"
-          aws ec2 start-instances --instance-ids $PRIMARY_NAT_ID
-          sleep $Wait_for_Instance_Start
-        else
-          if [ "$STOPPING_NAT" == "0" ]; then
-            echo `date` "-- NAT($PRIMARY_NAT_ID) instance $NAT_STATE, attempting to stop for reboot"
-            aws ec2 stop-instances --instance-ids $PRIMARY_NAT_ID
-            STOPPING_NAT=1
-          fi
-          sleep $Wait_for_Instance_Stop
-        fi
-        unhealthy_nat_pingresult=`ping -c $Num_Pings -W $Ping_Timeout $PRIMARY_NAT_IP| grep time= | wc -l`
-        if [ "$unhealthy_nat_pingresult" == "$Num_Pings" ]; then
-          NAT_HEALTHY=1
-        fi
-      done
-      # Backup nat was healthy so we switched to it.  It is now the primary.
-      if [ "$ROUTE_HEALTHY" == "1" ]; then
-        TEMP_NAT_ID=$PRIMARY_NAT_ID
-        TEMP_NAT_IP=$PRIMARY_NAT_IP
-        PRIMARY_NAT_ID=$BACKUP_NAT_ID
-        PRIMARY_NAT_IP=$BACKUP_NAT_IP
-        BACKUP_NAT_ID=$TEMP_NAT_ID
-        BACKUP_NAT_IP=$TEMP_NAT_IP
-      fi
    fi
  else
    echo `date` "-- PRIMARY NAT ($PRIMARY_NAT_ID $PRIMARY_NAT_IP) reports healthy to pings"