abbey.py 33.4 KB
Newer Older
1
#!/usr/bin/env python -u
2 3 4 5
import sys
from argparse import ArgumentParser
import time
import json
John Jarvis committed
6
import yaml
7
import os
Fred Smith committed
8
import requests
9 10 11
try:
    import boto.ec2
    import boto.sqs
12
    import boto.vpc
13
    from boto.exception import NoAuthHandlerFound, EC2ResponseError
14
    from boto.sqs.message import RawMessage
15
    from boto.ec2.blockdevicemapping import BlockDeviceType, BlockDeviceMapping
16 17 18 19
except ImportError:
    print "boto required for script"
    sys.exit(1)

20 21
from pprint import pprint

Feanil Patel committed
22
AMI_TIMEOUT = 2700  # time to wait for AMIs to complete(45 minutes)
23 24 25
EC2_RUN_TIMEOUT = 180  # time to wait for ec2 state transition
EC2_STATUS_TIMEOUT = 300  # time to wait for ec2 system status checks
NUM_TASKS = 5  # number of tasks for time summary report
26
NUM_PLAYBOOKS = 2
27

John Jarvis committed
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
class Unbuffered:
    """
    For unbuffered output, not
    needed if PYTHONUNBUFFERED is set
    """
    def __init__(self, stream):
        self.stream = stream

    def write(self, data):
        self.stream.write(data)
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)

sys.stdout = Unbuffered(sys.stdout)


def parse_args():
    parser = ArgumentParser()
    parser.add_argument('--noop', action='store_true',
                        help="don't actually run the cmds",
                        default=False)
52 53
    parser.add_argument('--secure-vars-file', required=False,
                        metavar="SECURE_VAR_FILE", default=None,
54
                        help="path to secure-vars from the root of "
55 56
                        "the secure repo. By default <deployment>.yml and "
                        "<environment>-<deployment>.yml will be used if they "
John Jarvis committed
57 58
                        "exist in <secure-repo>/ansible/vars/. This secure file "
                        "will be used in addition to these if they exist.")
59
    parser.add_argument('--stack-name',
60
                        help="defaults to ENVIRONMENT-DEPLOYMENT",
61 62 63 64 65
                        metavar="STACK_NAME",
                        required=False)
    parser.add_argument('-p', '--play',
                        help='play name without the yml extension',
                        metavar="PLAY", required=True)
66 67
    parser.add_argument('--playbook-dir',
                        help='directory to find playbooks in',
68
                        default='configuration/playbooks/edx-east',
69
                        metavar="PLAYBOOKDIR", required=False)
70 71 72 73 74 75 76 77 78 79 80
    parser.add_argument('-d', '--deployment', metavar="DEPLOYMENT",
                        required=True)
    parser.add_argument('-e', '--environment', metavar="ENVIRONMENT",
                        required=True)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help="turn on verbosity")
    parser.add_argument('--no-cleanup', action='store_true',
                        help="don't cleanup on failures")
    parser.add_argument('--vars', metavar="EXTRA_VAR_FILE",
                        help="path to extra var file", required=False)
    parser.add_argument('--configuration-version', required=False,
81
                        help="configuration repo gitref",
82 83
                        default="master")
    parser.add_argument('--configuration-secure-version', required=False,
84
                        help="configuration-secure repo gitref",
85
                        default="master")
86 87 88
    parser.add_argument('--configuration-secure-repo', required=False,
                        default="git@github.com:edx-ops/prod-secure",
                        help="repo to use for the secure files")
89 90 91 92 93 94
    parser.add_argument('--configuration-internal-version', required=False,
                        help="configuration-internal repo gitref",
                        default="master")
    parser.add_argument('--configuration-internal-repo', required=False,
                        default="",
                        help="repo to use for internal (non-secure) configuration data")
95
    parser.add_argument('--configuration-private-version', required=False,
96
                        help="configuration-private repo gitref",
97 98
                        default="master")
    parser.add_argument('--configuration-private-repo', required=False,
99
                        default="",
100
                        help="repo to use for private playbooks")
101 102
    parser.add_argument('-c', '--cache-id', required=True,
                        help="unique id to use as part of cache prefix")
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
    parser.add_argument('-i', '--identity', required=False,
                        help="path to identity file for pulling "
                             "down configuration-secure",
                        default=None)
    parser.add_argument('-r', '--region', required=False,
                        default="us-east-1",
                        help="aws region")
    parser.add_argument('-k', '--keypair', required=False,
                        default="deployment",
                        help="AWS keypair to use for instance")
    parser.add_argument('-t', '--instance-type', required=False,
                        default="m1.large",
                        help="instance type to launch")
    parser.add_argument("--role-name", required=False,
                        default="abbey",
                        help="IAM role name to use (must exist)")
    parser.add_argument("--msg-delay", required=False,
                        default=5,
                        help="How long to delay message display from sqs "
                             "to ensure ordering")
123 124 125 126
    parser.add_argument("--hipchat-room-id", required=False,
                        default=None,
                        help="The API ID of the Hipchat room to post"
                             "status messages to")
127 128 129 130
    parser.add_argument("--ansible-hipchat-room-id", required=False,
                        default='Hammer',
                        help="The room used by the abbey instance for "
                             "printing verbose ansible run data.")
131 132 133
    parser.add_argument("--hipchat-api-token", required=False,
                        default=None,
                        help="The API token for Hipchat integration")
Fred Smith committed
134 135 136
    parser.add_argument("--callback-url", required=False,
                        default=None,
                        help="The callback URL to send notifications to")
137 138 139 140
    parser.add_argument("--root-vol-size", required=False,
                        default=50,
                        help="The size of the root volume to use for the "
                             "abbey instance.")
141 142 143 144
    parser.add_argument("--datadog-api-key", required=False,
                        default="",
                        help="The datadog api key used for capturing task"
                             "and playbook metrics abbey instance.")
145 146 147

    group = parser.add_mutually_exclusive_group()
    group.add_argument('-b', '--base-ami', required=False,
John Jarvis committed
148
                       help="ami to use as a base ami",
149
                       default="ami-cd0f5cb6")
150
    group.add_argument('--blessed', action='store_true',
John Jarvis committed
151 152
                       help="Look up blessed ami for env-dep-play.",
                       default=False)
153

154 155
    return parser.parse_args()

156
def get_instance_sec_group(vpc_id):
157

158 159
    grp_details = ec2.get_all_security_groups(
        filters={
160
            'vpc_id': vpc_id,
161
            'tag:play': args.play
162 163
        }
    )
164

165
    if len(grp_details) < 1:
Feanil Patel committed
166 167 168 169 170 171 172 173 174 175 176 177
        #
        # try scheme for non-cloudformation builds
        #

        grp_details = ec2.get_all_security_groups(
            filters={
                'tag:play': args.play,
                'tag:environment': args.environment,
                'tag:deployment': args.deployment}
        )

    if len(grp_details) < 1:
178
        sys.stderr.write("ERROR: Expected atleast one security group, got {}\n".format(
Feanil Patel committed
179
            len(grp_details)))
180

181
    return grp_details[0].id
182

John Jarvis committed
183

184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
def get_blessed_ami():
    images = ec2.get_all_images(
        filters={
            'tag:environment': args.environment,
            'tag:deployment': args.deployment,
            'tag:play': args.play,
            'tag:blessed': True
        }
    )

    if len(images) != 1:
        raise Exception("ERROR: Expected only one blessed ami, got {}\n".format(
            len(images)))

    return images[0].id
199

John Jarvis committed
200

201 202 203 204 205 206 207 208
def create_instance_args():
    """
    Looks up security group, subnet
    and returns arguments to pass into
    ec2.run_instances() including
    user data
    """

209
    vpc = boto.vpc.connect_to_region(args.region)
210 211 212
    subnet = vpc.get_all_subnets(
        filters={
            'tag:aws:cloudformation:stack-name': stack_name,
e0d committed
213
            'tag:play': args.play}
214
    )
e0d committed
215 216 217 218 219 220 221 222

    if len(subnet) < 1:
        #
        # try scheme for non-cloudformation builds
        #

        subnet = vpc.get_all_subnets(
            filters={
Feanil Patel committed
223
                'tag:play': args.play,
e0d committed
224 225 226 227
                'tag:environment': args.environment,
                'tag:deployment': args.deployment}
        )

228
    if len(subnet) < 1:
Feanil Patel committed
229 230
        sys.stderr.write("ERROR: Expected at least one subnet, got {} for {}-{}-{}\n".format(
            len(subnet), args.environment, args.deployment, args.play))
231 232
        sys.exit(1)
    subnet_id = subnet[0].id
233 234
    vpc_id = subnet[0].vpc_id

235
    security_group_id = get_instance_sec_group(vpc_id)
236 237 238 239

    if args.identity:
        config_secure = 'true'
        with open(args.identity) as f:
240
            identity_contents = f.read()
241 242
    else:
        config_secure = 'false'
243 244
        identity_contents = "dummy"

245 246 247 248 249 250 251 252 253 254
    user_data = """#!/bin/bash
set -x
set -e
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
base_dir="/var/tmp/edx-cfg"
extra_vars="$base_dir/extra-vars-$$.yml"
secure_identity="$base_dir/secure-identity"
git_ssh="$base_dir/git_ssh.sh"
configuration_version="{configuration_version}"
configuration_secure_version="{configuration_secure_version}"
255
configuration_private_version="{configuration_private_version}"
256
configuration_internal_version="{configuration_internal_version}"
257 258 259
environment="{environment}"
deployment="{deployment}"
play="{play}"
260
cluster="{play}"
261
config_secure={config_secure}
262 263 264
git_repo_name="configuration"
git_repo="https://github.com/edx/$git_repo_name"
git_repo_secure="{configuration_secure_repo}"
265
git_repo_secure_name=$(basename $git_repo_secure .git)
266
git_repo_private="{configuration_private_repo}"
267
git_repo_private_name=$(basename $git_repo_private .git)
268 269
git_repo_internal="{configuration_internal_repo}"
git_repo_internal_name=$(basename $git_repo_internal .git)
John Jarvis committed
270
secure_vars_file={secure_vars_file}
271 272
environment_deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{environment}-{deployment}.yml"
deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{deployment}.yml"
273 274
environment_deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{environment}-{deployment}.yml"
deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{deployment}.yml"
275 276 277 278 279 280
instance_id=\\
$(curl http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null)
instance_ip=\\
$(curl http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null)
instance_type=\\
$(curl http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null)
281
playbook_dir="$base_dir/{playbook_dir}"
282 283 284 285 286 287 288 289 290

if $config_secure; then
    git_cmd="env GIT_SSH=$git_ssh git"
else
    git_cmd="git"
fi

ANSIBLE_ENABLE_SQS=true
SQS_NAME={queue_name}
291
SQS_REGION={region}
292 293
SQS_MSG_PREFIX="[ $instance_id $instance_ip $environment-$deployment $play ]"
PYTHONUNBUFFERED=1
294 295 296 297 298
HIPCHAT_TOKEN={hipchat_token}
HIPCHAT_ROOM={hipchat_room}
HIPCHAT_MSG_PREFIX="$environment-$deployment-$play: "
HIPCHAT_FROM="ansible-$instance_id"
HIPCHAT_MSG_COLOR=$(echo -e "yellow\\ngreen\\npurple\\ngray" | shuf | head -1)
299
DATADOG_API_KEY={datadog_api_key}
300 301
# environment for ansible
export ANSIBLE_ENABLE_SQS SQS_NAME SQS_REGION SQS_MSG_PREFIX PYTHONUNBUFFERED
302 303
export HIPCHAT_TOKEN HIPCHAT_ROOM HIPCHAT_MSG_PREFIX HIPCHAT_FROM
export HIPCHAT_MSG_COLOR DATADOG_API_KEY
304

305

306 307
#################################### Lifted from ansible-bootstrap.sh
if [[ -z "$ANSIBLE_REPO" ]]; then
308 309 310
  ANSIBLE_REPO="https://github.com/edx/ansible.git"
fi

311
if [[ -z "$ANSIBLE_VERSION" ]]; then
312 313 314
  ANSIBLE_VERSION="master"
fi

315
if [[ -z "$CONFIGURATION_REPO" ]]; then
316 317 318
  CONFIGURATION_REPO="https://github.com/edx/configuration.git"
fi

319
if [[ -z "$CONFIGURATION_VERSION" ]]; then
320 321 322
  CONFIGURATION_VERSION="master"
fi

323
if [[ -z "$UPGRADE_OS" ]]; then
324 325 326 327 328 329 330 331 332
  UPGRADE_OS=false
fi

#
# Bootstrapping constants
#
VIRTUAL_ENV_VERSION="15.0.2"
PIP_VERSION="8.1.2"
SETUPTOOLS_VERSION="24.0.3"
333
EDX_PPA_KEY_SERVER="keyserver.ubuntu.com"
334
EDX_PPA_KEY_ID="B41E5E3969464050"
335 336 337 338 339 340

cat << EOF
******************************************************************************

Running the abbey with the following arguments:

341 342 343 344
ANSIBLE_REPO="$ANSIBLE_REPO"
ANSIBLE_VERSION="$ANSIBLE_VERSION"
CONFIGURATION_REPO="$CONFIGURATION_REPO"
CONFIGURATION_VERSION="$CONFIGURATION_VERSION"
345 346 347 348 349 350 351 352

******************************************************************************
EOF


if [[ $(id -u) -ne 0 ]] ;then
    echo "Please run as root";
    exit 1;
353 354
fi

355
if grep -q 'Trusty Tahr' /etc/os-release
356 357 358 359 360 361 362 363
then
    SHORT_DIST="trusty"
elif grep -q 'Xenial Xerus' /etc/os-release
then
    SHORT_DIST="xenial"
else
    cat << EOF

364
    This script is only known to work on Ubuntu Trusty and Xenial,
365 366 367 368 369 370 371
    exiting.  If you are interested in helping make installation possible
    on other platforms, let us know.

EOF
   exit 1;
fi

372
EDX_PPA="deb http://ppa.edx.org $SHORT_DIST main"
373 374 375 376 377

# Upgrade the OS
apt-get update -y
apt-key update -y

378
if [ "$UPGRADE_OS" = true ]; then
379 380 381 382 383 384 385 386 387 388 389
    echo "Upgrading the OS..."
    apt-get upgrade -y
fi

# Required for add-apt-repository
apt-get install -y software-properties-common python-software-properties

# Add git PPA
add-apt-repository -y ppa:git-core/ppa

# For older distributions we need to install a PPA for Python 2.7.10
390
if [[ "trusty" = "$SHORT_DIST" ]]; then
391 392

    # Add python PPA
393 394
    apt-key adv --keyserver "$EDX_PPA_KEY_SERVER" --recv-keys "$EDX_PPA_KEY_ID"
    add-apt-repository -y "$EDX_PPA"
395 396 397 398 399 400 401 402 403 404 405 406
fi

# Install python 2.7 latest, git and other common requirements
# NOTE: This will install the latest version of python 2.7 and
# which may differ from what is pinned in virtualenvironments
apt-get update -y

apt-get install -y python2.7 python2.7-dev python-pip python-apt python-yaml python-jinja2 build-essential sudo git-core libmysqlclient-dev libffi-dev libssl-dev

# Workaround for a 16.04 bug, need to upgrade to latest and then
# potentially downgrade to the preferred version.
# https://github.com/pypa/pip/issues/3862
407
if [[ "xenial" = "$SHORT_DIST" ]]; then
408
    pip install --upgrade pip
409
    pip install --upgrade pip=="$PIP_VERSION"
410
else
411
    pip install --upgrade pip=="$PIP_VERSION"
412 413 414 415
fi

# pip moves to /usr/local/bin when upgraded
hash -r   #pip may have moved from /usr/bin/ to /usr/local/bin/. This clears bash's path cache.
416 417 418
PATH=/usr/local/bin:$PATH
pip install setuptools=="$SETUPTOOLS_VERSION"
pip install virtualenv=="$VIRTUAL_ENV_VERSION"
419 420 421 422 423


##################### END Lifted from ansible-bootstrap.sh


424 425
# python3 is required for certain other things
# (currently xqwatcher so it can run python2 and 3 grader code,
426 427
# but potentially more in the future).
/usr/bin/apt-get install -y python3-pip python3-dev
428

429 430 431 432 433
# this is missing on 14.04 (base package on 12.04)
# we need to do this on any build, since the above apt-get
# only runs on a build from scratch
/usr/bin/apt-get install -y python-httplib2 --force-yes

434 435 436 437 438 439 440 441 442 443 444 445 446
rm -rf $base_dir
mkdir -p $base_dir
cd $base_dir

cat << EOF > $git_ssh
#!/bin/sh
exec /usr/bin/ssh -o StrictHostKeyChecking=no -i "$secure_identity" "\$@"
EOF

chmod 755 $git_ssh

if $config_secure; then
    cat << EOF > $secure_identity
447
{identity_contents}
448 449 450 451
EOF
fi

cat << EOF >> $extra_vars
452
---
453 454 455
# extra vars passed into
# abbey.py including versions
# of all the repositories
456
{extra_vars_yml}
457

458 459 460 461
# abbey will always run fake migrations
# this is so that the application can come
# up healthy
fake_migrations: true
462

463
disable_edx_services: true
464
COMMON_TAG_EC2_INSTANCE: true
465 466 467 468

# abbey should never take instances in
# and out of elbs
elb_pre_post: false
469 470 471 472
EOF

chmod 400 $secure_identity

473 474 475 476
$git_cmd clone $git_repo $git_repo_name
cd $git_repo_name
$git_cmd checkout $configuration_version
cd $base_dir
477 478

if $config_secure; then
479 480 481 482
    $git_cmd clone $git_repo_secure $git_repo_secure_name
    cd $git_repo_secure_name
    $git_cmd checkout $configuration_secure_version
    cd $base_dir
483 484
fi

Fred Smith committed
485
if [[ ! -z $git_repo_private ]]; then
486 487 488 489 490 491
    $git_cmd clone $git_repo_private $git_repo_private_name
    cd $git_repo_private_name
    $git_cmd checkout $configuration_private_version
    cd $base_dir
fi

492 493 494 495 496 497 498
if [[ ! -z $git_repo_internal ]]; then
    $git_cmd clone $git_repo_internal $git_repo_internal_name
    cd $git_repo_internal_name
    $git_cmd checkout $configuration_internal_version
    cd $base_dir
fi

499

500
cd $base_dir/$git_repo_name
501
sudo pip install -r pre-requirements.txt
502 503 504 505
sudo pip install -r requirements.txt

cd $playbook_dir

506 507 508 509 510 511 512 513
if [[ -r "$deployment_internal_vars" ]]; then
    extra_args_opts+=" -e@$deployment_internal_vars"
fi

if [[ -r "$environment_deployment_internal_vars" ]]; then
    extra_args_opts+=" -e@$environment_deployment_internal_vars"
fi

514 515 516 517 518 519 520 521
if [[ -r "$deployment_secure_vars" ]]; then
    extra_args_opts+=" -e@$deployment_secure_vars"
fi

if [[ -r "$environment_deployment_secure_vars" ]]; then
    extra_args_opts+=" -e@$environment_deployment_secure_vars"
fi

John Jarvis committed
522 523
if $secure_vars_file; then
    extra_args_opts+=" -e@$secure_vars_file"
524 525 526 527
fi

extra_args_opts+=" -e@$extra_vars"

528 529
ansible-playbook -vvvv -c local -i "localhost," $play.yml $extra_args_opts
ansible-playbook -vvvv -c local -i "localhost," stop_all_edx_services.yml $extra_args_opts
530 531 532 533

rm -rf $base_dir

    """.format(
534
                hipchat_token=args.hipchat_api_token,
535
                hipchat_room=args.ansible_hipchat_room_id,
536 537
                configuration_version=args.configuration_version,
                configuration_secure_version=args.configuration_secure_version,
538
                configuration_secure_repo=args.configuration_secure_repo,
539 540
                configuration_private_version=args.configuration_private_version,
                configuration_private_repo=args.configuration_private_repo,
541 542
                configuration_internal_version=args.configuration_internal_version,
                configuration_internal_repo=args.configuration_internal_repo,
543 544 545
                environment=args.environment,
                deployment=args.deployment,
                play=args.play,
546
                playbook_dir=args.playbook_dir,
547
                config_secure=config_secure,
548
                identity_contents=identity_contents,
549
                queue_name=run_id,
550
                extra_vars_yml=extra_vars_yml,
John Jarvis committed
551
                secure_vars_file=secure_vars_file,
552
                cache_id=args.cache_id,
553 554
                datadog_api_key=args.datadog_api_key,
                region=args.region)
555

Feanil Patel committed
556 557 558
    mapping = BlockDeviceMapping()
    root_vol = BlockDeviceType(size=args.root_vol_size,
                               volume_type='gp2')
559 560
    mapping['/dev/sda1'] = root_vol

561 562 563 564
    ec2_args = {
        'security_group_ids': [security_group_id],
        'subnet_id': subnet_id,
        'key_name': args.keypair,
565
        'image_id': base_ami,
566 567 568
        'instance_type': args.instance_type,
        'instance_profile_name': args.role_name,
        'user_data': user_data,
569
        'block_device_map': mapping,
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
    }

    return ec2_args


def poll_sqs_ansible():
    """
    Prints events to the console and
    blocks until a final STATS ansible
    event is read off of SQS.

    SQS does not guarantee FIFO, for that
    reason there is a buffer that will delay
    messages before they are printed to the
    console.

    Returns length of the ansible run.
    """
    oldest_msg_ts = 0
    buf = []
    task_report = []  # list of tasks for reporting
    last_task = None
592
    completed = 0
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
    while True:
        messages = []
        while True:
            # get all available messages on the queue
            msgs = sqs_queue.get_messages(attributes='All')
            if not msgs:
                break
            messages.extend(msgs)

        for message in messages:
            recv_ts = float(
                message.attributes['ApproximateFirstReceiveTimestamp']) * .001
            sent_ts = float(message.attributes['SentTimestamp']) * .001
            try:
                msg_info = {
                    'msg': json.loads(message.get_body()),
                    'sent_ts': sent_ts,
                    'recv_ts': recv_ts,
                }
                buf.append(msg_info)
            except ValueError as e:
                print "!!! ERROR !!! unable to parse queue message, " \
                      "expecting valid json: {} : {}".format(
                          message.get_body(), e)
            if not oldest_msg_ts or recv_ts < oldest_msg_ts:
                oldest_msg_ts = recv_ts
            sqs_queue.delete_message(message)

        now = int(time.time())
        if buf:
Feanil Patel committed
623
            try:
624
                if (now - min([msg['recv_ts'] for msg in buf])) > args.msg_delay:
Feanil Patel committed
625 626 627 628 629 630 631 632 633 634 635
                    # sort by TS instead of recv_ts
                    # because the sqs timestamp is not as
                    # accurate
                    buf.sort(key=lambda k: k['msg']['TS'])
                    to_disp = buf.pop(0)
                    if 'START' in to_disp['msg']:
                        print '\n{:0>2.0f}:{:0>5.2f} {} : Starting "{}"'.format(
                            to_disp['msg']['TS'] / 60,
                            to_disp['msg']['TS'] % 60,
                            to_disp['msg']['PREFIX'],
                            to_disp['msg']['START']),
636

Feanil Patel committed
637 638 639 640 641 642 643 644 645 646 647 648
                    elif 'TASK' in to_disp['msg']:
                        print "\n{:0>2.0f}:{:0>5.2f} {} : {}".format(
                            to_disp['msg']['TS'] / 60,
                            to_disp['msg']['TS'] % 60,
                            to_disp['msg']['PREFIX'],
                            to_disp['msg']['TASK']),
                        last_task = to_disp['msg']['TASK']
                    elif 'OK' in to_disp['msg']:
                        if args.verbose:
                            print "\n"
                            for key, value in to_disp['msg']['OK'].iteritems():
                                print "    {:<15}{}".format(key, value)
649
                        else:
650 651 652 653 654 655
                            invocation = to_disp['msg']['OK']['invocation']
                            module = invocation['module_name']
                            # 'set_fact' does not provide a changed value.
                            if module == 'set_fact':
                                changed = "OK"
                            elif to_disp['msg']['OK']['changed']:
Feanil Patel committed
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
                                changed = "*OK*"
                            else:
                                changed = "OK"
                            print " {}".format(changed),
                        task_report.append({
                            'TASK': last_task,
                            'INVOCATION': to_disp['msg']['OK']['invocation'],
                            'DELTA': to_disp['msg']['delta'],
                        })
                    elif 'FAILURE' in to_disp['msg']:
                        print " !!!! FAILURE !!!!",
                        for key, value in to_disp['msg']['FAILURE'].iteritems():
                            print "    {:<15}{}".format(key, value)
                        raise Exception("Failed Ansible run")
                    elif 'STATS' in to_disp['msg']:
                        print "\n{:0>2.0f}:{:0>5.2f} {} : COMPLETE".format(
                            to_disp['msg']['TS'] / 60,
                            to_disp['msg']['TS'] % 60,
                            to_disp['msg']['PREFIX'])
675

Feanil Patel committed
676 677 678 679 680 681 682 683 684
                        # Since 3 ansible plays get run.
                        # We see the COMPLETE message 3 times
                        # wait till the last one to end listening
                        # for new messages.
                        completed += 1
                        if completed >= NUM_PLAYBOOKS:
                            return (to_disp['msg']['TS'], task_report)
            except KeyError:
                print "Failed to print status from message: {}".format(to_disp)
685 686 687 688 689 690 691 692 693 694 695

        if not messages:
            # wait 1 second between sqs polls
            time.sleep(1)


def create_ami(instance_id, name, description):

    params = {'instance_id': instance_id,
              'name': name,
              'description': description,
696
              'no_reboot': False}
697

Feanil Patel committed
698
    AWS_API_WAIT_TIME = 1
699
    image_id = ec2.create_image(**params)
Feanil Patel committed
700
    print("Checking if image is ready.")
701 702 703 704
    for _ in xrange(AMI_TIMEOUT):
        try:
            img = ec2.get_image(image_id)
            if img.state == 'available':
Feanil Patel committed
705
                print("Tagging image.")
706
                img.add_tag("environment", args.environment)
Feanil Patel committed
707
                time.sleep(AWS_API_WAIT_TIME)
708
                img.add_tag("deployment", args.deployment)
Feanil Patel committed
709
                time.sleep(AWS_API_WAIT_TIME)
710 711
                img.add_tag("cluster", args.play)
                time.sleep(AWS_API_WAIT_TIME)
712
                img.add_tag("play", args.play)
Feanil Patel committed
713
                time.sleep(AWS_API_WAIT_TIME)
714
                conf_tag = "{} {}".format("http://github.com/edx/configuration", args.configuration_version)
715
                img.add_tag("version:configuration", conf_tag)
Feanil Patel committed
716
                time.sleep(AWS_API_WAIT_TIME)
717
                conf_secure_tag = "{} {}".format(args.configuration_secure_repo, args.configuration_secure_version)
718
                img.add_tag("version:configuration_secure", conf_secure_tag)
Feanil Patel committed
719
                time.sleep(AWS_API_WAIT_TIME)
720
                conf_internal_tag = "{} {}".format(args.configuration_internal_repo, args.configuration_internal_version)
721 722
                img.add_tag("version:configuration_internal", conf_internal_tag)
                time.sleep(AWS_API_WAIT_TIME)
723
                img.add_tag("cache_id", args.cache_id)
Feanil Patel committed
724
                time.sleep(AWS_API_WAIT_TIME)
725 726 727 728 729 730

                # Get versions from the instance.
                tags = ec2.get_all_tags(filters={'resource-id': instance_id})
                for tag in tags:
                    if tag.name.startswith('version:'):
                        img.add_tag(tag.name, tag.value)
731
                        time.sleep(AWS_API_WAIT_TIME)
Feanil Patel committed
732
                break
733 734
            else:
                time.sleep(1)
735
        except EC2ResponseError as e:
736 737 738 739 740 741 742 743 744
            if e.error_code == 'InvalidAMIID.NotFound':
                time.sleep(1)
            else:
                raise Exception("Unexpected error code: {}".format(
                    e.error_code))
            time.sleep(1)
    else:
        raise Exception("Timeout waiting for AMI to finish")

745
    return image_id
John Jarvis committed
746

John Jarvis committed
747

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
def launch_and_configure(ec2_args):
    """
    Creates an sqs queue, launches an ec2 instance,
    configures it and creates an AMI. Polls
    SQS for updates
    """

    print "{:<40}".format(
        "Creating SQS queue and launching instance for {}:".format(run_id))
    print
    for k, v in ec2_args.iteritems():
        if k != 'user_data':
            print "    {:<25}{}".format(k, v)
    print

763 764
    global sqs_queue
    global instance_id
765 766 767 768 769
    sqs_queue = sqs.create_queue(run_id)
    sqs_queue.set_message_class(RawMessage)
    res = ec2.run_instances(**ec2_args)
    inst = res.instances[0]
    instance_id = inst.id
770

e0d committed
771 772
    print "{:<40}".format(
        "Waiting for instance {} to reach running status:".format(instance_id)),
773 774
    status_start = time.time()
    for _ in xrange(EC2_RUN_TIMEOUT):
775 776 777 778 779 780 781 782 783 784
        try:
            res = ec2.get_all_instances(instance_ids=[instance_id])
        except EC2ResponseError as e:
            if e.code == "InvalidInstanceID.NotFound":
                print("Instance not found({}), will try again.".format(
                    instance_id))
                time.sleep(1)
                continue
            else:
                raise(e)
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
        if res[0].instances[0].state == 'running':
            status_delta = time.time() - status_start
            run_summary.append(('EC2 Launch', status_delta))
            print "[ OK ] {:0>2.0f}:{:0>2.0f}".format(
                status_delta / 60,
                status_delta % 60)
            break
        else:
            time.sleep(1)
    else:
        raise Exception("Timeout waiting for running status: {} ".format(
            instance_id))

    print "{:<40}".format("Waiting for system status:"),
    system_start = time.time()
    for _ in xrange(EC2_STATUS_TIMEOUT):
        status = ec2.get_all_instance_status(inst.id)
802
        if status and status[0].system_status.status == u'ok':
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
            system_delta = time.time() - system_start
            run_summary.append(('EC2 Status Checks', system_delta))
            print "[ OK ] {:0>2.0f}:{:0>2.0f}".format(
                system_delta / 60,
                system_delta % 60)
            break
        else:
            time.sleep(1)
    else:
        raise Exception("Timeout waiting for status checks: {} ".format(
            instance_id))

    print
    print "{:<40}".format(
        "Waiting for user-data, polling sqs for Ansible events:")

    (ansible_delta, task_report) = poll_sqs_ansible()
    run_summary.append(('Ansible run', ansible_delta))
    print
    print "{} longest Ansible tasks (seconds):".format(NUM_TASKS)
    for task in sorted(
            task_report, reverse=True,
            key=lambda k: k['DELTA'])[:NUM_TASKS]:
        print "{:0>3.0f} {}".format(task['DELTA'], task['TASK'])
        print "  - {}".format(task['INVOCATION'])
    print

    print "{:<40}".format("Creating AMI:"),
    ami_start = time.time()
    ami = create_ami(instance_id, run_id, run_id)
    ami_delta = time.time() - ami_start
    print "[ OK ] {:0>2.0f}:{:0>2.0f}".format(
        ami_delta / 60,
        ami_delta % 60)
    run_summary.append(('AMI Build', ami_delta))
    total_time = time.time() - start_time
    all_stages = sum(run[1] for run in run_summary)
    if total_time - all_stages > 0:
        run_summary.append(('Other', total_time - all_stages))
    run_summary.append(('Total', total_time))

    return run_summary, ami
845

John Jarvis committed
846

e0d committed
847
def send_hipchat_message(message):
848
    print(message)
Fred Smith committed
849 850
    if args.callback_url:
        r=requests.get("{}/{}".format(args.callback_url, message))
851 852 853 854 855 856 857 858 859 860
    else:
        #If hipchat is configured send the details to the specified room
        if args.hipchat_api_token and args.hipchat_room_id:
            import hipchat
            try:
                hipchat = hipchat.HipChat(token=args.hipchat_api_token)
                hipchat.message_room(args.hipchat_room_id, 'AbbeyNormal',
                                     message)
            except Exception as e:
                print("Hipchat messaging resulted in an error: %s." % e)
e0d committed
861

862 863 864 865 866 867 868 869 870 871 872
if __name__ == '__main__':

    args = parse_args()

    run_summary = []

    start_time = time.time()

    if args.vars:
        with open(args.vars) as f:
            extra_vars_yml = f.read()
873
            extra_vars = yaml.load(extra_vars_yml)
874
    else:
875
        extra_vars_yml = ""
876
        extra_vars = {}
877

878
    if args.secure_vars_file:
879 880
        # explicit path to a single
        # secure var file
881
        secure_vars_file = args.secure_vars_file
882
    else:
883
        secure_vars_file = 'false'
884

885 886 887 888 889 890 891 892
    if args.stack_name:
        stack_name = args.stack_name
    else:
        stack_name = "{}-{}".format(args.environment, args.deployment)

    try:
        ec2 = boto.ec2.connect_to_region(args.region)
    except NoAuthHandlerFound:
893 894 895 896 897 898 899
        print 'Unable to connect to ec2 in region :{}'.format(args.region)
        sys.exit(1)

    try:
        sqs = boto.sqs.connect_to_region(args.region)
    except NoAuthHandlerFound:
        print 'Unable to connect to sqs in region :{}'.format(args.region)
900 901
        sys.exit(1)

902 903 904 905 906
    if args.blessed:
        base_ami = get_blessed_ami()
    else:
        base_ami = args.base_ami

907
    error_in_abbey_run = False
908 909 910 911
    try:
        sqs_queue = None
        instance_id = None

Feanil Patel committed
912
        run_id = "{}-abbey-{}-{}-{}".format(
913
            int(time.time() * 100), args.environment, args.deployment, args.play)
914 915 916

        ec2_args = create_instance_args()

917
        if args.noop:
John Jarvis committed
918 919
            print "Would have created sqs_queue with id: {}\nec2_args:".format(
                run_id)
920 921
            pprint(ec2_args)
            ami = "ami-00000"
922
        else:
923 924 925 926 927 928 929 930
            run_summary, ami = launch_and_configure(ec2_args)
            print
            print "Summary:\n"

            for run in run_summary:
                print "{:<30} {:0>2.0f}:{:0>5.2f}".format(
                    run[0], run[1] / 60, run[1] % 60)
            print "AMI: {}".format(ami)
e0d committed
931

John Jarvis committed
932 933 934 935 936
            message = 'Finished baking AMI {image_id} for {environment} {deployment} {play}.'.format(
                image_id=ami,
                environment=args.environment,
                deployment=args.deployment,
                play=args.play)
e0d committed
937 938

            send_hipchat_message(message)
e0d committed
939
    except Exception as e:
e0d committed
940
        message = 'An error occurred building AMI for {environment} ' \
e0d committed
941
            '{deployment} {play}.  The Exception was {exception}'.format(
e0d committed
942 943
                environment=args.environment,
                deployment=args.deployment,
e0d committed
944 945
                play=args.play,
                exception=repr(e))
e0d committed
946
        send_hipchat_message(message)
947
        error_in_abbey_run = True
948 949
    finally:
        print
950
        if not args.no_cleanup and not args.noop:
951 952 953 954 955 956
            if sqs_queue:
                print "Cleaning up - Removing SQS queue - {}".format(run_id)
                sqs.delete_queue(sqs_queue)
            if instance_id:
                print "Cleaning up - Terminating instance ID - {}".format(
                    instance_id)
957 958 959
            # Check to make sure we have an instance id.
            if instance_id:
                ec2.terminate_instances(instance_ids=[instance_id])
960 961
        if error_in_abbey_run:
            exit(1)