Last active
January 24, 2019 21:25
-
-
Save richid/b95582e2ce85fdb456862a97b87b9ba0 to your computer and use it in GitHub Desktop.
Lambda function to gracefully remove an EC2 instance from an ECS cluster based on a Spot Instance interruption notice, all managed by Terraform.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
from typing import Optional | |
import boto3 | |
@dataclass(frozen=True) | |
class EcsInstanceInfo: | |
ec2_id: str | |
ecs_instance_arn: str | |
ecs_cluster_arn: str | |
def ecs_cluster_name(self): | |
return self.ecs_cluster_arn.split('/')[-1] | |
try: | |
session = boto3.Session(profile_name='local') | |
except: | |
session = boto3.Session() | |
ecs = session.client('ecs') | |
def main(event: dict, context: Optional[dict]) -> bool: | |
ec2_instance_id = event['detail']['instance-id'] | |
instance_action = event['detail']['instance-action'] | |
if instance_action.lower() != 'terminate': | |
print(f'Unexpected instance action ("{instance_action}"), skipping') | |
return False | |
print(f'Received spot instance termination notice for instance {ec2_instance_id}') | |
instance_info = get_instance_info(ec2_instance_id) | |
if not instance_info: | |
print('Unable to determine ECS cluster, skipping') | |
return False | |
print(f'Instance {instance_info.ec2_id} is part of the {instance_info.ecs_cluster_name()} ECS cluster, will drain') | |
return drain_ecs_instance(instance_info) | |
def get_instance_info(ec2_instance_id: str) -> Optional[EcsInstanceInfo]: | |
for cluster_arn in paginate(ecs.list_clusters): | |
for inst_arn in paginate(ecs.list_container_instances, cluster=cluster_arn): | |
inst_info = ecs.describe_container_instances(cluster=cluster_arn, containerInstances=[inst_arn]) | |
if 'containerInstances' not in inst_info: | |
continue | |
for inst in inst_info['containerInstances']: | |
if ec2_instance_id == inst['ec2InstanceId']: | |
return EcsInstanceInfo(ec2_instance_id, inst['containerInstanceArn'], cluster_arn) | |
return None | |
def drain_ecs_instance(instance_info: EcsInstanceInfo) -> bool: | |
response = ecs.update_container_instances_state( | |
cluster=instance_info.ecs_cluster_arn, | |
containerInstances=[instance_info.ecs_instance_arn], | |
status='DRAINING' | |
) | |
if len(response['failures']) < 1: | |
print(f'Instance {instance_info.ec2_id} successfully set to DRAINING') | |
return True | |
else: | |
print(f'Error draining {instance_info.ec2_id}: {response["failures"]}') | |
return False | |
def paginate(method, **kwargs): | |
client = method.__self__ | |
paginator = client.get_paginator(method.__name__) | |
for page in paginator.paginate(**kwargs).result_key_iters(): | |
for result in page: | |
yield result | |
if __name__ == '__main__': | |
instance_id = 'i-0938587f3cc35d73f' | |
event = { | |
"version": "0", | |
"id": "12345678-1234-1234-1234-123456789012", | |
"detail-type": "EC2 Spot Instance Interruption Warning", | |
"source": "aws.ec2", | |
"account": "123456789012", | |
"time": "yyyy-mm-ddThh:mm:ssZ", | |
"region": "us-east-2", | |
"resources": ["arn:aws:ec2:us-east-2:123456789012:instance/" + instance_id], | |
"detail": { | |
"instance-id": instance_id, | |
"instance-action": "terminate" | |
} | |
} | |
main(event, None) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
resource "aws_cloudwatch_event_rule" "spot_instance_termination_rule" { | |
name = "SpotInstanceTerminationRule" | |
description = "Events rule for Spot Instance termination notices" | |
event_pattern = <<PATTERN | |
{ | |
"detail-type": [ | |
"EC2 Spot Instance Interruption Warning" | |
], | |
"source": [ | |
"aws.ec2" | |
] | |
} | |
PATTERN | |
} | |
resource "aws_cloudwatch_event_target" "spot_instance_termination_rule_target_lambda" { | |
rule = "${aws_cloudwatch_event_rule.spot_instance_termination_rule.name}" | |
target_id = "InvokeLambda" | |
arn = "${aws_lambda_function.spot_instance_termination_lambda.arn}" | |
} | |
resource "aws_cloudwatch_event_target" "spot_instance_termination_rule_target_sns" { | |
rule = "${aws_cloudwatch_event_rule.spot_instance_termination_rule.name}" | |
target_id = "SendToSNS" | |
arn = "${aws_sns_topic.spot_instance_termination_sns.arn}" | |
} | |
resource "aws_cloudwatch_log_group" "spot_instance_termination_lambda_log_group" { | |
name = "/aws/lambda/${aws_lambda_function.spot_instance_termination_lambda.function_name}" | |
retention_in_days = 60 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
resource "aws_iam_role" "spot_instance_termination_lambda_role" { | |
name = "SpotInstanceTerminationLambda" | |
description = "Allows the Spot Instance termination Lambda access to EC2 and ECS." | |
assume_role_policy = <<POLICY | |
{ | |
"Version": "2012-10-17", | |
"Statement": [ | |
{ | |
"Effect": "Allow", | |
"Principal": { | |
"Service": "lambda.amazonaws.com" | |
}, | |
"Action": "sts:AssumeRole" | |
} | |
] | |
} | |
POLICY | |
} | |
resource "aws_iam_role_policy_attachment" "spot_instance_termination_lambda_basic_attachment" { | |
role = "${aws_iam_role.spot_instance_termination_lambda_role.name}" | |
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" | |
} | |
resource "aws_iam_role_policy" "spot_instance_termination_lambda_policy_attach_inline" { | |
name = "SpotInstanceTerminationLambda" | |
role = "${aws_iam_role.spot_instance_termination_lambda_role.name}" | |
policy = <<EOF | |
{ | |
"Version": "2012-10-17", | |
"Statement": [ | |
{ | |
"Action": [ | |
"ecs:Describe*", | |
"ecs:List*", | |
"ecs:UpdateContainerInstanceStatus" | |
], | |
"Effect": "Allow", | |
"Resource": [ | |
"*" | |
] | |
} | |
] | |
} | |
EOF | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
resource "aws_lambda_function" "spot_instance_termination_lambda" { | |
function_name = "SpotInstanceTermination" | |
description = "Handles Spot Instance termination messages by gracefully shutting down the instance." | |
role = "${aws_iam_role.spot_instance_termination_lambda_role.arn}" | |
handler = "_spot_instance_termination_handler.main" | |
runtime = "python3.7" | |
timeout = 30 | |
memory_size = 128 | |
kms_key_arn = "${data.aws_kms_key.lambda_kms.arn}" | |
filename = "${data.archive_file.spot_instance_termination_lambda_zip.output_path}" | |
source_code_hash = "${data.archive_file.spot_instance_termination_lambda_zip.output_base64sha256}" | |
} | |
data "archive_file" "spot_instance_termination_lambda_zip" { | |
type = "zip" | |
source_file = "_spot_instance_termination_handler.py" | |
output_path = "dist/spot_instance_termination_lambda.zip" | |
} | |
resource "aws_lambda_permission" "spot_instance_termination_lambda_allow_cloudwatch" { | |
statement_id = "AllowInvokeFromCloudWatch" | |
action = "lambda:InvokeFunction" | |
function_name = "${aws_lambda_function.spot_instance_termination_lambda.function_name}" | |
principal = "events.amazonaws.com" | |
source_arn = "${aws_cloudwatch_event_target.spot_instance_termination_rule_target_lambda.arn}" | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
resource "aws_sns_topic" "spot_instance_termination_sns" { | |
name = "SpotInstanceTerminations" | |
kms_master_key_id = "alias/aws/sns" | |
} | |
resource "aws_sns_topic_policy" "spot_instance_termination_sns_policy" { | |
arn = "${aws_sns_topic.spot_instance_termination_sns.arn}" | |
policy = <<POLICY | |
{ | |
"Version": "2012-10-17", | |
"Id": "snspolicy", | |
"Statement": [{ | |
"Sid": "AllowPublishFromCloudWatchEvents", | |
"Effect": "Allow", | |
"Principal": { | |
"Service": "events.amazonaws.com" | |
}, | |
"Action": "sns:Publish", | |
"Resource": "${aws_sns_topic.spot_instance_termination_sns.arn}" | |
}] | |
} | |
POLICY | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Background
AWS Spot Instances are effectively excess compute capacity that is available to users at large discounts. Users set a bid price for a specific instance type and as long as there is capacity at that price your instance will stay running. If the market price of the instance type rises above your bid your instance will be "interrupted" (read: terminated). AWS gives users two minutes notice to gracefully handle this situation.
Theory of operation
Given that ECS clusters are designed to autoscale and support draining out of the box handling the dynamic nature of Spot Instances should be straightforward. In order to do this we'll be relying on two AWS services: CloudWatch and Lambda.
CloudWatch
When AWS is going to terminate a Spot Instance it emits a CloudWatch Event with the necessary details. We then create a CloudWatch Event Rule that listens for these events and invokes a Lambda function when there is a match. This also sends a message to a dedicated SNS Topic. The above configuration does not install any subscribers to this topic but Terraform makes this simple.
Lambda
The Lambda fires and, given the EC2 instance ID, queries a series of ECS API endpoints to determine the ECS cluster and ECS instance ID (EC2 instance ID != ECS instance ID). With this information the ECS instance is then set to the
DRAINING
state and the ECS scheduler will start moving tasks off the instance.