Last active
August 15, 2022 08:38
-
-
Save f9n/a412f7c9e7fdd852d432bc5626b9388a to your computer and use it in GitHub Desktop.
Generic alarms for Aws Cloudwatch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# In 2019 | |
import os | |
import json | |
import sys | |
import time | |
import copy | |
# Import requirement python packages | |
file_path = os.path.dirname(__file__) | |
module_path = os.path.join(file_path, "env") | |
sys.path.append(module_path) | |
import yaml | |
import boto3 | |
from botocore.exceptions import ClientError, BotoCoreError | |
def get_configs(): | |
print(f"[+] Get config") | |
with open("./config.yml") as f: | |
return yaml.safe_load(f) | |
CONFIGS = get_configs() | |
class Ec2Instance: | |
def __init__(self, region, instance_id): | |
self.ec2 = boto3.resource("ec2", region_name=region) | |
self.instance = self.ec2.Instance(instance_id) | |
def get_name(self): | |
print("[+] Ec2Instance.get_name()") | |
instance_id = self.instance.id | |
instance_name = self.get_tag_value(key_name="Name") | |
print(f"[+] InstanceName: {instance_name}, InstanceId: {instance_id}") | |
if instance_name == None: | |
instance_name = instance_id | |
if instance_name.find(instance_id) < 0: | |
instance_name = instance_name + "_" + instance_id | |
print(f"[+] InstanceName: {instance_name}") | |
return instance_name | |
def get_tag_value(self, key_name="Name"): | |
for tag in self.instance.tags: | |
if tag["Key"] == key_name: | |
return tag["Value"] | |
return None | |
def convert_tags_to_dictionary(self): | |
tags_object = {} | |
for tag in self.instance.tags: | |
tags_object[tag["Key"]] = tag["Value"] | |
print(f"[+] Instace Tags: {tags_object}") | |
return tags_object | |
class Alarms: | |
def __init__( | |
self, | |
region, | |
account_id, | |
instance_id, | |
sleep_duration=CONFIGS["globals"]["sleep_duration"], | |
): | |
self.region = region | |
self.account_id = account_id | |
self.instance_id = instance_id | |
self.sleep_duration = sleep_duration | |
self.__ec2instance = None | |
self.__cw_client = None | |
self.__cw_resource = None | |
@property | |
def ec2instance(self): | |
if self.__ec2instance is None: | |
self.__ec2instance = Ec2Instance( | |
region=self.region, instance_id=self.instance_id | |
) | |
return self.__ec2instance | |
@property | |
def cw_client(self): | |
if self.__cw_client is None: | |
self.__cw_client = boto3.client("cloudwatch", region_name=self.region) | |
return self.__cw_client | |
@property | |
def cw_resource(self): | |
if self.__cw_resource is None: | |
self.__cw_resource = boto3.resource("cloudwatch", region_name=self.region) | |
return self.__cw_resource | |
def __create_alarm(self, metric): | |
print("[+] Create {name} alarm".format(name=metric["AlarmName"])) | |
self.cw_client.put_metric_alarm(**metric) | |
def __find_metrics_by_instance(self): | |
instance_tags = self.ec2instance.convert_tags_to_dictionary() | |
temp_metrics = [] | |
count = 0 | |
matched = False | |
configs = get_configs() | |
for config in configs["ec2"]["alarms"]: | |
print(f"[+] Config: {config}") | |
config_select_by_tags = config["select_by_tags"] | |
config_metrics = config["metrics"] | |
_count = 0 | |
for instance_tag_key, instance_tag_value in instance_tags.items(): | |
for config_tag_key, config_tag_value in config_select_by_tags.items(): | |
if ( | |
instance_tag_key == config_tag_key | |
and instance_tag_value == config_tag_value | |
): | |
_count += 1 | |
if _count > count: | |
count = _count | |
temp_metrics = copy.deepcopy(config_metrics) | |
matched = True | |
print(f"[+] Temp Metrics: {temp_metrics}") | |
print(f"[+] Matched: {matched}") | |
return temp_metrics, matched | |
def __get_metrics_by_instance(self): | |
print("[+] Alarms.__get_metrics_by_instsance") | |
instance_name = self.ec2instance.get_name() | |
metric = dict( | |
Namespace="AWS/EC2", | |
AlarmName="Shallow", | |
AlarmDescription="", | |
MetricName="CPUUtilization", | |
ComparisonOperator="GreaterThanOrEqualToThreshold", | |
Statistic="Average", | |
Threshold=50, | |
EvaluationPeriods=1, | |
Period=5 * 60, | |
ActionsEnabled=False, | |
OKActions=[], | |
AlarmActions=[], | |
Dimensions=[{"Name": "InstanceId", "Value": self.ec2instance.instance.id}], | |
) | |
metric.update(CONFIGS["ec2"]["default"]) | |
temp_metrics, matched = self.__find_metrics_by_instance() | |
metrics = [] | |
if matched: | |
for temp_metric in temp_metrics: | |
_metric = copy.deepcopy(metric) | |
temp_metric_alarm_name = temp_metric["AlarmName"] | |
print(f"Old -> TempMetricAlarmName: {temp_metric_alarm_name}") | |
temp_metric["AlarmName"] = "[{instance_name}][{alarm_name}]".format( | |
instance_name=instance_name, alarm_name=temp_metric_alarm_name | |
) | |
temp_metric_alarm_name = temp_metric["AlarmName"] | |
print(f"New -> TempMetricAlarmName: {temp_metric_alarm_name}") | |
_metric.update(temp_metric) | |
metrics.append(_metric) | |
print(f"[+] Metrics: {metrics}") | |
return (metrics, matched) | |
def create(self): | |
print("[+] Create all alarms") | |
metrics, matched = self.__get_metrics_by_instance() | |
if not matched: | |
message = "We can't find the metrics for this '{instance}' instance, so we will use default configurations.".format( | |
instance=self.ec2instance.get_name() | |
) | |
sys.exit(message) | |
for metric in metrics: | |
self.__create_alarm(metric) | |
time.sleep(self.sleep_duration) | |
# Enable all notifications | |
# Change Alarm State to INSUFFICIENT_DATA | |
print("[+] Enable all notifications") | |
for metric in metrics: | |
alarm = self.cw_resource.Alarm(metric["AlarmName"]) | |
print(f"[+] Alarm: {alarm}, State: {alarm.state_value}") | |
alarm.enable_actions() | |
time.sleep(1) | |
if alarm.state_value == "ALARM": | |
alarm.set_state( | |
StateValue="INSUFFICIENT_DATA", | |
StateReason="Set state to INSUFFICIENT_DATA", | |
) | |
def delete(self): | |
print("[+] Delete all alarms") | |
metrics, matched = self.__get_metrics_by_instance() | |
if not matched: | |
message = "We can't find the metrics for this '{instance}' instance.".format( | |
instance=self.ec2instance.get_name() | |
) | |
sys.exit(message) | |
all_alarm_names = [metric["AlarmName"] for metric in metrics] | |
print(f"[+] Delete All Alarm Names: {all_alarm_names}") | |
self.cw_client.delete_alarms(AlarmNames=all_alarm_names) | |
def __str__(self): | |
return "Alarms<Region: {region}, AccountId: {account_id}, InstanceId: {instance_id}>".format( | |
region=self.region, account_id=self.account_id, instance_id=self.instance_id | |
) | |
def handle(event, context): | |
alarms = Alarms( | |
region=event["region"], | |
account_id=event["account"], | |
instance_id=event["detail"]["instance-id"], | |
) | |
state = event["detail"]["state"] | |
if CONFIGS["globals"]["debug"]: | |
print(f"[+] Event: {event}") | |
print(f"[+] State: {state}") | |
print(f"[+] Alarms: {alarms}") | |
if state == "running": | |
alarms.create() | |
elif state == "terminated": | |
alarms.delete() | |
else: | |
print("[+] Undefined Ec2 Event State") | |
return {"statusCode": 200, "body": json.dumps("Generic Alarms!")} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
globals: | |
sleep_duration: 840 | |
debug: True | |
vars: | |
ec2: | |
default: | |
OKActions: ["arn:aws:sns:us-east-1:...:DatabaseAlerts"] | |
AlarmActions: ["arn:aws:sns:us-east-1:...:DatabaseAlerts"] | |
InsufficientDataActions: [] | |
EvaluationPeriods: 1 | |
Period: 300 | |
alarms: | |
- select_by_tags: | |
Name: logstash_test_1 | |
metrics: | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 70 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 40 | |
- select_by_tags: | |
aws:autoscaling:groupName: app_example_prod_1 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 30 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 20 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 20000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 2500000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 14000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 1000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: app_example_prod_2 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 85 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 20 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 110000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 25000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 100000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 20000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: app_example_prod_3 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 85 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 30 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 55000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 35000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: app_example_prod_4 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 70 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 30 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 100000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 25000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 40000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: nodes.k8s-prod-3 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 65 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 350000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 150000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 300000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 120000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: nodes.k8s-beta-5 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 70 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 5 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 5000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 2000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 5000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 1000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: app_example_prod_5 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 55 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 45000000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 15000000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 25000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: master-us-east-1b.masters.k8s-prod-3 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 3.5 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 2.5 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 500000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 400000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 10000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 9000000 | |
- select_by_tags: | |
aws:autoscaling:groupName: master-us-east-1b.masters.k8s-beta-5 | |
metrics: | |
- AlarmName: HighStatusCheckFailed | |
MetricName: StatusCheckFailed | |
Statistic: Maximum | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 1 | |
- AlarmName: HighCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 15 | |
- AlarmName: LowCPUUtilization | |
MetricName: CPUUtilization | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 10 | |
- AlarmName: HighNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 400000 | |
- AlarmName: LowNetworkIn | |
MetricName: NetworkIn | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 300000 | |
- AlarmName: HighNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: GreaterThanOrEqualToThreshold | |
Threshold: 2000000 | |
- AlarmName: LowNetworkOut | |
MetricName: NetworkOut | |
Statistic: Average | |
ComparisonOperator: LessThanOrEqualToThreshold | |
Threshold: 1700000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment