🕌
LambdaでAutoScalingに自動でアラーム設定する
はじめに
AutoScalingにより起動したインスタンスにAlarm設定を自動で行いたかったためLambdaを使って自動で設定してみます。
手順
- Lambda用ロールを作成する
- Lambdaを作成する
- EventBridgeを使い、AutoScaling起動時のイベントで発火させるように設定
Lambda用ロール作成
ポリシーは下記になります。
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": "arn:aws:logs:*:*:*"
},
{
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"cloudwatch:PutMetricAlarm",
"cloudwatch:DeleteAlarms",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}
Lambda作成
Lambdaを作成していきます。
今回はPythonでコード作成したので、ランタイムはPythonにします。
また、実行ロールについては新規作成したものを選択して下さい。
lambda_function.py に下記コードを使用します。
アラームの設定を変更したい場合は、メソッドを変更してください。
on_launched関数 TOPIC_ARN ALARM_NAME_LIST は適宜変更する必要があります。
lambda_function.py
import json
import boto3
# SNS topic_arnを指定
TOPIC_ARN=['arn:aws:sns:ap-northeast-1:xxxxxxxxx:test_1','arn:aws:sns:ap-northeast-1:xxxxxxxxx:test_2']
# instance
ec2 = boto3.resource('ec2')
cw = boto3.client('cloudwatch')
"""
基本的な監視設定を追加できます。
パラメータを変更すれば、設定値を変更可能です。
"""
class SetAlarm:
# class val
instance_id = None
# instance val
def __init__(self):
print("create SetAlarm instance")
# Cpu
def cpu_util_alarm(self, alarm_name, threshold):
cw.put_metric_alarm(
AlarmName= alarm_name,
AlarmDescription= f'"CPU使用率が {threshold} % を超えていたら通知"',
Namespace= 'AWS/EC2',
Dimensions= [
{ "Name": "InstanceId", "Value": self.instance_id }
],
MetricName= 'CPUUtilization',
Statistic= "Average",
Threshold= threshold,
Unit= 'Percent',
ComparisonOperator= "GreaterThanThreshold",
Period= 60,
EvaluationPeriods= 3,
DatapointsToAlarm= 3,
TreatMissingData= "ignore",
AlarmActions= TOPIC_ARN,
OKActions= TOPIC_ARN,
)
print(f'ok, "{alarm_name}" setting is complete')
# Mem
def mem_util_alarm(self, alarm_name, threshold):
cw.put_metric_alarm(
AlarmName= alarm_name,
AlarmDescription= f'"Memory使用率が {threshold} % を超えていたら通知"',
Namespace= 'CWAgent',
Dimensions= [
{ "Name": "InstanceId", "Value": self.instance_id }
],
MetricName= 'mem_used_percent',
Statistic= "Average",
Threshold= threshold,
Unit= 'Percent',
ComparisonOperator= "GreaterThanThreshold",
Period= 60,
EvaluationPeriods= 3,
DatapointsToAlarm= 3,
TreatMissingData= "ignore",
AlarmActions= TOPIC_ARN,
OKActions= TOPIC_ARN,
)
print(f'ok, "{alarm_name}" setting is complete')
# Disk
def disk_util_alarm(self, alarm_name, threshold):
cw.put_metric_alarm(
AlarmName= alarm_name,
AlarmDescription= f'"Disk使用率が {threshold} % を超えていたら通知"',
Namespace= 'CWAgent',
Dimensions= [
{ "Name": "InstanceId", "Value": self.instance_id },
{ "Name": "device", "Value": "nvme0n1p1" },
{ "Name": "fstype", "Value": "xfs" },
{ "Name": "path", "Value": "/" }
],
MetricName= 'disk_used_percent',
Statistic= "Average",
Threshold= threshold,
Unit= 'Percent',
ComparisonOperator= "GreaterThanThreshold",
Period= 300,
EvaluationPeriods= 3,
DatapointsToAlarm= 3,
TreatMissingData= "ignore",
AlarmActions= TOPIC_ARN,
OKActions= TOPIC_ARN,
)
print(f'ok, "{alarm_name}" setting is complete')
# StatusCheck
def status_check_alarm(self, alarm_name, threshold):
cw.put_metric_alarm(
AlarmName= alarm_name,
AlarmDescription= f'"StatusCheckFailedが {threshold} を超えていたら通知"',
Namespace= 'AWS/EC2',
Dimensions= [
{ "Name": "InstanceId", "Value": self.instance_id }
],
MetricName= 'StatusCheckFailed',
Statistic= "Average",
Threshold= threshold,
Unit= 'Count',
ComparisonOperator= "GreaterThanThreshold",
Period= 60,
EvaluationPeriods= 1,
DatapointsToAlarm= 1,
TreatMissingData= "ignore",
AlarmActions= TOPIC_ARN,
OKActions= TOPIC_ARN,
)
print(f'ok, "{alarm_name}" setting is complete')
# Process
## httpd
def httpd_process_alarm(self, alarm_name, threshold):
cw.put_metric_alarm(
AlarmName = alarm_name ,
AlarmDescription= f'httpd_process_countが {threshold} を超えていたら通知',
Namespace= 'CWAgent',
Dimensions= [
{ "Name": "InstanceId", "Value": self.instance_id },
{ "Name": "exe", "Value": "httpd" },
{ "Name": "pid_finder", "Value": "native" }
],
MetricName = 'procstat_lookup_pid_count',
Statistic = "Average",
Threshold = threshold,
ComparisonOperator = "LessThanThreshold",
Period = 60,
EvaluationPeriods = 1,
DatapointsToAlarm = 1,
TreatMissingData = "ignore",
AlarmActions = TOPIC_ARN,
OKActions = TOPIC_ARN,
)
print(f'ok, "{alarm_name}" setting is complete')
def on_launched(instance_id, ALARM_NAME_LIST):
"""
起動時にアラーム設定を設定する
必要なアラームを設定してください。
基本的には、threshold, alarm_name に引数を与える
alarm_nameにはALARM_NAME_LISTから指定
"""
# cpu
setalarm = SetAlarm()
setalarm.cpu_util_alarm(threshold=70, alarm_name=ALARM_NAME_LIST[0])
setalarm.cpu_util_alarm(threshold=80, alarm_name=ALARM_NAME_LIST[1])
setalarm.cpu_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[2])
# mem
setalarm.mem_util_alarm(threshold=70, alarm_name=ALARM_NAME_LIST[3])
setalarm.mem_util_alarm(threshold=80, alarm_name=ALARM_NAME_LIST[4])
setalarm.mem_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[5])
# disk
setalarm.disk_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[6])
# statuscheck
setalarm.status_check_alarm(threshold=1, alarm_name=ALARM_NAME_LIST[7])
# httpd
setalarm.httpd_process_alarm(threshold=1, alarm_name=ALARM_NAME_LIST[8])
def on_terminated(ALARM_NAME_LIST):
"""
インスタンス削除時にアラームも削除する
ALARM_NAME_LIST に登録されているものを削除
"""
cw.delete_alarms(AlarmNames = ALARM_NAME_LIST)
print(f'{ALARM_NAME_LIST} の削除が完了しました。')
return
# event Main
def lambda_handler(event, context):
# TODO implement
instance_id = event['detail']['EC2InstanceId']
detail_type = event['detail-type']
# name tag
instance = ec2.Instance(id=instance_id)
name_tag = [x['Value'] for x in instance.tags if x['Key'] == 'Name']
instance_name = name_tag[0] if len(name_tag) else ''
# クラス変数にセット
SetAlarm.instance_id = instance_id
# リスト
ALARM_NAME_LIST = []
""" ※アラーム名を設定してください """
ALARM_NAME_LIST.extend([
# CPU
f'[{instance_name}][{instance_id}]CPUUtilization-High-70%',
f'[{instance_name}][{instance_id}]CPUUtilization-High-80%',
f'[{instance_name}][{instance_id}]CPUUtilization-High-90%',
# MEM
f'[{instance_name}][{instance_id}]Memorytilization-High-70%',
f'[{instance_name}][{instance_id}]Memorytilization-High-80%',
f'[{instance_name}][{instance_id}]Memorytilization-High-90%',
# DISK
f'[{instance_name}][{instance_id}]DiskUtilization-High-90%',
# STATUS
f'[{instance_name}][{instance_id}]StatusCheckFailed',
# PROCESS
f'[{instance_name}][{instance_id}]httpd process down'
])
# EC2の起動 or EC2の終了
if detail_type == "EC2 Instance Launch Successful":
print(f'Instance_ID:{instance_id} のアラームを設定します。')
on_launched(instance_id, ALARM_NAME_LIST)
elif detail_type == "EC2 Instance Terminate Successful":
print(f'Instance_ID:{instance_id} のアラームを削除します。')
on_terminated(ALARM_NAME_LIST)
else:
print("no matched detail_type")
EventBridge作成
下のように設定します。
動作確認
Lambda テストイベント使って動作確認します。
detail-typeとEC2InstanceIdを変更します。
削除する場合は、"EC2 Instance Terminate Successful" を使用。
event
{
"version": "0",
"id": "3e3c153a-8339-4e30-8c35-687ebef853fe",
"detail-type": "EC2 Instance Launch Successful",
"source": "aws.autoscaling",
"account": "123456789012",
"time": "2015-11-11T21:31:47Z",
"region": "us-east-1",
"resources": ["arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:eb56d16b-bbf0-401d-b893-d5978ed4a025:autoScalingGroupName/sampleLuanchSucASG", "arn:aws:ec2:us-east-1:123456789012:instance/i-b188560f"],
"detail": {
"StatusCode": "InProgress",
"AutoScalingGroupName": "sampleLuanchSucASG",
"ActivityId": "9cabb81f-42de-417d-8aa7-ce16bf026590",
"Details": {
"Availability Zone": "us-east-1b",
"Subnet ID": "subnet-95bfcebe"
},
"RequestId": "9cabb81f-42de-417d-8aa7-ce16bf026590",
"EndTime": "2015-11-11T21:31:47.208Z",
"EC2InstanceId": "i-b188560f",
"StartTime": "2015-11-11T21:31:13.671Z",
"Cause": "At 2015-11-11T21:31:10Z a user request created an AutoScalingGroup changing the desired capacity from 0 to 1. At 2015-11-11T21:31:11Z an instance was started in response to a difference between desired and actual capacity, increasing the capacity from 0 to 1."
}
}
実行すると設定されていることが確認できました。
参考
参考にさせていただきました。
Discussion