🕌

LambdaでAutoScalingに自動でアラーム設定する

2023/04/27に公開

はじめに

AutoScalingにより起動したインスタンスにAlarm設定を自動で行いたかったためLambdaを使って自動で設定してみます。

手順

  1. Lambda用ロールを作成する
  2. Lambdaを作成する
  3. EventBridgeを使い、AutoScaling起動時のイベントで発火させるように設定

Lambda用ロール作成

ポリシーは下記になります。

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogGroup",
                "logs:CreateLogStream",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ec2:DescribeInstances",
                "cloudwatch:PutMetricAlarm",
                "cloudwatch:DeleteAlarms",
                "cloudwatch:DescribeAlarms"
            ],
            "Resource": "*"
        }
    ]
}

Lambda作成

Lambdaを作成していきます。
今回はPythonでコード作成したので、ランタイムはPythonにします。
また、実行ロールについては新規作成したものを選択して下さい。

lambda_function.py に下記コードを使用します。
アラームの設定を変更したい場合は、メソッドを変更してください。
on_launched関数 TOPIC_ARN ALARM_NAME_LIST は適宜変更する必要があります。

lambda_function.py
import json
import boto3

# SNS topic_arnを指定
TOPIC_ARN=['arn:aws:sns:ap-northeast-1:xxxxxxxxx:test_1','arn:aws:sns:ap-northeast-1:xxxxxxxxx:test_2']

# instance
ec2 = boto3.resource('ec2')
cw = boto3.client('cloudwatch')

"""
基本的な監視設定を追加できます。
パラメータを変更すれば、設定値を変更可能です。
"""
class SetAlarm:
    # class val
    instance_id = None

    # instance val
    def __init__(self):
        print("create SetAlarm instance")

    # Cpu
    def cpu_util_alarm(self, alarm_name, threshold):
        cw.put_metric_alarm(
            AlarmName= alarm_name,
            AlarmDescription= f'"CPU使用率が {threshold} % を超えていたら通知"',
            Namespace= 'AWS/EC2',
            Dimensions= [
                { "Name": "InstanceId", "Value": self.instance_id }
            ],
            MetricName= 'CPUUtilization',
            Statistic= "Average",
            Threshold= threshold,
            Unit= 'Percent',
            ComparisonOperator= "GreaterThanThreshold",
            Period= 60,
            EvaluationPeriods= 3,
            DatapointsToAlarm= 3,
            TreatMissingData= "ignore",
            AlarmActions= TOPIC_ARN,
            OKActions= TOPIC_ARN,
        )
        print(f'ok, "{alarm_name}" setting is complete')

    # Mem
    def mem_util_alarm(self, alarm_name, threshold):
        cw.put_metric_alarm(
            AlarmName= alarm_name,
            AlarmDescription= f'"Memory使用率が {threshold} % を超えていたら通知"',
            Namespace= 'CWAgent',
            Dimensions= [
                { "Name": "InstanceId", "Value": self.instance_id }
            ],
            MetricName= 'mem_used_percent',
            Statistic= "Average",
            Threshold= threshold,
            Unit= 'Percent',
            ComparisonOperator= "GreaterThanThreshold",
            Period= 60,
            EvaluationPeriods= 3,
            DatapointsToAlarm= 3,
            TreatMissingData= "ignore",
            AlarmActions= TOPIC_ARN,
            OKActions= TOPIC_ARN,
        )
        print(f'ok, "{alarm_name}" setting is complete')

    # Disk
    def disk_util_alarm(self, alarm_name, threshold):
        cw.put_metric_alarm(
            AlarmName= alarm_name,
            AlarmDescription= f'"Disk使用率が {threshold} % を超えていたら通知"',
            Namespace= 'CWAgent',
            Dimensions= [
                { "Name": "InstanceId", "Value": self.instance_id },
                { "Name": "device", "Value": "nvme0n1p1" },
                { "Name": "fstype", "Value": "xfs" },
                { "Name": "path", "Value": "/" }

            ],
            MetricName= 'disk_used_percent',
            Statistic= "Average",
            Threshold= threshold,
            Unit= 'Percent',
            ComparisonOperator= "GreaterThanThreshold",
            Period= 300,
            EvaluationPeriods= 3,
            DatapointsToAlarm= 3,
            TreatMissingData= "ignore",
            AlarmActions= TOPIC_ARN,
            OKActions= TOPIC_ARN,
        )
        print(f'ok, "{alarm_name}" setting is complete')

    # StatusCheck
    def status_check_alarm(self, alarm_name, threshold):
        cw.put_metric_alarm(
            AlarmName= alarm_name,
            AlarmDescription= f'"StatusCheckFailedが {threshold} を超えていたら通知"',
            Namespace= 'AWS/EC2',
            Dimensions= [
                { "Name": "InstanceId", "Value": self.instance_id }
            ],
            MetricName= 'StatusCheckFailed',
            Statistic= "Average",
            Threshold= threshold,
            Unit= 'Count',
            ComparisonOperator= "GreaterThanThreshold",
            Period= 60,
            EvaluationPeriods= 1,
            DatapointsToAlarm= 1,
            TreatMissingData= "ignore",
            AlarmActions= TOPIC_ARN,
            OKActions= TOPIC_ARN,
        )
        print(f'ok, "{alarm_name}" setting is complete')

    # Process
    ## httpd
    def httpd_process_alarm(self, alarm_name, threshold):
        cw.put_metric_alarm(
            AlarmName = alarm_name ,
            AlarmDescription= f'httpd_process_countが {threshold} を超えていたら通知',
            Namespace= 'CWAgent',
            Dimensions= [
                { "Name": "InstanceId", "Value": self.instance_id },
                { "Name": "exe", "Value": "httpd" },
                { "Name": "pid_finder", "Value": "native" }
            ],
            MetricName = 'procstat_lookup_pid_count',
            Statistic = "Average",
            Threshold = threshold,
            ComparisonOperator = "LessThanThreshold",
            Period = 60,
            EvaluationPeriods = 1,
            DatapointsToAlarm = 1,
            TreatMissingData = "ignore",
            AlarmActions = TOPIC_ARN,
            OKActions = TOPIC_ARN,
        )
        print(f'ok, "{alarm_name}" setting is complete')

def on_launched(instance_id, ALARM_NAME_LIST):
    """

    起動時にアラーム設定を設定する
    必要なアラームを設定してください。
    基本的には、threshold, alarm_name に引数を与える
    alarm_nameにはALARM_NAME_LISTから指定
    """
    # cpu
    setalarm = SetAlarm()
    setalarm.cpu_util_alarm(threshold=70, alarm_name=ALARM_NAME_LIST[0])
    setalarm.cpu_util_alarm(threshold=80, alarm_name=ALARM_NAME_LIST[1])
    setalarm.cpu_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[2])
    # mem
    setalarm.mem_util_alarm(threshold=70, alarm_name=ALARM_NAME_LIST[3])
    setalarm.mem_util_alarm(threshold=80, alarm_name=ALARM_NAME_LIST[4])
    setalarm.mem_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[5])
    # disk
    setalarm.disk_util_alarm(threshold=90, alarm_name=ALARM_NAME_LIST[6])
    # statuscheck
    setalarm.status_check_alarm(threshold=1, alarm_name=ALARM_NAME_LIST[7])
    # httpd
    setalarm.httpd_process_alarm(threshold=1, alarm_name=ALARM_NAME_LIST[8])


def on_terminated(ALARM_NAME_LIST):
    """
    インスタンス削除時にアラームも削除する
    ALARM_NAME_LIST に登録されているものを削除
    """
    cw.delete_alarms(AlarmNames = ALARM_NAME_LIST)
    print(f'{ALARM_NAME_LIST} の削除が完了しました。')
    return

# event Main
def lambda_handler(event, context):
    # TODO implement
    instance_id = event['detail']['EC2InstanceId']
    detail_type = event['detail-type']

    # name tag
    instance = ec2.Instance(id=instance_id)
    name_tag = [x['Value'] for x in instance.tags if x['Key'] == 'Name']
    instance_name = name_tag[0] if len(name_tag) else ''

    # クラス変数にセット
    SetAlarm.instance_id = instance_id

    # リスト
    ALARM_NAME_LIST = []

    """ ※アラーム名を設定してください  """
    ALARM_NAME_LIST.extend([
        # CPU
        f'[{instance_name}][{instance_id}]CPUUtilization-High-70%',
        f'[{instance_name}][{instance_id}]CPUUtilization-High-80%',
        f'[{instance_name}][{instance_id}]CPUUtilization-High-90%',
        # MEM
        f'[{instance_name}][{instance_id}]Memorytilization-High-70%',
        f'[{instance_name}][{instance_id}]Memorytilization-High-80%',
        f'[{instance_name}][{instance_id}]Memorytilization-High-90%',
        # DISK
        f'[{instance_name}][{instance_id}]DiskUtilization-High-90%',
        # STATUS
        f'[{instance_name}][{instance_id}]StatusCheckFailed',
        # PROCESS
        f'[{instance_name}][{instance_id}]httpd process down'
    ])

    # EC2の起動 or EC2の終了 
    if detail_type == "EC2 Instance Launch Successful":
        print(f'Instance_ID:{instance_id} のアラームを設定します。')
        on_launched(instance_id, ALARM_NAME_LIST)
    elif detail_type == "EC2 Instance Terminate Successful":
        print(f'Instance_ID:{instance_id} のアラームを削除します。')
        on_terminated(ALARM_NAME_LIST)
    else:
        print("no matched detail_type")

EventBridge作成

下のように設定します。


動作確認

Lambda テストイベント使って動作確認します。
detail-typeとEC2InstanceIdを変更します。
削除する場合は、"EC2 Instance Terminate Successful" を使用。

event
{
  "version": "0",
  "id": "3e3c153a-8339-4e30-8c35-687ebef853fe",
  "detail-type": "EC2 Instance Launch Successful",
  "source": "aws.autoscaling",
  "account": "123456789012",
  "time": "2015-11-11T21:31:47Z",
  "region": "us-east-1",
  "resources": ["arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:eb56d16b-bbf0-401d-b893-d5978ed4a025:autoScalingGroupName/sampleLuanchSucASG", "arn:aws:ec2:us-east-1:123456789012:instance/i-b188560f"],
  "detail": {
    "StatusCode": "InProgress",
    "AutoScalingGroupName": "sampleLuanchSucASG",
    "ActivityId": "9cabb81f-42de-417d-8aa7-ce16bf026590",
    "Details": {
      "Availability Zone": "us-east-1b",
      "Subnet ID": "subnet-95bfcebe"
    },
    "RequestId": "9cabb81f-42de-417d-8aa7-ce16bf026590",
    "EndTime": "2015-11-11T21:31:47.208Z",
    "EC2InstanceId": "i-b188560f",
    "StartTime": "2015-11-11T21:31:13.671Z",
    "Cause": "At 2015-11-11T21:31:10Z a user request created an AutoScalingGroup changing the desired capacity from 0 to 1.  At 2015-11-11T21:31:11Z an instance was started in response to a difference between desired and actual capacity, increasing the capacity from 0 to 1."
  }
}

実行すると設定されていることが確認できました。

参考

参考にさせていただきました。

https://dev.classmethod.jp/articles/get-name-tag-from-instance-object-in-boto3/#toc-1
https://www.m3tech.blog/entry/ec2-auto-alarm-lambda

Discussion