🔆

Aurora Serverless v2 と Redshift Serverless の Zero-ETL統合 でコスパ最強のデータ分析基盤

はじめに

データ分析基盤を作りたいと思ったことは誰しもあるかと思います。

AWSではOLTPワークロードにAurora、OLAPワークロードにRedshiftが提供されていましたが、どちらも最低利用料が月額1万円程度かかりました。
しかし近年ではServerless版が発表され、負荷の軽いワークロードでは低価格で利用することが可能となりました。

今回は、Terraformを使ってAurora Serverless v2とRedshift ServerlessのZero-ETL統合を簡単に構築する方法を紹介します。

Terraformコード

全体のTerraformコードです。

Terraform
data "aws_caller_identity" "this" {}

resource "random_id" "this" {
  byte_length = 4
}

#
# Aurora Serverless
#

# Aurora Serverless – PostgreSQL
resource "aws_rds_cluster" "this" {
  cluster_identifier              = "${var.service}-${var.environment}-cluster"
  engine                          = "aurora-postgresql"
  engine_mode                     = "provisioned"
  engine_version                  = "16.6" # ご利用のバージョンに合わせて変更してください
  database_name                   = "${var.service}_${var.environment}_db"
  master_username                 = "${var.service}_${var.environment}_admin"
  master_password                 = var.rds_password
  storage_encrypted               = true
  enabled_cloudwatch_logs_exports = ["postgresql"]
  backup_retention_period         = 7
  final_snapshot_identifier       = "${var.service}-${var.environment}-cluster-${random_id.this.hex}"
  db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.this.name
  db_subnet_group_name            = aws_db_subnet_group.rds.name
  vpc_security_group_ids          = [aws_security_group.rds.id]
  network_type                    = "DUAL"

  serverlessv2_scaling_configuration {
    max_capacity = 1.0
    min_capacity = 0.5
  }

  lifecycle {
    ignore_changes = [
      engine_version
    ]
  }
}

resource "aws_rds_cluster_instance" "this" {
  identifier                   = "${var.service}-${var.environment}-instance"
  cluster_identifier           = aws_rds_cluster.this.id
  instance_class               = "db.serverless"
  engine                       = aws_rds_cluster.this.engine
  engine_version               = aws_rds_cluster.this.engine_version
  db_subnet_group_name         = aws_db_subnet_group.rds.name
  db_parameter_group_name      = aws_db_parameter_group.this.name
  performance_insights_enabled = var.environment == "prod" ? true : false
}

# Aurora Serverless – Security group
resource "aws_security_group" "rds" {
  name   = "${var.service}-${var.environment}-rds-sg"
  vpc_id = var.vpc.id
}

# Aurora Serverless – Parameter group
resource "aws_rds_cluster_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-cluster-parameter"
  family = "aurora-postgresql16" # ご利用のバージョンファミリーに合わせて変更してください

  parameter {
    apply_method = "pending-reboot"
    name         = "rds.logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.enhanced_logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_backup"
    value        = 0
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_globaldb"
    value        = 0
  }
}

resource "aws_db_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-instance-parameter"
  family = "aurora-postgresql16" # ご利用のバージョンファミリーに合わせて変更してください
}

# Aurora Serverless – Subnet group
resource "aws_db_subnet_group" "rds" {
  name        = "${var.service}-${var.environment}-rds-sg"
  description = "${var.service}-${var.environment}-rds-sg"
  subnet_ids  = var.database_subnet_ids
}

#
# Redshift Serverless
#

# RedShift Serverless – Namespace
resource "aws_redshiftserverless_namespace" "this" {
  namespace_name = "${var.service}-${var.environment}-redshift-serverless-namespace"
  db_name        = "${var.service}-${var.environment}-db"
}

# RedShift Serverless – Workgroup
resource "aws_redshiftserverless_workgroup" "this" {
  namespace_name       = aws_redshiftserverless_namespace.this.namespace_name
  workgroup_name       = "${var.service}-${var.environment}-redshift-serverless-workgroup"
  subnet_ids           = aws_db_subnet_group.redshift.subnet_ids
  security_group_ids   = [aws_security_group.redshift.id]
  base_capacity        = 8
  max_capacity         = 256
  enhanced_vpc_routing = true
  publicly_accessible  = false
  port                 = 5439

  config_parameter {
    parameter_key   = "auto_mv"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "datestyle"
    parameter_value = "ISO, MDY"
  }
  config_parameter {
    parameter_key   = "enable_case_sensitive_identifier"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "enable_user_activity_logging"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "max_query_execution_time"
    parameter_value = "14400"
  }
  config_parameter {
    parameter_key   = "query_group"
    parameter_value = "default"
  }
  config_parameter {
    parameter_key   = "require_ssl"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "search_path"
    parameter_value = "$user, public"
  }
  config_parameter {
    parameter_key   = "use_fips_ssl"
    parameter_value = "false"
  }
}

data "aws_iam_policy_document" "this" {
  statement {
    effect = "Allow"
    principals {
      type        = "Service"
      identifiers = ["redshift.amazonaws.com"]
    }
    actions = [
      "redshift:AuthorizeInboundIntegration"
    ]
    condition {
      test     = "StringEquals"
      variable = "aws:SourceArn"
      values   = [aws_rds_cluster.this.arn]
    }
  }
  statement {
    effect = "Allow"
    principals {
      type        = "AWS"
      identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
    }
    actions = [
      "redshift:CreateInboundIntegration",
    ]
  }
}

resource "aws_redshift_resource_policy" "this" {
  resource_arn = aws_redshiftserverless_namespace.this.arn
  policy       = data.aws_iam_policy_document.this.json
}

# RedShift Serverless – Security group
resource "aws_security_group" "redshift" {
  name   = "${var.service}-${var.environment}-redshift-sg"
  vpc_id = var.vpc.id
}

# RedShift Serverless – Subnet group
resource "aws_db_subnet_group" "redshift" {
  name       = "${var.service}-${var.environment}-redshift-subnet-group"
  subnet_ids = var.database_subnet_ids
}

#
# Zero-ETL intergration
#

resource "aws_rds_integration" "this" {
  integration_name = "${var.service}-${var.environment}-zero-etl-integration"
  source_arn       = aws_rds_cluster.this.arn
  target_arn       = aws_redshiftserverless_namespace.this.arn
  data_filter      = "include: ${var.service}_${var.environment}_db.*.*"
}

コード解説

重要な部分のみ解説します。

Aurora パラメータ

Zero-ETL統合には、以下の4つのパラメータ設定が必須です。

resource "aws_rds_cluster_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-cluster-parameter"
  family = "aurora-postgresql16"

  parameter {
    apply_method = "pending-reboot"
    name         = "rds.logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.enhanced_logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_backup"
    value        = 0
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_globaldb"
    value        = 0
  }
}

Redshift パラメータ

RedshiftのWorkgroupでは、enable_case_sensitive_identifier = "true" の設定がZero-ETL統合に必須です。

resource "aws_redshiftserverless_workgroup" "this" {
  # ... (省略) ...

  config_parameter {
    parameter_key   = "auto_mv"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "datestyle"
    parameter_value = "ISO, MDY"
  }
  config_parameter {
    parameter_key   = "enable_case_sensitive_identifier"
    parameter_value = "true" # trueが必須
  }
  # ... (その他のパラメータ) ...
}

Redshift リソースポリシー

Redshiftのリソースポリシーを設定し、Auroraからデータを受け取る権限と、AWSアカウント内のユーザーがETL統合を操作する権限を付与する必要があります。

data "aws_iam_policy_document" "this" {
  statement {
    effect = "Allow"
    principals {
      type        = "Service"
      identifiers = ["redshift.amazonaws.com"]
    }
    actions = [
      "redshift:AuthorizeInboundIntegration"
    ]
    condition {
      test     = "StringEquals"
      variable = "aws:SourceArn"
      values   = [aws_rds_cluster.this.arn]
    }
  }
  statement {
    effect = "Allow"
    principals {
      type        = "AWS"
      identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
    }
    actions = [
      "redshift:CreateInboundIntegration",
    ]
  }
}

resource "aws_redshift_resource_policy" "this" {
  resource_arn = aws_redshiftserverless_namespace.this.arn
  policy       = data.aws_iam_policy_document.this.json
}

Zero-ETL 統合

以上の設定が完了したら、aws_rds_integrationリソースでAuroraとRedshiftを統合します。data_filterで同期するデータベースやテーブルを指定できます。

resource "aws_rds_integration" "this" {
  integration_name = "${var.service}-${var.environment}-zero-etl-integration"
  source_arn       = aws_rds_cluster.this.arn
  target_arn       = aws_redshiftserverless_namespace.this.arn
  data_filter      = "include: ${var.service}_${var.environment}_db.*.*"
}

お疲れ様でした!これでZero-ETL統合のインフラ構築は完了です。

Discussion