Open9

SQL品質

YuichiYuichi

より信頼できるクエリを書くために、SQLでもテストを書く

https://fkubota.hatenablog.com/entry/test_driven_sql
重複確認追記

WITH
users AS (
  SELECT * FROM UNNEST([
    STRUCT<id INT64,name STRING,age INT64,registered_date DATE>
    (1,'aaa',25,DATE('2023-01-01'))
    ,(1,'bbb',30,DATE('2023-01-01'))
    ,(3,'ccc',20,DATE('2023-03-01'))
    ,(4,'ddd',43,DATE('2023-07-01'))
    ,(null,'eee',18,DATE('2023-12-01'))
  ])
)

,test AS (
  SELECT
    -- ageにnullがない
    CASE
      WHEN EXISTS (
        SELECT 1
        FROM users
        WHERE id IS NULL
      ) THEN ERROR('idにnullが存在します')
      ELSE '問題なし'
    END AS test1

    -- idに重複がない  
    ,CASE
      WHEN EXISTS (
        SELECT id,COUNT(id) AS count
        FROM users
        GROUP BY id
        HAVING COUNT(id) > 1
      ) THEN ERROR('idに重複が存在します')
      ELSE '問題なし'
    END AS test2
)

SELECT *
FROM users
CROSS JOIN test
YuichiYuichi

結局中身が見たい

WITH
users AS (
  SELECT * FROM UNNEST([
    STRUCT<id INT64,name STRING,age INT64,registered_date DATE>
    (1,'aaa',25,DATE('2023-01-01'))
    ,(1,'bbb',30,DATE('2023-01-01'))
    ,(3,'ccc',20,DATE('2023-03-01'))
    ,(4,'ddd',43,DATE('2023-07-01'))
    ,(null,'eee',18,DATE('2023-12-01'))
  ])
)

SELECT *
FROM users
where 1 = 1
AND id IN (
    SELECT id
    FROM users
    GROUP BY id
    HAVING COUNT(id) > 1
  )
OR id IS NULL
ORDER BY id
YuichiYuichi

SQLで統計情報取得

  • MySQL
SELECT 
    'id' AS column_name, 
    COUNT(id) AS count_value, 
    COUNT(DISTINCT id) AS unique_count, 
    COUNT(*) - COUNT(id) AS null_value,
    STDDEV(id) AS stddev,
    AVG(id) AS avg_value,
    MIN(id) AS min_value, 
    MAX(id) AS max_value
FROM employees

UNION ALL

SELECT 
    'salary' AS column_name, 
    COUNT(salary) AS count_value, 
    COUNT(DISTINCT salary) AS unique_count, 
    COUNT(*) - COUNT(salary) AS null_value,
    STDDEV(salary) AS stddev,
    AVG(salary) AS avg_value,
    MIN(salary) AS min_value, 
    MAX(salary) AS max_value
FROM employees
  • bigquery
  SELECT 
    'employee_id' AS column_name,
    COUNT(employee_id) AS count,
    COUNT(DISTINCT employee_id) AS unique_count,
    AVG(employee_id) AS mean,
    MIN(employee_id) AS min,
    MAX(employee_id) AS max,
    STDDEV(employee_id) AS stddev,
    SUM(employee_id) AS sum,
    COUNT(*) - COUNT(employee_id) AS null_count,
    PERCENTILE_CONT(employee_id, 0.25) OVER() AS percentile_25,
    PERCENTILE_CONT(employee_id, 0.50) OVER() AS percentile_50,
    PERCENTILE_CONT(employee_id, 0.75) OVER() AS percentile_75
  FROM `your_project.your_dataset.employees`
  
  UNION ALL
  
  SELECT 
    'salary' AS column_name,
    COUNT(salary) AS count,
    COUNT(DISTINCT salary) AS unique_count,
    AVG(salary) AS mean,
    MIN(salary) AS min,
    MAX(salary) AS max,
    STDDEV(salary) AS stddev,
    SUM(salary) AS sum,
    COUNT(*) - COUNT(salary) AS null_count,
    PERCENTILE_CONT(salary, 0.25) OVER() AS percentile_25,
    PERCENTILE_CONT(salary, 0.50) OVER() AS percentile_50,
    PERCENTILE_CONT(salary, 0.75) OVER() AS percentile_75
  FROM `your_project.your_dataset.employees`
YuichiYuichi

数値型カラムの統計値取得

DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;

-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
  SELECT ARRAY_AGG(
    STRUCT(
      column_name,
      'SELECT "' || column_name || '" AS column_name, '
        || 'COUNT(' || column_name || ') AS count, '
        || 'COUNT(DISTINCT ' || column_name || ') AS unique_count, '
        || 'AVG(' || column_name || ' ) AS mean, '
        || 'MIN(' || column_name || ') AS min, '
        || 'MAX(' || column_name || ') AS max, '
        || 'STDDEV(' || column_name || ' ) AS stddev, '
        || 'SUM(' || column_name || ' ) AS sum, '
        || 'COUNT(*) - COUNT(' || column_name || ') AS null_count, '
        || 'FROM `your_project.your_dataset.employees` '
    )
  )
  FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
  WHERE table_name = "employees"
    AND data_type IN ('INT64', 'FLOAT64', 'NUMERIC', 'BIGNUMERIC')
);

-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
  SELECT STRING_AGG(query_string, ' UNION ALL ')
  FROM UNNEST(column_list)
);

EXECUTE IMMEDIATE dynamic_query;
YuichiYuichi

サンプル値取得

DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;

-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
  SELECT ARRAY_AGG(
    STRUCT(
      column_name,
      'SELECT "' || column_name || '" AS column_name, ' ||
      'STRING_AGG(sample_value, ", " ORDER BY sample_value) AS sample_values ' ||
      'FROM ( ' ||
      'SELECT DISTINCT CAST(' || column_name || ' AS STRING) AS sample_value ' ||
      'FROM `your_project.your_dataset.employees` ' ||
      'WHERE RAND() <= 0.01 LIMIT 10' ||
      ') AS sampled_data'
    )
  )
  FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
  WHERE table_name = "employees"
);

-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
  SELECT STRING_AGG(query_string, ' UNION ALL ')
  FROM UNNEST(column_list)
);

EXECUTE IMMEDIATE dynamic_query;

サンプル値取得+α

DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;

-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
  SELECT ARRAY_AGG(
    STRUCT(
      column_name,
      'SELECT "' || column_name || '" AS column_name, ' ||
      'COUNT(' || column_name || ') AS count, ' ||
      'COUNT(DISTINCT ' || column_name || ') AS unique_count, ' ||
      'SUM(CASE WHEN ' || column_name || ' IS NULL THEN 1 ELSE 0 END) AS null_count, ' ||
      '(SELECT STRING_AGG(DISTINCT CAST(' || column_name || ' AS STRING), " | ") ' ||
      ' FROM `your_project.your_dataset.employees` ' ||
      ' WHERE RAND() <= 0.01 AND ' || column_name || ' IS NOT NULL ' ||
      ' LIMIT 10) AS sample_values ' ||
      'FROM `your_project.your_dataset.employees`'
    )
  )
  FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
  WHERE table_name = "employees"
);

-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
  SELECT STRING_AGG(query_string, ' UNION ALL ')
  FROM UNNEST(column_list)
);

EXECUTE IMMEDIATE dynamic_query;
YuichiYuichi

テーブルリスト

-- テーブルリストを格納する配列を宣言
DECLARE table_list ARRAY<STRING>;
DECLARE dynamic_query STRING;

-- テーブルリストを取得
SET table_list = (
  SELECT ARRAY_AGG(CONCAT(catalog_name, '.', schema_name, '.__TABLES__'))
  FROM `PROJECT_ID`.`region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
);

-- 各テーブルのクエリを生成する配列を作成
SET dynamic_query = (
  SELECT STRING_AGG(
    'SELECT ' ||
      'project_id, ' ||
      'dataset_id, ' ||
      'table_id, ' ||
      'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(creation_time), "Asia/Tokyo") AS creation_time, ' ||
      'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(creation_time), DAY) AS days_since_creation, ' ||
      'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(last_modified_time), "Asia/Tokyo") AS last_modified_time, ' ||
      'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(last_modified_time), DAY) AS days_since_last_modification, ' ||
      'row_count, ' ||
      'size_bytes, ' ||
      'size_bytes / 1024 / 1024 / 1024 AS size_gb, ' ||
      'CASE type WHEN 1 THEN "table" WHEN 2 THEN "view" ELSE "" END AS table_type ' ||
    'FROM `' || table || '`'
    , ' UNION ALL ')
  FROM UNNEST(table_list) AS table
);

-- クエリの実行
EXECUTE IMMEDIATE dynamic_query;

カラム数追加

-- テーブルリストを格納する配列を宣言
DECLARE table_list ARRAY<STRING>;
DECLARE dynamic_query STRING;

-- テーブルリストを取得
SET table_list = (
  SELECT ARRAY_AGG(CONCAT(catalog_name, '.', schema_name, '.__TABLES__'))
  FROM `PROJECT_ID`.`region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
);

-- 各テーブルのクエリを生成する配列を作成
SET dynamic_query = (
  SELECT STRING_AGG(
    'SELECT ' ||
      'project_id, ' ||
      'dataset_id, ' ||
      'table_id, ' ||
      'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(creation_time), "Asia/Tokyo") AS creation_time, ' ||
      'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(creation_time), DAY) AS days_since_creation, ' ||
      'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(last_modified_time), "Asia/Tokyo") AS last_modified_time, ' ||
      'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(last_modified_time), DAY) AS days_since_last_modification, ' ||
      'c.column_count, ' ||
      'row_count, ' ||
      'size_bytes, ' ||
      'size_bytes / 1024 / 1024 / 1024 AS size_gb, ' ||
      'CASE type WHEN 1 THEN "table" WHEN 2 THEN "view" ELSE "" END AS table_type ' ||
    'FROM `' || table || '` t ' ||
    'LEFT JOIN ( SELECT table_name,COUNT(*) AS column_count ' ||
      'FROM `' || REPLACE(table, '.__TABLES__', '.INFORMATION_SCHEMA.COLUMNS') || '`' ||
      'GROUP BY table_name) as c ' ||
    'ON t.table_id = c.table_name ' 
    , ' UNION ALL ')
  FROM UNNEST(table_list) AS table
);

-- クエリの実行
EXECUTE IMMEDIATE dynamic_query;
YuichiYuichi

カラムリスト

DECLARE project_id STRING DEFAULT 'your_project_id'; -- プロジェクトIDを指定
DECLARE dataset_list ARRAY<STRING>;
DECLARE dynamic_query STRING; -- 各データセットのカラムリストを取得するクエリを生成

-- データセットリストを取得
SET dataset_list = (
  SELECT ARRAY_AGG(schema_name)
  FROM `region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
  WHERE catalog_name = project_id
);

SET dynamic_query = (
  SELECT STRING_AGG(
    'SELECT ' ||
    'table_catalog AS project_id, ' ||
    'table_schema AS dataset_id, ' ||
    'table_name, ' ||
    'column_name, ' ||
    'data_type ' ||
    'FROM `' || dataset || '.INFORMATION_SCHEMA.COLUMNS`'
  , ' UNION ALL ')
  FROM UNNEST(dataset_list) AS dataset
);

-- クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;
DECLARE project_id STRING DEFAULT 'your_project_id'; -- プロジェクトIDを指定
DECLARE dataset_list ARRAY<STRING>;
DECLARE dynamic_query STRING; -- 各データセットのカラムリストを取得するクエリを生成

-- データセットリストを取得
SET dataset_list = (
  SELECT ARRAY_AGG(schema_name)
  FROM `region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
  WHERE catalog_name = project_id
);

SET dynamic_query = (
  SELECT STRING_AGG(
    'SELECT * ' ||
    'FROM `' || dataset || '.INFORMATION_SCHEMA.COLUMNS`'
  , ' UNION ALL ')
  FROM UNNEST(dataset_list) AS dataset
);

-- クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;
YuichiYuichi

INFORMATION_SCHEMA.COLUMNS 強化版

DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;
DECLARE table_id STRING;
DECLARE table_name_string STRING;

SET table_id = "your_project.your_dataset.employees";  -- ここでデータセット指定
SET table_name_string = SPLIT(table_id, '.')[OFFSET(2)];

SET column_list = (
  SELECT
    ARRAY_AGG(
      STRUCT(
        column_name,
        'SELECT ' ||
        '"' || table_catalog || '" AS table_catalog, ' ||
        '"' || table_schema || '" AS table_schema, ' ||
        '"' || table_name || '" AS table_name, ' ||
        '"' || column_name || '" AS column_name, ' ||
        '"' || ordinal_position || '" AS ordinal_position, ' ||
        '"' || is_nullable || '" AS is_nullable, ' ||
        '"' || data_type || '" AS data_type, ' ||
        'COUNT(' || column_name || ') AS count, ' ||
        'COUNT(DISTINCT ' || column_name || ') AS unique_count, ' ||
        'SUM(CASE WHEN ' || column_name || ' IS NULL THEN 1 ELSE 0 END) AS null_count, ' ||
        CASE
          WHEN data_type IN ("INT64", "FLOAT64", "NUMERIC", "BIGNUMERIC") THEN
            'CAST(MIN(' || column_name || ') AS STRING) AS min, ' ||
            'CAST(MAX(' || column_name || ') AS STRING) AS max, ' ||
            'CAST(AVG(' || column_name || ') AS STRING) AS mean, ' ||
            'CAST(SUM(' || column_name || ') AS STRING) AS sum, ' ||
            'CAST(STDDEV(' || column_name || ') AS STRING) AS stddev, '
          WHEN data_type IN ("DATE", "DATETIME", "TIME", "TIMESTAMP") THEN
            'CAST(MIN(' || column_name || ') AS STRING) AS min, ' ||
            'CAST(MAX(' || column_name || ') AS STRING) AS max, ' ||
            'NULL AS mean, NULL AS sum, NULL AS stddev, '          
          ELSE
            'NULL AS min, NULL AS max, NULL AS mean, NULL AS sum, NULL AS stddev, '
        END ||
        '(SELECT STRING_AGG(DISTINCT CAST(' || column_name || ' AS STRING), " | ") ' ||
        ' FROM `' || table_id || '` ' ||
        ' WHERE RAND() <= 30.0 / (SELECT COUNT(*) FROM `' || table_id || '`)' ||
        ' AND ' || column_name || ' IS NOT NULL ' ||
        ' ) AS sample_values, ' ||
        'FROM `' || table_id || '`'
      )
    )
  FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`  -- ここでデータセット指定
  WHERE table_name = table_name_string
);

-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
  SELECT STRING_AGG(query_string, ' UNION ALL ')
  FROM UNNEST(column_list)
);

EXECUTE IMMEDIATE dynamic_query;
YuichiYuichi

カテゴリ値

ユニーク値で集計可否判断したかったができず、、、

DECLARE column_list ARRAY<STRING>;
DECLARE dynamic_query STRING;
DECLARE table_id STRING;

SET table_id = "your_project.your_dataset.employees";

-- 手動でカラム名を設定
SET column_list = ['column1', 'column2', 'columnN']; -- ここにカラム名を手動で追加

-- 各ユニーク値のカウントを集計するクエリを作成
SET dynamic_query = (
  SELECT STRING_AGG(
    'SELECT "' || column_name || '" AS column_name, CAST(' || column_name || ' AS STRING) AS value, COUNT(*) AS count ' ||
    'FROM `' || table_id || '` ' ||
    'GROUP BY ' || column_name,
    ' UNION ALL '
  )
  FROM UNNEST(column_list) AS column_name
);

-- 最終クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;