Open9
SQL品質
より信頼できるクエリを書くために、SQLでもテストを書く
重複確認追記
WITH
users AS (
SELECT * FROM UNNEST([
STRUCT<id INT64,name STRING,age INT64,registered_date DATE>
(1,'aaa',25,DATE('2023-01-01'))
,(1,'bbb',30,DATE('2023-01-01'))
,(3,'ccc',20,DATE('2023-03-01'))
,(4,'ddd',43,DATE('2023-07-01'))
,(null,'eee',18,DATE('2023-12-01'))
])
)
,test AS (
SELECT
-- ageにnullがない
CASE
WHEN EXISTS (
SELECT 1
FROM users
WHERE id IS NULL
) THEN ERROR('idにnullが存在します')
ELSE '問題なし'
END AS test1
-- idに重複がない
,CASE
WHEN EXISTS (
SELECT id,COUNT(id) AS count
FROM users
GROUP BY id
HAVING COUNT(id) > 1
) THEN ERROR('idに重複が存在します')
ELSE '問題なし'
END AS test2
)
SELECT *
FROM users
CROSS JOIN test
結局中身が見たい
WITH
users AS (
SELECT * FROM UNNEST([
STRUCT<id INT64,name STRING,age INT64,registered_date DATE>
(1,'aaa',25,DATE('2023-01-01'))
,(1,'bbb',30,DATE('2023-01-01'))
,(3,'ccc',20,DATE('2023-03-01'))
,(4,'ddd',43,DATE('2023-07-01'))
,(null,'eee',18,DATE('2023-12-01'))
])
)
SELECT *
FROM users
where 1 = 1
AND id IN (
SELECT id
FROM users
GROUP BY id
HAVING COUNT(id) > 1
)
OR id IS NULL
ORDER BY id
SQLで統計情報取得
- MySQL
SELECT
'id' AS column_name,
COUNT(id) AS count_value,
COUNT(DISTINCT id) AS unique_count,
COUNT(*) - COUNT(id) AS null_value,
STDDEV(id) AS stddev,
AVG(id) AS avg_value,
MIN(id) AS min_value,
MAX(id) AS max_value
FROM employees
UNION ALL
SELECT
'salary' AS column_name,
COUNT(salary) AS count_value,
COUNT(DISTINCT salary) AS unique_count,
COUNT(*) - COUNT(salary) AS null_value,
STDDEV(salary) AS stddev,
AVG(salary) AS avg_value,
MIN(salary) AS min_value,
MAX(salary) AS max_value
FROM employees
- bigquery
SELECT
'employee_id' AS column_name,
COUNT(employee_id) AS count,
COUNT(DISTINCT employee_id) AS unique_count,
AVG(employee_id) AS mean,
MIN(employee_id) AS min,
MAX(employee_id) AS max,
STDDEV(employee_id) AS stddev,
SUM(employee_id) AS sum,
COUNT(*) - COUNT(employee_id) AS null_count,
PERCENTILE_CONT(employee_id, 0.25) OVER() AS percentile_25,
PERCENTILE_CONT(employee_id, 0.50) OVER() AS percentile_50,
PERCENTILE_CONT(employee_id, 0.75) OVER() AS percentile_75
FROM `your_project.your_dataset.employees`
UNION ALL
SELECT
'salary' AS column_name,
COUNT(salary) AS count,
COUNT(DISTINCT salary) AS unique_count,
AVG(salary) AS mean,
MIN(salary) AS min,
MAX(salary) AS max,
STDDEV(salary) AS stddev,
SUM(salary) AS sum,
COUNT(*) - COUNT(salary) AS null_count,
PERCENTILE_CONT(salary, 0.25) OVER() AS percentile_25,
PERCENTILE_CONT(salary, 0.50) OVER() AS percentile_50,
PERCENTILE_CONT(salary, 0.75) OVER() AS percentile_75
FROM `your_project.your_dataset.employees`
数値型カラムの統計値取得
DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;
-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
SELECT ARRAY_AGG(
STRUCT(
column_name,
'SELECT "' || column_name || '" AS column_name, '
|| 'COUNT(' || column_name || ') AS count, '
|| 'COUNT(DISTINCT ' || column_name || ') AS unique_count, '
|| 'AVG(' || column_name || ' ) AS mean, '
|| 'MIN(' || column_name || ') AS min, '
|| 'MAX(' || column_name || ') AS max, '
|| 'STDDEV(' || column_name || ' ) AS stddev, '
|| 'SUM(' || column_name || ' ) AS sum, '
|| 'COUNT(*) - COUNT(' || column_name || ') AS null_count, '
|| 'FROM `your_project.your_dataset.employees` '
)
)
FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = "employees"
AND data_type IN ('INT64', 'FLOAT64', 'NUMERIC', 'BIGNUMERIC')
);
-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
SELECT STRING_AGG(query_string, ' UNION ALL ')
FROM UNNEST(column_list)
);
EXECUTE IMMEDIATE dynamic_query;
サンプル値取得
DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;
-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
SELECT ARRAY_AGG(
STRUCT(
column_name,
'SELECT "' || column_name || '" AS column_name, ' ||
'STRING_AGG(sample_value, ", " ORDER BY sample_value) AS sample_values ' ||
'FROM ( ' ||
'SELECT DISTINCT CAST(' || column_name || ' AS STRING) AS sample_value ' ||
'FROM `your_project.your_dataset.employees` ' ||
'WHERE RAND() <= 0.01 LIMIT 10' ||
') AS sampled_data'
)
)
FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = "employees"
);
-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
SELECT STRING_AGG(query_string, ' UNION ALL ')
FROM UNNEST(column_list)
);
EXECUTE IMMEDIATE dynamic_query;
サンプル値取得+α
DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;
-- 数値型のカラムリストと各カラムごとのクエリを取得
SET column_list = (
SELECT ARRAY_AGG(
STRUCT(
column_name,
'SELECT "' || column_name || '" AS column_name, ' ||
'COUNT(' || column_name || ') AS count, ' ||
'COUNT(DISTINCT ' || column_name || ') AS unique_count, ' ||
'SUM(CASE WHEN ' || column_name || ' IS NULL THEN 1 ELSE 0 END) AS null_count, ' ||
'(SELECT STRING_AGG(DISTINCT CAST(' || column_name || ' AS STRING), " | ") ' ||
' FROM `your_project.your_dataset.employees` ' ||
' WHERE RAND() <= 0.01 AND ' || column_name || ' IS NOT NULL ' ||
' LIMIT 10) AS sample_values ' ||
'FROM `your_project.your_dataset.employees`'
)
)
FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = "employees"
);
-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
SELECT STRING_AGG(query_string, ' UNION ALL ')
FROM UNNEST(column_list)
);
EXECUTE IMMEDIATE dynamic_query;
テーブルリスト
-- テーブルリストを格納する配列を宣言
DECLARE table_list ARRAY<STRING>;
DECLARE dynamic_query STRING;
-- テーブルリストを取得
SET table_list = (
SELECT ARRAY_AGG(CONCAT(catalog_name, '.', schema_name, '.__TABLES__'))
FROM `PROJECT_ID`.`region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
);
-- 各テーブルのクエリを生成する配列を作成
SET dynamic_query = (
SELECT STRING_AGG(
'SELECT ' ||
'project_id, ' ||
'dataset_id, ' ||
'table_id, ' ||
'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(creation_time), "Asia/Tokyo") AS creation_time, ' ||
'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(creation_time), DAY) AS days_since_creation, ' ||
'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(last_modified_time), "Asia/Tokyo") AS last_modified_time, ' ||
'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(last_modified_time), DAY) AS days_since_last_modification, ' ||
'row_count, ' ||
'size_bytes, ' ||
'size_bytes / 1024 / 1024 / 1024 AS size_gb, ' ||
'CASE type WHEN 1 THEN "table" WHEN 2 THEN "view" ELSE "" END AS table_type ' ||
'FROM `' || table || '`'
, ' UNION ALL ')
FROM UNNEST(table_list) AS table
);
-- クエリの実行
EXECUTE IMMEDIATE dynamic_query;
カラム数追加
-- テーブルリストを格納する配列を宣言
DECLARE table_list ARRAY<STRING>;
DECLARE dynamic_query STRING;
-- テーブルリストを取得
SET table_list = (
SELECT ARRAY_AGG(CONCAT(catalog_name, '.', schema_name, '.__TABLES__'))
FROM `PROJECT_ID`.`region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
);
-- 各テーブルのクエリを生成する配列を作成
SET dynamic_query = (
SELECT STRING_AGG(
'SELECT ' ||
'project_id, ' ||
'dataset_id, ' ||
'table_id, ' ||
'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(creation_time), "Asia/Tokyo") AS creation_time, ' ||
'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(creation_time), DAY) AS days_since_creation, ' ||
'FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_MILLIS(last_modified_time), "Asia/Tokyo") AS last_modified_time, ' ||
'TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), TIMESTAMP_MILLIS(last_modified_time), DAY) AS days_since_last_modification, ' ||
'c.column_count, ' ||
'row_count, ' ||
'size_bytes, ' ||
'size_bytes / 1024 / 1024 / 1024 AS size_gb, ' ||
'CASE type WHEN 1 THEN "table" WHEN 2 THEN "view" ELSE "" END AS table_type ' ||
'FROM `' || table || '` t ' ||
'LEFT JOIN ( SELECT table_name,COUNT(*) AS column_count ' ||
'FROM `' || REPLACE(table, '.__TABLES__', '.INFORMATION_SCHEMA.COLUMNS') || '`' ||
'GROUP BY table_name) as c ' ||
'ON t.table_id = c.table_name '
, ' UNION ALL ')
FROM UNNEST(table_list) AS table
);
-- クエリの実行
EXECUTE IMMEDIATE dynamic_query;
カラムリスト
DECLARE project_id STRING DEFAULT 'your_project_id'; -- プロジェクトIDを指定
DECLARE dataset_list ARRAY<STRING>;
DECLARE dynamic_query STRING; -- 各データセットのカラムリストを取得するクエリを生成
-- データセットリストを取得
SET dataset_list = (
SELECT ARRAY_AGG(schema_name)
FROM `region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
WHERE catalog_name = project_id
);
SET dynamic_query = (
SELECT STRING_AGG(
'SELECT ' ||
'table_catalog AS project_id, ' ||
'table_schema AS dataset_id, ' ||
'table_name, ' ||
'column_name, ' ||
'data_type ' ||
'FROM `' || dataset || '.INFORMATION_SCHEMA.COLUMNS`'
, ' UNION ALL ')
FROM UNNEST(dataset_list) AS dataset
);
-- クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;
DECLARE project_id STRING DEFAULT 'your_project_id'; -- プロジェクトIDを指定
DECLARE dataset_list ARRAY<STRING>;
DECLARE dynamic_query STRING; -- 各データセットのカラムリストを取得するクエリを生成
-- データセットリストを取得
SET dataset_list = (
SELECT ARRAY_AGG(schema_name)
FROM `region-asia-northeast1`.INFORMATION_SCHEMA.SCHEMATA
WHERE catalog_name = project_id
);
SET dynamic_query = (
SELECT STRING_AGG(
'SELECT * ' ||
'FROM `' || dataset || '.INFORMATION_SCHEMA.COLUMNS`'
, ' UNION ALL ')
FROM UNNEST(dataset_list) AS dataset
);
-- クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;
INFORMATION_SCHEMA.COLUMNS 強化版
DECLARE column_list ARRAY<STRUCT<column_name STRING, query_string STRING>>;
DECLARE dynamic_query STRING;
DECLARE table_id STRING;
DECLARE table_name_string STRING;
SET table_id = "your_project.your_dataset.employees"; -- ここでデータセット指定
SET table_name_string = SPLIT(table_id, '.')[OFFSET(2)];
SET column_list = (
SELECT
ARRAY_AGG(
STRUCT(
column_name,
'SELECT ' ||
'"' || table_catalog || '" AS table_catalog, ' ||
'"' || table_schema || '" AS table_schema, ' ||
'"' || table_name || '" AS table_name, ' ||
'"' || column_name || '" AS column_name, ' ||
'"' || ordinal_position || '" AS ordinal_position, ' ||
'"' || is_nullable || '" AS is_nullable, ' ||
'"' || data_type || '" AS data_type, ' ||
'COUNT(' || column_name || ') AS count, ' ||
'COUNT(DISTINCT ' || column_name || ') AS unique_count, ' ||
'SUM(CASE WHEN ' || column_name || ' IS NULL THEN 1 ELSE 0 END) AS null_count, ' ||
CASE
WHEN data_type IN ("INT64", "FLOAT64", "NUMERIC", "BIGNUMERIC") THEN
'CAST(MIN(' || column_name || ') AS STRING) AS min, ' ||
'CAST(MAX(' || column_name || ') AS STRING) AS max, ' ||
'CAST(AVG(' || column_name || ') AS STRING) AS mean, ' ||
'CAST(SUM(' || column_name || ') AS STRING) AS sum, ' ||
'CAST(STDDEV(' || column_name || ') AS STRING) AS stddev, '
WHEN data_type IN ("DATE", "DATETIME", "TIME", "TIMESTAMP") THEN
'CAST(MIN(' || column_name || ') AS STRING) AS min, ' ||
'CAST(MAX(' || column_name || ') AS STRING) AS max, ' ||
'NULL AS mean, NULL AS sum, NULL AS stddev, '
ELSE
'NULL AS min, NULL AS max, NULL AS mean, NULL AS sum, NULL AS stddev, '
END ||
'(SELECT STRING_AGG(DISTINCT CAST(' || column_name || ' AS STRING), " | ") ' ||
' FROM `' || table_id || '` ' ||
' WHERE RAND() <= 30.0 / (SELECT COUNT(*) FROM `' || table_id || '`)' ||
' AND ' || column_name || ' IS NOT NULL ' ||
' ) AS sample_values, ' ||
'FROM `' || table_id || '`'
)
)
FROM `your_project.your_dataset.INFORMATION_SCHEMA.COLUMNS` -- ここでデータセット指定
WHERE table_name = table_name_string
);
-- 各カラムのクエリを実行して結果を出力
SET dynamic_query = (
SELECT STRING_AGG(query_string, ' UNION ALL ')
FROM UNNEST(column_list)
);
EXECUTE IMMEDIATE dynamic_query;
カテゴリ値
ユニーク値で集計可否判断したかったができず、、、
DECLARE column_list ARRAY<STRING>;
DECLARE dynamic_query STRING;
DECLARE table_id STRING;
SET table_id = "your_project.your_dataset.employees";
-- 手動でカラム名を設定
SET column_list = ['column1', 'column2', 'columnN']; -- ここにカラム名を手動で追加
-- 各ユニーク値のカウントを集計するクエリを作成
SET dynamic_query = (
SELECT STRING_AGG(
'SELECT "' || column_name || '" AS column_name, CAST(' || column_name || ' AS STRING) AS value, COUNT(*) AS count ' ||
'FROM `' || table_id || '` ' ||
'GROUP BY ' || column_name,
' UNION ALL '
)
FROM UNNEST(column_list) AS column_name
);
-- 最終クエリを実行して結果を表示
EXECUTE IMMEDIATE dynamic_query;