Closed6

DuckDBでR2 Data Catalog & DuckLakeを試す(with Neon)

sugar-catsugar-cat

まずはIcebergを扱えるR2 Data Catalogを使う(DWHとして使う)

https://developers.cloudflare.com/r2/data-catalog/config-examples/duckdb/#example-usage

とりあえずデータ投入

[project]
name = "duckdb-cf-iceberg"
version = "0.1.0"
description = "Iceberg data creation for DuckDB CloudFlare project"
requires-python = ">=3.13"
dependencies = [
    "pyarrow",
    "pyiceberg",
]

[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
uv sync

実行

import pyarrow as pa
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.exceptions import NamespaceAlreadyExistsError

# Define catalog connection details (replace variables)
WAREHOUSE = "..."
TOKEN = "..."
CATALOG_URI = "..."

# Connect to R2 Data Catalog
catalog = RestCatalog(
    name="my_catalog",
    warehouse=WAREHOUSE,
    uri=CATALOG_URI,
    token=TOKEN,
)

# Create default namespace
catalog.create_namespace("default")

# Create simple PyArrow table
df = pa.table({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
})

# Create an Iceberg table
test_table = ("default", "my_table")
table = catalog.create_table(
    test_table,
    schema=df.schema,
)

$ duckdb

DuckDB v1.3.2 (Ossivalis) 0b83e5d2f6
Enter ".help" for usage hints.
Connected to a transient in-memory database.
Use ".open FILENAME" to reopen on a persistent database.
D INSTALL iceberg;
D LOAD iceberg;
D CREATE SECRET r2_secret (
      TYPE ICEBERG,
      TOKEN 'xxx'
  );
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘
D SHOW ALL TABLES;
┌──────────┬─────────┬─────────┬──────────────┬──────────────┬───────────┐
│ database │ schema  │  name   │ column_names │ column_types │ temporary │
│ varchar  │ varchar │ varchar │  varchar[]   │  varchar[]   │  boolean  │
├──────────┴─────────┴─────────┴──────────────┴──────────────┴───────────┤
│                                 0 rows                                 │
└────────────────────────────────────────────────────────────────────────┘
D ATTACH 'xxx' AS my_r2_catalog (
      TYPE ICEBERG,
      ENDPOINT 'xxx'
  );
D SHOW ALL TABLES;
┌───────────────┬─────────┬──────────┬───┬───────────────────┬───────────┐
│   database    │ schema  │   name   │ … │   column_types    │ temporary │
│    varchar    │ varchar │ varchar  │   │     varchar[]     │  boolean  │
├───────────────┼─────────┼──────────┼───┼───────────────────┼───────────┤
│ my_r2_catalog │ default │ my_table │ … │ [BIGINT, VARCHAR] │ false     │
├───────────────┴─────────┴──────────┴───┴───────────────────┴───────────┤
│ 1 rows                                             6 columns (5 shown) │
└────────────────────────────────────────────────────────────────────────┘
D SELECT * FROM my_r2_catalog.default.my_table;
┌───────┬─────────┐
│  id   │  name   │
│ int64 │ varchar │
├───────┼─────────┤
│     1 │ Alice   │
│     2 │ Bob     │
│     3 │ Charlie │
└───────┴─────────┘
sugar-catsugar-cat

メタデータレイヤーをNeon
ストレージレイヤーをR2
としてローカルのDuckDB拡張を利用して接続する

INSTALL ducklake;
LOAD ducklake;


CREATE OR REPLACE SECRET r2_secret (
    TYPE R2,
    KEY_ID 'xxx',
    SECRET 'xxx',
    ACCOUNT_ID 'xxx',
    ENDPOINT 'xxx'
);

ATTACH 'ducklake:postgres:dbname=xxx host=xxx.neon.tech user=neondb_owner password=xxx sslmode=require port=5432' 
AS lake (DATA_PATH 'r2://ducklake');

USE lake;

CREATE TABLE IF NOT EXISTS sales_data (
    id INTEGER,
    product VARCHAR,
    amount DECIMAL(10,2),
    sale_date DATE
);

INSERT INTO sales_data VALUES 
    (1, 'Product A', 100.50, '2025-01-01'),
    (2, 'Product B', 200.75, '2025-01-02'),
    (3, 'Product C', 150.25, '2025-01-03');

SELECT * FROM sales_data;
┌───────┬───────────┬───────────────┬────────────┐
│  id   │  product  │    amount     │ sale_date  │
│ int32 │  varchar  │ decimal(10,2) │    date    │
├───────┼───────────┼───────────────┼────────────┤
│     1 │ Product A │        100.50 │ 2025-01-01 │
│     2 │ Product B │        200.75 │ 2025-01-02 │
│     3 │ Product C │        150.25 │ 2025-01-03 │
└───────┴───────────┴───────────────┴────────────┘

SELECT * FROM ducklake_snapshots('lake');
┌─────────────┬────────────────────────────┬────────────────┬────────────────────────────────────┐
│ snapshot_id │       snapshot_time        │ schema_version │              changes               │
│    int64    │  timestamp with time zone  │     int64      │      map(varchar, varchar[])       │
├─────────────┼────────────────────────────┼────────────────┼────────────────────────────────────┤
│           0 │ 2025-07-25 00:53:24.574+09 │              0 │ {schemas_created=[main]}           │
│           1 │ 2025-07-25 00:53:36.446+09 │              1 │ {tables_created=[main.sales_data]} │
│           2 │ 2025-07-25 00:56:55.266+09 │              1 │ {tables_inserted_into=[1]}         │
└─────────────┴────────────────────────────┴────────────────┴────────────────────────────────────┘


sugar-catsugar-cat
  • ducklake_data_file
    • R2上の実データファイルへの参照を管理
=> select * from ducklake_data_file;
 data_file_id | table_id | begin_snapshot | end_snapshot | file_order |                         path                          | path_is_relative | file_format | record_count | file_size_bytes | footer_size | row_id_start | partition_id | encryption_key | partial_file_info | mapping_id 
--------------+----------+----------------+--------------+------------+-------------------------------------------------------+------------------+-------------+--------------+-----------------+-------------+--------------+--------------+----------------+-------------------+------------
            0 |        1 |              2 |              |            | ducklake-01983d26-e398-7c80-89f7-163517ae347f.parquet | t                | parquet     |            3 |             950 |         481 |            0 |              |                |                   |           
(1 row)

  • ducklake_table + ducklake_column
    • テーブルとカラムの定義(スキーマ情報)
=> select * from ducklake_table;
 table_id |              table_uuid              | begin_snapshot | end_snapshot | schema_id | table_name |    path     | path_is_relative 
----------+--------------------------------------+----------------+--------------+-----------+------------+-------------+------------------
        1 | 01983d23-daf5-7f68-97c9-34e99d11cf6f |              1 |              |         0 | sales_data | sales_data/ | t
(1 row)

=> select * from ducklake_column;
 column_id | begin_snapshot | end_snapshot | table_id | column_order | column_name |  column_type  | initial_default | default_value | nulls_allowed | parent_column 
-----------+----------------+--------------+----------+--------------+-------------+---------------+-----------------+---------------+---------------+---------------
         1 |              1 |              |        1 |            1 | id          | int32         |                 |               | t             |              
         2 |              1 |              |        1 |            2 | product     | varchar       |                 |               | t             |              
         3 |              1 |              |        1 |            3 | amount      | decimal(10,2) |                 |               | t             |              
         4 |              1 |              |        1 |            4 | sale_date   | date          |                 |               | t             |              
(4 rows)
  • ducklake_file_column_statistics
    • ファイルごとのカラム統計情報
=> select * from ducklake_file_column_statistics;
 data_file_id | table_id | column_id | column_size_bytes | value_count | null_count | min_value  | max_value  | contains_nan 
--------------+----------+-----------+-------------------+-------------+------------+------------+------------+--------------
            0 |        1 |         1 |                60 |           3 |          0 | 1          | 3          | 
            0 |        1 |         2 |                79 |           3 |          0 | Product A  | Product C  | 
            0 |        1 |         3 |                70 |           3 |          0 | 100.50     | 200.75     | 
            0 |        1 |         4 |                60 |           3 |          0 | 2025-01-01 | 2025-01-03 | 
(4 rows)
sugar-catsugar-cat

R2側
メタデータ側のpathに一致するobjectがストレージにある
ducklake-01983d26-e398-7c80-89f7-163517ae347f.parquet

このスクラップは1ヶ月前にクローズされました