Closed1

DuckDBでJSONのVARCHARとるやつ

黒ヰ樹黒ヰ樹

DuckDBはjqのように使える。

data.json
{
  "id": "https://www.example.com/users/alice/outbox?page=true",
  "partOf": "https://www.example.com/users/alice/outbox",
  "type": "OrderedCollectionPage",
  "totalItems": 1,
  "orderedItems": [
    {
      "id": "https://www.example.com/notes/EXAMPLE/activity",
      "actor": "https://www.example.com/users/alice",
      "type": "Create",
      "published": "2024-10-30T12:00:00.000Z",
      "object": {
        "id": "https://www.example.com/notes/EXAMPLE",
        "type": "Note",
        "attributedTo": "https://www.example.com/users/alice",
        "content": "<p>Example</p>",
        "published": "2024-10-30T12:00:00.000Z",
        "to": [
          "https://www.w3.org/ns/activitystreams#Public"
        ],
        "cc": [
          "https://www.example.com/users/alice/followers"
        ],
        "inReplyTo": null,
        "attachment": [
          {
            "type": "Document",
            "mediaType": "image/png",
            "url": "https://media.example.com/files/00000000-0000-0000-0000-000000000000.png",
            "name": null,
            "sensitive": false
          }
        ],
        "sensitive": false,
        "tag": []
      },
      "to": [
        "https://www.w3.org/ns/activitystreams#Public"
      ],
      "cc": [
        "https://www.example.com/users/alice/followers"
      ]
    }
  ],
  "prev": "https://www.example.com/users/alice/outbox?page=true&since_id=PREV",
  "next": "https://www.example.com/users/alice/outbox?page=true&until_id=NEXT"
}

MisskeyのようなOrderedCollectionPageを例とする。

  • data.jsonはdata.jsonl.gzにminifyしてgzip圧縮
  • @contentは省略
  • idはNOT NULLだが、orderedItems, object, attachmentはNULLの可能性がある
-- $ duckdb -noheader -list -s "SELECT id FROM read_json_auto('data.jsonl.gz');"
-- https://example.com/users/alice/outbox?page=true

SELECT id
FROM read_json_auto('data.jsonl.gz');
-- $ duckdb -noheader -list -s "SELECT count(item) FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz'));"
-- 1

SELECT count(item)
FROM (
    SELECT unnest(orderedItems) AS item
    FROM read_json_auto('data.jsonl.gz')
  );
-- $ duckdb -noheader -list -s "SELECT item->>'id' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz'));"
-- https://www.example.com/notes/EXAMPLE/activity

SELECT item->>'id'
FROM (
    SELECT unnest(orderedItems) AS item
    FROM read_json_auto('data.jsonl.gz')
  );
-- $ duckdb -noheader -list -s "SELECT item->'object'->>'id' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'id' IS NOT NULL;"
-- https://www.example.com/notes/EXAMPLE

SELECT item->'object'->>'id'
FROM (
    SELECT unnest(orderedItems) AS item
    FROM read_json_auto('data.jsonl.gz')
  )
WHERE item->'object'->>'id' IS NOT NULL;
-- $ duckdb -noheader -list -s "SELECT item->'object'->>'content' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'content' IS NOT NULL;"
-- <p>Example</p>

SELECT item->'object'->>'content'
FROM (
    SELECT unnest(orderedItems) AS item
    FROM read_json_auto('data.jsonl.gz')
  )
WHERE item->'object'->>'content' IS NOT NULL;
-- $ duckdb -noheader -list -s "SELECT count(attachment) FROM (SELECT unnest(json_extract(item->'object'->>'attachment', '\$[*]')) AS attachment FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')));"
-- 1

SELECT count(attachment)
FROM (
    SELECT unnest(
        json_extract(item->'object'->>'attachment', '$[*]')
      ) AS attachment
    FROM (
        SELECT unnest(orderedItems) AS item
        FROM read_json_auto('data.jsonl.gz')
      )
  );
-- $ duckdb -noheader -list -s "SELECT attachment->>'url' FROM (SELECT unnest(json_extract(item->'object'->>'attachment', '\$[*]')) AS attachment FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')));"
-- https://media.example.com/files/00000000-0000-0000-0000-000000000000.png

SELECT attachment->>'url'
FROM (
    SELECT unnest(
        json_extract(item->'object'->>'attachment', '$[*]')
      ) AS attachment
    FROM (
        SELECT unnest(orderedItems) AS item
        FROM read_json_auto('data.jsonl.gz')
      )
  );

-- attachmentがarrayではなくobjectの場合(Holloなど)
-- $duckdb -noheader -list -s "SELECT item->'object'->'attachment'->>'url' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'attachment' IS NOT NULL;"
-- https://media.example.com/files/00000000-0000-0000-0000-000000000000.png

SELECT item->'object'->'attachment'->>'url'
FROM (
    SELECT unnest(orderedItems) AS item
    FROM read_json_auto('data.jsonl.gz')
  )
WHERE item->'object'->>'attachment' IS NOT NULL;
このスクラップは20日前にクローズされました