Closed1
DuckDBでJSONのVARCHARとるやつ
DuckDBはjqのように使える。
data.json
{
"id": "https://www.example.com/users/alice/outbox?page=true",
"partOf": "https://www.example.com/users/alice/outbox",
"type": "OrderedCollectionPage",
"totalItems": 1,
"orderedItems": [
{
"id": "https://www.example.com/notes/EXAMPLE/activity",
"actor": "https://www.example.com/users/alice",
"type": "Create",
"published": "2024-10-30T12:00:00.000Z",
"object": {
"id": "https://www.example.com/notes/EXAMPLE",
"type": "Note",
"attributedTo": "https://www.example.com/users/alice",
"content": "<p>Example</p>",
"published": "2024-10-30T12:00:00.000Z",
"to": [
"https://www.w3.org/ns/activitystreams#Public"
],
"cc": [
"https://www.example.com/users/alice/followers"
],
"inReplyTo": null,
"attachment": [
{
"type": "Document",
"mediaType": "image/png",
"url": "https://media.example.com/files/00000000-0000-0000-0000-000000000000.png",
"name": null,
"sensitive": false
}
],
"sensitive": false,
"tag": []
},
"to": [
"https://www.w3.org/ns/activitystreams#Public"
],
"cc": [
"https://www.example.com/users/alice/followers"
]
}
],
"prev": "https://www.example.com/users/alice/outbox?page=true&since_id=PREV",
"next": "https://www.example.com/users/alice/outbox?page=true&until_id=NEXT"
}
MisskeyのようなOrderedCollectionPageを例とする。
- data.jsonはdata.jsonl.gzにminifyしてgzip圧縮
-
@content
は省略 - idはNOT NULLだが、orderedItems, object, attachmentはNULLの可能性がある
-- $ duckdb -noheader -list -s "SELECT id FROM read_json_auto('data.jsonl.gz');"
-- https://www.example.com/users/alice/outbox?page=true
SELECT id
FROM read_json_auto('data.jsonl.gz');
-- $ duckdb -noheader -list -s "SELECT count(item) FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz'));"
-- 1
SELECT count(item)
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
);
-- $ duckdb -noheader -list -s "SELECT item->>'id' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz'));"
-- https://www.example.com/notes/EXAMPLE/activity
SELECT item->>'id'
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
);
-- $ duckdb -noheader -list -s "SELECT item->'object'->>'id' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'id' IS NOT NULL;"
-- https://www.example.com/notes/EXAMPLE
SELECT item->'object'->>'id'
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
)
WHERE item->'object'->>'id' IS NOT NULL;
-- $ duckdb -noheader -list -s "SELECT item->'object'->>'content' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'content' IS NOT NULL;"
-- <p>Example</p>
SELECT item->'object'->>'content'
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
)
WHERE item->'object'->>'content' IS NOT NULL;
-- $ duckdb -noheader -list -s "SELECT count(attachment) FROM (SELECT unnest(json_extract(item->'object'->>'attachment', '\$[*]')) AS attachment FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')));"
-- 1
SELECT count(attachment)
FROM (
SELECT unnest(
json_extract(item->'object'->>'attachment', '$[*]')
) AS attachment
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
)
);
-- $ duckdb -noheader -list -s "SELECT attachment->>'url' FROM (SELECT unnest(json_extract(item->'object'->>'attachment', '\$[*]')) AS attachment FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')));"
-- https://media.example.com/files/00000000-0000-0000-0000-000000000000.png
SELECT attachment->>'url'
FROM (
SELECT unnest(
json_extract(item->'object'->>'attachment', '$[*]')
) AS attachment
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
)
);
-- attachmentがarrayではなくobjectの場合(Holloなど)
-- $duckdb -noheader -list -s "SELECT item->'object'->'attachment'->>'url' FROM (SELECT unnest(orderedItems) AS item FROM read_json_auto('data.jsonl.gz')) WHERE item->'object'->>'attachment' IS NOT NULL;"
-- https://media.example.com/files/00000000-0000-0000-0000-000000000000.png
SELECT item->'object'->'attachment'->>'url'
FROM (
SELECT unnest(orderedItems) AS item
FROM read_json_auto('data.jsonl.gz')
)
WHERE item->'object'->>'attachment' IS NOT NULL;
このスクラップは27日前にクローズされました