Open1

pandas.json_normalizeを使わずにpolarsだけでjson列をparseする方法

pigooosukepigooosuke

pandasで書くとこうなる

import pandas as pd
import polars as pl


data = {
    "name": ["Alice", "Bob", "Charlie"],
    "details": [
        '{"age": 25, "city": "New York", "address": {"street": "123 Main St", "zip": "10001"}}',
        '{"age": 30, "city": "London", "address": {"street": "456 Elm St", "zip": "SW1A"}}',
        '{"age": 35, "city": "Tokyo", "address": {"street": "789 Oak St", "zip": "100-0001"}}',
    ],
}

df_sample = pl.DataFrame(data)
df_sample_pd = df_sample.to_pandas()
df_sample_flatten_pd = pd.concat(
    [
        df_sample_pd.drop("details", axis=1),
        pd.json_normalize(df_sample_pd["details"].apply(json.loads))
    ], axis=1
)
df_sample_flatten = pl.from_pandas(df_sample_flatten_pd)

with pl.Config(fmt_str_lengths=100):
    display(df_sample_flatten)

出力

# df_sample_flatten
name	age	city	address.street	address.zip
str	i64	str	str	str
"Alice"	25	"New York"	"123 Main St"	"10001"
"Bob"	30	"London"	"456 Elm St"	"SW1A"
"Charlie"	35	"Tokyo"	"789 Oak St"	"100-0001"

polarsだけで書く

import polars as pl
import json


def flatten_nested_struct(df: pl.DataFrame, nested_field: str) -> pl.DataFrame:
    def unnest_struct_fields(df_struct: pl.DataFrame, prefix="") -> list[pl.DataFrame]:
        dfs = []
        for field in df_struct.unnest():
            if df_struct.field(field.name).dtype == pl.datatypes.Struct:
                dfs += unnest_struct_fields(df_struct.field(field.name).struct, prefix=prefix + field.name + ".")
            else:
                dfs.append(pl.DataFrame(df_struct.field(field.name), schema=[prefix + field.name]))
        return dfs

    df_concat = pl.DataFrame()
    df_struct = df.get_column(nested_field).apply(json.loads).struct
    dfs = unnest_struct_fields(df_struct)
    for df_tmp in dfs:
        df_concat = pl.concat([df_concat, df_tmp], how="horizontal")
    df = pl.concat([df.drop(nested_field), df_concat], how="horizontal")

    return df


data = {
    "name": ["Alice", "Bob", "Charlie"],
    "details": [
        '{"age": 25, "city": "New York", "address": {"street": "123 Main St", "zip": "10001"}}',
        '{"age": 30, "city": "London", "address": {"street": "456 Elm St", "zip": "SW1A"}}',
        '{"age": 35, "city": "Tokyo", "address": {"street": "789 Oak St", "zip": "100-0001"}}',
    ],
}

df_sample = pl.DataFrame(data)
df_sample_flatten = flatten_nested_struct(df_sample, nested_field="details")

with pl.Config(fmt_str_lengths=100):
    display(df_sample_flatten)

出力

# df_sample_flatten
name	address.street	address.zip	age	city
str	str	str	i64	str
"Alice"	"123 Main St"	"10001"	25	"New York"
"Bob"	"456 Elm St"	"SW1A"	30	"London"
"Charlie"	"789 Oak St"	"100-0001"	35	"Tokyo"