Open1
pandas.json_normalizeを使わずにpolarsだけでjson列をparseする方法
pandasで書くとこうなる
import pandas as pd
import polars as pl
data = {
"name": ["Alice", "Bob", "Charlie"],
"details": [
'{"age": 25, "city": "New York", "address": {"street": "123 Main St", "zip": "10001"}}',
'{"age": 30, "city": "London", "address": {"street": "456 Elm St", "zip": "SW1A"}}',
'{"age": 35, "city": "Tokyo", "address": {"street": "789 Oak St", "zip": "100-0001"}}',
],
}
df_sample = pl.DataFrame(data)
df_sample_pd = df_sample.to_pandas()
df_sample_flatten_pd = pd.concat(
[
df_sample_pd.drop("details", axis=1),
pd.json_normalize(df_sample_pd["details"].apply(json.loads))
], axis=1
)
df_sample_flatten = pl.from_pandas(df_sample_flatten_pd)
with pl.Config(fmt_str_lengths=100):
display(df_sample_flatten)
出力
# df_sample_flatten
name age city address.street address.zip
str i64 str str str
"Alice" 25 "New York" "123 Main St" "10001"
"Bob" 30 "London" "456 Elm St" "SW1A"
"Charlie" 35 "Tokyo" "789 Oak St" "100-0001"
polarsだけで書く
import polars as pl
import json
def flatten_nested_struct(df: pl.DataFrame, nested_field: str) -> pl.DataFrame:
def unnest_struct_fields(df_struct: pl.DataFrame, prefix="") -> list[pl.DataFrame]:
dfs = []
for field in df_struct.unnest():
if df_struct.field(field.name).dtype == pl.datatypes.Struct:
dfs += unnest_struct_fields(df_struct.field(field.name).struct, prefix=prefix + field.name + ".")
else:
dfs.append(pl.DataFrame(df_struct.field(field.name), schema=[prefix + field.name]))
return dfs
df_concat = pl.DataFrame()
df_struct = df.get_column(nested_field).apply(json.loads).struct
dfs = unnest_struct_fields(df_struct)
for df_tmp in dfs:
df_concat = pl.concat([df_concat, df_tmp], how="horizontal")
df = pl.concat([df.drop(nested_field), df_concat], how="horizontal")
return df
data = {
"name": ["Alice", "Bob", "Charlie"],
"details": [
'{"age": 25, "city": "New York", "address": {"street": "123 Main St", "zip": "10001"}}',
'{"age": 30, "city": "London", "address": {"street": "456 Elm St", "zip": "SW1A"}}',
'{"age": 35, "city": "Tokyo", "address": {"street": "789 Oak St", "zip": "100-0001"}}',
],
}
df_sample = pl.DataFrame(data)
df_sample_flatten = flatten_nested_struct(df_sample, nested_field="details")
with pl.Config(fmt_str_lengths=100):
display(df_sample_flatten)
出力
# df_sample_flatten
name address.street address.zip age city
str str str i64 str
"Alice" "123 Main St" "10001" 25 "New York"
"Bob" "456 Elm St" "SW1A" 30 "London"
"Charlie" "789 Oak St" "100-0001" 35 "Tokyo"