💽

データサイエンス100本ノック(構造化データ加工編)(初級)

2023/05/11に公開

[!Important]+ Goals
一般社団法人データサイエンス協会がGitHubに公開した「データサイエンス100のノック(構造化データ加工編)」の問題演習を行うコースです。 このコースでは、行や列の操作、結合、アスペクト変換、四則演算などを学ぶことができます。
ビジネス現場での分析実務では、データのほとんどが構造化データです。構造化データを自由に加工・集計するスキルを磨き、データサイエンスの実践力を高めましょう。

[!abstract]+ Curriculum
1.列に対する操作
2.行に対する操作
3.結合
4.縦横変換
5.四捨五入演算
6.正規化、非正規化
7.ファイル入出力


熱に対する操作

#pd/rename

#pd/rename

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.rename(columns={'sales_ymd':'sales_date'})[["sales_date", "customer_id","product_cd","amount"]].head(10))

行に対する操作

#pd/クエリ

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt[["sales_ymd", "customer_id","product_cd","quantity","amount"]]
        .query("customer_id=='CS018205000001' & (amount >= 1000 | quantity >= 5)"))

結合

#pd/merge

内部結合

#pd/merge/inner

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(pd.merge(df_product
         , df_category[['category_major_cd', 'category_medium_cd','category_small_cd','category_small_name']]
         , how='inner', on=['category_major_cd', 'category_medium_cd','category_small_cd']).head(10))
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(
    pd.merge(df_product, df_category,
	    on = ["category_major_cd", "category_medium_cd", "category_small_cd"], how = 'inner')
	    [df_product.columns.append(pd.Index(['category_small_name']))]
    .head(10))

左側外部結合

#pd/merge/left #pd/sort #pd/query

女性会員別購入金額を算出する

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

df_amount_sum = df_receipt.groupby('customer_id').amount.sum().reset_index()
df_tmp = df_customer.query('gender_cd == "1" and not customer_id.str.startswith("Z")', engine='python')
print( pd.merge(df_tmp['customer_id'], df_amount_sum, how='left', on='customer_id').fillna(0).head(10) )

完全外部結合

#pd/merge/outer #pd/duplicated #pd/sort #pd/query

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

df_sum_1 = df_receipt.groupby("customer_id").amount.sum().reset_index()
df_sum_2 = df_sum_1.query('not customer_id.str.startswith("Z")', engine='python')
df_sum = df_sum_2.sort_values(by="amount",ascending=False).head(20)

df_cnt_1 = df_receipt[~df_receipt.duplicated(subset=["customer_id", "sales_ymd"])]
df_cnt_2 = df_cnt_1.query('not customer_id.str.startswith("Z")', engine='python')
df_cnt_3 = df_cnt_2.groupby("customer_id").sales_ymd.count().reset_index()
df_cnt = df_cnt_3.sort_values(by="sales_ymd", ascending=False).head(20)

# 完全外部結合を行う引数を記入してください
df_merged = pd.merge(df_sum, df_cnt, on="customer_id", how='outer')
print(df_merged)

クロス結合

#pd/コピー

すべての店舗と商品の組み合わせ

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

tmp_df_store = df_store.copy()
tmp_df_product = df_product.copy()
tmp_df_store["key"] = 0
tmp_df_product["key"] = 0

# 店舗(df_store)と商品(df_product)を直積した件数を出力させてください
print(len(pd.merge(tmp_df_store, tmp_df_product, on = "key", how='outer')))

磁気結合による時系列の移し替え

前日比 売上増減

#pd/shift #pd/rename #pd/concat

#pd/shift #pd/rename #pd/concat

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# "df_sales_amount_by_date "に処理後のデータを代入してください
df_sales_amount_by_date = df_receipt[["sales_ymd", "amount"]].groupby("sales_ymd").sum().reset_index()
df_sales_amount_by_date = pd.concat([df_sales_amount_by_date, df_sales_amount_by_date.shift()], axis=1)

# カラム名を指定します。書き換える必要はありません
df_sales_amount_by_date.columns = ['sales_ymd','amount','lag_ymd','lag_amount']
# "diff_amount"カラムに売上金額増減を代入します
df_sales_amount_by_date['diff_amount'] = df_sales_amount_by_date.amount-df_sales_amount_by_date.lag_amount
print(df_sales_amount_by_date.head(10))

3日前の売上まで同時に見る

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


df_sales_amount_by_date = df_receipt[['sales_ymd', 'amount']].groupby('sales_ymd').sum().reset_index()
for i in range(1, 4):
    if i == 1:
        df_lag = pd.concat([df_sales_amount_by_date, df_sales_amount_by_date.shift(i)],axis=1)
    else:
        df_lag = pd.concat([df_lag, df_sales_amount_by_date.shift(i)],axis=1)
# カラム名を指定します。書き換える必要はありません
df_lag.columns = ['sales_ymd', 'amount', 'lag_ymd_1', 'lag_amount_1', 'lag_ymd_2', 'lag_amount_2', 'lag_ymd_3', 'lag_amount_3']
print(df_lag.dropna().sort_values(['sales_ymd']).head(10))

縦横変換

縦から横へ

#pd/pivot

import pandas as pd
import math
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

df_tmp = pd.merge(df_receipt, df_customer, how ='inner', on="customer_id")
df_tmp['era'] = df_tmp['age'].apply(lambda x: math.floor(x / 10) * 10)
df_sales_summary = pd.pivot_table(df_tmp, index='era', columns='gender_cd', values='amount', aggfunc='sum').reset_index()
# カラム名を指定します。書き換える必要はありません
df_sales_summary.columns = ['era', 'male', 'female', 'unknown']
print(df_sales_summary)

横から縦へ

#pd/set_index #pd/stack #pd/replace #pd/rename

import pandas as pd
# "df_sales_summary"をcsvファイルから読み込みます。書き換える必要はありません
df_sales_summary = pd.read_csv("./100knocks-preprocess/df_sales_summary.csv", index_col=0)

print(df_sales_summary.set_index('era').stack().reset_index()
        .replace(regex={'female': '01', 'male': '00', 'unknown': '99'})
        .rename(columns={'level_1' : 'gender_cd', 0:'amount'})
    )


四則演算

四則演算

利益率の計算

#pd/mean/skipna

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


df_tmp = df_product.copy()
# index毎に(単価-原価)/単価で利益率を求める
df_tmp['unit_profit_rate'] = (df_tmp['unit_price'] - df_tmp['unit_cost']) / df_tmp['unit_price']
# 平均を求める
df_mean = df_tmp.unit_profit_rate.mean(skipna=True)
print(df_mean)

利益率30%に合わせた単価リセット:小数点以下切り捨て

#np/フロア

import pandas as pd
import numpy as np
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


df_tmp = df_product.copy()
# 利益率が30%となる価格を"new_price"カラムに代入します
df_tmp['new_price'] = np.floor(df_tmp.unit_cost/0.7)
# "new_price"で利益率を算出し、"new_profit_rate"カラムに代入します
df_tmp['new_profit_rate'] = (df_tmp.new_price - df_tmp.unit_cost) / df_tmp.new_price
# print関数を使用し、df_tmpを10件表示させます
print(df_tmp.head(10))

小数点以下を扱う

利益率30%に合わせた単価をリセット : 丸め直し

#np/ラウンド

パイソン

import pandas as pd
import numpy as np
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


df_tmp = df_product.copy()
# 利益率が30%となる価格を"new_price"カラムに代入します
df_tmp['new_price'] = np.round(df_tmp.unit_cost / 0.7)
# "new_price"で利益率を算出し、"new_profit_rate"カラムに代入します
df_tmp['new_profit_rate'] = ( df_tmp.new_price -  df_tmp.unit_cost )/df_tmp.new_price
# print関数を使用し、df_tmpを10件表示させます
print(df_tmp.head(10))

利益率30%に合わせた単価のリセット : 上げ

#ナノパープル・セイル

df_tmp['new_price'] = np.ceil(df_tmp.unit_cost / 0.7)

集計結果演算

お客様ごとの総購入金額に対する缶詰の割合を求める

import pandas as pd
import numpy as np
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


# 顧客毎に全商品の売上金額合計のデータフレーム1を作成する
df_tmp_1 = pd.merge(df_receipt, df_product, 
                    how='inner', on='product_cd').groupby('customer_id').agg({'amount':'sum'}).reset_index()
# 顧客毎に瓶詰缶詰の売上金額合計のデータフレーム2を作成する
df_tmp_2 = pd.merge(df_receipt, df_product.query('category_major_cd == "07"'), 
                    how='inner', on='product_cd').groupby('customer_id').agg({'amount':'sum'}).reset_index()
# データフレーム1、データフレーム2よりデータフレーム3を作成する
df_tmp_3 = pd.merge(df_tmp_1, df_tmp_2, how='inner', on='customer_id')
# 瓶詰缶詰の売上金額合計の比率を"rate_07"カラムに代入する
df_tmp_3['rate_07'] = df_tmp_3['amount_y'] / df_tmp_3['amount_x']
# print関数を使用し、df_tmp_3を10件表示させます
print(df_tmp_3.head(10))

正規化、非正規化

正規化

第3正規化

第3次正規化
#pd/drop_duplicates

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# "gender_cd"と"gender"の依存関係をdf_genderに代入します
df_gender = df_customer[['gender_cd', 'gender']].drop_duplicates()
# "gender"カラムを削除します
df_customer_s = df_customer.drop('gender',axis=1)
print(df_gender)
print(df_customer_s)

非正規化

カテゴリ番号と名前を全てつなげる

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt = pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# 非正規化を行ってください
df_product_full = pd.merge(df_product, df_category,
                            on = ['category_major_cd','category_medium_cd','category_small_cd'],
                            how='outer')
print(df_product_full)

CSV出力


ファイルの入出力

簡単なので、メモは書かない

Discussion