💽

データサイエンス100本ノック(構造化データ加工編)(中級)

2023/05/26に公開

[!Important]+ Goals
一般社団法人データサイエンス協会が GitHub に公開した「データサイエンス 100 のノック(構造化データ処理編)」の問題演習を行うコースです。
このコースでは、曖昧な条件、ソート、集計、サンプリングなどについて学ぶことができます。
ビジネス現場での分析実務では、データのほとんどが構造化データです。構造化データを自由に加工・集計するスキルを磨き、データサイエンスの実践力を高めましょう。

[!abstract]+ Curriculum
1.曖昧な条件
2.ソート
3.集計
4.サブクエリ
5.サンプリング
6.減算エラー対応
7. name identification
8.データ分割


曖昧な条件

前方条件

問題 10

#pd/query/str/startswithを使用します。

  • df_store から store_cd が "S14" で始まる項目だけ抽出する
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_store.query("store_cd.str.startswith('S14')", engine='python').head(10))

問題 13

正規表現

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.query("status_cd.str.contains('^[A-F]', regex=True)", engine='python').head(10)))

後部条件

問題 11

#pd/query/str/endswith

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.query("customer_id.str.endswith('1')", engine='python').head(10))

問題 14

#pd/query/str/contains #regex #regular_expression

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.query("status_cd.str.contains('[1-9]$', regex=True)", engine='python').head(10)))

部分条件

問題 12

#pd/query/str/containsを使用します。

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_store.query("住所.str.contains')", engine='python'))

問題 15

#pd/query/str/containsを使用します。

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.query("status_cd.str.contains('^[A-F].*[1-9]$', regex=True)", engine='python').head(10)))

#ソート

順番の変更

問題 17

#pd/ソート

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.sort_values('birth_day', ascending=True).head(10))

問題 18

#pd/ソート

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_customer.sort_values('birth_day', ascending=False).head(10))

順位

問題 19

#pd/rank #pd/concat #pd/sort

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# "df_tmp"に処理後のデータを代入してください

df_tmp = pd.concat(
[df_receipt[['customer_id', 'amount']]
, df_receipt["amount"].rank(method='min', ascending=False)
]
, axis=1)
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ['customer_id', 'amount', 'ranking']
print(df_tmp.sort_values('ranking', ascending=True).head(10))

問題 20

#pd/rank #pd/concat #pd/sort

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# "df_tmp"に処理後のデータを代入してください

df_tmp = pd.concat(
[df_receipt[['customer_id', 'amount']]
, df_receipt["amount"].rank(method='first', ascending=False)
]
, axis=1)
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ['customer_id', 'amount', 'ranking']
print(df_tmp.sort_values('ranking', ascending=True).head(10))

集計

##カウント

問題 22

#レン #pd/unique

import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(len(pd.unique(df_receipt.customer_id)))

合計

問題 23

#pd/groupby/agg/sum

  • 店舗別売上高と売上数量合計
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby('store_cd').agg({'amount':'sum', 'quantity':'sum'}).reset_index().head()))

最大/最小

問題 24

#pd/groupby/agg/max

  • 領収書データから顧客 ID 別に最近の売上日を計算する
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby('customer_id').agg({'sales_ymd': max'}).reset_index().head(10))

問題 25

#pd/groupby/agg/min

  • レシートデータから顧客 ID ごとに最も古い売上日を計算する
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby('customer_id').agg({'sales_ymd':'min'}).reset_index().head(10))

問題 26

#pd/groupby/agg

  • 1 回以上訪問した人を探す
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

df_tmp = df_receipt.groupby('customer_id').agg({'sales_ymd':['max','min']}).reset_index()
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ["_".join(pair) for pair in df_tmp.columns]
print(df_tmp.query('sales_ymd_max != sales_ymd_min').head(10))

統計量

問題 27

#pd/groupby/agg/mean #pd/sort

  • 店舗別売上平均
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby("store_cd").agg({"amount": "mean"}).reset_index().sort_values("amount", ascending=False).head()))

問題 28

#pd/groupby/agg/median #pd/sort

  • 店舗別売上の中央値
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby("store_cd").agg({"amount": "median"}).reset_index().sort_values("amount", ascending=False).head()))

問題 29

pd/groupby/apply #lambda #lambda

  • 店舗別よく売れた商品を探す
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(df_receipt.groupby('store_cd').product_cd.apply(lambda x: x.mode()).reset_index()))

問題 30

#pd/groupby/var #std_var

  • 店舗別売上の標準分散を求める
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

プリント(
    df_receipt
    .groupby("store_cd")
    .金額.var(ddof=0)
    .reset_index()
    .sort_values("金額", ascending=False)
    .head()
    )

問題 31

#pd/groupby/std #pd/std_dev

  • 店舗別売上の標準偏差を求める
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

プリント(
    df_receipt
    .groupby("store_cd").amount
    .std(ddof=0)
    .reset_index()
    .sort_values("金額", ascending=False)
    .head()
    )

問題 32

#np/percentile

  • パーセンタイル (パーセンタイル) の求め方
import pandas as pd
import numpy as np
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(
    np.percentile(df_receipt.amount, [25,50,75,100])
    )

問題 33

#pd/query #pd/groupby/agg/mean

  • 売上平均が 330 以上の店舗を抽出
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

print(
    df_receipt.groupby("store_cd")
    .agg({"amount":'mean'})
    .reset_index()
    .query("amount >= 330")
    )

#サブクエリ

検索結果からのサブクエリ

問題 34

#pd/query

  • 会員別売上総額とその平均を求める。
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

プリント(
	  df_receipt
	  .query('not customer_id.str.startswith("Z")', engine='python')
	  .groupby("customer_id")
	  .金額
	  .sum()
	  .平均値()
	  )

条件指定でのサブクエリ

問題 35

#pd/query

  • 平均売上高以上の会員を探す
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

amount_mean = df_receipt.query('not customer_id.str.startswith("Z")', engine='python').groupby("customer_id").amount.sum().mean()
amount_sum = df_receipt.query('not customer_id.str.startswith("Z")', engine='python').groupby("customer_id") .amount.sum().reset_index()
print(amount_sum[amount_sum.amount >= amount_mean].head(10))

#サンプリング

##ランダム

問題 75

#df/sample

  • ランダムサンプリング
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# random_stateを42としてランダムに1%のデータを抽出し, "df_sampleに代入してください"
df_sample = df_customer.sample(frac=0.01, random_state=42)
print(df_sample.head(10))

階層化

問題 76

#サンプル/層別

  • 性別分布に応じた階層抽出後、性別ごとの顧客数を集計。
import pandas as pd
from sklearn.model_selection import train_test_split
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


# sklearn.model_selection.train_test_splitを使用して層化を行ってください
_, sample_df_customer = train_test_split(df_customer, test_size=0.1, stratify =df_customer.gender_cd, random_state=42)
print(sample_df_customer.groupby("gender_cd").agg({'customer_id' : 'count'}))

減算エラー対応

減算エラー対応

問題 84

#pd/merge #fillna

  • 顧客別 2019 年度売上高比率
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# 1.レシート明細データフレーム(df_receipt)からqueryメソッドにて該当の期間のデータを抽出する
df_tmp_1 = df_receipt.query('20190101 <= sales_ymd <= 20191231')
# 2. "1"で抽出したデータを顧客データフレーム(df_customer)に結合する
df_tmp_1 = pd.merge(df_customer['customer_id'], df_tmp_1[['customer_id', 'amount']], how='left', on='customer_id'). \
    groupby('customer_id').sum().reset_index().rename(columns={'amount':'amount_2019'})
# 3. レシート明細データフレーム(df_receipt)を顧客データフレーム(df_customer)に結合する
df_tmp_2 = pd.merge(df_customer['customer_id'], df_receipt[['customer_id', 'amount']], how='left', on='customer_id'). \
    groupby('customer_id').sum().reset_index()
# 4. "2"と"3"で得たデータを内部結合する
df_tmp = pd.merge(df_tmp_1, df_tmp_2, how='inner', on='customer_id')
# 5. "4"の結合時に生じた欠損値を補完する
df_tmp['amount_2019'] = df_tmp['amount_2019'].fillna(0)
df_tmp['amount'] = df_tmp['amount'].fillna(0)
# 6. 2019の売り上げ金額 / 全期間の売上金額を行い割合をデータフレームに追加する
df_tmp['amount_rate'] = df_tmp['amount_2019'] / df_tmp['amount']
# 7. "6"で生じた欠損値を補完する
df_tmp['amount_rate'] = df_tmp['amount_rate'].fillna(0)
# 8. queryメソッドにて条件に基づいて取得する
print( df_tmp.query('amount_rate > 0').head(10) )


名前の識別

完全一致

問題 87

#pd/merge #pd/sort #pd/drop_duplicates

  • 複数登録された顧客データから複数のデータを削除した後、売上高 0 の顧客は ID 番号が一番小さい顧客だけを残します。
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


# 顧客ごとの売上金額合計を算出する
df_tmp = df_receipt.groupby('customer_id').agg({'amount':'sum'}).reset_index()
# 顧客データフレーム(df_customer)に売上金額合計を追加し、売上金額合計、顧客IDでソートする
df_customer_u = pd.merge(df_customer, df_tmp, how='left', on='customer_id').sort_values(['amount', 'customer_id'], ascending=[False, True])
# 同一顧客に対しては売上金額合計が最も高いものを残すように削除する
df_customer_u.drop_duplicates(subset=['customer_name', 'postal_cd'], keep='first', inplace=True)

print('減少数: ', len(df_customer) - len(df_customer_u))

変換データ作成

問題 88

  • 前の問題の df_customer_u をベースにデータフレームに統合 ID を付与した df_customer_n を作成します。
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
df_customer_u = pd.read_csv('./100knocks-preprocess/87.csv', index_col=0)

# 顧客データフレーム(df_customer)と名寄顧客データフレーム(df_customer_u)を内部結合する
df_customer_n = pd.merge(df_customer, df_customer_u[['customer_name', 'postal_cd', 'customer_id']],
                        how='inner', on =['customer_name', 'postal_cd'])
# カラム名を変更する
df_customer_n.rename(columns={'customer_id_x':'customer_id', 'customer_id_y':'integration_id'}, inplace=True)

print('ID数の差', len(df_customer_n['customer_id'].unique()) - len(df_customer_n['integration_id'].unique()))

#データ分割

##レコードデータ

問題 89

  • 売上がある顧客のみを対象として、トレーニングデータと検証データを分離
import pandas as pd
from sklearn.model_selection import train_test_split
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')

# 顧客ごとの売上金額合計を算出します
df_sales= df_receipt.groupby('customer_id').agg({'amount':'sum'}).reset_index()
#  df_salesにある顧客のみを抽出します
df_tmp = pd.merge(df_customer, df_sales['customer_id'], how='inner', on='customer_id')
# 8:2の割合でランダムにデータを分割します
df_train, df_test = train_test_split(df_tmp, test_size=0.2, random_state=71)
print('訓練データ割合: ', len(df_train) / len(df_tmp))
print('検証データ割合: ', len(df_test) / len(df_tmp))

時系列データ

問題 90

  • 月別売上集計後、トレーニングデータを作成
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')


df_tmp = df_receipt[['sales_ymd', 'amount']].copy()
# 西暦と月のみにし、"sales_ym"に代入します
df_tmp['sales_ym'] = df_tmp['sales_ymd'].astype('str').str[0:6]
# 月毎の"amount"を算出します
df_tmp = df_tmp.groupby('sales_ym').agg({'amount':'sum'}).reset_index()

# 「train_size, test_size」はデータの長さ, 「slide_window,start_point」はtrainデータの始まりを決定するのに使用します
def split_data(df, train_size, test_size, slide_window, start_point):
    train_start = start_point * slide_window
    test_start = train_start + train_size
    return df[train_start : test_start], df[test_start : test_start + test_size]

df_train_1, df_test_1 = split_data(df_tmp, train_size=12, test_size=6, slide_window=6, start_point=0)
df_train_2, df_test_2 = split_data(df_tmp, train_size=12, test_size=6, slide_window=6, start_point=1)
df_train_3, df_test_3 = split_data(df_tmp、train_size=12、test_size=6、slide_window=6, start_point=2)
print(df_train_3)

Discussion