🦔
【Python - pandas】ユーザーガイド - パンダまであと10分(1/2)
パンダまであと10分
後編
【Python - pandas】ユーザーガイド - パンダまであと10分(2/2)
pandasの基本データ構造
- pandasは、データを処理するための2種類のクラスを提供します。
オブジェクトの作成
セクション:データ構造入門
オブジェクトの作成
- Seriesオブジェクトの作成
import numpy as np
import pandas as pd
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
- DataFrameオブジェクトの作成(←Numpy配列)
import numpy as np
import pandas as pd
dates = pd.date_range(
"20130101",
periods=6
)
# -> DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
# '2013-01-05', '2013-01-06'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(
np.random.randn(6, 4),
index = dates,
columns = list("ABCD")
)
print(df)
A B C D
2013-01-01 -0.662633 -0.769206 -0.763576 0.666317
2013-01-02 0.172708 -0.617286 -0.053372 -1.130917
2013-01-03 -1.005611 -0.964423 -0.162094 -0.071588
2013-01-04 -0.479771 0.540221 0.807865 0.760459
2013-01-05 -0.037409 -1.062844 0.924946 0.979073
2013-01-06 0.869426 0.542442 1.309254 -0.545590
- DataFrameオブジェクトの作成(←辞書)
import numpy as np
import pandas as pd
df2 = pd.DataFrame(
{
"A": 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo",
}
)
print(df2)
print("-- dtypes --")
print(df2.dtypes)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
-- dtypes --
A float64
B datetime64[s]
C float32
D int32
E category
F object
dtype: object
データの表示
セクション:必須の基本機能
- DataFrame.head()
- DataFrame.tail()
- DataFrame.index
- DataFrame.columns
- DataFrame.to_numpy()
- DataFrame.describe
データの表示
使用するデータ
import numpy as np
import pandas as pd
dates = pd.date_range(
"20130101",
periods=6
)
df = pd.DataFrame(
np.random.randn(6, 4),
index=dates,
columns=list("ABCD")
)
- フレームの上部と下部
# -- existing code --
head = df.head()
tail = df.tail(3)
print("-- head --")
print(head)
print("-- tail --")
print(tail)
-- head --
A B C D
2013-01-01 1.918322 -1.693173 -0.474956 0.178245
2013-01-02 2.647734 -0.230600 1.956934 0.805509
2013-01-03 -0.424932 0.891356 -0.006440 0.333991
2013-01-04 -0.510190 0.619605 1.078304 0.111051
2013-01-05 -0.502453 1.852233 -1.257425 0.073380
-- tail --
A B C D
2013-01-04 -0.510190 0.619605 1.078304 0.111051
2013-01-05 -0.502453 1.852233 -1.257425 0.073380
2013-01-06 0.434086 1.364373 -0.095750 0.499967
- フレームの行ラベルと列ラベル
# -- existing code --
index = df.index
columns = df.columns
print("-- index --")
print(index)
print("-- columns --")
print(columns)
-- index --
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
-- columns --
Index(['A', 'B', 'C', 'D'], dtype='object')
- Numpy表現
to_numpy = df.to_numpy()
print(to_numpy)
[[ 0.10275781 -0.15240652 -0.49230203 -0.55844575]
[-0.93814807 -0.27737133 -0.04194464 -1.14233342]
[ 0.77402372 -0.96135358 -0.19474949 -0.4155293 ]
[ 0.28781136 -0.85801733 3.38548367 -0.33695306]
[-1.26120989 -0.04527307 0.88362413 -0.43181833]
[-0.06495374 0.41521996 0.0108958 -1.3119894 ]]
- 統計概要
# -- existing code --
describe = df.describe()
print(describe)
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.199047 -0.257772 0.478266 -0.274356
std 0.349179 1.102231 0.774650 0.907204
min -0.175088 -1.588327 -0.481989 -1.009412
25% -0.060202 -0.849672 -0.156136 -0.851061
50% 0.172844 -0.538776 0.553989 -0.704405
75% 0.333776 0.321754 1.081292 0.106752
max 0.774148 1.463954 1.379389 1.294893
- データの転置
# -- existing code --
transposition = df.T
_sort_index = df.sort_index(axis=1, ascending=False)
_sort_values = df.sort_values(by="B")
print("-- 転置 --")
print(transposition)
print("-- 軸によるソート --")
print(_sort_index)
print("-- 値によるソート --")
print(_sort_values)
-- 転置 --
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A 2.448773 -0.308430 -0.752449 -0.403320 -0.476072 0.008912
B -0.664506 0.196025 0.514682 1.725387 1.094336 0.749363
C 0.388941 0.228956 -0.655113 -1.466240 0.377765 -2.154219
D -2.348640 0.296181 -0.512352 -0.642324 0.581467 0.354751
-- 軸によるソート --
D C B A
2013-01-01 -2.348640 0.388941 -0.664506 2.448773
2013-01-02 0.296181 0.228956 0.196025 -0.308430
2013-01-03 -0.512352 -0.655113 0.514682 -0.752449
2013-01-04 -0.642324 -1.466240 1.725387 -0.403320
2013-01-05 0.581467 0.377765 1.094336 -0.476072
2013-01-06 0.354751 -2.154219 0.749363 0.008912
-- 値によるソート --
A B C D
2013-01-01 2.448773 -0.664506 0.388941 -2.348640
2013-01-02 -0.308430 0.196025 0.228956 0.296181
2013-01-03 -0.752449 0.514682 -0.655113 -0.512352
2013-01-06 0.008912 0.749363 -2.154219 0.354751
2013-01-05 -0.476072 1.094336 0.377765 0.581467
2013-01-04 -0.403320 1.725387 -1.466240 -0.642324
選択
セクション:データのインデックス作成と選択
選択
使用するデータ
import numpy as np
import pandas as pd
dates = pd.date_range(
"20130101",
periods=6
)
df = pd.DataFrame(
np.random.randn(6, 4),
index=dates,
columns=list("ABCD")
)
- アイテムを取得
# -- existing code --
a = df["A"]
slice_0_3 = df[0:3]
print("-- A --")
print(a)
print("-- 0_3 --")
print(slice_0_3)
-- A --
2013-01-01 -0.230356
2013-01-02 -0.559038
2013-01-03 0.281296
2013-01-04 1.333772
2013-01-05 -1.191256
2013-01-06 1.213864
Freq: D, Name: A, dtype: float64
-- 0_3 --
A B C D
2013-01-01 -0.230356 0.706460 1.189687 -0.075775
2013-01-02 -0.559038 0.269358 -2.012862 1.779300
2013-01-03 0.281296 -0.548101 0.942486 0.286164
- ラベルによる選択
# -- existing code --
dates_0 = df.loc[dates[0]]
df_a_b = df.loc[
:,
["A", "B"]
]
df_a_b_between_02_04 = df.loc[
"20130102":"20130104",
["A", "B"]
]
print("-- dates_0 --")
print(dates_0)
print("-- df_a_b --")
print(df_a_b)
print("-- df_a_b_between_02_04 --")
print(df_a_b_between_02_04)
-- dates_0 --
A 0.061603
B 0.750192
C -0.098624
D -0.531968
Name: 2013-01-01 00:00:00, dtype: float64
-- df_a_b --
A B
2013-01-01 0.061603 0.750192
2013-01-02 -1.010798 0.023275
2013-01-03 0.612322 -0.319232
2013-01-04 0.550425 -0.447256
2013-01-05 -0.971426 0.827942
2013-01-06 -0.577870 -0.426460
-- df_a_b_between_02_04 --
A B
2013-01-02 -1.010798 0.023275
2013-01-03 0.612322 -0.319232
2013-01-04 0.550425 -0.447256
- ポジションによる選択
# -- existing code --
p_3 = df.iloc[3]
p_list = df.iloc[
[1, 2, 4],
[0, 2]
]
slice_row = df.iloc[1:3, :]
slice_col = df.iloc[:, 1:3]
print("-- p_3 --")
print(p_3)
print("-- p_list --")
print(p_list)
print("-- slice_row --")
print(slice_row)
print("-- slice_col --")
print(slice_col)
-- p_3 --
A -1.632344
B -1.611617
C -1.182941
D 0.016364
Name: 2013-01-04 00:00:00, dtype: float64
-- p_list --
A C
2013-01-02 0.597187 1.218829
2013-01-03 -1.458470 1.365110
2013-01-05 1.868191 -0.930007
-- slice_row --
A B C D
2013-01-02 0.597187 1.142186 1.218829 1.367564
2013-01-03 -1.458470 -1.680932 1.365110 0.007064
-- slice_col --
B C
2013-01-01 1.292156 -0.715628
2013-01-02 1.142186 1.218829
2013-01-03 -1.680932 1.365110
2013-01-04 -1.611617 -1.182941
2013-01-05 -0.009445 -0.930007
2013-01-06 -1.126733 -0.390663
- ブールインデックス
# -- existing code --
df2 = df.copy()
df2["E"] = ["one", "two", "three", "four", "five", "six"]
filter_with_a_bigger_0 = df[df["A"] > 0]
filter_with_e_isin_two_four = df2[df2["E"].isin(["two", "four"])]
print("-- filter_with_a_bigger_0 --")
print(filter_with_a_bigger_0)
print("-- df2 --")
print(df2)
print("-- filter_with_e_isin_two_four --")
print(filter_with_e_isin_two_four)
-- filter_with_a_bigger_0 --
A B C D
2013-01-02 0.610320 -0.687301 -1.184956 1.02887
2013-01-03 1.516186 -0.449244 -0.566938 -0.14706
-- df2 --
A B C D E
2013-01-01 -0.661098 1.438593 -0.977869 -1.141950 one
2013-01-02 0.610320 -0.687301 -1.184956 1.028870 two
2013-01-03 1.516186 -0.449244 -0.566938 -0.147060 three
2013-01-04 -0.386337 -2.241849 -0.422667 1.703897 four
2013-01-05 -2.068874 0.097219 -0.312625 -0.471222 five
2013-01-06 -0.089850 0.824537 0.610175 -0.565128 six
-- filter_with_e_isin_two_four --
A B C D E
2013-01-02 0.610320 -0.687301 -1.184956 1.028870 two
2013-01-04 -0.386337 -2.241849 -0.422667 1.703897 four
欠損データ
セクション:欠損データの処理
欠損データ
使用するデータ
import numpy as np
import pandas as pd
dates = pd.date_range(
"20130101",
periods=6
)
df = pd.DataFrame(
np.random.randn(6, 4),
index=dates,
columns=list("ABCD")
)
df1 = df.reindex(
index=dates[0:4],
columns=list(df.columns) + ["E"]
)
df1.loc[
dates[0] : dates[1],
"E"
] = 1
# -- existing code --
dropped_nan = df1.dropna(how="any")
filled_nan_in_5 = df1.fillna(value=5)
is_nan = pd.isna(df1)
print("-- dropped_nan --")
print(dropped_nan)
print("-- filled_nan_in_5 --")
print(filled_nan_in_5)
print("-- is_nan --")
print(is_nan)
-- dropped_nan --
A B C D E
2013-01-01 0.636323 0.199453 0.266351 -0.377901 1.0
2013-01-02 0.410628 0.206869 -0.141973 -0.757250 1.0
-- filled_nan_in_5 --
A B C D E
2013-01-01 0.636323 0.199453 0.266351 -0.377901 1.0
2013-01-02 0.410628 0.206869 -0.141973 -0.757250 1.0
2013-01-03 1.139067 -0.533380 0.047384 -0.982880 5.0
2013-01-04 0.684012 1.237839 -0.103038 -0.017450 5.0
-- is_nan --
A B C D E
2013-01-01 False False False False False
2013-01-02 False False False False False
2013-01-03 False False False False True
2013-01-04 False False False False True
操作
セクション:柔軟なバイナリ操作
操作
使用するデータ
import numpy as np
import pandas as pd
dates = pd.date_range(
"20130101",
periods=6
)
df = pd.DataFrame(
np.random.randn(6, 4),
index=dates,
columns=list("ABCD")
)
- 統計
# -- existing code --
col_mean = df.mean()
row_mean = df.mean(axis=1)
print("-- col_mean --")
print(col_mean)
print("-- row_mean --")
print(row_mean)
-- col_mean --
A 0.022821
B -0.232273
C -0.678120
D -0.512279
dtype: float64
-- row_mean --
2013-01-01 -0.338239
2013-01-02 -0.356218
2013-01-03 -0.345457
2013-01-04 0.385384
2013-01-05 -0.930577
2013-01-06 -0.514670
Freq: D, dtype: float64
- ユーザー定義関数
# -- existing code --
_agg = df.agg(lambda x: np.mean(x) * 5.6)
_transform = df.transform(lambda x: x * 101.2)
print("-- _agg --")
print(_agg)
print("-- _transform --")
print(_transform)
-- _agg --
A -1.532046
B 0.662410
C 3.028417
D 3.801749
dtype: float64
-- _transform --
A B C D
2013-01-01 51.797570 157.958617 110.669660 196.351202
2013-01-02 123.783122 -242.567867 108.002598 213.381817
2013-01-03 74.189474 62.864840 40.198602 64.064164
2013-01-04 -111.454909 -20.448038 37.499770 -97.090494
2013-01-05 -8.817108 86.672639 52.686987 8.996838
2013-01-06 -295.615737 27.344004 -20.690736 26.514681
- 価値が重要
s = pd.Series(np.random.randint(0, 7, size=10))
s_counts = s.value_counts()
print("-- s --")
print(s)
print("-- s_counts --")
print(s_counts)
-- s --
0 3
1 0
2 1
3 3
4 4
5 0
6 0
7 0
8 6
9 5
dtype: int64
-- s_counts --
0 4
3 2
1 1
4 1
6 1
5 1
Name: count, dtype: int64
- 文字列メソッド
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s_lower = s.str.lower()
print("-- s --")
print(s)
print("-- s_lower --")
print(s_lower)
-- s --
0 A
1 B
2 C
3 Aaba
4 Baca
5 NaN
6 CABA
7 dog
8 cat
dtype: object
-- s_lower --
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
Discussion