🦔

【Python - pandas】ユーザーガイド - パンダまであと10分(1/2)

に公開

パンダまであと10分

後編

【Python - pandas】ユーザーガイド - パンダまであと10分(2/2)

pandasの基本データ構造

  • pandasは、データを処理するための2種類のクラスを提供します。
    • Series
      • 任意の型のデータを保持する1次元のラベル付き配列。(整数、文字列、Pythonオブジェクト等)
    • DataFrame
      • 2次元配列や行と列を持つテーブルのようにデータを保持する2次元データ構造。

オブジェクトの作成

セクション:データ構造入門

オブジェクトの作成
  • Seriesオブジェクトの作成
import numpy as np
import pandas as pd

s = pd.Series([1, 3, 5, np.nan, 6, 8])

print(s)
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


  • DataFrameオブジェクトの作成(←Numpy配列)
import numpy as np
import pandas as pd

dates = pd.date_range(
    "20130101",
    periods=6
)
# -> DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
#                   '2013-01-05', '2013-01-06'],
#                   dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(
    np.random.randn(6, 4),
    index = dates,
    columns = list("ABCD")
)

print(df)
                   A         B         C         D
2013-01-01 -0.662633 -0.769206 -0.763576  0.666317
2013-01-02  0.172708 -0.617286 -0.053372 -1.130917
2013-01-03 -1.005611 -0.964423 -0.162094 -0.071588
2013-01-04 -0.479771  0.540221  0.807865  0.760459
2013-01-05 -0.037409 -1.062844  0.924946  0.979073
2013-01-06  0.869426  0.542442  1.309254 -0.545590


  • DataFrameオブジェクトの作成(←辞書)
import numpy as np
import pandas as pd

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

print(df2)
print("-- dtypes --")
print(df2.dtypes)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
-- dtypes --
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

データの表示

セクション:必須の基本機能

データの表示

使用するデータ

import numpy as np
import pandas as pd

dates = pd.date_range(
    "20130101",
    periods=6
)
df = pd.DataFrame(
    np.random.randn(6, 4),
    index=dates,
    columns=list("ABCD")
)


  • フレームの上部と下部
# -- existing code --
head = df.head()
tail = df.tail(3)

print("-- head --")
print(head)
print("-- tail --")
print(tail)
-- head --
                   A         B         C         D
2013-01-01  1.918322 -1.693173 -0.474956  0.178245
2013-01-02  2.647734 -0.230600  1.956934  0.805509
2013-01-03 -0.424932  0.891356 -0.006440  0.333991
2013-01-04 -0.510190  0.619605  1.078304  0.111051
2013-01-05 -0.502453  1.852233 -1.257425  0.073380
-- tail --
                   A         B         C         D
2013-01-04 -0.510190  0.619605  1.078304  0.111051
2013-01-05 -0.502453  1.852233 -1.257425  0.073380
2013-01-06  0.434086  1.364373 -0.095750  0.499967


  • フレームの行ラベルと列ラベル
# -- existing code --
index = df.index
columns = df.columns

print("-- index --")
print(index)
print("-- columns --")
print(columns)
-- index --
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
-- columns --
Index(['A', 'B', 'C', 'D'], dtype='object')


  • Numpy表現
to_numpy = df.to_numpy()
print(to_numpy)
[[ 0.10275781 -0.15240652 -0.49230203 -0.55844575]
 [-0.93814807 -0.27737133 -0.04194464 -1.14233342]
 [ 0.77402372 -0.96135358 -0.19474949 -0.4155293 ]
 [ 0.28781136 -0.85801733  3.38548367 -0.33695306]
 [-1.26120989 -0.04527307  0.88362413 -0.43181833]
 [-0.06495374  0.41521996  0.0108958  -1.3119894 ]]


  • 統計概要
# -- existing code --
describe = df.describe()
print(describe)
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.199047 -0.257772  0.478266 -0.274356
std    0.349179  1.102231  0.774650  0.907204
min   -0.175088 -1.588327 -0.481989 -1.009412
25%   -0.060202 -0.849672 -0.156136 -0.851061
50%    0.172844 -0.538776  0.553989 -0.704405
75%    0.333776  0.321754  1.081292  0.106752
max    0.774148  1.463954  1.379389  1.294893


  • データの転置
# -- existing code --
transposition = df.T
_sort_index = df.sort_index(axis=1, ascending=False)
_sort_values = df.sort_values(by="B")

print("-- 転置 --")
print(transposition)
print("-- 軸によるソート --")
print(_sort_index)
print("-- 値によるソート --")
print(_sort_values)
-- 転置 --
   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A    2.448773   -0.308430   -0.752449   -0.403320   -0.476072    0.008912
B   -0.664506    0.196025    0.514682    1.725387    1.094336    0.749363
C    0.388941    0.228956   -0.655113   -1.466240    0.377765   -2.154219
D   -2.348640    0.296181   -0.512352   -0.642324    0.581467    0.354751
-- 軸によるソート --
                   D         C         B         A
2013-01-01 -2.348640  0.388941 -0.664506  2.448773
2013-01-02  0.296181  0.228956  0.196025 -0.308430
2013-01-03 -0.512352 -0.655113  0.514682 -0.752449
2013-01-04 -0.642324 -1.466240  1.725387 -0.403320
2013-01-05  0.581467  0.377765  1.094336 -0.476072
2013-01-06  0.354751 -2.154219  0.749363  0.008912
-- 値によるソート --
                   A         B         C         D
2013-01-01  2.448773 -0.664506  0.388941 -2.348640
2013-01-02 -0.308430  0.196025  0.228956  0.296181
2013-01-03 -0.752449  0.514682 -0.655113 -0.512352
2013-01-06  0.008912  0.749363 -2.154219  0.354751
2013-01-05 -0.476072  1.094336  0.377765  0.581467
2013-01-04 -0.403320  1.725387 -1.466240 -0.642324

選択

セクション:データのインデックス作成と選択

選択

使用するデータ

import numpy as np
import pandas as pd

dates = pd.date_range(
    "20130101",
    periods=6
)
df = pd.DataFrame(
    np.random.randn(6, 4),
    index=dates,
    columns=list("ABCD")
)


  • アイテムを取得
# -- existing code --
a = df["A"]
slice_0_3 = df[0:3]

print("-- A --")
print(a)
print("-- 0_3 --")
print(slice_0_3)
-- A --
2013-01-01   -0.230356
2013-01-02   -0.559038
2013-01-03    0.281296
2013-01-04    1.333772
2013-01-05   -1.191256
2013-01-06    1.213864
Freq: D, Name: A, dtype: float64
-- 0_3 --
                   A         B         C         D
2013-01-01 -0.230356  0.706460  1.189687 -0.075775
2013-01-02 -0.559038  0.269358 -2.012862  1.779300
2013-01-03  0.281296 -0.548101  0.942486  0.286164


  • ラベルによる選択
# -- existing code --
dates_0 = df.loc[dates[0]]
df_a_b = df.loc[
    :,
    ["A", "B"]
]
df_a_b_between_02_04 = df.loc[
    "20130102":"20130104",
    ["A", "B"]
]

print("-- dates_0 --")
print(dates_0)
print("-- df_a_b --")
print(df_a_b)
print("-- df_a_b_between_02_04 --")
print(df_a_b_between_02_04)
-- dates_0 --
A    0.061603
B    0.750192
C   -0.098624
D   -0.531968
Name: 2013-01-01 00:00:00, dtype: float64
-- df_a_b --
                   A         B
2013-01-01  0.061603  0.750192
2013-01-02 -1.010798  0.023275
2013-01-03  0.612322 -0.319232
2013-01-04  0.550425 -0.447256
2013-01-05 -0.971426  0.827942
2013-01-06 -0.577870 -0.426460
-- df_a_b_between_02_04 --
                   A         B
2013-01-02 -1.010798  0.023275
2013-01-03  0.612322 -0.319232
2013-01-04  0.550425 -0.447256


  • ポジションによる選択
# -- existing code --
p_3 = df.iloc[3]
p_list = df.iloc[
    [1, 2, 4],
    [0, 2]
]
slice_row = df.iloc[1:3, :]
slice_col = df.iloc[:, 1:3]

print("-- p_3 --")
print(p_3)
print("-- p_list --")
print(p_list)
print("-- slice_row --")
print(slice_row)
print("-- slice_col --")
print(slice_col)
-- p_3 --
A   -1.632344
B   -1.611617
C   -1.182941
D    0.016364
Name: 2013-01-04 00:00:00, dtype: float64
-- p_list --
                   A         C
2013-01-02  0.597187  1.218829
2013-01-03 -1.458470  1.365110
2013-01-05  1.868191 -0.930007
-- slice_row --
                   A         B         C         D
2013-01-02  0.597187  1.142186  1.218829  1.367564
2013-01-03 -1.458470 -1.680932  1.365110  0.007064
-- slice_col --
                   B         C
2013-01-01  1.292156 -0.715628
2013-01-02  1.142186  1.218829
2013-01-03 -1.680932  1.365110
2013-01-04 -1.611617 -1.182941
2013-01-05 -0.009445 -0.930007
2013-01-06 -1.126733 -0.390663


  • ブールインデックス
# -- existing code --
df2 = df.copy()
df2["E"] = ["one", "two", "three", "four", "five", "six"]

filter_with_a_bigger_0 = df[df["A"] > 0]
filter_with_e_isin_two_four = df2[df2["E"].isin(["two", "four"])]


print("-- filter_with_a_bigger_0 --")
print(filter_with_a_bigger_0)
print("-- df2 --")
print(df2)
print("-- filter_with_e_isin_two_four --")
print(filter_with_e_isin_two_four)
-- filter_with_a_bigger_0 --
                   A         B         C        D
2013-01-02  0.610320 -0.687301 -1.184956  1.02887
2013-01-03  1.516186 -0.449244 -0.566938 -0.14706
-- df2 --
                   A         B         C         D      E
2013-01-01 -0.661098  1.438593 -0.977869 -1.141950    one
2013-01-02  0.610320 -0.687301 -1.184956  1.028870    two
2013-01-03  1.516186 -0.449244 -0.566938 -0.147060  three
2013-01-04 -0.386337 -2.241849 -0.422667  1.703897   four
2013-01-05 -2.068874  0.097219 -0.312625 -0.471222   five
2013-01-06 -0.089850  0.824537  0.610175 -0.565128    six
-- filter_with_e_isin_two_four --
                   A         B         C         D     E
2013-01-02  0.610320 -0.687301 -1.184956  1.028870   two
2013-01-04 -0.386337 -2.241849 -0.422667  1.703897  four

欠損データ

セクション:欠損データの処理

欠損データ

使用するデータ

import numpy as np
import pandas as pd

dates = pd.date_range(
    "20130101",
    periods=6
)
df = pd.DataFrame(
    np.random.randn(6, 4),
    index=dates,
    columns=list("ABCD")
)
df1 = df.reindex(
    index=dates[0:4],
    columns=list(df.columns) + ["E"]
)
df1.loc[
    dates[0] : dates[1],
    "E"
] = 1


# -- existing code --
dropped_nan = df1.dropna(how="any")
filled_nan_in_5 = df1.fillna(value=5)
is_nan = pd.isna(df1)

print("-- dropped_nan --")
print(dropped_nan)
print("-- filled_nan_in_5 --")
print(filled_nan_in_5)
print("-- is_nan --")
print(is_nan)
-- dropped_nan --
                   A         B         C         D    E
2013-01-01  0.636323  0.199453  0.266351 -0.377901  1.0
2013-01-02  0.410628  0.206869 -0.141973 -0.757250  1.0
-- filled_nan_in_5 --
                   A         B         C         D    E
2013-01-01  0.636323  0.199453  0.266351 -0.377901  1.0
2013-01-02  0.410628  0.206869 -0.141973 -0.757250  1.0
2013-01-03  1.139067 -0.533380  0.047384 -0.982880  5.0
2013-01-04  0.684012  1.237839 -0.103038 -0.017450  5.0
-- is_nan --
                A      B      C      D      E
2013-01-01  False  False  False  False  False
2013-01-02  False  False  False  False  False
2013-01-03  False  False  False  False   True
2013-01-04  False  False  False  False   True

操作

セクション:柔軟なバイナリ操作

操作

使用するデータ

import numpy as np
import pandas as pd

dates = pd.date_range(
    "20130101",
    periods=6
)
df = pd.DataFrame(
    np.random.randn(6, 4),
    index=dates,
    columns=list("ABCD")
)


  • 統計
# -- existing code --
col_mean = df.mean()
row_mean = df.mean(axis=1)

print("-- col_mean --")
print(col_mean)
print("-- row_mean --")
print(row_mean)
-- col_mean --
A    0.022821
B   -0.232273
C   -0.678120
D   -0.512279
dtype: float64
-- row_mean --
2013-01-01   -0.338239
2013-01-02   -0.356218
2013-01-03   -0.345457
2013-01-04    0.385384
2013-01-05   -0.930577
2013-01-06   -0.514670
Freq: D, dtype: float64


  • ユーザー定義関数
# -- existing code --
_agg = df.agg(lambda x: np.mean(x) * 5.6)
_transform = df.transform(lambda x: x * 101.2)

print("-- _agg --")
print(_agg)
print("-- _transform --")
print(_transform)
-- _agg --
A   -1.532046
B    0.662410
C    3.028417
D    3.801749
dtype: float64
-- _transform --
                     A           B           C           D
2013-01-01   51.797570  157.958617  110.669660  196.351202
2013-01-02  123.783122 -242.567867  108.002598  213.381817
2013-01-03   74.189474   62.864840   40.198602   64.064164
2013-01-04 -111.454909  -20.448038   37.499770  -97.090494
2013-01-05   -8.817108   86.672639   52.686987    8.996838
2013-01-06 -295.615737   27.344004  -20.690736   26.514681


  • 価値が重要
s = pd.Series(np.random.randint(0, 7, size=10))
s_counts = s.value_counts()

print("-- s --")
print(s)
print("-- s_counts --")
print(s_counts)
-- s --
0    3
1    0
2    1
3    3
4    4
5    0
6    0
7    0
8    6
9    5
dtype: int64
-- s_counts --
0    4
3    2
1    1
4    1
6    1
5    1
Name: count, dtype: int64


  • 文字列メソッド
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s_lower = s.str.lower()

print("-- s --")
print(s)
print("-- s_lower --")
print(s_lower)
-- s --
0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object
-- s_lower --
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Discussion