👏

【Python - pandas】ユーザーガイド - パンダまであと10分(2/2)

に公開

パンダまであと10分

前編

【Python - pandas】ユーザーガイド - パンダまであと10分(1/2)

マージ

セクション:マージ、結合、連結、比較

マージ
  • 連結
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [
    df[:3],
    df[3:7],
    df[7:]
]
concatted_pieces = pd.concat(pieces)

print("-- df --")
print(df)
print("-- pieces :3 --")
print(df[:3])
print("-- pieces 3:7 --")
print(df[3:7])
print("-- pieces 7: --")
print(df[7:])
print("-- concatted_pieces --")
print(concatted_pieces)
-- df --
          0         1         2         3
0  0.356967  0.446067 -0.951542  0.045307
1 -0.013618  0.747423 -0.414012  1.099421
2 -0.261020 -1.180841  1.033768  0.355981
3  0.896978 -2.390465 -0.660916 -1.140535
4  2.157401  0.144803  1.709412 -1.290725
5 -0.501878 -0.548913  1.047481  0.066022
6 -0.991660 -1.746597  0.492355  0.080778
7 -0.463837 -0.106896  0.906937  0.733530
8  2.076344  0.820683 -1.356686  1.620598
9 -0.412320 -0.279609  0.191564 -0.771300
-- pieces :3 --
          0         1         2         3
0  0.356967  0.446067 -0.951542  0.045307
1 -0.013618  0.747423 -0.414012  1.099421
2 -0.261020 -1.180841  1.033768  0.355981
-- pieces 3:7 --
          0         1         2         3
3  0.896978 -2.390465 -0.660916 -1.140535
4  2.157401  0.144803  1.709412 -1.290725
5 -0.501878 -0.548913  1.047481  0.066022
6 -0.991660 -1.746597  0.492355  0.080778
-- pieces 7: --
          0         1         2         3
7 -0.463837 -0.106896  0.906937  0.733530
8  2.076344  0.820683 -1.356686  1.620598
9 -0.412320 -0.279609  0.191564 -0.771300
-- concatted_pieces --
          0         1         2         3
0  0.356967  0.446067 -0.951542  0.045307
1 -0.013618  0.747423 -0.414012  1.099421
2 -0.261020 -1.180841  1.033768  0.355981
3  0.896978 -2.390465 -0.660916 -1.140535
4  2.157401  0.144803  1.709412 -1.290725
5 -0.501878 -0.548913  1.047481  0.066022
6 -0.991660 -1.746597  0.492355  0.080778
7 -0.463837 -0.106896  0.906937  0.733530
8  2.076344  0.820683 -1.356686  1.620598
9 -0.412320 -0.279609  0.191564 -0.771300


  • 参加する
left = pd.DataFrame({
    "key":["foo", "foo"], 
    "lval": [1, 2]
})
right = pd.DataFrame({
    "key": ["foo", "foo"],
    "rval": [4, 5]
})
_merge = pd.merge(left, right, on="key")

print("-- left --")
print(left)
print("-- right --")
print(right)
print("-- _merge --")
print(_merge)
-- left --
   key  lval
0  foo     1
1  foo     2
-- right --
   key  rval
0  foo     4
1  foo     5
-- _merge --
   key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5

グループ化

セクション:グループ化:分割-適用-結合

グループ化

使用するデータ

import numpy as np
import pandas as pd

df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
     A      B         C         D
0  foo    one  0.041960  1.062586
1  bar    one  0.839786  1.263725
2  foo    two -0.143314  0.002854
3  bar  three -0.437049  0.589962
4  foo    two -0.249942  0.056087
5  bar    two -0.653513  0.231411
6  foo    one  0.867870  2.160475
7  foo  three -0.214342 -0.502674


  • 列ラベルの集計
# -- existing code --
sum_c_d_grouping_by_a = df.groupby("A")[["C", "D"]].sum()

print("-- sum_c_d_grouping_by_a --")
print(sum_c_d_grouping_by_a)
-- sum_c_d_grouping_by_a --
            C         D
A                      
bar -0.578867  1.482711
foo -0.773802  0.127150


  • 複数ラベルの集計
# -- existing code --
sum_grouping_by_a_b = df.groupby(["A", "B"]).sum()

print("-- sum_grouping_by_a_b --")
print(sum_grouping_by_a_b)
-- sum_grouping_by_a_b --
                  C         D
A   B                        
bar one   -0.734985  0.212720
    three  0.863857  0.051750
    two    0.665720 -2.080695
foo one    2.951221 -0.337802
    three -0.980548 -0.091998
    two    0.026546  1.686402

再形成

セクション:

再形成

使用するデータ(スタック)

import numpy as np
import pandas as pd

arrays = [
   ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(
    arrays,
    names=["first", "second"]
)
df = pd.DataFrame(
    np.random.randn(8, 2),
    index=index,
    columns=["A", "B"]
)
df2 = df[:4]

print("-- index --")
print(index)
print("-- df --")
print(df)
print("-- df2 --")
print(df2)
-- index --
MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])
-- df --
                     A         B
first second                    
bar   one     0.457522  1.331643
      two    -0.701495 -1.696649
baz   one    -0.339136 -0.307470
      two    -0.140431 -2.381103
foo   one     1.420993  1.807942
      two     0.362635  0.481874
qux   one    -0.044897  0.042085
      two     0.929983 -1.466763
-- df2 --
                     A         B
first second                    
bar   one     0.457522  1.331643
      two    -0.701495 -1.696649
baz   one    -0.339136 -0.307470
      two    -0.140431 -2.381103


  • スタック
# -- existing code --
stacked = df2.stack(future_stack=True)
unstacked = stacked.unstack()

print("-- stacked --")
print(stacked)
print("-- unstacked --")
print(unstacked)
-- stacked --
first  second   
bar    one     A    1.203191
               B   -0.307585
       two     A   -0.604120
               B    1.048764
baz    one     A    1.027104
               B    1.808218
       two     A    1.716391
               B    0.359411
dtype: float64
-- unstacked --
                     A         B
first second                    
bar   one     1.203191 -0.307585
      two    -0.604120  1.048764
baz   one     1.027104  1.808218
      two     1.716391  0.359411


  • ピボットテーブル
import numpy as np
import pandas as pd

df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
_pivot_table = pd.pivot_table(
    df, 
    values="D", 
    index=["A", "B"], 
    columns=["C"]
)

print("-- df --")
print(df)
print("-- _pivot_table --")
print(_pivot_table)
-- df --
        A  B    C         D         E
0     one  A  foo -2.201357  1.022021
1     one  B  foo -1.935201  0.141118
2     two  C  foo  0.347313  0.202710
3   three  A  bar -0.729666 -1.091237
4     one  B  bar -1.613063 -0.142726
5     one  C  bar  0.569315  0.223944
6     two  A  foo -1.760303 -1.771394
7   three  B  foo  1.179379  0.098726
8     one  C  foo -0.543498  0.093197
9     one  A  bar  0.282582 -2.144519
10    two  B  bar  0.283832 -1.631549
11  three  C  bar -0.478173  0.744416
-- _pivot_table --
C             bar       foo
A     B                    
one   A  0.282582 -2.201357
      B -1.613063 -1.935201
      C  0.569315 -0.543498
three A -0.729666       NaN
      B       NaN  1.179379
      C -0.478173       NaN
two   A       NaN -1.760303
      B  0.283832       NaN
      C       NaN  0.347313

時系列

セクション:時系列/日付機能

時系列

:::message error
バージョン 2.0.0 以降では非推奨です:代わりに frame.T.resample(…) を使用してください。
:::

  • 時系列データの再サンプリング
import numpy as np
import pandas as pd

rng = pd.date_range(
    "1/1/2012",
    periods=100,
    freq="s"
)
ts = pd.Series(
    np.random.randint(0, 500, len(rng)),
    index=rng
)
ts_resample = ts.resample("5Min").sum()

print("-- rng --")
print(rng)
print("-- ts --")
print(ts)
print("-- ts_resample --")
print(ts_resample)
-- rng --
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-01 00:00:32', '2012-01-01 00:00:33',
               '2012-01-01 00:00:34', '2012-01-01 00:00:35',
               '2012-01-01 00:00:36', '2012-01-01 00:00:37',
               '2012-01-01 00:00:38', '2012-01-01 00:00:39',
               '2012-01-01 00:00:40', '2012-01-01 00:00:41',
               '2012-01-01 00:00:42', '2012-01-01 00:00:43',
               '2012-01-01 00:00:44', '2012-01-01 00:00:45',
               '2012-01-01 00:00:46', '2012-01-01 00:00:47',
               '2012-01-01 00:00:48', '2012-01-01 00:00:49',
               '2012-01-01 00:00:50', '2012-01-01 00:00:51',
               '2012-01-01 00:00:52', '2012-01-01 00:00:53',
               '2012-01-01 00:00:54', '2012-01-01 00:00:55',
               '2012-01-01 00:00:56', '2012-01-01 00:00:57',
               '2012-01-01 00:00:58', '2012-01-01 00:00:59',
               '2012-01-01 00:01:00', '2012-01-01 00:01:01',
               '2012-01-01 00:01:02', '2012-01-01 00:01:03',
               '2012-01-01 00:01:04', '2012-01-01 00:01:05',
               '2012-01-01 00:01:06', '2012-01-01 00:01:07',
               '2012-01-01 00:01:08', '2012-01-01 00:01:09',
               '2012-01-01 00:01:10', '2012-01-01 00:01:11',
               '2012-01-01 00:01:12', '2012-01-01 00:01:13',
               '2012-01-01 00:01:14', '2012-01-01 00:01:15',
               '2012-01-01 00:01:16', '2012-01-01 00:01:17',
               '2012-01-01 00:01:18', '2012-01-01 00:01:19',
               '2012-01-01 00:01:20', '2012-01-01 00:01:21',
               '2012-01-01 00:01:22', '2012-01-01 00:01:23',
               '2012-01-01 00:01:24', '2012-01-01 00:01:25',
               '2012-01-01 00:01:26', '2012-01-01 00:01:27',
               '2012-01-01 00:01:28', '2012-01-01 00:01:29',
               '2012-01-01 00:01:30', '2012-01-01 00:01:31',
               '2012-01-01 00:01:32', '2012-01-01 00:01:33',
               '2012-01-01 00:01:34', '2012-01-01 00:01:35',
               '2012-01-01 00:01:36', '2012-01-01 00:01:37',
               '2012-01-01 00:01:38', '2012-01-01 00:01:39'],
              dtype='datetime64[ns]', freq='s')
-- ts --
2012-01-01 00:00:00    315
2012-01-01 00:00:01    296
2012-01-01 00:00:02     25
2012-01-01 00:00:03    309
2012-01-01 00:00:04    410
                      ... 
2012-01-01 00:01:35    228
2012-01-01 00:01:36      8
2012-01-01 00:01:37    107
2012-01-01 00:01:38     22
2012-01-01 00:01:39    155
Freq: s, Length: 100, dtype: int64
-- ts_resample --
2012-01-01    23725
Freq: 5min, dtype: int64


  • 時系列をタイムゾーンにローカライズ
import pandas as pd

rng = pd.date_range(
    "3/6/2012 00:00", 
    periods=5, 
    freq="D"
)
ts = pd.Series(
    np.random.randn(len(rng)), 
    rng
)
ts_utc = ts.tz_localize("UTC")

print("-- rng --")
print(rng)
print("-- ts --")
print(ts)
print("-- ts_utc --")
print(ts_utc)
-- rng --
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')
-- ts --
2012-03-06    0.371665
2012-03-07    0.162164
2012-03-08   -0.083604
2012-03-09   -0.232782
2012-03-10    0.028668
Freq: D, dtype: float64
-- ts_utc --
2012-03-06 00:00:00+00:00    0.371665
2012-03-07 00:00:00+00:00    0.162164
2012-03-08 00:00:00+00:00   -0.083604
2012-03-09 00:00:00+00:00   -0.232782
2012-03-10 00:00:00+00:00    0.028668
Freq: D, dtype: float64


  • タイムゾーンの変換
# -- existing code --
ts_utc_to_us = ts_utc.tz_convert("US/Eastern")

print("-- ts_utc_to_us --")
print(ts_utc_to_us)
-- ts_utc_to_us --
2012-03-05 19:00:00-05:00   -0.546822
2012-03-06 19:00:00-05:00   -0.614345
2012-03-07 19:00:00-05:00   -1.304176
2012-03-08 19:00:00-05:00   -1.018010
2012-03-09 19:00:00-05:00    3.066178
Freq: D, dtype: float64

カテゴリカル

セクション:カテゴリデータ

カテゴリカル
  • カテゴリ型へキャスト
import numpy as np
import pandas as pd

df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5, 6], 
    "raw_grade": ["a", "b", "b", "a", "a", "e"]
})
df["grade"] = df["raw_grade"].astype("category")

print("-- df --")
print(df)
print("-- grade --")
print(df["grade"])
-- df --
   id raw_grade grade
0   1         a     a
1   2         b     b
2   3         b     b
3   4         a     a
4   5         a     a
5   6         e     e
-- grade --
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']


  • カテゴリ名の変更
# -- existing code --
new_categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.rename_categories(new_categories)

print("-- grade --")
print(df["grade"])
-- grade --
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']


  • 新規カテゴリ名への変更
# -- existing code --
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)

print("-- grade --")
print(df["grade"])
-- grade --
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']


  • カテゴリ名によるソート
# -- existing code --
sorted_grade = df.sort_values(by="grade")

print("-- sorted_grade --")
print(sorted_grade)
-- sorted_grade --
   id raw_grade      grade
0   1         a  very good
3   4         a  very good
4   5         a  very good
1   2         b       good
2   3         b       good
5   6         e   very bad


  • カテゴリ列の集計
# -- existing code --
group_by_grade = df.groupby("grade", observed=False).size()

print("-- group_by_grade --")
print(group_by_grade)
-- group_by_grade --
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Discussion