👏
【Python - pandas】ユーザーガイド - パンダまであと10分(2/2)
パンダまであと10分
前編
【Python - pandas】ユーザーガイド - パンダまであと10分(1/2)
マージ
セクション:マージ、結合、連結、比較
マージ
- 連結
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [
df[:3],
df[3:7],
df[7:]
]
concatted_pieces = pd.concat(pieces)
print("-- df --")
print(df)
print("-- pieces :3 --")
print(df[:3])
print("-- pieces 3:7 --")
print(df[3:7])
print("-- pieces 7: --")
print(df[7:])
print("-- concatted_pieces --")
print(concatted_pieces)
-- df --
0 1 2 3
0 0.356967 0.446067 -0.951542 0.045307
1 -0.013618 0.747423 -0.414012 1.099421
2 -0.261020 -1.180841 1.033768 0.355981
3 0.896978 -2.390465 -0.660916 -1.140535
4 2.157401 0.144803 1.709412 -1.290725
5 -0.501878 -0.548913 1.047481 0.066022
6 -0.991660 -1.746597 0.492355 0.080778
7 -0.463837 -0.106896 0.906937 0.733530
8 2.076344 0.820683 -1.356686 1.620598
9 -0.412320 -0.279609 0.191564 -0.771300
-- pieces :3 --
0 1 2 3
0 0.356967 0.446067 -0.951542 0.045307
1 -0.013618 0.747423 -0.414012 1.099421
2 -0.261020 -1.180841 1.033768 0.355981
-- pieces 3:7 --
0 1 2 3
3 0.896978 -2.390465 -0.660916 -1.140535
4 2.157401 0.144803 1.709412 -1.290725
5 -0.501878 -0.548913 1.047481 0.066022
6 -0.991660 -1.746597 0.492355 0.080778
-- pieces 7: --
0 1 2 3
7 -0.463837 -0.106896 0.906937 0.733530
8 2.076344 0.820683 -1.356686 1.620598
9 -0.412320 -0.279609 0.191564 -0.771300
-- concatted_pieces --
0 1 2 3
0 0.356967 0.446067 -0.951542 0.045307
1 -0.013618 0.747423 -0.414012 1.099421
2 -0.261020 -1.180841 1.033768 0.355981
3 0.896978 -2.390465 -0.660916 -1.140535
4 2.157401 0.144803 1.709412 -1.290725
5 -0.501878 -0.548913 1.047481 0.066022
6 -0.991660 -1.746597 0.492355 0.080778
7 -0.463837 -0.106896 0.906937 0.733530
8 2.076344 0.820683 -1.356686 1.620598
9 -0.412320 -0.279609 0.191564 -0.771300
- 参加する
left = pd.DataFrame({
"key":["foo", "foo"],
"lval": [1, 2]
})
right = pd.DataFrame({
"key": ["foo", "foo"],
"rval": [4, 5]
})
_merge = pd.merge(left, right, on="key")
print("-- left --")
print(left)
print("-- right --")
print(right)
print("-- _merge --")
print(_merge)
-- left --
key lval
0 foo 1
1 foo 2
-- right --
key rval
0 foo 4
1 foo 5
-- _merge --
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
グループ化
セクション:グループ化:分割-適用-結合
グループ化
使用するデータ
import numpy as np
import pandas as pd
df = pd.DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
A B C D
0 foo one 0.041960 1.062586
1 bar one 0.839786 1.263725
2 foo two -0.143314 0.002854
3 bar three -0.437049 0.589962
4 foo two -0.249942 0.056087
5 bar two -0.653513 0.231411
6 foo one 0.867870 2.160475
7 foo three -0.214342 -0.502674
- 列ラベルの集計
# -- existing code --
sum_c_d_grouping_by_a = df.groupby("A")[["C", "D"]].sum()
print("-- sum_c_d_grouping_by_a --")
print(sum_c_d_grouping_by_a)
-- sum_c_d_grouping_by_a --
C D
A
bar -0.578867 1.482711
foo -0.773802 0.127150
- 複数ラベルの集計
# -- existing code --
sum_grouping_by_a_b = df.groupby(["A", "B"]).sum()
print("-- sum_grouping_by_a_b --")
print(sum_grouping_by_a_b)
-- sum_grouping_by_a_b --
C D
A B
bar one -0.734985 0.212720
three 0.863857 0.051750
two 0.665720 -2.080695
foo one 2.951221 -0.337802
three -0.980548 -0.091998
two 0.026546 1.686402
再形成
セクション:
再形成
使用するデータ(スタック)
import numpy as np
import pandas as pd
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(
arrays,
names=["first", "second"]
)
df = pd.DataFrame(
np.random.randn(8, 2),
index=index,
columns=["A", "B"]
)
df2 = df[:4]
print("-- index --")
print(index)
print("-- df --")
print(df)
print("-- df2 --")
print(df2)
-- index --
MultiIndex([('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')],
names=['first', 'second'])
-- df --
A B
first second
bar one 0.457522 1.331643
two -0.701495 -1.696649
baz one -0.339136 -0.307470
two -0.140431 -2.381103
foo one 1.420993 1.807942
two 0.362635 0.481874
qux one -0.044897 0.042085
two 0.929983 -1.466763
-- df2 --
A B
first second
bar one 0.457522 1.331643
two -0.701495 -1.696649
baz one -0.339136 -0.307470
two -0.140431 -2.381103
- スタック
# -- existing code --
stacked = df2.stack(future_stack=True)
unstacked = stacked.unstack()
print("-- stacked --")
print(stacked)
print("-- unstacked --")
print(unstacked)
-- stacked --
first second
bar one A 1.203191
B -0.307585
two A -0.604120
B 1.048764
baz one A 1.027104
B 1.808218
two A 1.716391
B 0.359411
dtype: float64
-- unstacked --
A B
first second
bar one 1.203191 -0.307585
two -0.604120 1.048764
baz one 1.027104 1.808218
two 1.716391 0.359411
- ピボットテーブル
import numpy as np
import pandas as pd
df = pd.DataFrame(
{
"A": ["one", "one", "two", "three"] * 3,
"B": ["A", "B", "C"] * 4,
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
"D": np.random.randn(12),
"E": np.random.randn(12),
}
)
_pivot_table = pd.pivot_table(
df,
values="D",
index=["A", "B"],
columns=["C"]
)
print("-- df --")
print(df)
print("-- _pivot_table --")
print(_pivot_table)
-- df --
A B C D E
0 one A foo -2.201357 1.022021
1 one B foo -1.935201 0.141118
2 two C foo 0.347313 0.202710
3 three A bar -0.729666 -1.091237
4 one B bar -1.613063 -0.142726
5 one C bar 0.569315 0.223944
6 two A foo -1.760303 -1.771394
7 three B foo 1.179379 0.098726
8 one C foo -0.543498 0.093197
9 one A bar 0.282582 -2.144519
10 two B bar 0.283832 -1.631549
11 three C bar -0.478173 0.744416
-- _pivot_table --
C bar foo
A B
one A 0.282582 -2.201357
B -1.613063 -1.935201
C 0.569315 -0.543498
three A -0.729666 NaN
B NaN 1.179379
C -0.478173 NaN
two A NaN -1.760303
B 0.283832 NaN
C NaN 0.347313
時系列
セクション:時系列/日付機能
時系列
:::message error
バージョン 2.0.0 以降では非推奨です:代わりに frame.T.resample(…) を使用してください。
:::
- 時系列データの再サンプリング
import numpy as np
import pandas as pd
rng = pd.date_range(
"1/1/2012",
periods=100,
freq="s"
)
ts = pd.Series(
np.random.randint(0, 500, len(rng)),
index=rng
)
ts_resample = ts.resample("5Min").sum()
print("-- rng --")
print(rng)
print("-- ts --")
print(ts)
print("-- ts_resample --")
print(ts_resample)
-- rng --
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
'2012-01-01 00:00:02', '2012-01-01 00:00:03',
'2012-01-01 00:00:04', '2012-01-01 00:00:05',
'2012-01-01 00:00:06', '2012-01-01 00:00:07',
'2012-01-01 00:00:08', '2012-01-01 00:00:09',
'2012-01-01 00:00:10', '2012-01-01 00:00:11',
'2012-01-01 00:00:12', '2012-01-01 00:00:13',
'2012-01-01 00:00:14', '2012-01-01 00:00:15',
'2012-01-01 00:00:16', '2012-01-01 00:00:17',
'2012-01-01 00:00:18', '2012-01-01 00:00:19',
'2012-01-01 00:00:20', '2012-01-01 00:00:21',
'2012-01-01 00:00:22', '2012-01-01 00:00:23',
'2012-01-01 00:00:24', '2012-01-01 00:00:25',
'2012-01-01 00:00:26', '2012-01-01 00:00:27',
'2012-01-01 00:00:28', '2012-01-01 00:00:29',
'2012-01-01 00:00:30', '2012-01-01 00:00:31',
'2012-01-01 00:00:32', '2012-01-01 00:00:33',
'2012-01-01 00:00:34', '2012-01-01 00:00:35',
'2012-01-01 00:00:36', '2012-01-01 00:00:37',
'2012-01-01 00:00:38', '2012-01-01 00:00:39',
'2012-01-01 00:00:40', '2012-01-01 00:00:41',
'2012-01-01 00:00:42', '2012-01-01 00:00:43',
'2012-01-01 00:00:44', '2012-01-01 00:00:45',
'2012-01-01 00:00:46', '2012-01-01 00:00:47',
'2012-01-01 00:00:48', '2012-01-01 00:00:49',
'2012-01-01 00:00:50', '2012-01-01 00:00:51',
'2012-01-01 00:00:52', '2012-01-01 00:00:53',
'2012-01-01 00:00:54', '2012-01-01 00:00:55',
'2012-01-01 00:00:56', '2012-01-01 00:00:57',
'2012-01-01 00:00:58', '2012-01-01 00:00:59',
'2012-01-01 00:01:00', '2012-01-01 00:01:01',
'2012-01-01 00:01:02', '2012-01-01 00:01:03',
'2012-01-01 00:01:04', '2012-01-01 00:01:05',
'2012-01-01 00:01:06', '2012-01-01 00:01:07',
'2012-01-01 00:01:08', '2012-01-01 00:01:09',
'2012-01-01 00:01:10', '2012-01-01 00:01:11',
'2012-01-01 00:01:12', '2012-01-01 00:01:13',
'2012-01-01 00:01:14', '2012-01-01 00:01:15',
'2012-01-01 00:01:16', '2012-01-01 00:01:17',
'2012-01-01 00:01:18', '2012-01-01 00:01:19',
'2012-01-01 00:01:20', '2012-01-01 00:01:21',
'2012-01-01 00:01:22', '2012-01-01 00:01:23',
'2012-01-01 00:01:24', '2012-01-01 00:01:25',
'2012-01-01 00:01:26', '2012-01-01 00:01:27',
'2012-01-01 00:01:28', '2012-01-01 00:01:29',
'2012-01-01 00:01:30', '2012-01-01 00:01:31',
'2012-01-01 00:01:32', '2012-01-01 00:01:33',
'2012-01-01 00:01:34', '2012-01-01 00:01:35',
'2012-01-01 00:01:36', '2012-01-01 00:01:37',
'2012-01-01 00:01:38', '2012-01-01 00:01:39'],
dtype='datetime64[ns]', freq='s')
-- ts --
2012-01-01 00:00:00 315
2012-01-01 00:00:01 296
2012-01-01 00:00:02 25
2012-01-01 00:00:03 309
2012-01-01 00:00:04 410
...
2012-01-01 00:01:35 228
2012-01-01 00:01:36 8
2012-01-01 00:01:37 107
2012-01-01 00:01:38 22
2012-01-01 00:01:39 155
Freq: s, Length: 100, dtype: int64
-- ts_resample --
2012-01-01 23725
Freq: 5min, dtype: int64
- 時系列をタイムゾーンにローカライズ
import pandas as pd
rng = pd.date_range(
"3/6/2012 00:00",
periods=5,
freq="D"
)
ts = pd.Series(
np.random.randn(len(rng)),
rng
)
ts_utc = ts.tz_localize("UTC")
print("-- rng --")
print(rng)
print("-- ts --")
print(ts)
print("-- ts_utc --")
print(ts_utc)
-- rng --
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
'2012-03-10'],
dtype='datetime64[ns]', freq='D')
-- ts --
2012-03-06 0.371665
2012-03-07 0.162164
2012-03-08 -0.083604
2012-03-09 -0.232782
2012-03-10 0.028668
Freq: D, dtype: float64
-- ts_utc --
2012-03-06 00:00:00+00:00 0.371665
2012-03-07 00:00:00+00:00 0.162164
2012-03-08 00:00:00+00:00 -0.083604
2012-03-09 00:00:00+00:00 -0.232782
2012-03-10 00:00:00+00:00 0.028668
Freq: D, dtype: float64
- タイムゾーンの変換
# -- existing code --
ts_utc_to_us = ts_utc.tz_convert("US/Eastern")
print("-- ts_utc_to_us --")
print(ts_utc_to_us)
-- ts_utc_to_us --
2012-03-05 19:00:00-05:00 -0.546822
2012-03-06 19:00:00-05:00 -0.614345
2012-03-07 19:00:00-05:00 -1.304176
2012-03-08 19:00:00-05:00 -1.018010
2012-03-09 19:00:00-05:00 3.066178
Freq: D, dtype: float64
カテゴリカル
セクション:カテゴリデータ
カテゴリカル
- カテゴリ型へキャスト
import numpy as np
import pandas as pd
df = pd.DataFrame({
"id": [1, 2, 3, 4, 5, 6],
"raw_grade": ["a", "b", "b", "a", "a", "e"]
})
df["grade"] = df["raw_grade"].astype("category")
print("-- df --")
print(df)
print("-- grade --")
print(df["grade"])
-- df --
id raw_grade grade
0 1 a a
1 2 b b
2 3 b b
3 4 a a
4 5 a a
5 6 e e
-- grade --
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']
- カテゴリ名の変更
# -- existing code --
new_categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.rename_categories(new_categories)
print("-- grade --")
print(df["grade"])
-- grade --
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']
- 新規カテゴリ名への変更
# -- existing code --
df["grade"] = df["grade"].cat.set_categories(
["very bad", "bad", "medium", "good", "very good"]
)
print("-- grade --")
print(df["grade"])
-- grade --
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']
- カテゴリ名によるソート
# -- existing code --
sorted_grade = df.sort_values(by="grade")
print("-- sorted_grade --")
print(sorted_grade)
-- sorted_grade --
id raw_grade grade
0 1 a very good
3 4 a very good
4 5 a very good
1 2 b good
2 3 b good
5 6 e very bad
- カテゴリ列の集計
# -- existing code --
group_by_grade = df.groupby("grade", observed=False).size()
print("-- group_by_grade --")
print(group_by_grade)
-- group_by_grade --
grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
Discussion