pd.to_datetimeでtimestampにした列を期間指定して絞りたいとき

dateに揃えて日付で絞る

import datetime
import pandas as pd

df['DateTime64'] = pd.to_datetime(df['DateTimeObject'])

start = datetime.date(2020,7,1)
end = datetime.date(2021,6,30)
df_tmp = df[(start <= df['DateTime64'].dt.date) & (df['DateTime64'].dt.date <= end)]

antyuntyun 2021/07/31

特定列の値でdataframeを分割

grouped_df = df.groupby('category')

for split_value, split_df in grouped_df:
    print(f'split value: {split_value}')
    print(split_df)

antyuntyun 2021/07/31

ちゃんとキャストしたいときは欠損値を埋める

カラムに欠損値ある場合、欠損している行について取り除か埋めてからキャストしないとちゃんとキャストされない。
整数と欠損を含む列を文字列として読み込んでも、キャストしても小数点以下を延々と表示し続けたりするので、欠損値処理をさぼらないようにしよう。

antyuntyun 2021/11/02に更新

便利lamnda

case when

def func_categorize(x):
    if  x == 1:
        return 'A'
    elif 2 <= x and x < 5:
        return 'B'
    elif 5 <= x:
        return 'C'

df['count_category'] = df['count'].apply(func_categorize)

antyuntyun 2021/08/15に更新

データ確認テンプレ

# レコード数確認
print(f'df len: {len(df)}')
# データ期間確認
time_column_name = 'time'
print(f'nat: {df[time_column_name].isnull().sum()}')
print(f'min_time: {df[time_column_name ].min()}')
print(f'max_time: {df[time_column_name ].max()}')

antyuntyun 2021/08/15

# null確認
df.isna().sum()

antyuntyun 2021/07/31に更新

特定文字列をハッシュ化

import hashib
# sha256
df['secret'] = df['secret'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

antyuntyun 2021/07/31

グラフ設定

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=2)
fig,ax = plt.subplots(figsize=(20,6))
plt.xlabel('xlabel')
plt.ylabel('ylabel')
# 三桁カンマの設定
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

antyuntyun 2021/08/22

散布図とヒストグラムを同時に出すjoint_plot

sns.set_style('whitegrid')
plt.figure(figsize=(40, 3))
sns.set(font_scale=1.5)
sns.jointplot('x','y', data=df_result, kind='reg', height=10)

antyuntyun 2021/08/22

seabornでヒストグラム

antyuntyun 2021/08/22

antyuntyun 2022/04/12

ヒストグラム

sns

sns.set(font='Noto Sans CJK JP')
ax1.set_title('graph title)
ax1.set_xlabel('xlabel')
ax1.set_ylabel('ylabe;')
plt.figure(figsize=(15, 20))
plt.show()  # ヒストグラムを表示
plt.xlim(0,5000) # x軸の表示範囲指定
plt.legend() # 凡例を表示
sns.histplot(data=df, x='column_name', bins=100, color='#123456')

plotly

plt.figure(figsize=(15, 20))
plt.xlim(0,5000)
fig = px.histogram(df[df['point'] > 0], x="point")
fig.show()

antyuntyun 2021/08/22

集約関数

作成される列名、集計列、集約関数を指定

df.groupby(['columnA', 'columnA'],as_index=False)\
    .agg(
        count=('columnC', 'count'),
        columnC_mean=('columnC', 'mean'),
        columnC_median=('columnC', 'median'),
        columnC_max=('columnC', 'max'),
        columnC_min=('columnC', 'min'),
        columnC_std=('columnC', 'std'),
    )

antyuntyun 2021/11/02に更新

レコードを更新日と特定列で一意にする

create_time列が最新のもののみを残し、colAでdistinct

df_distinct = df.dropna(subset=['colA']).sort_values("create_time").drop_duplicates(subset='colA', keep='last')

antyuntyun 2021/11/06

def fill_missing_values(df):
   for col in df.select_dtypes(include= ["int","float"]).columns:
      val = df[col].mean()
      df[col].fillna(val, inplace=True)
   return df

antyuntyun 2022/03/03

複数ファイルをマージ

all_files = glob.glob(f'{basic_path}/*.csv')
list_df = []
for filename in tqdm(all_files):
    df = pd.read_csv(filename)
    list_df.append(df)

df_merge = pd.concat(list_df, axis=0, ignore_index=True)
df_merge.head()