ProbSpace pollen counts (PVS 2nd-Code)
PobSpace 花粉飛散予測_MODEL_STACK
作成者:saru_da_mon(AI-FOX)
本コードの使用権利
MIT License に準拠する。
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
コードの概要
※数理モデルについてはコメント欄
ここからコード
ライブラリインポート
import numpy as np
import pandas as pd
import math
from datetime import datetime,date,timedelta
from matplotlib import pyplot as plt
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import StandardScaler
from scipy import signal
import os
import random
from keras import backend as K
# 活性化関数を用意
def swish(x):
return x * K.sigmoid(x)
# 不要なエラー表示はしない
import warnings
warnings.simplefilter('ignore')
# 表示指定
pd.set_option("display.max_columns",600)
pd.set_option("display.max_rows",600)
plt.style.use('ggplot')
%matplotlib inline
設定
# MIXパラメータ
seed_lgb0 = 747
seed_xgb0 = 818
seed_toy0 = 911
seed_lgb1 = 990
a = 1
b = 1
c = 1
# 外れ値クリッピング閾値
THR_H = 0.9999
THR_L = 0.0001
YMIN = 3e-2
def ymin_log(x):
return np.log(x+YMIN)
def ymin_exp(x):
return np.exp(x)-YMIN
乱数の固定
#!pip install tensorflow-determinism
def set_seed(seed):
tf.random.set_seed(seed)
# optional
# for numpy.random
np.random.seed(seed)
# for built-in random
random.seed(seed)
# for hash seed
os.environ["PYTHONHASHSEED"] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(0)
データ読込み
#データ読込み
train = pd.read_csv('./data/train_v2.csv')
test = pd.read_csv('./data/test_v2.csv')
len_train = train.shape[0]
print(train.shape)
print(test.shape)
# 'datatime'列から年、月、週、日をとりあえず取り出す
df = pd.concat([train, test]).reset_index(drop=True)
df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
df.head(3)
df.columns
# 'time' の列は要らないので削除
col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo',
'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo',
'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo',
'windspeed_utsunomiya', 'windspeed_chiba','windspeed_tokyo',
'pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo',
'year', 'month', 'day', 'hour']
df = df[col]
df
# 欠測として埋められてるデータを補間
for name in df.columns:
if 'pollen' in name:
df[name] = df[name].apply(lambda x: np.nan if x < -9000 else x)
df[name] = df[name].replace({'欠測':np.nan})
df[name] = df[name].interpolate('ffill', limit_direction='forward')
if df[name].dtype == 'object':
df[name] = df[name].astype('float')
print(name, df[name].dtype)
df_copy = df.copy()
df_copy.shape
飛来花粉量 数理モデル定義
def CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3):
df_pollen = []
for i in range(len(df_1.T)):
if (i-S_3) > 0:
y_sugi = S_0*((i-S_3)/24)**S_1 * np.exp(-S_2*(i-S_3)/24)
if (i-H_3) > 0:
y_hinoki = H_0*((i-H_3)/24)**H_1 * np.exp(-H_2*(i-H_3)/24)
else:
y_hinoki = 0
else:
y_sugi = 0
y_hinoki = 0
y = Y_0 + y_sugi + y_hinoki
df_pollen.append(y)
df_h = np.abs(signal.hilbert(np.array(df_1.T)[:int(59*24)]))
sum_abs = 0
for i in range(len(df_h)):
if df_h[i] > 0:
sum_abs += abs(df_pollen[i]-df_h[i])
return sum_abs #math.sqrt(sum_abs)
def objective(trial):
S_0 = trial.suggest_float('S_0', 0.0, 0.001)
S_1 = trial.suggest_float('S_1', SP_1, SP_1)
S_2 = trial.suggest_float('S_2', SP_2, SP_2)
S_3 = trial.suggest_int('S_3', 180, 300)
H_0 = trial.suggest_float('H_0', S_0*P_b, S_0*P_t)
H_1 = trial.suggest_float('H_1', S_1, S_1)
H_2 = trial.suggest_float('H_2', S_2, S_2)
H_3 = trial.suggest_int('H_3', 840, 1080)
return CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3)
数理モデルフィッテイング
region_col = ['utsunomiya','chiba','tokyo']
df_0 = pd.DataFrame()
for name in region_col:
if name == 'utsunomiya':
# 数理モデルパラメータ
Y_0 = df[:len_train].pollen_utsunomiya.quantile(0.5) # バイアス
SP_1 = 5.9
SP_2 = 0.17
P_t = 0.70 # スギに対するヒノキの上限
P_b = 0.50 # スギに対するヒノキの下限
elif name == 'chiba':
# 数理モデルパラメータ
Y_0 = df[:len_train].pollen_chiba.quantile(0.5)
SP_1 = 5.9
SP_2 = 0.17
P_t = 0.20
P_b = 0.00
elif name == 'tokyo':
# 数理モデルパラメータ
Y_0 = df[:len_train].pollen_tokyo.quantile(0.5)
SP_1 = 5.9
SP_2 = 0.17
P_t = 0.20
P_b = 0.00
for year in [2017,2018,2019,2020]:
df_0 = df[df["year"]==year]
file_name = f"model_{name}_{year}.csv"
df_0[f"pollen_{name}"].to_csv(file_name,index=False)
df_1 = pd.read_csv(file_name)
df_1 = pd.DataFrame(np.array(df_1).reshape(1,df_1.size)).dropna(axis=1)
df_1[:0] = df_1[:0].apply(lambda : x if x > 0 else 0)
# 目的関数の最適化を実行
optuna.logging.disable_default_handler()
study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=51))
study.optimize(objective, n_trials=1000)
S_0 = study.best_params['S_0']
S_1 = study.best_params['S_1']
S_2 = study.best_params['S_2']
S_3 = study.best_params['S_3']
H_0 = study.best_params['H_0']
H_1 = study.best_params['H_1']
H_2 = study.best_params['H_2']
H_3 = study.best_params['H_3']
print('Y_0 = ',f'{Y_0:.7f}')
print('S_0 = ',f'{S_0:.7f}')
print('S_1 = ',f'{S_1:.7f}')
print('S_2 = ',f'{S_2:.7f}')
print('S_3 = ',f'{S_3:.7f}')
print('H_0 = ',f'{H_0:.7f}')
print('H_1 = ',f'{H_1:.7f}')
print('H_2 = ',f'{H_2:.7f}')
print('H_3 = ',f'{H_3:.7f}')
print('CAL = ',CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3))
df_pollen = []
for i in range(len(df_1.T)):
if (i-S_3) > 0:
y_sugi = S_0*((i-S_3)/24)**S_1 * np.exp(-S_2*(i-S_3)/24)
if (i-H_3) > 0:
y_hinoki = H_0*((i-H_3)/24)**H_1 * np.exp(-H_2*(i-H_3)/24)
else:
y_hinoki = 0
else:
y_sugi = 0
y_hinoki = 0
y = Y_0 + y_sugi + y_hinoki
df_pollen.append(y)
df_pollen.append(i-S_3)
df_pollen.append(i-H_3)
df_pollen = np.array(df_pollen).reshape(int(len(df_pollen)/3),3)
(pd.concat([df_1.T, pd.DataFrame(df_pollen)],axis=1)).to_csv(file_name,index=False,header=None)
plt.rcParams["font.size"] = 16
plt.figure(figsize=[18,10])
plt.plot(np.array(df_pollen), marker="+", label='model')
plt.plot(np.array(df_1.T), marker="+", label='pollen')
plt.ticklabel_format(style='plain',axis='y')
plt.title(file_name)
#グラフの軸
plt.ylabel('Pollen')
plt.xlabel('hours')
plt.xlim(0,3000)
plt.ylim(0,1600)
#グラフの凡例
plt.legend()
plt.show()
特徴量生成①
# 降水量ゼロカウンター
def zero_count(df,alpha=0.005):
df_count = []
n_count = 0
for i in range(len(df)):
if df[i] < 0.5:
n_count += 1
else:
n_count = 0
df_count.append(n_count)
df_count = np.tanh(np.array(df_count)*alpha)
return df_count
# 特徴量生成
for name in region_col:
# 降水量ゼロカウンター
df[f'zero_precipitation_{name}'] = zero_count(df[f'precipitation_{name}'],0.005)
# 風速ベクトル
df[f'sin_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.sin(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
df[f'cos_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.cos(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
df[f'v_sin_{name}'] = df[f'windspeed_{name}']*df[f'sin_{name}']
df[f'v_cos_{name}'] = df[f'windspeed_{name}']*df[f'cos_{name}']
# 方位ベクトル積
for az in range(8):
if az != 0 and az !=4:
df[f'Azimuth_{az}_inner_product_{name}'] = (df[f'v_cos_{name}']*math.cos(np.pi*az/8)+df[f'v_sin_{name}']*math.sin(np.pi*az/8)).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift1'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(1).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift2'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(2).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift3'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(3).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift4'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(4).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift5'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(5).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift6'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(6).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift7'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(7).interpolate(limit_direction='both')
df[f'Azimuth_{az}_inner_product_{name}_shift8'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(8).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift1'] = df[f'precipitation_{name}'].shift(1).interpolate(limit_direction='both')
df[f'temperature_{name}_shift1'] = df[f'temperature_{name}'].shift(1).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift1'] = df[f'v_sin_{name}'].shift(1).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift1'] = df[f'v_cos_{name}'].shift(1).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift2'] = df[f'precipitation_{name}'].shift(2).interpolate(limit_direction='both')
df[f'temperature_{name}_shift2'] = df[f'temperature_{name}'].shift(2).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift2'] = df[f'v_sin_{name}'].shift(2).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift2'] = df[f'v_cos_{name}'].shift(2).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift3'] = df[f'precipitation_{name}'].shift(3).interpolate(limit_direction='both')
df[f'temperature_{name}_shift3'] = df[f'temperature_{name}'].shift(3).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift3'] = df[f'v_sin_{name}'].shift(3).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift3'] = df[f'v_cos_{name}'].shift(3).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift4'] = df[f'precipitation_{name}'].shift(4).interpolate(limit_direction='both')
df[f'temperature_{name}_shift4'] = df[f'temperature_{name}'].shift(4).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift4'] = df[f'v_sin_{name}'].shift(4).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift4'] = df[f'v_cos_{name}'].shift(4).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift5'] = df[f'precipitation_{name}'].shift(5).interpolate(limit_direction='both')
df[f'temperature_{name}_shift5'] = df[f'temperature_{name}'].shift(5).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift5'] = df[f'v_sin_{name}'].shift(5).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift5'] = df[f'v_cos_{name}'].shift(5).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift6'] = df[f'precipitation_{name}'].shift(6).interpolate(limit_direction='both')
df[f'temperature_{name}_shift6'] = df[f'temperature_{name}'].shift(6).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift6'] = df[f'v_sin_{name}'].shift(6).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift6'] = df[f'v_cos_{name}'].shift(6).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift7'] = df[f'precipitation_{name}'].shift(7).interpolate(limit_direction='both')
df[f'temperature_{name}_shift7'] = df[f'temperature_{name}'].shift(7).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift7'] = df[f'v_sin_{name}'].shift(7).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift7'] = df[f'v_cos_{name}'].shift(7).interpolate(limit_direction='both')
df[f'precipitation_{name}_shift8'] = df[f'precipitation_{name}'].shift(8).interpolate(limit_direction='both')
df[f'temperature_{name}_shift8'] = df[f'temperature_{name}'].shift(8).interpolate(limit_direction='both')
df[f'v_sin_{name}_shift8'] = df[f'v_sin_{name}'].shift(8).interpolate(limit_direction='both')
df[f'v_cos_{name}_shift8'] = df[f'v_cos_{name}'].shift(8).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}_shift1']).interpolate(limit_direction='both')
df[f'temperature_{name}_delta'] = (df[f'temperature_{name}']-df[f'temperature_{name}_shift1']).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}_shift1']).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}_shift1']).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta1'] = df[f'precipitation_{name}_delta'].shift(1).interpolate(limit_direction='both')
df[f'temperature_{name}_delta1'] = df[f'temperature_{name}_delta'].shift(1).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta1'] = df[f'v_sin_{name}_delta'].shift(1).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta1'] = df[f'v_cos_{name}_delta'].shift(1).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta2'] = df[f'precipitation_{name}_delta'].shift(2).interpolate(limit_direction='both')
df[f'temperature_{name}_delta2'] = df[f'temperature_{name}_delta'].shift(2).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta2'] = df[f'v_sin_{name}_delta'].shift(2).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta2'] = df[f'v_cos_{name}_delta'].shift(2).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta3'] = df[f'precipitation_{name}_delta'].shift(3).interpolate(limit_direction='both')
df[f'temperature_{name}_delta3'] = df[f'temperature_{name}_delta'].shift(3).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta3'] = df[f'v_sin_{name}_delta'].shift(3).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta3'] = df[f'v_cos_{name}_delta'].shift(3).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta4'] = df[f'precipitation_{name}_delta'].shift(4).interpolate(limit_direction='both')
df[f'temperature_{name}_delta4'] = df[f'temperature_{name}_delta'].shift(4).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta4'] = df[f'v_sin_{name}_delta'].shift(4).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta4'] = df[f'v_cos_{name}_delta'].shift(4).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta5'] = df[f'precipitation_{name}_delta'].shift(5).interpolate(limit_direction='both')
df[f'temperature_{name}_delta5'] = df[f'temperature_{name}_delta'].shift(5).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta5'] = df[f'v_sin_{name}_delta'].shift(5).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta5'] = df[f'v_cos_{name}_delta'].shift(5).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta6'] = df[f'precipitation_{name}_delta'].shift(6).interpolate(limit_direction='both')
df[f'temperature_{name}_delta6'] = df[f'temperature_{name}_delta'].shift(6).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta6'] = df[f'v_sin_{name}_delta'].shift(6).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta6'] = df[f'v_cos_{name}_delta'].shift(6).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta7'] = df[f'precipitation_{name}_delta'].shift(7).interpolate(limit_direction='both')
df[f'temperature_{name}_delta7'] = df[f'temperature_{name}_delta'].shift(7).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta7'] = df[f'v_sin_{name}_delta'].shift(7).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta7'] = df[f'v_cos_{name}_delta'].shift(7).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta8'] = df[f'precipitation_{name}_delta'].shift(8).interpolate(limit_direction='both')
df[f'temperature_{name}_delta8'] = df[f'temperature_{name}_delta'].shift(8).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta8'] = df[f'v_sin_{name}_delta'].shift(8).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta8'] = df[f'v_cos_{name}_delta'].shift(8).interpolate(limit_direction='both')
df[f'precipitation_{name}_mean'] = df[f'precipitation_{name}'].rolling(24).mean().interpolate(limit_direction='both')
df[f'temperature_{name}_mean'] = df[f'temperature_{name}'].rolling(24).mean().interpolate(limit_direction='both')
df[f'v_sin_{name}_mean'] = df[f'v_sin_{name}'].rolling(24).mean().interpolate(limit_direction='both')
df[f'v_cos_{name}_mean'] = df[f'v_cos_{name}'].rolling(24).mean().interpolate(limit_direction='both')
df[f'precipitation_{name}_mean3'] = df[f'precipitation_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
df[f'temperature_{name}_mean3'] = df[f'temperature_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
df[f'v_sin_{name}_mean3'] = df[f'v_sin_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
df[f'v_cos_{name}_mean3'] = df[f'v_cos_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
df[f'precipitation_{name}_mean5'] = df[f'precipitation_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
df[f'temperature_{name}_mean5'] = df[f'temperature_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
df[f'v_sin_{name}_mean5'] = df[f'v_sin_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
df[f'v_cos_{name}_mean5'] = df[f'v_cos_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
df[f'precipitation_{name}_mean10'] = df[f'precipitation_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
df[f'temperature_{name}_mean10'] = df[f'temperature_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
df[f'v_sin_{name}_mean10'] = df[f'v_sin_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
df[f'v_cos_{name}_mean10'] = df[f'v_cos_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
df[f'precipitation_{name}_diff'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}_mean10']).interpolate(limit_direction='both')
df[f'temperature_{name}_diff'] = (df[f'temperature_{name}']-df[f'temperature_{name}_mean10']).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}_mean10']).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}_mean10']).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff1'] = df[f'precipitation_{name}_diff'].shift(1).interpolate(limit_direction='both')
df[f'temperature_{name}_diff1'] = df[f'temperature_{name}_diff'].shift(1).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff1'] = df[f'v_sin_{name}_diff'].shift(1).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff1'] = df[f'v_cos_{name}_diff'].shift(1).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff2'] = df[f'precipitation_{name}_diff'].shift(2).interpolate(limit_direction='both')
df[f'temperature_{name}_diff2'] = df[f'temperature_{name}_diff'].shift(2).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff2'] = df[f'v_sin_{name}_diff'].shift(2).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff2'] = df[f'v_cos_{name}_diff'].shift(2).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff3'] = df[f'precipitation_{name}_diff'].shift(3).interpolate(limit_direction='both')
df[f'temperature_{name}_diff3'] = df[f'temperature_{name}_diff'].shift(3).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff3'] = df[f'v_sin_{name}_diff'].shift(3).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff3'] = df[f'v_cos_{name}_diff'].shift(3).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff4'] = df[f'precipitation_{name}_diff'].shift(4).interpolate(limit_direction='both')
df[f'temperature_{name}_diff4'] = df[f'temperature_{name}_diff'].shift(4).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff4'] = df[f'v_sin_{name}_diff'].shift(4).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff4'] = df[f'v_cos_{name}_diff'].shift(4).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff5'] = df[f'precipitation_{name}_diff'].shift(5).interpolate(limit_direction='both')
df[f'temperature_{name}_diff5'] = df[f'temperature_{name}_diff'].shift(5).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff5'] = df[f'v_sin_{name}_diff'].shift(5).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff5'] = df[f'v_cos_{name}_diff'].shift(5).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff6'] = df[f'precipitation_{name}_diff'].shift(6).interpolate(limit_direction='both')
df[f'temperature_{name}_diff6'] = df[f'temperature_{name}_diff'].shift(6).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff6'] = df[f'v_sin_{name}_diff'].shift(6).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff6'] = df[f'v_cos_{name}_diff'].shift(6).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff7'] = df[f'precipitation_{name}_diff'].shift(7).interpolate(limit_direction='both')
df[f'temperature_{name}_diff7'] = df[f'temperature_{name}_diff'].shift(7).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff7'] = df[f'v_sin_{name}_diff'].shift(7).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff7'] = df[f'v_cos_{name}_diff'].shift(7).interpolate(limit_direction='both')
df[f'precipitation_{name}_diff8'] = df[f'precipitation_{name}_diff'].shift(8).interpolate(limit_direction='both')
df[f'temperature_{name}_diff8'] = df[f'temperature_{name}_diff'].shift(8).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff8'] = df[f'v_sin_{name}_diff'].shift(8).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff8'] = df[f'v_cos_{name}_diff'].shift(8).interpolate(limit_direction='both')
df = df.drop(f'sin_{name}',axis=1)
df = df.drop(f'cos_{name}',axis=1)
df = df.drop(f'windspeed_{name}',axis=1)
df = df.drop(f'winddirection_{name}',axis=1)
追加特徴量と目的変数取り出し①
# 追加特徴量と目的変数取り出し①
for name in region_col:
df_3 = pd.DataFrame()
for year in [2017,2018,2019,2020]:
file_name = f"model_{name}_{year}.csv"
df_3 = pd.concat([df_3,pd.read_csv(file_name,header=None)])
file_name = f"model_{name}.csv"
df_3.to_csv(file_name,index=False)
df_3 = df_3.reset_index()
# 特徴量にモデル飛来量、飛来開始日からのカウントを追加
df[f"pollen_model_{name}"] = df_3[1]
df[f"count_sugi_{name}"] = df_3[2]
df[f"count_hinoki_{name}"] = df_3[3]
# 目的変数
df_3 = ymin_log(np.log1p(np.log1p(df_3[0]/df_3[1])))
file_name = f"model_{name}_target.csv"
df_3.to_csv(file_name,index=False)
前処理①
# 特徴量データを train test に分割
train=df[0:len_train]
test=df[len_train:]
# 不要なカラムを削除
d_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo', 'year', 'month', 'day']
month = train[['year','month','day']]
train = train[~((train['month']==2)&(train['day']<11))].reset_index(drop=True)
# display(train)
train_x = train.drop(columns=d_col)
test_x = test.drop(columns=d_col)
# 時間カラムをOne-Hot化
train_x['hour'] = train_x['hour'].astype('str')
test_x['hour'] = test_x['hour'].astype('str')
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)
print(train_x.shape)
print(test_x.shape)
# 特徴量を地域に限定するために用意したコード
train_x_u = train_x
test_x_u = test_x
for col in train_x.columns:
if 'utsunomiya' not in col:
train_x_u = train_x_u.drop(col,axis=1)
test_x_u = test_x_u.drop(col,axis=1)
train_x_c = train_x
test_x_c = test_x
for col in train_x.columns:
if ('chiba' not in col): #and ('tokyo' not in col):
train_x_c = train_x_c.drop(col,axis=1)
test_x_c = test_x_c.drop(col,axis=1)
train_x_t = train_x
test_x_t = test_x
for col in train_x.columns:
if ('tokyo' not in col): # and ('utsunomiya' not in col):
train_x_t = train_x_t.drop(col,axis=1)
test_x_t = test_x_t.drop(col,axis=1)
# 目的変数train用
train_y_u = pd.read_csv("model_utsunomiya_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_c = pd.read_csv("model_chiba_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_t = pd.read_csv("model_tokyo_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_u = pd.concat([month,train_y_u],axis=1)
train_y_c = pd.concat([month,train_y_c],axis=1)
train_y_t = pd.concat([month,train_y_t],axis=1)
train_y_u = train_y_u[~((train_y_u['month']==2)&(train_y_u['day']<11))].reset_index(drop=True)
train_y_c = train_y_c[~((train_y_c['month']==2)&(train_y_c['day']<11))].reset_index(drop=True)
train_y_t = train_y_t[~((train_y_t['month']==2)&(train_y_t['day']<11))].reset_index(drop=True)
len_train_2017 = train_y_u[train_y_u['year']==2017].shape[0]
len_train_2018 = train_y_u[train_y_u['year']==2018].shape[0]
len_train_2019 = train_y_u[train_y_u['year']==2019].shape[0]
len_train_2020 = train_y_u[train_y_u['year']==2020].shape[0]
train_y_u = train_y_u.drop(['year','month','day'],axis=1)
train_y_c = train_y_c.drop(['year','month','day'],axis=1)
train_y_t = train_y_t.drop(['year','month','day'],axis=1)
print(len_train_2017)
print(len_train_2018)
print(len_train_2019)
print(len_train_2020)
# 数理モデルの値を格納
test_u = pd.read_csv("model_utsunomiya.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_c = pd.read_csv("model_chiba.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_t = pd.read_csv("model_tokyo.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
# 目的変数外れ値クリッピング
threshold_h = train_y_u[0].quantile(THR_H)
threshold_l = train_y_u[0].quantile(THR_L)
train_y_u = train_y_u[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_c[0].quantile(THR_H)
threshold_l = train_y_c[0].quantile(THR_L)
train_y_c = train_y_c[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_t[0].quantile(THR_H)
threshold_l = train_y_t[0].quantile(THR_L)
train_y_t = train_y_t[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
LightGBM 交差検証定義
def kFold_LightGBM(n_splits,train,target,test,seed=23):
kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
X = train
y = target
# 各Foldの結果を格納する
preds_lgb = []
preds_val = []
val_indx = []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_valid = X.iloc[valid_index]
y_valid = y.iloc[valid_index]
lgb_train = lgb.Dataset(X_train.values, y_train)
lgb_valid = lgb.Dataset(X_valid.values, y_valid)
# パラメータ
lgb_params = {
'task': 'train', # タスクを訓練に設定
'boosting_type': 'gbdt', # 指定 gbdt goss dart
'objective': 'regression', # 回帰を指定
'metric': 'rmse', # 回帰の評価関数
# 'lambda_l1': 0.10,
'lambda_l2': 0.35,
'num_iterations': 9000,
'max_depth': -1,
'learning_rate': 0.05,
# 'extra_trees': True,
}
model = lgb.train(lgb_params,
lgb_train,
valid_sets=lgb_valid,
num_boost_round=9999,
verbose_eval=200,
early_stopping_rounds=30)
# 学習の評価
valid_pred = model.predict(X_valid)
score = MAE(y_valid,valid_pred)
# テストデータの予測
test_pred = model.predict(test, num_iteration=model.best_iteration)
# Foldの結果を格納
preds_val.append(valid_pred)
val_indx.append(valid_index)
preds_lgb.append(test_pred)
print(f'Fold{i}:', score)
# 特徴量の重要度をプロットする
lgb.plot_importance(model, figsize=(12, 8))
plt.show()
# feature importanceを表示
importance = pd.DataFrame(model.feature_importance(), index=X_train.columns, columns=['importance'])
display(importance.sort_values(by='importance',ascending=False))
val_indx = np.concatenate(val_indx)
preds_val = np.concatenate(preds_val)
order = np.argsort(val_indx)
pred_train = preds_val[order]
return pred_train, preds_lgb
XGBOOST 交差検証定義
def kFold_xgboost(n_splits,train,target,test,seed=87):
kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
X = train
y = target
# 各Foldの結果を格納する
preds_xgb =[]
preds_val = []
val_indx = []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_valid = X.iloc[valid_index]
y_valid = y.iloc[valid_index]
xgb_train = xgb.DMatrix(X_train.values, label=y_train)
xgb_valid = xgb.DMatrix(X_valid.values, label=y_valid)
xgb_test = xgb.DMatrix(test.values)
# パラメータ
xgb_params = {
# 回帰問題
'objective': 'reg:linear',
'eval_metric': 'rmse',
'max_depth': 16,
# 'alpha': 0.10,
'lambda': 0.35,
'eta':0.1,
}
evals = [(xgb_train, 'train'), (xgb_valid, 'eval')]
model =xgb.train(xgb_params,
xgb_train,
verbose_eval=200, # 200イテレーション毎に学習結果出力
num_boost_round=2000, # 最大イテレーション回数指定
early_stopping_rounds=30,
evals=evals )
# 学習の評価
valid_pred = model.predict(xgb_valid)
score = MAE(y_valid,valid_pred)
# テストデータの予測
test_pred = model.predict(xgb_test)
# Foldの結果を格納
preds_val.append(valid_pred)
val_indx.append(valid_index)
preds_xgb.append(test_pred)
print(f'Fold{i}:', score)
val_indx = np.concatenate(val_indx)
preds_val = np.concatenate(preds_val)
order = np.argsort(val_indx)
pred_train = preds_val[order]
return pred_train, preds_xgb
lgb で交差検証①
train_ul,preds_ul = kFold_LightGBM(4, train_x_u, train_y_u, test_x_u,seed=seed_lgb0)
train_cl,preds_cl = kFold_LightGBM(4, train_x_c, train_y_c, test_x_c,seed=seed_lgb0)
train_tl,preds_tl = kFold_LightGBM(4, train_x_t, train_y_t, test_x_t,seed=seed_lgb0)
xgb で交差検証①
train_ux,preds_ux = kFold_xgboost(4, train_x_u, train_y_u, test_x_u,seed=seed_xgb0)
train_cx,preds_cx = kFold_xgboost(4, train_x_c, train_y_c, test_x_c,seed=seed_xgb0)
train_tx,preds_tx = kFold_xgboost(4, train_x_t, train_y_t, test_x_t,seed=seed_xgb0)
予測結果①
submission_ul = pd.DataFrame(preds_ul).mean(axis=0)
submission_cl = pd.DataFrame(preds_cl).mean(axis=0)
submission_tl = pd.DataFrame(preds_tl).mean(axis=0)
submission_ux = pd.DataFrame(preds_ux).mean(axis=0)
submission_cx = pd.DataFrame(preds_cx).mean(axis=0)
submission_tx = pd.DataFrame(preds_tx).mean(axis=0)
アンサンブル平均を取ってから指数変換①
submission_u = np.expm1(np.expm1(ymin_exp((submission_ul + submission_ux)/2)))
submission_c = np.expm1(np.expm1(ymin_exp((submission_cl + submission_cx)/2)))
submission_t = np.expm1(np.expm1(ymin_exp((submission_tl + submission_tx)/2)))
最小単位を指定して提出ファイルに格納①
m_unit = 4
submission = pd.read_csv('./data/sample_submission.csv')
submission['pollen_utsunomiya'] = m_unit*(submission_u*test_u/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_chiba'] = m_unit*(submission_c*test_c/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_tokyo'] = m_unit*(submission_t*test_t/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission
submission.to_csv(f"./submit/submit_emsemble_{seed_lgb0:03}{seed_xgb0:03}.csv", index=False, header=True)
スタッキング
train_us = np.concatenate([train_ul,train_ux,train_ul**2,train_ul*train_ux,train_ux**2]).reshape(len(train_ul),5)
train_cs = np.concatenate([train_cl,train_cx,train_cl**2,train_cl*train_cx,train_cx**2]).reshape(len(train_ul),5)
train_ts = np.concatenate([train_tl,train_tx,train_tl**2,train_tl*train_tx,train_tx**2]).reshape(len(train_ul),5)
test_us = np.concatenate([submission_ul,submission_ux,submission_ul**2,submission_ul*submission_ux,submission_ux**2]).reshape(len(submission_ul),5)
test_cs = np.concatenate([submission_cl,submission_cx,submission_cl**2,submission_cl*submission_cx,submission_cx**2]).reshape(len(submission_ul),5)
test_ts = np.concatenate([submission_tl,submission_tx,submission_tl**2,submission_tl*submission_tx,submission_tx**2]).reshape(len(submission_ul),5)
def build_toy():
model=Sequential()
model.add(Dense(5, input_dim=5, activation='tanh'))
model.add(Dense(8, activation='relu'))
model.add(Dense(16, activation=swish))
model.add(Dropout(0.15))
model.add(Dense(32, activation=swish))
model.add(Dropout(0.25))
model.add(Dense(32, activation=swish))
model.add(Dropout(0.25))
model.add(Dense(16, activation=swish))
model.add(Dropout(0.15))
model.add(Dense(8, activation='relu'))
model.add(Dense(1))
# compile model
model.compile(loss='mean_squared_error',
optimizer =optimizers.Adam(learning_rate=1e-3,amsgrad=False))
return model
model = build_toy()
model.summary()
def kFold_TOY(n_splits,train,target,test,seed=421):
# model = []
preds_nn =[]
preds_val = []
val_indx = []
kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
X = train
y = target
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
X_train = X[train_index]
y_train = y.iloc[train_index]
X_valid = X[valid_index]
y_valid = y.iloc[valid_index]
model = build_toy()
rlr = ReduceLROnPlateau(monitor='val_loss',
factor=0.7,
patience=6,
verbose=0,
min_delta=1e-5,
mode='min')
# ckp = ModelCheckpoint(f'./model/model_{i}.h',
# monitor='val_loss',
# verbose=0,
# save_best_only=True,
# save_weights_only=True,
# mode='min')
es = EarlyStopping(monitor='val_loss',
min_delta=1e-5,
patience=20,
mode='min',
baseline=None,
restore_best_weights=True,
verbose=0)
print(f'fold {i} 巡目 計算中')
model.fit(X_train, y_train,
validation_data=(X_valid, y_valid),
epochs=150,
batch_size=128,
# callbacks=[rlr, ckp, es],
callbacks=[rlr, es],
verbose=1)
# model.load_weights(f'./model/model_{i}.h')
preds = model.predict(test)
preds_nn.append(preds)
valid_pred = model.predict(X_valid)
preds_val.append(valid_pred)
val_indx.append(valid_index)
preds_nn = np.array(preds_nn).reshape(n_splits,len(test))
val_indx = np.array(val_indx).reshape(len(train))
preds_val = np.array(preds_val).reshape(len(train))
order = np.argsort(val_indx)
pred_train = preds_val[order]
return pred_train, preds_nn
scaler = StandardScaler()
scaler.fit(train_us)
train_x_ = scaler.transform(train_us)
test_x_ = scaler.transform(test_us)
train_uss, preds_us = kFold_TOY(4, train_x_, train_y_u, test_x_,seed=seed_toy0)
scaler.fit(train_cs)
train_x_ = scaler.transform(train_cs)
test_x_ = scaler.transform(test_cs)
train_css, preds_cs = kFold_TOY(4, train_x_, train_y_c, test_x_,seed=seed_toy0)
scaler.fit(train_ts)
train_x_ = scaler.transform(train_ts)
test_x_ = scaler.transform(test_ts)
train_tss, preds_ts = kFold_TOY(4, train_x_, train_y_t, test_x_,seed=seed_toy0)
submission_us = pd.DataFrame(preds_us).mean(axis=0)
submission_cs = pd.DataFrame(preds_cs).mean(axis=0)
submission_ts = pd.DataFrame(preds_ts).mean(axis=0)
特徴量生成②
df_copy.shape
set_seed(1)
df = df_copy.copy()
dummy240 = np.zeros(240)
l1 = len_train_2017
l2 = l1 + len_train_2018
l3 = l2 + len_train_2019
utsunomiya_lgb = np.concatenate([train_ul,submission_ul])
utsunomiya_xgb = np.concatenate([train_ux,submission_ux])
utsunomiya_ss = np.concatenate([train_uss,submission_us])
chiba_lgb = np.concatenate([train_cl,submission_cl])
chiba_xgb = np.concatenate([train_cx,submission_cx])
chiba_ss = np.concatenate([train_css,submission_cs])
tokyo_lgb = np.concatenate([train_tl,submission_tl])
tokyo_xgb = np.concatenate([train_tx,submission_tx])
tokyo_ss = np.concatenate([train_tss,submission_ts])
utsunomiya_lgb = np.concatenate([dummy240,utsunomiya_lgb[0:l1]
,dummy240,utsunomiya_lgb[l1:l2]
,dummy240,utsunomiya_lgb[l2:l3]
,dummy240,utsunomiya_lgb[l3:]])
utsunomiya_xgb = np.concatenate([dummy240,utsunomiya_xgb[0:l1]
,dummy240,utsunomiya_xgb[l1:l2]
,dummy240,utsunomiya_xgb[l2:l3]
,dummy240,utsunomiya_xgb[l3:]])
utsunomiya_ss = np.concatenate([dummy240,utsunomiya_ss[0:l1]
,dummy240,utsunomiya_ss[l1:l2]
,dummy240,utsunomiya_ss[l2:l3]
,dummy240,utsunomiya_ss[l3:]])
chiba_lgb = np.concatenate([dummy240,chiba_lgb[0:l1]
,dummy240,chiba_lgb[l1:l2]
,dummy240,chiba_lgb[l2:l3]
,dummy240,chiba_lgb[l3:]])
chiba_xgb = np.concatenate([dummy240,chiba_xgb[0:l1]
,dummy240,chiba_xgb[l1:l2]
,dummy240,chiba_xgb[l2:l3]
,dummy240,chiba_xgb[l3:]])
chiba_ss = np.concatenate([dummy240,chiba_ss[0:l1]
,dummy240,chiba_ss[l1:l2]
,dummy240,chiba_ss[l2:l3]
,dummy240,chiba_ss[l3:]])
tokyo_lgb = np.concatenate([dummy240,tokyo_lgb[0:l1]
,dummy240,tokyo_lgb[l1:l2]
,dummy240,tokyo_lgb[l2:l3]
,dummy240,tokyo_lgb[l3:]])
tokyo_xgb = np.concatenate([dummy240,tokyo_xgb[0:l1]
,dummy240,tokyo_xgb[l1:l2]
,dummy240,tokyo_xgb[l2:l3]
,dummy240,tokyo_xgb[l3:]])
tokyo_ss = np.concatenate([dummy240,tokyo_ss[0:l1]
,dummy240,tokyo_ss[l1:l2]
,dummy240,tokyo_ss[l2:l3]
,dummy240,tokyo_ss[l3:]])
# それぞれの結果をミックスしたものを説明変数として特徴量に加える
df["stack_utsunomiya"] = a*utsunomiya_lgb + b*utsunomiya_xgb + c*utsunomiya_ss
df["stack_chiba"] = a*chiba_lgb + b*chiba_xgb + c*chiba_ss
df["stack_tokyo"] = a*tokyo_lgb + b*tokyo_xgb + c*tokyo_ss
# 特徴量生成(①とは異なる特徴量セットを使用)
for name in region_col:
# 降水量ゼロカウンター
df[f'zero_precipitation_{name}'] = zero_count(df[f'precipitation_{name}'],0.005)
# 風向と風速から風速ベクトル生成
df[f'sin_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.sin(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
df[f'cos_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.cos(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
df[f'v_sin_{name}'] = df[f'windspeed_{name}']*df[f'sin_{name}']
df[f'v_cos_{name}'] = df[f'windspeed_{name}']*df[f'cos_{name}']
df = df.drop(f'sin_{name}',axis=1)
df = df.drop(f'cos_{name}',axis=1)
df = df.drop(f'windspeed_{name}',axis=1)
df = df.drop(f'winddirection_{name}',axis=1)
# 最高気温
df[f'precipitation_{name}_max'] = df[f'precipitation_{name}'].rolling(36).max().interpolate(limit_direction='both')
# 方位ベクトル積
for az in range(8):
if az != 0 and az !=4:
df[f'Azimuth_{az}_inner_product_{name}'] = (df[f'v_cos_{name}']*math.cos(np.pi*az/8)+df[f'v_sin_{name}']*math.sin(np.pi*az/8)).interpolate(limit_direction='both')
# 1時間前風速ベクトル積
df[f'inner_product_{name}'] = (df[f'v_cos_{name}'].shift(1)*df[f'v_cos_{name}']+df[f'v_sin_{name}'].shift(1)*df[f'v_sin_{name}']).interpolate(limit_direction='both')
df[f'cross_product_{name}'] = (df[f'v_cos_{name}'].shift(1)*df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(1)*df[f'v_cos_{name}']).interpolate(limit_direction='both')
# 1時間前データとの差
df[f'precipitation_{name}_delta'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}'].shift(1)).interpolate(limit_direction='both')
df[f'temperature_{name}_delta'] = (df[f'temperature_{name}']-df[f'temperature_{name}'].shift(1)).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(1)).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}'].shift(1)).interpolate(limit_direction='both')
df[f'inner_product_{name}_delta'] = (df[f'inner_product_{name}']-df[f'inner_product_{name}'].shift(1)).interpolate(limit_direction='both')
df[f'cross_product_{name}_delta'] = (df[f'cross_product_{name}']-df[f'cross_product_{name}'].shift(1)).interpolate(limit_direction='both')
# 10日移動平均との差
df[f'precipitation_{name}_diff'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
df[f'temperature_{name}_diff'] = (df[f'temperature_{name}']-df[f'temperature_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
df[f'v_sin_{name}_diff'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
df[f'v_cos_{name}_diff'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
df[f'inner_product_{name}_diff'] = (df[f'inner_product_{name}']-df[f'inner_product_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
df[f'cross_product_{name}_diff'] = (df[f'cross_product_{name}']-df[f'cross_product_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
# 24時間毎、10日平均データとの差
df[f'precipitation_{name}_delta_24h'] = np.zeros(len(df))
df[f'temperature_{name}_delta_24h'] = np.zeros(len(df))
df[f'v_sin_{name}_delta_24h'] = np.zeros(len(df))
df[f'v_cos_{name}_delta_24h'] = np.zeros(len(df))
df[f'inner_product_{name}_delta_24h'] = np.zeros(len(df))
df[f'cross_product_{name}_delta_24h'] = np.zeros(len(df))
for i in range(1,11):
df[f'precipitation_{name}_delta_24h'] += (df[f'precipitation_{name}']-df[f'precipitation_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'temperature_{name}_delta_24h'] += (df[f'temperature_{name}']-df[f'temperature_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'v_sin_{name}_delta_24h'] += (df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'v_cos_{name}_delta_24h'] += (df[f'v_cos_{name}']-df[f'v_cos_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'inner_product_{name}_delta_24h'] += (df[f'inner_product_{name}']-df[f'inner_product_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'cross_product_{name}_delta_24h'] += (df[f'cross_product_{name}']-df[f'cross_product_{name}'].shift(24*i)).interpolate(limit_direction='both')
df[f'precipitation_{name}_delta_24h'] /= 10
df[f'temperature_{name}_delta_24h'] /= 10
df[f'v_sin_{name}_delta_24h'] /= 10
df[f'v_cos_{name}_delta_24h'] /= 10
df[f'inner_product_{name}_delta_24h'] /= 10
df[f'cross_product_{name}_delta_24h'] /= 10
追加特徴量と目的変数取り出し②
# 追加特徴量と目的変数取り出し②
for name in region_col:
df_3 = pd.DataFrame()
for year in [2017,2018,2019,2020]:
file_name = f"model_{name}_{year}.csv"
df_3 = pd.concat([df_3,pd.read_csv(file_name,header=None)])
file_name = f"model_{name}.csv"
df_3.to_csv(file_name,index=False)
print(df_3.shape)
df_3 = df_3.reset_index()
# 特徴量にモデル飛来量、飛来開始日からのカウントを追加
df[f"pollen_model_{name}"] = df_3[1]
df[f"count_sugi_{name}"] = df_3[2]
df[f"count_hinoki_{name}"] = df_3[3]
# 目的変数
df_3 = ymin_log(np.log1p(np.log1p(df_3[0]/df_3[1])))
file_name = f"model_{name}_target.csv"
df_3.to_csv(file_name,index=False)
前処理②(特徴量)
# 特徴量データを train test に分割
train=df[0:len_train]
test=df[len_train:]
# 不要なカラムを削除
d_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo', 'year', 'month', 'day']
month = train[['year','month','day']]
train = train[~((train['month']==2)&(train['day']<11))].reset_index(drop=True)
# display(train)
train_x = train.drop(columns=d_col)
test_x = test.drop(columns=d_col)
# 時間カラムをOne-Hot化
train_x['hour'] = train_x['hour'].astype('str')
test_x['hour'] = test_x['hour'].astype('str')
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)
print(train_x.shape)
print(test_x.shape)
# 特徴量を地域に限定するために用意したコード
train_x_u = train_x
test_x_u = test_x
for col in train_x.columns:
if 'utsunomiya' not in col:
train_x_u = train_x_u.drop(col,axis=1)
test_x_u = test_x_u.drop(col,axis=1)
train_x_c = train_x
test_x_c = test_x
for col in train_x.columns:
if 'chiba' not in col:
train_x_c = train_x_c.drop(col,axis=1)
test_x_c = test_x_c.drop(col,axis=1)
train_x_t = train_x
test_x_t = test_x
for col in train_x.columns:
if 'tokyo' not in col:
train_x_t = train_x_t.drop(col,axis=1)
test_x_t = test_x_t.drop(col,axis=1)
# 目的変数train用
train_y_u = pd.read_csv("model_utsunomiya_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_c = pd.read_csv("model_chiba_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_t = pd.read_csv("model_tokyo_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_u = pd.concat([month,train_y_u],axis=1)
train_y_c = pd.concat([month,train_y_c],axis=1)
train_y_t = pd.concat([month,train_y_t],axis=1)
train_y_u = train_y_u[~((train_y_u['month']==2)&(train_y_u['day']<11))].reset_index(drop=True)
train_y_c = train_y_c[~((train_y_c['month']==2)&(train_y_c['day']<11))].reset_index(drop=True)
train_y_t = train_y_t[~((train_y_t['month']==2)&(train_y_t['day']<11))].reset_index(drop=True)
len_train_2017 = train_y_u[train_y_u['year']==2017].shape[0]
len_train_2018 = train_y_u[train_y_u['year']==2018].shape[0]
len_train_2019 = train_y_u[train_y_u['year']==2019].shape[0]
len_train_2020 = train_y_u[train_y_u['year']==2020].shape[0]
train_y_u = train_y_u.drop(['year','month','day'],axis=1)
train_y_c = train_y_c.drop(['year','month','day'],axis=1)
train_y_t = train_y_t.drop(['year','month','day'],axis=1)
print(len_train_2017)
print(len_train_2018)
print(len_train_2019)
print(len_train_2020)
# 数理モデルの値を格納
test_u = pd.read_csv("model_utsunomiya.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_c = pd.read_csv("model_chiba.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_t = pd.read_csv("model_tokyo.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
# 目的変数外れ値クリッピング
threshold_h = train_y_u[0].quantile(THR_H)
threshold_l = train_y_u[0].quantile(THR_L)
train_y_u = train_y_u[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_c[0].quantile(THR_H)
threshold_l = train_y_c[0].quantile(THR_L)
train_y_c = train_y_c[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_t[0].quantile(THR_H)
threshold_l = train_y_t[0].quantile(THR_L)
train_y_t = train_y_t[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
lgb で交差検証②
train_ul,preds_ul = kFold_LightGBM(4, train_x_u, train_y_u, test_x_u,seed=seed_lgb1)
train_cl,preds_cl = kFold_LightGBM(4, train_x_c, train_y_c, test_x_c,seed=seed_lgb1)
train_tl,preds_tl = kFold_LightGBM(4, train_x_t, train_y_t, test_x_t,seed=seed_lgb1)
予測結果②
submission_ul = pd.DataFrame(preds_ul).mean(axis=0)
submission_cl = pd.DataFrame(preds_cl).mean(axis=0)
submission_tl = pd.DataFrame(preds_tl).mean(axis=0)
最小単位を指定して提出ファイルに格納②
submission_u = np.expm1(np.expm1(ymin_exp(submission_ul)))
submission_c = np.expm1(np.expm1(ymin_exp(submission_cl)))
submission_t = np.expm1(np.expm1(ymin_exp(submission_tl)))
m_unit = 4
submission = pd.read_csv('./data/sample_submission.csv')
submission['pollen_utsunomiya'] = m_unit*(submission_u*test_u/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_chiba'] = m_unit*(submission_c*test_c/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_tokyo'] = m_unit*(submission_t*test_t/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission
submission.to_csv(f"./submit/submit_stck_{seed_lgb0:03}{seed_xgb0:03}{seed_toy0:03}{seed_lgb1:03}{a}{b}{c}.csv", index=False, header=True)
Discussion
数理モデルでフィッティングして
(目的変数)=(飛来花粉量)/(数理モデル)
としたところが解法の要諦です。
(数理モデル)=(年間バイアス)+(スギ花粉量)+(ヒノキ花粉量)
としており、それぞれの花粉量はガンマ関数に従うと仮定しています。
赤色・・・数理モデル
灰色・・・スギ(初飛来日は2月上旬)
黄色・・・ヒノキ(初飛来日は3月中旬)
雨が続けばバーストする現象を取り込むなどして数理モデルをもっと洗練させるべきでしたね