🐒

ProbSpace pollen counts (PVS 2nd-Code)

2023/01/10に公開約52,500字1件のコメント

https://comp.probspace.com/competitions/pollen_counts

PobSpace 花粉飛散予測_MODEL_STACK

作成者:saru_da_mon(AI-FOX)

本コードの使用権利

MIT License に準拠する。

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

コードの概要

※数理モデルについてはコメント欄
https://zenn.dev/link/comments/08176f35774cfc

ここからコード

ライブラリインポート

import numpy as np
import pandas as pd
import math
from datetime import datetime,date,timedelta
from matplotlib import pyplot as plt
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import StandardScaler
from scipy import signal

import os
import random

from keras import backend as K
# 活性化関数を用意
def swish(x):
    return x * K.sigmoid(x)
    
# 不要なエラー表示はしない
import warnings
warnings.simplefilter('ignore')

# 表示指定
pd.set_option("display.max_columns",600)
pd.set_option("display.max_rows",600)
plt.style.use('ggplot')
%matplotlib inline

設定

# MIXパラメータ
seed_lgb0 = 747
seed_xgb0 = 818
seed_toy0 = 911
seed_lgb1 = 990
a = 1
b = 1
c = 1
# 外れ値クリッピング閾値
THR_H = 0.9999
THR_L = 0.0001
YMIN = 3e-2
def ymin_log(x):
    return np.log(x+YMIN)
def ymin_exp(x):
    return np.exp(x)-YMIN

乱数の固定

#!pip install tensorflow-determinism

def set_seed(seed):
    tf.random.set_seed(seed)
    # optional
    # for numpy.random
    np.random.seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_seed(0)

データ読込み

#データ読込み
train = pd.read_csv('./data/train_v2.csv')
test = pd.read_csv('./data/test_v2.csv')
len_train = train.shape[0]
print(train.shape)
print(test.shape)
# 'datatime'列から年、月、週、日をとりあえず取り出す
df = pd.concat([train, test]).reset_index(drop=True)
df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
df.head(3)
df.columns
# 'time' の列は要らないので削除
col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo', 
       'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo', 
       'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo', 
       'windspeed_utsunomiya', 'windspeed_chiba','windspeed_tokyo', 
       'pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo',
       'year', 'month', 'day', 'hour']
df = df[col]
df
# 欠測として埋められてるデータを補間
for name in df.columns:
    if 'pollen' in name:
        df[name] = df[name].apply(lambda x: np.nan if x < -9000 else x)
        df[name] = df[name].replace({'欠測':np.nan})
	df[name] = df[name].interpolate('ffill', limit_direction='forward')
    if df[name].dtype == 'object':
        df[name] = df[name].astype('float')
        print(name, df[name].dtype)
df_copy = df.copy()
df_copy.shape

飛来花粉量 数理モデル定義

def CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3):
    df_pollen = []
    for i in range(len(df_1.T)):
        if (i-S_3) > 0:
            y_sugi = S_0*((i-S_3)/24)**S_1 * np.exp(-S_2*(i-S_3)/24)
            if (i-H_3) > 0:
                y_hinoki = H_0*((i-H_3)/24)**H_1 * np.exp(-H_2*(i-H_3)/24)
            else:
                y_hinoki = 0
        else:
            y_sugi = 0
            y_hinoki = 0
        y = Y_0 + y_sugi + y_hinoki
        df_pollen.append(y)

    df_h = np.abs(signal.hilbert(np.array(df_1.T)[:int(59*24)]))
    sum_abs = 0
    for i in range(len(df_h)):
        if df_h[i] > 0:
            sum_abs += abs(df_pollen[i]-df_h[i])    
    return sum_abs #math.sqrt(sum_abs)

def objective(trial):
    S_0 = trial.suggest_float('S_0', 0.0, 0.001)
    S_1 = trial.suggest_float('S_1', SP_1, SP_1)
    S_2 = trial.suggest_float('S_2', SP_2, SP_2)
    S_3 = trial.suggest_int('S_3', 180, 300)
    H_0 = trial.suggest_float('H_0', S_0*P_b, S_0*P_t)
    H_1 = trial.suggest_float('H_1', S_1, S_1)
    H_2 = trial.suggest_float('H_2', S_2, S_2)
    H_3 = trial.suggest_int('H_3', 840, 1080)
    return CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3)

数理モデルフィッテイング

region_col = ['utsunomiya','chiba','tokyo']
df_0 = pd.DataFrame()

for name in region_col:
    
    if name == 'utsunomiya':
        # 数理モデルパラメータ
        Y_0 = df[:len_train].pollen_utsunomiya.quantile(0.5) # バイアス
        SP_1 = 5.9
        SP_2 = 0.17
        P_t = 0.70  # スギに対するヒノキの上限
        P_b = 0.50  # スギに対するヒノキの下限
    elif name == 'chiba':
        # 数理モデルパラメータ
        Y_0 = df[:len_train].pollen_chiba.quantile(0.5)
        SP_1 = 5.9
        SP_2 = 0.17
        P_t = 0.20
        P_b = 0.00
    elif name == 'tokyo':
        # 数理モデルパラメータ
        Y_0 = df[:len_train].pollen_tokyo.quantile(0.5)
        SP_1 = 5.9
        SP_2 = 0.17
        P_t = 0.20
        P_b = 0.00
        
    for year in [2017,2018,2019,2020]:
        df_0 = df[df["year"]==year]
        file_name = f"model_{name}_{year}.csv"
        df_0[f"pollen_{name}"].to_csv(file_name,index=False)
        df_1 = pd.read_csv(file_name)
        df_1 = pd.DataFrame(np.array(df_1).reshape(1,df_1.size)).dropna(axis=1)
        df_1[:0] = df_1[:0].apply(lambda : x if x > 0 else 0)

        # 目的関数の最適化を実行
        optuna.logging.disable_default_handler()
        study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=51))
        study.optimize(objective, n_trials=1000)

        S_0 = study.best_params['S_0']
        S_1 = study.best_params['S_1']
        S_2 = study.best_params['S_2']
        S_3 = study.best_params['S_3']
        H_0 = study.best_params['H_0']
        H_1 = study.best_params['H_1']
        H_2 = study.best_params['H_2']
        H_3 = study.best_params['H_3']

        print('Y_0 = ',f'{Y_0:.7f}')
        print('S_0 = ',f'{S_0:.7f}')
        print('S_1 = ',f'{S_1:.7f}')
        print('S_2 = ',f'{S_2:.7f}')
        print('S_3 = ',f'{S_3:.7f}')
        print('H_0 = ',f'{H_0:.7f}')
        print('H_1 = ',f'{H_1:.7f}')
        print('H_2 = ',f'{H_2:.7f}')
        print('H_3 = ',f'{H_3:.7f}')

        print('CAL = ',CALC(Y_0,S_0,S_1,S_2,S_3,H_0,H_1,H_2,H_3))

        df_pollen = []
        for i in range(len(df_1.T)):
            if (i-S_3) > 0:
                y_sugi = S_0*((i-S_3)/24)**S_1 * np.exp(-S_2*(i-S_3)/24)
                if (i-H_3) > 0:
                    y_hinoki = H_0*((i-H_3)/24)**H_1 * np.exp(-H_2*(i-H_3)/24)
                else:
                    y_hinoki = 0
            else:
                y_sugi = 0
                y_hinoki = 0
            y = Y_0 + y_sugi + y_hinoki
            df_pollen.append(y)
            df_pollen.append(i-S_3)
            df_pollen.append(i-H_3)
            
        df_pollen = np.array(df_pollen).reshape(int(len(df_pollen)/3),3)
        (pd.concat([df_1.T, pd.DataFrame(df_pollen)],axis=1)).to_csv(file_name,index=False,header=None)

        plt.rcParams["font.size"] = 16
        plt.figure(figsize=[18,10])
        plt.plot(np.array(df_pollen), marker="+", label='model')
        plt.plot(np.array(df_1.T), marker="+", label='pollen')
        plt.ticklabel_format(style='plain',axis='y')

        plt.title(file_name)

        #グラフの軸
        plt.ylabel('Pollen')
        plt.xlabel('hours')
        plt.xlim(0,3000)
        plt.ylim(0,1600)
        #グラフの凡例
        plt.legend()

        plt.show()

特徴量生成①

# 降水量ゼロカウンター
def zero_count(df,alpha=0.005):
    df_count = []
    n_count = 0
    for i in range(len(df)):
        if df[i] < 0.5:
            n_count += 1
        else:
            n_count = 0
        df_count.append(n_count)
    df_count = np.tanh(np.array(df_count)*alpha)
    return df_count
# 特徴量生成
for name in region_col:

    # 降水量ゼロカウンター
    df[f'zero_precipitation_{name}'] = zero_count(df[f'precipitation_{name}'],0.005)
    
    # 風速ベクトル
    df[f'sin_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.sin(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
    df[f'cos_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.cos(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
    df[f'v_sin_{name}'] = df[f'windspeed_{name}']*df[f'sin_{name}']
    df[f'v_cos_{name}'] = df[f'windspeed_{name}']*df[f'cos_{name}']
    
    # 方位ベクトル積
    for az in range(8):
        if az != 0 and az !=4:
            df[f'Azimuth_{az}_inner_product_{name}'] = (df[f'v_cos_{name}']*math.cos(np.pi*az/8)+df[f'v_sin_{name}']*math.sin(np.pi*az/8)).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift1'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(1).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift2'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(2).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift3'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(3).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift4'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(4).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift5'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(5).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift6'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(6).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift7'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(7).interpolate(limit_direction='both')
            df[f'Azimuth_{az}_inner_product_{name}_shift8'] = df[f'Azimuth_{az}_inner_product_{name}'].shift(8).interpolate(limit_direction='both')
                    
    df[f'precipitation_{name}_shift1'] = df[f'precipitation_{name}'].shift(1).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift1'] = df[f'temperature_{name}'].shift(1).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift1'] = df[f'v_sin_{name}'].shift(1).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift1'] = df[f'v_cos_{name}'].shift(1).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_shift2'] = df[f'precipitation_{name}'].shift(2).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift2'] = df[f'temperature_{name}'].shift(2).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift2'] = df[f'v_sin_{name}'].shift(2).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift2'] = df[f'v_cos_{name}'].shift(2).interpolate(limit_direction='both')

    df[f'precipitation_{name}_shift3'] = df[f'precipitation_{name}'].shift(3).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift3'] = df[f'temperature_{name}'].shift(3).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift3'] = df[f'v_sin_{name}'].shift(3).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift3'] = df[f'v_cos_{name}'].shift(3).interpolate(limit_direction='both')

    df[f'precipitation_{name}_shift4'] = df[f'precipitation_{name}'].shift(4).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift4'] = df[f'temperature_{name}'].shift(4).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift4'] = df[f'v_sin_{name}'].shift(4).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift4'] = df[f'v_cos_{name}'].shift(4).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_shift5'] = df[f'precipitation_{name}'].shift(5).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift5'] = df[f'temperature_{name}'].shift(5).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift5'] = df[f'v_sin_{name}'].shift(5).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift5'] = df[f'v_cos_{name}'].shift(5).interpolate(limit_direction='both')

    df[f'precipitation_{name}_shift6'] = df[f'precipitation_{name}'].shift(6).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift6'] = df[f'temperature_{name}'].shift(6).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift6'] = df[f'v_sin_{name}'].shift(6).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift6'] = df[f'v_cos_{name}'].shift(6).interpolate(limit_direction='both')

    df[f'precipitation_{name}_shift7'] = df[f'precipitation_{name}'].shift(7).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift7'] = df[f'temperature_{name}'].shift(7).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift7'] = df[f'v_sin_{name}'].shift(7).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift7'] = df[f'v_cos_{name}'].shift(7).interpolate(limit_direction='both')

    df[f'precipitation_{name}_shift8'] = df[f'precipitation_{name}'].shift(8).interpolate(limit_direction='both')
    df[f'temperature_{name}_shift8'] = df[f'temperature_{name}'].shift(8).interpolate(limit_direction='both')
    df[f'v_sin_{name}_shift8'] = df[f'v_sin_{name}'].shift(8).interpolate(limit_direction='both')
    df[f'v_cos_{name}_shift8'] = df[f'v_cos_{name}'].shift(8).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}_shift1']).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta'] = (df[f'temperature_{name}']-df[f'temperature_{name}_shift1']).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}_shift1']).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}_shift1']).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta1'] = df[f'precipitation_{name}_delta'].shift(1).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta1'] = df[f'temperature_{name}_delta'].shift(1).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta1'] = df[f'v_sin_{name}_delta'].shift(1).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta1'] = df[f'v_cos_{name}_delta'].shift(1).interpolate(limit_direction='both')

    df[f'precipitation_{name}_delta2'] = df[f'precipitation_{name}_delta'].shift(2).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta2'] = df[f'temperature_{name}_delta'].shift(2).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta2'] = df[f'v_sin_{name}_delta'].shift(2).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta2'] = df[f'v_cos_{name}_delta'].shift(2).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta3'] = df[f'precipitation_{name}_delta'].shift(3).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta3'] = df[f'temperature_{name}_delta'].shift(3).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta3'] = df[f'v_sin_{name}_delta'].shift(3).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta3'] = df[f'v_cos_{name}_delta'].shift(3).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta4'] = df[f'precipitation_{name}_delta'].shift(4).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta4'] = df[f'temperature_{name}_delta'].shift(4).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta4'] = df[f'v_sin_{name}_delta'].shift(4).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta4'] = df[f'v_cos_{name}_delta'].shift(4).interpolate(limit_direction='both')

    df[f'precipitation_{name}_delta5'] = df[f'precipitation_{name}_delta'].shift(5).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta5'] = df[f'temperature_{name}_delta'].shift(5).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta5'] = df[f'v_sin_{name}_delta'].shift(5).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta5'] = df[f'v_cos_{name}_delta'].shift(5).interpolate(limit_direction='both')

    df[f'precipitation_{name}_delta6'] = df[f'precipitation_{name}_delta'].shift(6).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta6'] = df[f'temperature_{name}_delta'].shift(6).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta6'] = df[f'v_sin_{name}_delta'].shift(6).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta6'] = df[f'v_cos_{name}_delta'].shift(6).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta7'] = df[f'precipitation_{name}_delta'].shift(7).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta7'] = df[f'temperature_{name}_delta'].shift(7).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta7'] = df[f'v_sin_{name}_delta'].shift(7).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta7'] = df[f'v_cos_{name}_delta'].shift(7).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_delta8'] = df[f'precipitation_{name}_delta'].shift(8).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta8'] = df[f'temperature_{name}_delta'].shift(8).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta8'] = df[f'v_sin_{name}_delta'].shift(8).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta8'] = df[f'v_cos_{name}_delta'].shift(8).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_mean'] = df[f'precipitation_{name}'].rolling(24).mean().interpolate(limit_direction='both')
    df[f'temperature_{name}_mean'] = df[f'temperature_{name}'].rolling(24).mean().interpolate(limit_direction='both')
    df[f'v_sin_{name}_mean'] = df[f'v_sin_{name}'].rolling(24).mean().interpolate(limit_direction='both')
    df[f'v_cos_{name}_mean'] = df[f'v_cos_{name}'].rolling(24).mean().interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_mean3'] = df[f'precipitation_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
    df[f'temperature_{name}_mean3'] = df[f'temperature_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
    df[f'v_sin_{name}_mean3'] = df[f'v_sin_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
    df[f'v_cos_{name}_mean3'] = df[f'v_cos_{name}'].rolling(3*24).mean().interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_mean5'] = df[f'precipitation_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
    df[f'temperature_{name}_mean5'] = df[f'temperature_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
    df[f'v_sin_{name}_mean5'] = df[f'v_sin_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
    df[f'v_cos_{name}_mean5'] = df[f'v_cos_{name}'].rolling(5*24).mean().interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_mean10'] = df[f'precipitation_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
    df[f'temperature_{name}_mean10'] = df[f'temperature_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
    df[f'v_sin_{name}_mean10'] = df[f'v_sin_{name}'].rolling(10*24).mean().interpolate(limit_direction='both')
    df[f'v_cos_{name}_mean10'] = df[f'v_cos_{name}'].rolling(10*24).mean().interpolate(limit_direction='both') 
    
    df[f'precipitation_{name}_diff'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}_mean10']).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff'] =  (df[f'temperature_{name}']-df[f'temperature_{name}_mean10']).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}_mean10']).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}_mean10']).interpolate(limit_direction='both')

    df[f'precipitation_{name}_diff1'] = df[f'precipitation_{name}_diff'].shift(1).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff1'] =  df[f'temperature_{name}_diff'].shift(1).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff1'] = df[f'v_sin_{name}_diff'].shift(1).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff1'] = df[f'v_cos_{name}_diff'].shift(1).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_diff2'] = df[f'precipitation_{name}_diff'].shift(2).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff2'] =  df[f'temperature_{name}_diff'].shift(2).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff2'] = df[f'v_sin_{name}_diff'].shift(2).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff2'] = df[f'v_cos_{name}_diff'].shift(2).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_diff3'] = df[f'precipitation_{name}_diff'].shift(3).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff3'] =  df[f'temperature_{name}_diff'].shift(3).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff3'] = df[f'v_sin_{name}_diff'].shift(3).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff3'] = df[f'v_cos_{name}_diff'].shift(3).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_diff4'] = df[f'precipitation_{name}_diff'].shift(4).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff4'] =  df[f'temperature_{name}_diff'].shift(4).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff4'] = df[f'v_sin_{name}_diff'].shift(4).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff4'] = df[f'v_cos_{name}_diff'].shift(4).interpolate(limit_direction='both')

    df[f'precipitation_{name}_diff5'] = df[f'precipitation_{name}_diff'].shift(5).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff5'] =  df[f'temperature_{name}_diff'].shift(5).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff5'] = df[f'v_sin_{name}_diff'].shift(5).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff5'] = df[f'v_cos_{name}_diff'].shift(5).interpolate(limit_direction='both')
    
    df[f'precipitation_{name}_diff6'] = df[f'precipitation_{name}_diff'].shift(6).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff6'] =  df[f'temperature_{name}_diff'].shift(6).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff6'] = df[f'v_sin_{name}_diff'].shift(6).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff6'] = df[f'v_cos_{name}_diff'].shift(6).interpolate(limit_direction='both')

    df[f'precipitation_{name}_diff7'] = df[f'precipitation_{name}_diff'].shift(7).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff7'] =  df[f'temperature_{name}_diff'].shift(7).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff7'] = df[f'v_sin_{name}_diff'].shift(7).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff7'] = df[f'v_cos_{name}_diff'].shift(7).interpolate(limit_direction='both')

    df[f'precipitation_{name}_diff8'] = df[f'precipitation_{name}_diff'].shift(8).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff8'] =  df[f'temperature_{name}_diff'].shift(8).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff8'] = df[f'v_sin_{name}_diff'].shift(8).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff8'] = df[f'v_cos_{name}_diff'].shift(8).interpolate(limit_direction='both')
    
    df = df.drop(f'sin_{name}',axis=1)
    df = df.drop(f'cos_{name}',axis=1)
    df = df.drop(f'windspeed_{name}',axis=1)
    df = df.drop(f'winddirection_{name}',axis=1)

追加特徴量と目的変数取り出し①

# 追加特徴量と目的変数取り出し①
for name in region_col:
    df_3 = pd.DataFrame()
    for year in [2017,2018,2019,2020]:
        file_name = f"model_{name}_{year}.csv"
        df_3 = pd.concat([df_3,pd.read_csv(file_name,header=None)])
    file_name = f"model_{name}.csv"
    df_3.to_csv(file_name,index=False)
    df_3 = df_3.reset_index()
    # 特徴量にモデル飛来量、飛来開始日からのカウントを追加
    df[f"pollen_model_{name}"] = df_3[1]
    df[f"count_sugi_{name}"] = df_3[2]
    df[f"count_hinoki_{name}"] = df_3[3]
    # 目的変数
    df_3 = ymin_log(np.log1p(np.log1p(df_3[0]/df_3[1])))
    file_name = f"model_{name}_target.csv"
    df_3.to_csv(file_name,index=False)

前処理①

# 特徴量データを train test に分割
train=df[0:len_train]
test=df[len_train:]

# 不要なカラムを削除
d_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo', 'year', 'month', 'day']
month = train[['year','month','day']]
train = train[~((train['month']==2)&(train['day']<11))].reset_index(drop=True)
# display(train)
train_x = train.drop(columns=d_col)
test_x = test.drop(columns=d_col)

# 時間カラムをOne-Hot化
train_x['hour'] = train_x['hour'].astype('str')
test_x['hour'] = test_x['hour'].astype('str')
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

print(train_x.shape)
print(test_x.shape)

# 特徴量を地域に限定するために用意したコード
train_x_u = train_x
test_x_u = test_x
for col in train_x.columns:
    if 'utsunomiya' not in col:
        train_x_u = train_x_u.drop(col,axis=1)
        test_x_u = test_x_u.drop(col,axis=1)

train_x_c = train_x
test_x_c = test_x
for col in train_x.columns:
    if ('chiba' not in col): #and ('tokyo' not in col):
        train_x_c = train_x_c.drop(col,axis=1)
        test_x_c = test_x_c.drop(col,axis=1)

train_x_t = train_x
test_x_t = test_x
for col in train_x.columns:
    if ('tokyo' not in col): # and ('utsunomiya' not in col):
        train_x_t = train_x_t.drop(col,axis=1)
        test_x_t = test_x_t.drop(col,axis=1)

# 目的変数train用
train_y_u = pd.read_csv("model_utsunomiya_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_c = pd.read_csv("model_chiba_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_t = pd.read_csv("model_tokyo_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_u = pd.concat([month,train_y_u],axis=1)
train_y_c = pd.concat([month,train_y_c],axis=1)
train_y_t = pd.concat([month,train_y_t],axis=1)
train_y_u = train_y_u[~((train_y_u['month']==2)&(train_y_u['day']<11))].reset_index(drop=True)
train_y_c = train_y_c[~((train_y_c['month']==2)&(train_y_c['day']<11))].reset_index(drop=True)
train_y_t = train_y_t[~((train_y_t['month']==2)&(train_y_t['day']<11))].reset_index(drop=True)

len_train_2017 = train_y_u[train_y_u['year']==2017].shape[0]
len_train_2018 = train_y_u[train_y_u['year']==2018].shape[0]
len_train_2019 = train_y_u[train_y_u['year']==2019].shape[0]
len_train_2020 = train_y_u[train_y_u['year']==2020].shape[0]

train_y_u = train_y_u.drop(['year','month','day'],axis=1)
train_y_c = train_y_c.drop(['year','month','day'],axis=1)
train_y_t = train_y_t.drop(['year','month','day'],axis=1)

print(len_train_2017)
print(len_train_2018)
print(len_train_2019)
print(len_train_2020)

# 数理モデルの値を格納
test_u = pd.read_csv("model_utsunomiya.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_c = pd.read_csv("model_chiba.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_t = pd.read_csv("model_tokyo.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)

# 目的変数外れ値クリッピング
threshold_h = train_y_u[0].quantile(THR_H)
threshold_l = train_y_u[0].quantile(THR_L)
train_y_u = train_y_u[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_c[0].quantile(THR_H)
threshold_l = train_y_c[0].quantile(THR_L)
train_y_c = train_y_c[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_t[0].quantile(THR_H)
threshold_l = train_y_t[0].quantile(THR_L)
train_y_t = train_y_t[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))

LightGBM 交差検証定義

def kFold_LightGBM(n_splits,train,target,test,seed=23):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    X = train
    y = target
    # 各Foldの結果を格納する
    preds_lgb = []
    preds_val = []
    val_indx = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(X, y)): 

        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_valid = X.iloc[valid_index]
        y_valid = y.iloc[valid_index]

        lgb_train = lgb.Dataset(X_train.values, y_train)
        lgb_valid = lgb.Dataset(X_valid.values, y_valid)
        
        # パラメータ
        lgb_params = {
            'task': 'train',              # タスクを訓練に設定
            'boosting_type': 'gbdt',      # 指定 gbdt goss dart
            'objective': 'regression',    # 回帰を指定
            'metric': 'rmse',             # 回帰の評価関数
#             'lambda_l1': 0.10,
            'lambda_l2': 0.35,
            'num_iterations': 9000,
            'max_depth': -1,
            'learning_rate': 0.05,
#             'extra_trees': True,
        }
        model = lgb.train(lgb_params,
                          lgb_train, 
                          valid_sets=lgb_valid,
                          num_boost_round=9999,
                          verbose_eval=200,
                          early_stopping_rounds=30)
        
        # 学習の評価
        valid_pred = model.predict(X_valid)
        score = MAE(y_valid,valid_pred)

        # テストデータの予測
        test_pred = model.predict(test, num_iteration=model.best_iteration)

        # Foldの結果を格納
        preds_val.append(valid_pred)
        val_indx.append(valid_index)
        preds_lgb.append(test_pred)
        print(f'Fold{i}:', score)
        
        # 特徴量の重要度をプロットする
        lgb.plot_importance(model, figsize=(12, 8))
        plt.show()
        # feature importanceを表示
        importance = pd.DataFrame(model.feature_importance(), index=X_train.columns, columns=['importance'])
        display(importance.sort_values(by='importance',ascending=False))
        
    val_indx = np.concatenate(val_indx)
    preds_val = np.concatenate(preds_val)
    order = np.argsort(val_indx)
    pred_train = preds_val[order]
    
    return pred_train, preds_lgb

XGBOOST 交差検証定義

def kFold_xgboost(n_splits,train,target,test,seed=87):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    X = train
    y = target
    # 各Foldの結果を格納する
    preds_xgb =[]
    preds_val = []
    val_indx = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(X, y)): 

        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_valid = X.iloc[valid_index]
        y_valid = y.iloc[valid_index]
        
        xgb_train = xgb.DMatrix(X_train.values, label=y_train)
        xgb_valid = xgb.DMatrix(X_valid.values, label=y_valid)
        xgb_test = xgb.DMatrix(test.values) 
        
        # パラメータ
        xgb_params = {
            # 回帰問題
            'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'max_depth': 16,
#             'alpha': 0.10,
            'lambda': 0.35,
            'eta':0.1,
        }
        
        evals = [(xgb_train, 'train'), (xgb_valid, 'eval')]
        
        model =xgb.train(xgb_params, 
                 xgb_train,
                 verbose_eval=200,  # 200イテレーション毎に学習結果出力
                 num_boost_round=2000,  # 最大イテレーション回数指定
                 early_stopping_rounds=30,
                 evals=evals )
        
        # 学習の評価
        valid_pred = model.predict(xgb_valid)
        score = MAE(y_valid,valid_pred)

        # テストデータの予測
        test_pred = model.predict(xgb_test)

        # Foldの結果を格納
        preds_val.append(valid_pred)
        val_indx.append(valid_index)
        preds_xgb.append(test_pred)
        print(f'Fold{i}:', score)
        
    val_indx = np.concatenate(val_indx)
    preds_val = np.concatenate(preds_val)
    order = np.argsort(val_indx)
    pred_train = preds_val[order]
        
    return pred_train, preds_xgb

lgb で交差検証①

train_ul,preds_ul = kFold_LightGBM(4, train_x_u, train_y_u, test_x_u,seed=seed_lgb0)
train_cl,preds_cl = kFold_LightGBM(4, train_x_c, train_y_c, test_x_c,seed=seed_lgb0)
train_tl,preds_tl = kFold_LightGBM(4, train_x_t, train_y_t, test_x_t,seed=seed_lgb0)

xgb で交差検証①

train_ux,preds_ux = kFold_xgboost(4, train_x_u, train_y_u, test_x_u,seed=seed_xgb0)
train_cx,preds_cx = kFold_xgboost(4, train_x_c, train_y_c, test_x_c,seed=seed_xgb0)
train_tx,preds_tx = kFold_xgboost(4, train_x_t, train_y_t, test_x_t,seed=seed_xgb0)

予測結果①

submission_ul = pd.DataFrame(preds_ul).mean(axis=0)
submission_cl = pd.DataFrame(preds_cl).mean(axis=0)
submission_tl = pd.DataFrame(preds_tl).mean(axis=0)
submission_ux = pd.DataFrame(preds_ux).mean(axis=0)
submission_cx = pd.DataFrame(preds_cx).mean(axis=0)
submission_tx = pd.DataFrame(preds_tx).mean(axis=0)

アンサンブル平均を取ってから指数変換①

submission_u = np.expm1(np.expm1(ymin_exp((submission_ul + submission_ux)/2)))
submission_c = np.expm1(np.expm1(ymin_exp((submission_cl + submission_cx)/2)))
submission_t = np.expm1(np.expm1(ymin_exp((submission_tl + submission_tx)/2)))

最小単位を指定して提出ファイルに格納①

m_unit = 4
submission = pd.read_csv('./data/sample_submission.csv')
submission['pollen_utsunomiya'] = m_unit*(submission_u*test_u/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_chiba'] = m_unit*(submission_c*test_c/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_tokyo'] = m_unit*(submission_t*test_t/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission
submission.to_csv(f"./submit/submit_emsemble_{seed_lgb0:03}{seed_xgb0:03}.csv", index=False, header=True)

スタッキング

train_us = np.concatenate([train_ul,train_ux,train_ul**2,train_ul*train_ux,train_ux**2]).reshape(len(train_ul),5)
train_cs = np.concatenate([train_cl,train_cx,train_cl**2,train_cl*train_cx,train_cx**2]).reshape(len(train_ul),5)
train_ts = np.concatenate([train_tl,train_tx,train_tl**2,train_tl*train_tx,train_tx**2]).reshape(len(train_ul),5)
test_us = np.concatenate([submission_ul,submission_ux,submission_ul**2,submission_ul*submission_ux,submission_ux**2]).reshape(len(submission_ul),5)
test_cs = np.concatenate([submission_cl,submission_cx,submission_cl**2,submission_cl*submission_cx,submission_cx**2]).reshape(len(submission_ul),5)
test_ts = np.concatenate([submission_tl,submission_tx,submission_tl**2,submission_tl*submission_tx,submission_tx**2]).reshape(len(submission_ul),5)
def build_toy():
    model=Sequential()
    model.add(Dense(5, input_dim=5, activation='tanh'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(16, activation=swish))
    model.add(Dropout(0.15))
    model.add(Dense(32, activation=swish))
    model.add(Dropout(0.25))
    model.add(Dense(32, activation=swish))
    model.add(Dropout(0.25))
    model.add(Dense(16, activation=swish))
    model.add(Dropout(0.15))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1))
    # compile model
    model.compile(loss='mean_squared_error',  
                  optimizer =optimizers.Adam(learning_rate=1e-3,amsgrad=False))
    return model

model = build_toy()
model.summary()
def kFold_TOY(n_splits,train,target,test,seed=421):
#     model = []
    preds_nn =[]
    preds_val = []
    val_indx = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    X = train
    y = target
    
    for i, (train_index, valid_index) in enumerate(kf.split(X, y)): 

        X_train = X[train_index]
        y_train = y.iloc[train_index]
        X_valid = X[valid_index]
        y_valid = y.iloc[valid_index]

        model = build_toy()

        rlr = ReduceLROnPlateau(monitor='val_loss',
                                factor=0.7,
                                patience=6,
                                verbose=0,
                                min_delta=1e-5,
                                mode='min')

#         ckp = ModelCheckpoint(f'./model/model_{i}.h',
#                               monitor='val_loss',
#                               verbose=0,
#                               save_best_only=True,
#                               save_weights_only=True,
#                               mode='min')

        es = EarlyStopping(monitor='val_loss',
                           min_delta=1e-5,
                           patience=20,
                           mode='min',
                           baseline=None,
                           restore_best_weights=True,
                           verbose=0)

        print(f'fold {i} 巡目 計算中')
        model.fit(X_train, y_train,
                  validation_data=(X_valid, y_valid),
                  epochs=150,
                  batch_size=128,
#                   callbacks=[rlr, ckp, es],
                  callbacks=[rlr, es],
                  verbose=1)

#         model.load_weights(f'./model/model_{i}.h')
        preds = model.predict(test)        
        preds_nn.append(preds)
        
        valid_pred = model.predict(X_valid)
        preds_val.append(valid_pred)
        val_indx.append(valid_index)
        
    preds_nn = np.array(preds_nn).reshape(n_splits,len(test))
    val_indx = np.array(val_indx).reshape(len(train))
    preds_val = np.array(preds_val).reshape(len(train))
    order = np.argsort(val_indx)
    pred_train = preds_val[order]
    
    return pred_train, preds_nn
scaler = StandardScaler()
scaler.fit(train_us)
train_x_ = scaler.transform(train_us)
test_x_ = scaler.transform(test_us)
train_uss, preds_us = kFold_TOY(4, train_x_, train_y_u, test_x_,seed=seed_toy0)
scaler.fit(train_cs)
train_x_ = scaler.transform(train_cs)
test_x_ = scaler.transform(test_cs)
train_css, preds_cs = kFold_TOY(4, train_x_, train_y_c, test_x_,seed=seed_toy0)
scaler.fit(train_ts)
train_x_ = scaler.transform(train_ts)
test_x_ = scaler.transform(test_ts)
train_tss, preds_ts = kFold_TOY(4, train_x_, train_y_t, test_x_,seed=seed_toy0)
submission_us = pd.DataFrame(preds_us).mean(axis=0)
submission_cs = pd.DataFrame(preds_cs).mean(axis=0)
submission_ts = pd.DataFrame(preds_ts).mean(axis=0)

特徴量生成②

df_copy.shape
set_seed(1)
df = df_copy.copy()
dummy240 = np.zeros(240)
l1 = len_train_2017
l2 = l1 + len_train_2018
l3 = l2 + len_train_2019
utsunomiya_lgb = np.concatenate([train_ul,submission_ul])
utsunomiya_xgb = np.concatenate([train_ux,submission_ux])
utsunomiya_ss = np.concatenate([train_uss,submission_us])
chiba_lgb = np.concatenate([train_cl,submission_cl])
chiba_xgb = np.concatenate([train_cx,submission_cx])
chiba_ss = np.concatenate([train_css,submission_cs])
tokyo_lgb = np.concatenate([train_tl,submission_tl])
tokyo_xgb = np.concatenate([train_tx,submission_tx])
tokyo_ss = np.concatenate([train_tss,submission_ts])
utsunomiya_lgb = np.concatenate([dummy240,utsunomiya_lgb[0:l1]
                          ,dummy240,utsunomiya_lgb[l1:l2]
                          ,dummy240,utsunomiya_lgb[l2:l3]
                          ,dummy240,utsunomiya_lgb[l3:]])
utsunomiya_xgb = np.concatenate([dummy240,utsunomiya_xgb[0:l1]
                          ,dummy240,utsunomiya_xgb[l1:l2]
                          ,dummy240,utsunomiya_xgb[l2:l3]
                          ,dummy240,utsunomiya_xgb[l3:]])
utsunomiya_ss = np.concatenate([dummy240,utsunomiya_ss[0:l1]
                          ,dummy240,utsunomiya_ss[l1:l2]
                          ,dummy240,utsunomiya_ss[l2:l3]
                          ,dummy240,utsunomiya_ss[l3:]])
chiba_lgb = np.concatenate([dummy240,chiba_lgb[0:l1]
                          ,dummy240,chiba_lgb[l1:l2]
                          ,dummy240,chiba_lgb[l2:l3]
                          ,dummy240,chiba_lgb[l3:]])
chiba_xgb = np.concatenate([dummy240,chiba_xgb[0:l1]
                          ,dummy240,chiba_xgb[l1:l2]
                          ,dummy240,chiba_xgb[l2:l3]
                          ,dummy240,chiba_xgb[l3:]])
chiba_ss = np.concatenate([dummy240,chiba_ss[0:l1]
                          ,dummy240,chiba_ss[l1:l2]
                          ,dummy240,chiba_ss[l2:l3]
                          ,dummy240,chiba_ss[l3:]])
tokyo_lgb = np.concatenate([dummy240,tokyo_lgb[0:l1]
                          ,dummy240,tokyo_lgb[l1:l2]
                          ,dummy240,tokyo_lgb[l2:l3]
                          ,dummy240,tokyo_lgb[l3:]])
tokyo_xgb = np.concatenate([dummy240,tokyo_xgb[0:l1]
                          ,dummy240,tokyo_xgb[l1:l2]
                          ,dummy240,tokyo_xgb[l2:l3]
                          ,dummy240,tokyo_xgb[l3:]])
tokyo_ss = np.concatenate([dummy240,tokyo_ss[0:l1]
                          ,dummy240,tokyo_ss[l1:l2]
                          ,dummy240,tokyo_ss[l2:l3]
                          ,dummy240,tokyo_ss[l3:]])
# それぞれの結果をミックスしたものを説明変数として特徴量に加える
df["stack_utsunomiya"] = a*utsunomiya_lgb + b*utsunomiya_xgb + c*utsunomiya_ss
df["stack_chiba"] = a*chiba_lgb + b*chiba_xgb + c*chiba_ss
df["stack_tokyo"] = a*tokyo_lgb + b*tokyo_xgb + c*tokyo_ss
# 特徴量生成(①とは異なる特徴量セットを使用)
for name in region_col:
    
    # 降水量ゼロカウンター
    df[f'zero_precipitation_{name}'] = zero_count(df[f'precipitation_{name}'],0.005)
    
    # 風向と風速から風速ベクトル生成
    df[f'sin_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.sin(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
    df[f'cos_{name}'] = df[f'winddirection_{name}'].apply(lambda x: np.cos(2*np.pi*float(4-x)/16) if x > 0.1 else 0)
    df[f'v_sin_{name}'] = df[f'windspeed_{name}']*df[f'sin_{name}']
    df[f'v_cos_{name}'] = df[f'windspeed_{name}']*df[f'cos_{name}']
    df = df.drop(f'sin_{name}',axis=1)
    df = df.drop(f'cos_{name}',axis=1)
    df = df.drop(f'windspeed_{name}',axis=1)
    df = df.drop(f'winddirection_{name}',axis=1)

    # 最高気温
    df[f'precipitation_{name}_max'] = df[f'precipitation_{name}'].rolling(36).max().interpolate(limit_direction='both')
    
    # 方位ベクトル積
    for az in range(8):
        if az != 0 and az !=4:
            df[f'Azimuth_{az}_inner_product_{name}'] = (df[f'v_cos_{name}']*math.cos(np.pi*az/8)+df[f'v_sin_{name}']*math.sin(np.pi*az/8)).interpolate(limit_direction='both')
            
    # 1時間前風速ベクトル積
    df[f'inner_product_{name}'] = (df[f'v_cos_{name}'].shift(1)*df[f'v_cos_{name}']+df[f'v_sin_{name}'].shift(1)*df[f'v_sin_{name}']).interpolate(limit_direction='both')
    df[f'cross_product_{name}'] = (df[f'v_cos_{name}'].shift(1)*df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(1)*df[f'v_cos_{name}']).interpolate(limit_direction='both')

    # 1時間前データとの差
    df[f'precipitation_{name}_delta'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}'].shift(1)).interpolate(limit_direction='both')
    df[f'temperature_{name}_delta'] = (df[f'temperature_{name}']-df[f'temperature_{name}'].shift(1)).interpolate(limit_direction='both')
    df[f'v_sin_{name}_delta'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(1)).interpolate(limit_direction='both')
    df[f'v_cos_{name}_delta'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}'].shift(1)).interpolate(limit_direction='both')
    df[f'inner_product_{name}_delta'] = (df[f'inner_product_{name}']-df[f'inner_product_{name}'].shift(1)).interpolate(limit_direction='both')
    df[f'cross_product_{name}_delta'] = (df[f'cross_product_{name}']-df[f'cross_product_{name}'].shift(1)).interpolate(limit_direction='both')

    # 10日移動平均との差
    df[f'precipitation_{name}_diff'] = (df[f'precipitation_{name}']-df[f'precipitation_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
    df[f'temperature_{name}_diff'] =  (df[f'temperature_{name}']-df[f'temperature_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
    df[f'v_sin_{name}_diff'] = (df[f'v_sin_{name}']-df[f'v_sin_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
    df[f'v_cos_{name}_diff'] = (df[f'v_cos_{name}']-df[f'v_cos_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
    df[f'inner_product_{name}_diff'] = (df[f'inner_product_{name}']-df[f'inner_product_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')
    df[f'cross_product_{name}_diff'] = (df[f'cross_product_{name}']-df[f'cross_product_{name}'].rolling(24*10).mean()).interpolate(limit_direction='both')

    # 24時間毎、10日平均データとの差
    df[f'precipitation_{name}_delta_24h'] = np.zeros(len(df))
    df[f'temperature_{name}_delta_24h'] = np.zeros(len(df))
    df[f'v_sin_{name}_delta_24h'] = np.zeros(len(df))
    df[f'v_cos_{name}_delta_24h'] = np.zeros(len(df))
    df[f'inner_product_{name}_delta_24h'] = np.zeros(len(df))
    df[f'cross_product_{name}_delta_24h'] = np.zeros(len(df))
    for i in range(1,11):
        df[f'precipitation_{name}_delta_24h'] += (df[f'precipitation_{name}']-df[f'precipitation_{name}'].shift(24*i)).interpolate(limit_direction='both')
        df[f'temperature_{name}_delta_24h'] += (df[f'temperature_{name}']-df[f'temperature_{name}'].shift(24*i)).interpolate(limit_direction='both')
        df[f'v_sin_{name}_delta_24h'] += (df[f'v_sin_{name}']-df[f'v_sin_{name}'].shift(24*i)).interpolate(limit_direction='both')
        df[f'v_cos_{name}_delta_24h'] += (df[f'v_cos_{name}']-df[f'v_cos_{name}'].shift(24*i)).interpolate(limit_direction='both')
        df[f'inner_product_{name}_delta_24h'] += (df[f'inner_product_{name}']-df[f'inner_product_{name}'].shift(24*i)).interpolate(limit_direction='both')
        df[f'cross_product_{name}_delta_24h'] += (df[f'cross_product_{name}']-df[f'cross_product_{name}'].shift(24*i)).interpolate(limit_direction='both')
    df[f'precipitation_{name}_delta_24h'] /= 10
    df[f'temperature_{name}_delta_24h'] /= 10
    df[f'v_sin_{name}_delta_24h'] /= 10
    df[f'v_cos_{name}_delta_24h'] /= 10
    df[f'inner_product_{name}_delta_24h'] /= 10
    df[f'cross_product_{name}_delta_24h'] /= 10

追加特徴量と目的変数取り出し②

# 追加特徴量と目的変数取り出し②
for name in region_col:
    df_3 = pd.DataFrame()
    for year in [2017,2018,2019,2020]:
        file_name = f"model_{name}_{year}.csv"
        df_3 = pd.concat([df_3,pd.read_csv(file_name,header=None)])
    file_name = f"model_{name}.csv"
    df_3.to_csv(file_name,index=False)
    print(df_3.shape)
    df_3 = df_3.reset_index()
    # 特徴量にモデル飛来量、飛来開始日からのカウントを追加
    df[f"pollen_model_{name}"] = df_3[1]
    df[f"count_sugi_{name}"] = df_3[2]
    df[f"count_hinoki_{name}"] = df_3[3]
    # 目的変数
    df_3 = ymin_log(np.log1p(np.log1p(df_3[0]/df_3[1])))
    file_name = f"model_{name}_target.csv"
    df_3.to_csv(file_name,index=False)

前処理②(特徴量)

# 特徴量データを train test に分割
train=df[0:len_train]
test=df[len_train:]

# 不要なカラムを削除
d_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo', 'year', 'month', 'day']
month = train[['year','month','day']]
train = train[~((train['month']==2)&(train['day']<11))].reset_index(drop=True)
# display(train)
train_x = train.drop(columns=d_col)
test_x = test.drop(columns=d_col)

# 時間カラムをOne-Hot化
train_x['hour'] = train_x['hour'].astype('str')
test_x['hour'] = test_x['hour'].astype('str')
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

print(train_x.shape)
print(test_x.shape)

# 特徴量を地域に限定するために用意したコード
train_x_u = train_x
test_x_u = test_x
for col in train_x.columns:
    if 'utsunomiya' not in col:
        train_x_u = train_x_u.drop(col,axis=1)
        test_x_u = test_x_u.drop(col,axis=1)

train_x_c = train_x
test_x_c = test_x
for col in train_x.columns:
    if 'chiba' not in col:
        train_x_c = train_x_c.drop(col,axis=1)
        test_x_c = test_x_c.drop(col,axis=1)

train_x_t = train_x
test_x_t = test_x
for col in train_x.columns:
    if 'tokyo' not in col:
        train_x_t = train_x_t.drop(col,axis=1)
        test_x_t = test_x_t.drop(col,axis=1)

# 目的変数train用
train_y_u = pd.read_csv("model_utsunomiya_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_c = pd.read_csv("model_chiba_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_t = pd.read_csv("model_tokyo_target.csv",header=None)[1:len_train+1].reset_index(drop=True).fillna(0)
train_y_u = pd.concat([month,train_y_u],axis=1)
train_y_c = pd.concat([month,train_y_c],axis=1)
train_y_t = pd.concat([month,train_y_t],axis=1)
train_y_u = train_y_u[~((train_y_u['month']==2)&(train_y_u['day']<11))].reset_index(drop=True)
train_y_c = train_y_c[~((train_y_c['month']==2)&(train_y_c['day']<11))].reset_index(drop=True)
train_y_t = train_y_t[~((train_y_t['month']==2)&(train_y_t['day']<11))].reset_index(drop=True)

len_train_2017 = train_y_u[train_y_u['year']==2017].shape[0]
len_train_2018 = train_y_u[train_y_u['year']==2018].shape[0]
len_train_2019 = train_y_u[train_y_u['year']==2019].shape[0]
len_train_2020 = train_y_u[train_y_u['year']==2020].shape[0]

train_y_u = train_y_u.drop(['year','month','day'],axis=1)
train_y_c = train_y_c.drop(['year','month','day'],axis=1)
train_y_t = train_y_t.drop(['year','month','day'],axis=1)

print(len_train_2017)
print(len_train_2018)
print(len_train_2019)
print(len_train_2020)

# 数理モデルの値を格納
test_u = pd.read_csv("model_utsunomiya.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_c = pd.read_csv("model_chiba.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)
test_t = pd.read_csv("model_tokyo.csv",header=None)[1][len_train+1:].reset_index(drop=True).fillna(0)

# 目的変数外れ値クリッピング
threshold_h = train_y_u[0].quantile(THR_H)
threshold_l = train_y_u[0].quantile(THR_L)
train_y_u = train_y_u[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_c[0].quantile(THR_H)
threshold_l = train_y_c[0].quantile(THR_L)
train_y_c = train_y_c[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))
threshold_h = train_y_t[0].quantile(THR_H)
threshold_l = train_y_t[0].quantile(THR_L)
train_y_t = train_y_t[0].apply(lambda x: threshold_h if x > threshold_h else (threshold_l if x < threshold_l else x))

lgb で交差検証②

train_ul,preds_ul = kFold_LightGBM(4, train_x_u, train_y_u, test_x_u,seed=seed_lgb1)
train_cl,preds_cl = kFold_LightGBM(4, train_x_c, train_y_c, test_x_c,seed=seed_lgb1)
train_tl,preds_tl = kFold_LightGBM(4, train_x_t, train_y_t, test_x_t,seed=seed_lgb1)

予測結果②

submission_ul = pd.DataFrame(preds_ul).mean(axis=0)
submission_cl = pd.DataFrame(preds_cl).mean(axis=0)
submission_tl = pd.DataFrame(preds_tl).mean(axis=0)

最小単位を指定して提出ファイルに格納②

submission_u = np.expm1(np.expm1(ymin_exp(submission_ul)))
submission_c = np.expm1(np.expm1(ymin_exp(submission_cl)))
submission_t = np.expm1(np.expm1(ymin_exp(submission_tl)))
m_unit = 4
submission = pd.read_csv('./data/sample_submission.csv')
submission['pollen_utsunomiya'] = m_unit*(submission_u*test_u/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_chiba'] = m_unit*(submission_c*test_c/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission['pollen_tokyo'] = m_unit*(submission_t*test_t/m_unit).round(0).astype(int).apply(lambda x: x if x>0 else 0)
submission
submission.to_csv(f"./submit/submit_stck_{seed_lgb0:03}{seed_xgb0:03}{seed_toy0:03}{seed_lgb1:03}{a}{b}{c}.csv", index=False, header=True)

Discussion

数理モデルでフィッティングして
(目的変数)=(飛来花粉量)/(数理モデル)
としたところが解法の要諦です。

(数理モデル)=(年間バイアス)+(スギ花粉量)+(ヒノキ花粉量)

としており、それぞれの花粉量はガンマ関数に従うと仮定しています。


赤色・・・数理モデル
灰色・・・スギ(初飛来日は2月上旬)
黄色・・・ヒノキ(初飛来日は3月中旬)

ログインするとコメントできます