😀

Pytorchで途中のlayerの入出力と勾配を保存する

2022/07/07に公開

ディープラーニング

PyTorch

tech

目的：Deep Learningで途中のlayerの入出力を保存する

背景：Deep Learningの性質を調べるときに途中のlayerの入出力(weightではない)を保存したいことがある

前方伝搬をノードを出力する簡単なサンプル

利用するモデル

Sequential(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=2, bias=True)
)

import torch
import torch.nn as nn

model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 2))

def forward_hook(model, inputs, output):
    # inputsは入力が複数があるのでtuple
    print('input:', inputs[0].shape, 'output', output.shape)    

for name, layer in model.named_children():
    print(f'hook onto {name}')
    layer.register_forward_hook(forward_hook)

x = torch.rand(5, 10)
model(x).mean()

出力

input: torch.Size([5, 10]) output torch.Size([5, 5])
input: torch.Size([5, 5]) output torch.Size([5, 2])

前方伝搬するときに、forward_hookが呼ばれ、inputsとoutputそれぞれに入出力のtensorが入る。
※inputsは入力組のtuple, outputsはtensorになる

逆誤差伝搬の場合

import torch
import torch.nn as nn

model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 2))

def backward_hook(module, grad_input, grad_output):
    # 入力層でのinput gradはNoneとなる
    if len(grad_input) >=2: 
        for i, grad_input_ in enumerate(grad_input):
            if grad_input_ is not None:            
                print('grad_in', i,  grad_input_.shape)
    print('grad_output:', grad_output[0].shape)

for name, layer in model.named_children():
    print(f'hook onto {name}')
    layer.register_backward_hook(backward_hook)
    #layer.register_full_backward_hook(backward_hook)
    
x = torch.rand(3, 10)
model(x).mean().backward()

出力

grad_in 0 torch.Size([2])
grad_in 1 torch.Size([3, 5])
grad_in 2 torch.Size([5, 2])
grad_output: torch.Size([3, 2])
grad_in 0 torch.Size([5])
grad_in 2 torch.Size([10, 5])
grad_output: torch.Size([3, 5])

grad normのinとoutの意味は

x(3, 10)->fc1(10,5)->h1(3, 5)->fc2(5,2)->h2(3, 2)->mean->1

f1とf2の前後の微分値を返している
つまりf2のoutはlossを h2(3, 2) で微分した値
つまりf2のinはlossを h1(3, 5) で微分した値

今回だとf2のinputとf1のoutputは一致する
f1のinはlossを x(3, 10) で微分した値になるはずだがNoneになる

ノードの出力を用いたlossの微分値をしりたいなら、grad_output[0]だけを監視してればいい

※ register_backward_hookとは古く、register_full_backward_hookを使うべきだが、Relu(inplace)がある場合動かない[1][2]ため、今回はあえてregister_backward_hookを使っている

register_backward_hookと register_full_backward_hookを使った場合で、grad_input中身が変わる。

register_full_backward_hookの場合

[0] is the derivative of loss wrt layer input
[1] is the derivative of loss wrt layer output (before activation)
[2] is the derivative of loss wrt layer weights

register_backward_hookの場合fcとconvで異なる

fcの場合
[0] shape [10] - Bias values.
[1] shape [64, 84] - Data. The first value is the 64 batches, 84 inputs from the previous layer.
[2] shape [84, 10] - Layer weights. Each node in the fully connected layer receives the 84 outputs from the previous layer. There are 10 nodes.

convの場合
[0] shape [64, 16, 16, 16] - This is the input data. 64 batches, 16 feature maps deep, 16 width, 16 height.
[1] shape [32, 16, 3, 3] - This is the kernel weight data. 32 kernels with 16 depth (to match number if input feature maps), and 3x3 height/width.
[2] shape [32] - This is the bias for each kernel

保存するためのクラス

hook関数を使って、ノードの値をprintするだけでなく、保存する必要があるのでそれ用のクラス。
前方伝搬が行われるたびに記録して、平均した値を返す

class SaveActive(object):
    def __init__(self, model):
        self.model = model
        self.fw_output = {}
        self.fw_input = {}
        self.bw_output = {}
        self.bw_input = {}
        self.fw_hook_lst = []
        self.bw_hook_lst = []
        self.clear_buffer()
        self.__registor_model(model)

    def __enter__(self):
        return self
    
    def __call__(self, model):
        self.__init__(model)
        
    def __exit__(self, exc_type, exc_value, traceback):
        self.remove_hook()
        self.clear_buffer()

    def clear_buffer(self):
        for name, layer in self.model.named_modules():
            if len(list(layer.named_children())) == 0:
                self.fw_input[name] = []
                self.fw_output[name] = []
                self.bw_input[name] = []
                self.bw_output[name] = []

    def __registor_model(self, model):
        for name, layer in model.named_modules():
            if len(list(layer.named_children())) == 0:
                # print(f'hook in {name}')
                fw_handle = layer.register_forward_hook(self.fw_save(name))
                self.fw_hook_lst.append(fw_handle)
                # except inplace for https://github.com/pytorch/pytorch/issues/61519
                # layer.register_full_backward_hook(self.bw_save(name))
                bw_handle = layer.register_backward_hook(self.bw_save(name))
                self.bw_hook_lst.append(bw_handle)

    def remove_hook(self):
        for fw_handle in self.fw_hook_lst:
            fw_handle.remove()
        for bw_handle in self.bw_hook_lst:
            bw_handle.remove()

    def fw_save(self, name):
        def forward_hook(model, inputs, output):
            tmp1 = inputs[0].detach().clone().cpu().to(torch.float32)
            if tmp1.dim() == 0:
                tmp1 = tmp1.unsqueeze(0)
            self.fw_input[name].append(tmp1)
            if output is not None:
                tmp2 = output.detach().clone().cpu().to(torch.float32)
                if tmp2.dim() == 0:
                    tmp2 = tmp2.unsqueeze(0)  # dim!=0 for torch.concat
                self.fw_output[name].append(tmp2)

        return forward_hook

    def bw_save(self, name):
        def backward_hook(module, grad_input, grad_output):
            if len(grad_input) >= 2:
                if grad_input[1] is not None:
                    tmp1 = grad_input[1].detach().clone().cpu().to(torch.float32)
                    if tmp1.dim() == 0:
                        tmp1 = tmp1.unsqueeze(0)
                    self.bw_input[name].append(tmp1)
            tmp2 = grad_output[0].detach().clone().cpu().to(torch.float32)
            if tmp2.dim() == 0:
                tmp2 = tmp2.unsqueeze(0)
            self.bw_output[name].append(tmp2)

        return backward_hook

    def get_fw_input_mean_norm(self):
        if self.__is_null(self.fw_input):
            print('Error: Please try foward prop')
            return {}
        means = {}
        for key in self.fw_input:
            if len(self.fw_input[key]) != 0:
                means[key] = torch.cat(self.fw_input[key], dim=0).mean(0).norm()
                n_data = len(torch.cat(self.fw_input[key], dim=0))
        # rint(f'mean norm by n_sameples: {n_data}')
        return means

    def get_fw_output_mean_norm(self):
        if self.__is_null(self.fw_output):
            print('Error: Please try foward prop')
            return {}
        means = {}
        for key in self.fw_output:
            if len(self.fw_output[key]) != 0:
                means[key] = torch.cat(self.fw_output[key], dim=0).mean(0).norm()
                n_data = len(torch.cat(self.fw_output[key], dim=0))
        # print(f'mean norm by n_sameples: {n_data}')
        return means

    def get_bw_input_mean_norm(self):
        if self.__is_null(self.bw_input):
            print('Error: Please try backward prop')
            return {}
        means = {}
        for key in self.bw_input:
            if len(self.bw_input[key]) != 0:
                means[key] = torch.cat(self.bw_input[key], dim=0).mean(0).norm()
                n_data = len(torch.cat(self.bw_input[key], dim=0))
        # print(f'mean norm by n_sameples: {n_data}')
        return means

    def get_bw_output_mean_norm(self):
        if self.__is_null(self.bw_output):
            print('Error: Please try backward prop')
            return {}
        means = {}
        for key in self.bw_output:
            if len(self.bw_output[key]) != 0:
                means[key] = torch.cat(self.bw_output[key], dim=0).mean(0).norm()
                n_data = len(torch.cat(self.bw_output[key], dim=0))
        # print(f'mean norm by n_sameples: {n_data}')
        return means

    def __is_null(self, dat):
        n_data = 0
        for key in dat:
            n_data += len(dat[key])
        return n_data == 0

使い方

model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 2))

# 登録するだけ with scopeに入ってるときだけ関数hookがかかる
with SaveActive(model) as sa:
    x = torch.rand(6, 10)
    model(x).mean().backward()
    print(sa.get_fw_input_mean_norm())
    print(sa.get_fw_output_mean_norm())
    print(sa.get_bw_input_mean_norm())
    print(sa.get_bw_output_mean_norm())

出力

mean norm by n_sameples: 6
{'0': tensor(1.5072), '1': tensor(0.5847)}
mean norm by n_sameples: 6
{'0': tensor(0.5847), '1': tensor(0.4009)}
mean norm by n_sameples: 6
{'1': tensor(0.1083)}
mean norm by n_sameples: 6
{'0': tensor(0.1083), '1': tensor(0.1179)}

前方伝搬だけすると、fwだけが記録され、逆誤差伝搬までするとbwにも値が入る
すべての入力に対してbwしたときの勾配の平均がほしいなら、batchごとにbackward()を呼ぶ必要がある。
平均に使われたn_sample数が出力されるので、思った通りの動作をしているかはそこでわかる。
今は出力を楽にするためにnormを返しているが、.mean(0).norm()→.mean(0)に修正すればtensor自体がreturnされる。
with構文を用いて、withの中にあるときだけ記録するようにする

つまり

with SaveActive(model) as sa:
    model(x).mean()
    model(x).mean()
    model(x).mean().backward()

とすると
前方伝搬はサンプル数 18の平均になり(batch size 6 * 3回)
逆誤差伝搬はサンプル数6の平均になる

ResNetに使ってみる

resblockも x+h1 → h2 となり 1つ入力すると1つ出力するので問題いない
relu inplaceのみ気をつける必要がある