😊

手軽に Vision Transformer で二値分類をする

2023/06/24に公開

基本的にはvit_pytorchのサンプル
https://farml1.com/vit/
https://meditech-ai.com/pytorch-vision-transformer/
を参考にさせてもらってます。

前準備

from __future__ import print_function

import glob
from itertools import chain
import os
import random
import zipfile
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import timm
from linformer import Linformer
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm

from vit_pytorch.efficient import ViT

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)
device = 'cuda'

学習データを用意

課題や業務に合わせて用意する。

この例ではKaggleで公開されている犬と猫のラベル付き教師データ(25000枚)を利用する。
https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition

最後に答え合わせをしたいので今回は画像を1000枚除けておく。

# 初回
os.makedirs('data', exist_ok=True)
train_dir = 'data/train'
with zipfile.ZipFile('train.zip') as train_zip:
    train_zip.extractall('data')
train_orig_list = glob.glob(os.path.join(train_dir,'*.jpg'))

train_list, test_list = train_test_split(train_orig_list, 
                                          test_size=,
                                          random_state=seed)
print(f"Train Data: {len(train_list)}")
print(f"Test Data: {len(test_list)}")
with open("train_list.pickle", mode="wb") as f:
  pickle.dump(train_list, f)

# 学習に使うファイルの一覧を保存しておく
with open("test_list.pickle", mode="wb") as f:
  pickle.dump(test_list, f)
 
# 2回目以降
with open("train_list.pickle", mode="br") as f:
    train_list = pickle.load(f)

ファインチューニング

この例では教師データを訓練用と検証用に3:1で分けて学習する。
model_image_sizeは使うモデルのサイズに合わせる。

今回の猫犬の二値分類なのでCatsDogsDatasetが犬は1、猫は0を返すようになっている。

# 学習設定
test_size = 0.25
test_image_size = 256
model_image_size = 224
model_name = 'vit_base_patch16_224'
num_classes = 2

# ハイパーパラメータ
batch_size = 64
epochs = 20
lr = 1e-6
gamma = 0.7
seed = 1

train_list, valid_list = train_test_split(train_list, 
                                          test_size=test_size,
                                          random_state=seed)
train_transforms = transforms.Compose(
    [
        transforms.Resize((model_image_size, model_image_size)),
        transforms.RandomResizedCrop(model_image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]
)
val_transforms = transforms.Compose(
    [
        transforms.Resize(test_image_size),
        transforms.CenterCrop(model_image_size),
        transforms.ToTensor(),
    ]
)

class CatsDogsDataset(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform

    def __len__(self):
        self.filelength = len(self.file_list)
        return self.filelength

    def __getitem__(self, idx):
        img_path = self.file_list[idx]
        img = Image.open(img_path)
        img_transformed = self.transform(img)

        label = img_path.split("/")[-1].split(".")[0]
        label = 1 if label == "dog" else 0

        return img_transformed, label

train_data = CatsDogsDataset(train_list, transform=train_transforms)
valid_data = CatsDogsDataset(valid_list, transform=val_transforms)
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True )
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True)

model = timm.create_model(model_name, pretrained=True, num_classes=num_classes)
model.to("cuda:0")

# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)
# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0

    for data, label in tqdm(train_loader):
        data = data.to(device)
        label = label.to(device)

        output = model(data)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data.to(device)
            label = label.to(device)

            val_output = model(data)
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

# save
torch.save(
    {
        "trained": model,
    },
    "trained_model.pth",
)

答え合わせ(利用)

model = torch.load('trained_model.pth')['trained']
model.eval()  # モデルを評価モードにする

# 犬猫分類メソッド
def classification(model, file_path, visible=True):
    with torch.no_grad():
        test_image = Image.open(file_path)
        if visible:
            test_image.show()
        input = test_transforms(test_image).unsqueeze(0)
        input = input.cuda()
        with torch.no_grad():
            outputs = model(input)
        return "cat" if np.argmax(outputs.cpu().numpy()) == 0 else "dog"

# 確認用画像リスト
with open("test_list.pickle", mode="br") as f:
    test_list = pickle.load(f)

test_transforms = transforms.Compose(
    [
        transforms.Resize(test_image_size),
        transforms.CenterCrop(model_image_size),
        transforms.ToTensor(),
    ]
)

# 1枚画像を食わせて犬か猫か出力する
test_file = test_list[0]
print(test_file, classification(model, test_file))

# 確認用画像の正答率を出す
test_data = CatsDogsDataset(test_list, transform=test_transforms)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False)
loss_sum = 0
accuracy = 0

for data, label in test_loader:
    data = data.to(device)
    label = label.to(device)

    output = model(data)
    loss = criterion(output, label)
    acc = (output.argmax(dim=1) == label).float().mean()
    accuracy += acc / len(test_loader)
    loss_sum += loss / len(test_loader)

print(
    f"loss : {loss_sum:.4f} - acc: {accuracy:.4f}\n"
)

利用しているmodel(hugging faceのやつ)の性能がいいのと、教師データの数が多いおかげで99%以上の精度が出る。

Discussion