😊
手軽に Vision Transformer で二値分類をする
基本的にはvit_pytorchのサンプルや を参考にさせてもらってます。
前準備
from __future__ import print_function
import glob
from itertools import chain
import os
import random
import zipfile
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import timm
from linformer import Linformer
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
from vit_pytorch.efficient import ViT
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
seed_everything(seed)
device = 'cuda'
学習データを用意
課題や業務に合わせて用意する。
この例ではKaggleで公開されている犬と猫のラベル付き教師データ(25000枚)を利用する。
最後に答え合わせをしたいので今回は画像を1000枚除けておく。
# 初回
os.makedirs('data', exist_ok=True)
train_dir = 'data/train'
with zipfile.ZipFile('train.zip') as train_zip:
train_zip.extractall('data')
train_orig_list = glob.glob(os.path.join(train_dir,'*.jpg'))
train_list, test_list = train_test_split(train_orig_list,
test_size=,
random_state=seed)
print(f"Train Data: {len(train_list)}")
print(f"Test Data: {len(test_list)}")
with open("train_list.pickle", mode="wb") as f:
pickle.dump(train_list, f)
# 学習に使うファイルの一覧を保存しておく
with open("test_list.pickle", mode="wb") as f:
pickle.dump(test_list, f)
# 2回目以降
with open("train_list.pickle", mode="br") as f:
train_list = pickle.load(f)
ファインチューニング
この例では教師データを訓練用と検証用に3:1で分けて学習する。
model_image_size
は使うモデルのサイズに合わせる。
今回の猫犬の二値分類なのでCatsDogsDataset
が犬は1、猫は0を返すようになっている。
# 学習設定
test_size = 0.25
test_image_size = 256
model_image_size = 224
model_name = 'vit_base_patch16_224'
num_classes = 2
# ハイパーパラメータ
batch_size = 64
epochs = 20
lr = 1e-6
gamma = 0.7
seed = 1
train_list, valid_list = train_test_split(train_list,
test_size=test_size,
random_state=seed)
train_transforms = transforms.Compose(
[
transforms.Resize((model_image_size, model_image_size)),
transforms.RandomResizedCrop(model_image_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
]
)
val_transforms = transforms.Compose(
[
transforms.Resize(test_image_size),
transforms.CenterCrop(model_image_size),
transforms.ToTensor(),
]
)
class CatsDogsDataset(Dataset):
def __init__(self, file_list, transform=None):
self.file_list = file_list
self.transform = transform
def __len__(self):
self.filelength = len(self.file_list)
return self.filelength
def __getitem__(self, idx):
img_path = self.file_list[idx]
img = Image.open(img_path)
img_transformed = self.transform(img)
label = img_path.split("/")[-1].split(".")[0]
label = 1 if label == "dog" else 0
return img_transformed, label
train_data = CatsDogsDataset(train_list, transform=train_transforms)
valid_data = CatsDogsDataset(valid_list, transform=val_transforms)
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True )
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True)
model = timm.create_model(model_name, pretrained=True, num_classes=num_classes)
model.to("cuda:0")
# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)
# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
for epoch in range(epochs):
epoch_loss = 0
epoch_accuracy = 0
for data, label in tqdm(train_loader):
data = data.to(device)
label = label.to(device)
output = model(data)
loss = criterion(output, label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = (output.argmax(dim=1) == label).float().mean()
epoch_accuracy += acc / len(train_loader)
epoch_loss += loss / len(train_loader)
with torch.no_grad():
epoch_val_accuracy = 0
epoch_val_loss = 0
for data, label in valid_loader:
data = data.to(device)
label = label.to(device)
val_output = model(data)
val_loss = criterion(val_output, label)
acc = (val_output.argmax(dim=1) == label).float().mean()
epoch_val_accuracy += acc / len(valid_loader)
epoch_val_loss += val_loss / len(valid_loader)
print(
f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
)
# save
torch.save(
{
"trained": model,
},
"trained_model.pth",
)
答え合わせ(利用)
model = torch.load('trained_model.pth')['trained']
model.eval() # モデルを評価モードにする
# 犬猫分類メソッド
def classification(model, file_path, visible=True):
with torch.no_grad():
test_image = Image.open(file_path)
if visible:
test_image.show()
input = test_transforms(test_image).unsqueeze(0)
input = input.cuda()
with torch.no_grad():
outputs = model(input)
return "cat" if np.argmax(outputs.cpu().numpy()) == 0 else "dog"
# 確認用画像リスト
with open("test_list.pickle", mode="br") as f:
test_list = pickle.load(f)
test_transforms = transforms.Compose(
[
transforms.Resize(test_image_size),
transforms.CenterCrop(model_image_size),
transforms.ToTensor(),
]
)
# 1枚画像を食わせて犬か猫か出力する
test_file = test_list[0]
print(test_file, classification(model, test_file))
# 確認用画像の正答率を出す
test_data = CatsDogsDataset(test_list, transform=test_transforms)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False)
loss_sum = 0
accuracy = 0
for data, label in test_loader:
data = data.to(device)
label = label.to(device)
output = model(data)
loss = criterion(output, label)
acc = (output.argmax(dim=1) == label).float().mean()
accuracy += acc / len(test_loader)
loss_sum += loss / len(test_loader)
print(
f"loss : {loss_sum:.4f} - acc: {accuracy:.4f}\n"
)
利用しているmodel(hugging faceのやつ)の性能がいいのと、教師データの数が多いおかげで99%以上の精度が出る。
Discussion