M2チップのMacBook Air上でStable Diffusionを動かしてみる
Web上の画像生成・編集サービスは何かと不便な部分が多いので,ローカルで動作するStable Diffusionの環境を持っておくと便利です
軽いモデルであればMacBookでも動くので,本記事では動作検証を兼ねて一通り試してみます
動作環境
- MacBook Air M2チップ 16GB
- PyTorch 2.3.0
所感
実験したモデルの使用メモリ量と実行速度です.適当な計測なので,あくまで目安です
モデル | VRAM | M2 | NVIDIA RTX A5000 |
---|---|---|---|
DDPM-cat | 0.8GB 程度 | 3 it/s 程度 | 42 it/s 程度 |
Stable Diffusion v1.5 | 5GB 程度 | 1 it/s 程度 | 12 it/s 程度 |
Stable Diffusion XL Turbo | 10GB 程度 | 0.5 it/s 程度 | 15 it/s 程度 |
Stable Diffusion v2 (inpainting) | 3GB 程度 | 1 it/s 程度 | 20 it/s 程度 |
ざっくり触ってみた感じでは,実行速度はNVIDIA RTX A5000と比べると非常に遅いですが,画像数枚程度の生成であればM2チップでも実用的かと思います.また,使用メモリ量の面では Stable Diffusion v1, v2系のモデルの場合は十分軽量に動作しますが,Stable Diffusion XL系のモデルの場合はswapが頻繁に発生してしまい重いです.小さいモデルを選んで使う必要がありそうです
前準備
Installation
本記事では uv を使用します.未インストールの場合は次のコマンドでインストールしてください
curl -LsSf https://astral.sh/uv/install.sh | sh
次のコマンドでPython実行環境と依存パッケージをまとめてインストールできます
uv sync
デフォルトでは .venv/
に仮想環境が作られます
huggingface CLIのセットアップ
HuggingFace のアカウントを作成して https://huggingface.co/settings/tokens からアクセストークンを発行します.その後,トークンを使ってhuggingface-cli
のログインを済ませておきます.huggingface-cli
はuv sync
でインストール済みです
huggingface-cli login
To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible):
入力したトークンは ~/.cache/huggingface/token
に保存されています
Appleチップ上でPyTorchを利用する
AppleチップのGPUでは Metal Performance Shaders (mps
) という技術が使われています.PyTorchでは次のコマンドでmps
が使用可能か確認できます
>>> import torch
>>> print(torch.backends.mps.is_available())
True
mps
デバイス上で計算するには model.to('mps')
とすればよいです.cpu
やcuda
と同じですね
Experiments
code]
DDPM [まずは基本的なDiffusion Modelとして DDPM を試してみましょう.使用するモデルはgoogle/ddpm-cat-256
です
uv run src/scripts/ddpm.py
DDPMPipeline.from_pretrained('google/ddpm-cat-256')
でモデルの読み込みとダウンロードを行います.ダウンロード先はデフォルトでは~/.cache/huggingface/hub
となっています.色々なモデルを試しているとストレージを非常に圧迫するので定期的に整理しましょう
1回で生成する画像枚数はbatch_size
で指定できます.MacBook上では小さい値にしておかないとメモリを食いつぶします
シンプルなDDPMの実装では1000回ほどの推論ステップ num_inference_steps
が必要です.今回は動作確認なので100ステップとします
import os
import torch
from diffusers.pipelines.ddpm.pipeline_ddpm import DDPMPipeline
from src.utils import ExperimentalContext, options
def inference(pipeline, context: ExperimentalContext, batch_size, num_inference_steps=1000):
# 推論
images = pipeline(
batch_size=batch_size,
generator=context.generator,
num_inference_steps=num_inference_steps,
).images
# 画像の保存
for i, image in enumerate(images):
context.save_image(image, 'uncond', f'i{i}_n{num_inference_steps}')
@options
def main(seed, device):
batch_size = 1
# モデルの読み込み
pipeline = DDPMPipeline.from_pretrained('google/ddpm-cat-256', torch_dtype=torch.float16).to(device)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'ddpm_cat'))
inference(pipeline=pipeline, context=context, batch_size=batch_size, num_inference_steps=100)
if __name__ == '__main__':
main()
src.utils の中身
import os
import random
from functools import wraps
import click
import numpy as np
import torch
def fix_seed(seed=0):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms = True
class ExperimentalContext:
def __init__(self, seed, device, root_dir='out'):
self.seed = seed
self.device = device
torch.set_default_device(device)
self.generator = torch.Generator(device)
self.generator.manual_seed(seed)
self.root_dir = root_dir
os.makedirs(self.root_dir, exist_ok=True)
fix_seed(seed)
def save_image(self, image, exp_name, image_name):
out_dir = os.path.join(self.root_dir, exp_name)
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f'seed{self.seed}_{image_name}.png')
image.save(out_path)
print(f'Image has been saved successfully to {out_path}')
def options(func):
@click.command()
@click.option('--seed', type=int, default=42, help='Random seed for reproducibility.')
@click.option('--device', type=str, default='mps', help='Device to run the computation on.')
@wraps(func)
def wrapper(*args, **kwargs):
return func(*args, **kwargs)
return wrapper
生成結果です.DDPMの100ステップだとこんなもんですね
google/ddpm-cat-256 100steps unconditional
code]
Stable Diffusion v1.5 [Stable Diffusion v1.5を試してみます.Stable Diffusion系列のモデルになると平気で数GBを超えるサイズになるので注意です
uv run src/scripts/sdv1_5_dpmsolver.py
パッチの内容
# ruff: noqa
# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
from typing import List, Optional, Union
import numpy as np
import torch
from diffusers.configuration_utils import register_to_config
from diffusers.schedulers.scheduling_dpmsolver_multistep import (
DPMSolverMultistepScheduler,
betas_for_alpha_bar,
rescale_zero_terminal_snr,
)
from diffusers.utils.deprecation_utils import deprecate
from diffusers.utils.import_utils import is_scipy_available
class PachedDPMSolverMultistepScheduler(DPMSolverMultistepScheduler):
@register_to_config
def __init__(
self,
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: str = 'linear',
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
solver_order: int = 2,
prediction_type: str = 'epsilon',
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
sample_max_value: float = 1.0,
algorithm_type: str = 'dpmsolver++',
solver_type: str = 'midpoint',
lower_order_final: bool = True,
euler_at_final: bool = False,
use_karras_sigmas: Optional[bool] = False,
use_exponential_sigmas: Optional[bool] = False,
use_beta_sigmas: Optional[bool] = False,
use_lu_lambdas: Optional[bool] = False,
final_sigmas_type: Optional[str] = 'zero', # "zero", "sigma_min"
lambda_min_clipped: float = -float('inf'),
variance_type: Optional[str] = None,
timestep_spacing: str = 'linspace',
steps_offset: int = 0,
rescale_betas_zero_snr: bool = False,
):
if self.config.use_beta_sigmas and not is_scipy_available():
raise ImportError('Make sure to install scipy if you want to use beta sigmas.')
if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
raise ValueError(
'Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used.'
)
if algorithm_type in ['dpmsolver', 'sde-dpmsolver']:
deprecation_message = f'algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead'
deprecate('algorithm_types dpmsolver and sde-dpmsolver', '1.0.0', deprecation_message)
if trained_betas is not None:
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
elif beta_schedule == 'linear':
self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
elif beta_schedule == 'scaled_linear':
# this schedule is very specific to the latent diffusion model.
self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
elif beta_schedule == 'squaredcos_cap_v2':
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
raise NotImplementedError(f'{beta_schedule} is not implemented for {self.__class__}')
if rescale_betas_zero_snr:
self.betas = rescale_zero_terminal_snr(self.betas)
self.alphas = 1.0 - self.betas
# PATCHED --------------------------------------------------------
# self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
# ----------------------------------------------------------------
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0).cpu()
# ----------------------------------------------------------------
if rescale_betas_zero_snr:
# Close to 0 without being 0 so first sigma is not inf
# FP16 smallest positive subnormal works well here
self.alphas_cumprod[-1] = 2**-24
# Currently we only support VP-type noise schedule
self.alpha_t = torch.sqrt(self.alphas_cumprod)
self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0
# settings for DPM-Solver
if algorithm_type not in ['dpmsolver', 'dpmsolver++', 'sde-dpmsolver', 'sde-dpmsolver++']:
if algorithm_type == 'deis':
self.register_to_config(algorithm_type='dpmsolver++')
else:
raise NotImplementedError(f'{algorithm_type} is not implemented for {self.__class__}')
if solver_type not in ['midpoint', 'heun']:
if solver_type in ['logrho', 'bh1', 'bh2']:
self.register_to_config(solver_type='midpoint')
else:
raise NotImplementedError(f'{solver_type} is not implemented for {self.__class__}')
if algorithm_type not in ['dpmsolver++', 'sde-dpmsolver++'] and final_sigmas_type == 'zero':
raise ValueError(
f'`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead.'
)
# setable values
self.num_inference_steps = None
timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
self.timesteps = torch.from_numpy(timesteps)
self.model_outputs = [None] * solver_order
self.lower_order_nums = 0
self._step_index = None
self._begin_index = None
self.sigmas = self.sigmas.to('cpu') # to avoid too much CPU/GPU communication
def set_timesteps(
self,
num_inference_steps: int = None,
device: Union[str, torch.device] = None,
timesteps: Optional[List[int]] = None,
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
must be `None`, and `timestep_spacing` attribute will be ignored.
"""
if num_inference_steps is None and timesteps is None:
raise ValueError('Must pass exactly one of `num_inference_steps` or `timesteps`.')
if num_inference_steps is not None and timesteps is not None:
raise ValueError('Can only pass one of `num_inference_steps` or `custom_timesteps`.')
if timesteps is not None and self.config.use_karras_sigmas:
raise ValueError('Cannot use `timesteps` with `config.use_karras_sigmas = True`')
if timesteps is not None and self.config.use_lu_lambdas:
raise ValueError('Cannot use `timesteps` with `config.use_lu_lambdas = True`')
if timesteps is not None and self.config.use_exponential_sigmas:
raise ValueError('Cannot set `timesteps` with `config.use_exponential_sigmas = True`.')
if timesteps is not None and self.config.use_beta_sigmas:
raise ValueError('Cannot set `timesteps` with `config.use_beta_sigmas = True`.')
if timesteps is not None:
timesteps = np.array(timesteps).astype(np.int64)
else:
# Clipping the minimum of all lambda(t) for numerical stability.
# This is critical for cosine (squaredcos_cap_v2) noise schedule.
clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
# PATCHED --------------------------------------------------------
# last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
# ----------------------------------------------------------------
last_timestep = ((self.config.num_train_timesteps - clipped_idx).cpu().numpy()).item()
# ----------------------------------------------------------------
# "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
if self.config.timestep_spacing == 'linspace':
timesteps = (
np.linspace(0, last_timestep - 1, num_inference_steps + 1)
.round()[::-1][:-1]
.copy()
.astype(np.int64)
)
elif self.config.timestep_spacing == 'leading':
step_ratio = last_timestep // (num_inference_steps + 1)
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = (
(np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
)
timesteps += self.config.steps_offset
elif self.config.timestep_spacing == 'trailing':
step_ratio = self.config.num_train_timesteps / num_inference_steps
# creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3
timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
timesteps -= 1
else:
raise ValueError(
f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
)
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
log_sigmas = np.log(sigmas)
if self.config.use_karras_sigmas:
sigmas = np.flip(sigmas).copy()
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
elif self.config.use_lu_lambdas:
lambdas = np.flip(log_sigmas.copy())
lambdas = self._convert_to_lu(in_lambdas=lambdas, num_inference_steps=num_inference_steps)
sigmas = np.exp(lambdas)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
elif self.config.use_exponential_sigmas:
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
elif self.config.use_beta_sigmas:
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
else:
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
if self.config.final_sigmas_type == 'sigma_min':
sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
elif self.config.final_sigmas_type == 'zero':
sigma_last = 0
else:
raise ValueError(
f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
)
sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
self.sigmas = torch.from_numpy(sigmas)
self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
self.num_inference_steps = len(timesteps)
self.model_outputs = [
None,
] * self.config.solver_order
self.lower_order_nums = 0
# add an index counter for schedulers that allow duplicated timesteps
self._step_index = None
self._begin_index = None
self.sigmas = self.sigmas.to('cpu') # to avoid too much CPU/GPU communication
from diffusers.schedulers import scheduling_dpmsolver_multistep
scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler = PachedDPMSolverMultistepScheduler
サンプリングにはDPM-Solverを使用します.DPM-Solverであれば25ステップほどで十分です
guidance_scale
でclassifier-free guidanceの強さ (簡単に言えばプロンプトの反映度) を指定しています.guidance_scale
の適切な値はサンプラーによって異なります.大きい値に設定すると1ステップあたりにdenoiseするノイズが大きくなり,生成が崩壊するので調整が必要です
生成プロンプトはかわいい猫とします.Stable Diffusion系のモデルでは,len(prompts)
がbatch_size
に対応しています
prompts = [
'a cat, fat, with brown fur, with short legs',
'a cat, fat, with white fur, with short legs',
]
実装
import src.patches.scheduling_dpmsolver_multistep # noqa
import os
from typing import List
import torch
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
from diffusers.schedulers.scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
from src.utils import ExperimentalContext, options
def inference(pipeline, context: ExperimentalContext, prompts: List[str], guidance_scale=0.0, num_inference_steps=50):
negative_prompts = ['low quality, bad quality' for _ in range(len(prompts))]
# 推論
images = pipeline(
prompts,
negative_prompt=negative_prompts,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=context.generator,
).images
# 画像の保存
for i, image in enumerate(images):
context.save_image(image, prompts[i].replace(' ', '_'), f'n{num_inference_steps}_s{guidance_scale}')
@options
def main(seed, device):
prompts = [
'a cat, fat, with brown fur, with short legs',
'a cat, fat, with white fur, with short legs',
]
# モデルの読み込み
pipeline = StableDiffusionPipeline.from_pretrained(
'runwayml/stable-diffusion-v1-5',
torch_dtype=torch.float16,
).to(device)
# スケジューラの読み込み
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdv1_5_dpmsolver'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=25, guidance_scale=0.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdv1_5_dpmsolver'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=25, guidance_scale=2.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdv1_5_dpmsolver'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=25, guidance_scale=4.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdv1_5_dpmsolver'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=25, guidance_scale=8.0)
if __name__ == '__main__':
main()
生成画像です.NSFWが入ってしまいましたが面倒なのでこのままにします
guidance_scale
を左から0.0, 2.0, 4.0, 8.0としています
code]
Stable Diffusion XL Turbo [Stable Diffusion XL Turboも試してみます.Stable Diffusion XL系のモデルはv1系よりもモデルサイズが非常に大きく,環境によってはフリーズするかもしれません
uv run src/scripts/sdxl_turbo.py
Stable Diffusion XL TurboはStable Diffusion XLをadversarial diffusion distillationしたもので,数ステップで高品質な画像を生成できます
実装
import src.patches.scheduling_euler_ancestral_discrete # noqa
import os
from typing import List
import torch
from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
from src.utils import ExperimentalContext, options
def inference(pipeline, context: ExperimentalContext, prompts: List[str], guidance_scale=0.0, num_inference_steps=1):
negative_prompt = 'low quality, bad quality'
# 推論
images = pipeline(
prompts,
negative_prompt=negative_prompt,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=context.generator,
).images
# 画像の保存
for i, image in enumerate(images):
context.save_image(image, prompts[i].replace(' ', '_'), f'i{i}_n{num_inference_steps}_s{guidance_scale}')
@options
def main(seed, device):
prompts = [
'a cat, fat, with brown fur, with short legs',
'a cat, fat, with white fur, with short legs',
]
# モデルの読み込み
pipeline = StableDiffusionXLPipeline.from_pretrained(
'stabilityai/sdxl-turbo', torch_dtype=torch.float16, variant='fp16'
).to(device)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdxl_turbo'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=1, guidance_scale=0.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdxl_turbo'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=2, guidance_scale=0.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdxl_turbo'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=4, guidance_scale=0.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdxl_turbo'))
inference(pipeline=pipeline, context=context, prompts=prompts, num_inference_steps=8, guidance_scale=0.0)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdxl_turbo'))
if __name__ == '__main__':
main()
生成画像です.プロンプトは先程と同じものを使いました.Stable Diffusion XL Turboの場合,guidance_scale
を大きくするとすぐに生成が崩壊します.num_inference_steps
は2~4にするのが良さそうです
num_inference_steps
を4とし, guidance_scale
を左から0.0, 2.0, 4.0, 8.0としています
num_inference_steps
を左から1, 2, 4, 8とし, guidance_scale
を0.0としています
code]
Stable Diffusion v2 (inpainting) [画像生成の他に,inpaintingタスクも良く使われていると思います.今回はStable Diffusion v2 (inpainting) を使って公式チュートリアルのサンプルのinpaintingを試してみます
uv run src/scripts/sdv2_inpaint.py
実装
import src.patches.scheduling_euler_ancestral_discrete # noqa
import os
import torch
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
from diffusers.utils.loading_utils import load_image
from diffusers.utils.pil_utils import make_image_grid
from src.utils import ExperimentalContext, options
def inference(pipeline_inpaint, context: ExperimentalContext, prompt: str, guidance_scale=0.0, num_inference_steps=2):
# ソース画像・マスク画像の読み込み
# https://huggingface.co/docs/diffusers/ja/tutorials/autopipeline
img_url = 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png'
mask_url = 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png'
init_image = load_image(img_url).convert('RGB')
mask_image = load_image(mask_url).convert('RGB')
negative_prompt = 'low quality, bad quality'
# 推論
image = pipeline_inpaint(
prompt,
negative_prompt=negative_prompt,
image=init_image,
mask_image=mask_image,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=context.generator,
).images[0]
# 画像の保存
filename = f'n{num_inference_steps}_s{guidance_scale}'
image_compare = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
context.save_image(image, prompt.replace(' ', '_'), filename)
context.save_image(image_compare, prompt.replace(' ', '_'), f'{filename}_comp')
@options
def main(seed, device):
prompt = 'a bench'
# prompt = 'a cat, sitting on a bench'
# img2imgモデルの読み込み
# https://huggingface.co/stabilityai/stable-diffusion-2-inpainting
pipeline_inpaint = StableDiffusionInpaintPipeline.from_pretrained(
'stabilityai/stable-diffusion-2-inpainting', torch_dtype=torch.float16, variant='fp16'
).to(device)
context = ExperimentalContext(seed=seed, device=device, root_dir=os.path.join('out', 'sdv2_inpaint_sample'))
inference(
pipeline_inpaint=pipeline_inpaint, context=context, prompt=prompt, num_inference_steps=50, guidance_scale=3.0
)
if __name__ == '__main__':
main()
生成画像です.guidance_scale
を3.0として100ステップ推論しています.プロンプトを適切に設定すれば上手くinpaintできそうです
プロンプト: a bench
プロンプト: a cat, sitting on a bench
以上です.記事には載せていませんが,他のモデルでも実験しているので興味があればリポジトリのほうも見ていってください
Discussion