Stable-Diffusion を適当に変換試行する

PINTO 2022/09/04

PINTO 2022/09/04に更新

git clone https://github.com/CompVis/stable-diffusion.git
cd stable-diffusion
git checkout 69ae4b35e0a0f6ee1af8bb9a5d0016ccb27e36dc

PINTO 2022/09/04

xhost +local: && \
  docker run --gpus all -it --rm \
  -v `pwd`:/home/user/workdir \
  -v /tmp/.X11-unix/:/tmp/.X11-unix:rw \
  --device /dev/video0:/dev/video0:mwr \
  --net=host \
  -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
  -e DISPLAY=$DISPLAY \
  --privileged \
  ghcr.io/pinto0309/openvino2tensorflow:latest

PINTO 2022/09/04に更新

pip install \
transformers==4.19.2 \
diffusers==0.2.4 \
invisible-watermark==0.1.5 \
omegaconf==2.2.3 \
einops==0.4.1 \
pytorch_lightning==1.7.4 \
taming-transformers-rom1504==0.0.6 \
clip==0.2.0 \
kornia==0.6.7

pip install -e .

PINTO 2022/09/04に更新

docker ps -a

CONTAINER ID IMAGE                                        COMMAND CREATED       STATUS       PORTS NAMES
97ea63046add ghcr.io/pinto0309/openvino2tensorflow:latest "bash"  2 minutes ago Up 2 minutes       charming_chaum

docker commit charming_chaum pinto0309/stablediffusion_export
docker push pinto0309/stablediffusion_export:latest

PINTO 2022/09/04に更新

mkdir -p models/ldm/stable-diffusion-v1/
cd models/ldm/stable-diffusion-v1/

stable-diffusion のチェックポイント
https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4-full-ema.ckpt

mv sd-v1-4.ckpt model.ckpt
cd ../../..

PINTO 2022/09/04に更新

python scripts/txt2img.py --prompt "a photograph of an cat into a bottle" --plms

RuntimeError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 0; 7.77 GiB total capacity; 5.62 GiB already allocated; 418.62 MiB free; 5.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

PINTO 2022/09/04に更新

exit

xhost +local: && \
  docker run -it --rm \
  -v `pwd`:/home/user/workdir \
  -v /tmp/.X11-unix/:/tmp/.X11-unix:rw \
  --device /dev/video0:/dev/video0:mwr \
  --net=host \
  -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
  -e DISPLAY=$DISPLAY \
  --privileged \
  pinto0309/stablediffusion_export:latest

PINTO 2022/09/04

scripts/txt2img.py

def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cuda()
    model.eval()
    return model

scripts/txt2img.py

def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cpu()
    model.eval()
    return model

PINTO 2022/09/04

ldm/modules/encoders/modules.py

class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(version)
        self.transformer = CLIPTextModel.from_pretrained(version)
        self.device = device
        self.max_length = max_length
        self.freeze()

ldm/modules/encoders/modules.py

class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
    def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(version)
        self.transformer = CLIPTextModel.from_pretrained(version)
        self.device = device
        self.max_length = max_length
        self.freeze()

PINTO 2022/09/04に更新

ldm/models/diffusion/plms.py

class PLMSSampler(object):
    def __init__(self, model, schedule="linear", **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.schedule = schedule

    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
            if attr.device != torch.device("cuda"):
                attr = attr.to(torch.device("cuda"))
        setattr(self, name, attr)

ldm/models/diffusion/plms.py

class PLMSSampler(object):
    def __init__(self, model, schedule="linear", **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.schedule = schedule

    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
            if attr.device != torch.device("cuda"):
                pass
        setattr(self, name, attr)

PINTO 2022/09/04

推論テスト

python scripts/txt2img.py --prompt "a photograph of an cat into a bottle" --plms