🤖
【Docker環境】SAM2でVideo Object Segmentationをしよう
SAM2の衝撃
これはすごい!
しかしながら公式のvideo demoコードが分かりにくいので,この記事で一通り動かせるようにします.1. Docker環境
Dockerfile
ARG PYTORCH="2.3.1"
ARG CUDA="12.1"
ARG CUDNN="8"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV DEBIAN_FRONTEND noninteractive
ENV TZ="Asia/Tokyo"
RUN rm -f /etc/apt/sources.list.d/cuda.list \
&& apt-get update && apt-get install -y --no-install-recommends wget \
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& rm -f cuda-keyring_1.0-1_all.deb \
&& apt-get purge --autoremove -y wget \
&& rm -rf /var/lib/apt/lists/
RUN apt-get update
RUN apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN pip install ipdb
RUN pip install numpy>=1.24.4 tqdm>=4.66.1 hydra-core>=1.3.2 iopath>=0.1.10 pillow>=9.4.0 matplotlib>=3.9.1 jupyter>=1.0.0 opencv-python>=4.7.0
docker-compose
services:
work:
build: .
runtime: nvidia
stdin_open: true
tty: true
ipc: host
volumes:
- $PWD:/work
working_dir: /work
environment:
- PYTHONPATH=/work
- NVIDIA_VISIBLE_DEVICES=0
ulimits:
memlock: -1
stack: 67108864
command: /bin/bash -c "python setup.py build_ext --inplace && /bin/bash"
一番下のコマンドはおまじないです.詳しくは↓
2. 動画の下処理
mp4のような動画ファイルではなく,フレーム連番のjpg画像を一つのフォルダに格納する必要があります.ffmpegを使いましょう.
ffmpeg -i input.mp4 -vf "scale=iw:ih" -q:v 2 video_dir/%05d.jpg
3. 重みのダウンロード
cd checkpoints
./download_ckpts.sh
4. 実行!
以下のpythonファイルにより推論を行います.
import os
from dataclasses import dataclass
from glob import glob
import cv2
import numpy as np
import torch
from sam2.build_sam import build_sam2_video_predictor
@dataclass
class SAM2Prompt:
ann_obj_id: int
ann_frame_idx: int
point_coords: np.ndarray
point_labels: np.ndarray
checkpoint = "./checkpoints/sam2_hiera_large.pt"
model_cfg = "sam2_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, checkpoint)
video_dir = "video_dir"
prompts: list[SAM2Prompt] = [
SAM2Prompt(ann_obj_id=1, ann_frame_idx=10, point_coords=np.array([[200, 300]]), point_labels=np.array([1])),
# SAM2Prompt(ann_obj_id=2, ann_frame_idx=10, point_coords=np.array([[200, 400]]), point_labels=np.array([1])),
# ...
]
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
state = predictor.init_state(video_path=video_dir)
for prompt in prompts:
_frame_idx, _object_ids, _masks = predictor.add_new_points(
inference_state=state,
frame_idx=prompt.ann_frame_idx,
obj_id=prompt.ann_obj_id,
points=prompt.point_coords,
labels=prompt.point_labels,
)
video_segments = {} # video_segments contains the per-frame segmentation results
for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(state):
video_segments[out_frame_idx] = {
out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
for i, out_obj_id in enumerate(out_obj_ids)
}
for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(state, reverse=True):
video_segments[out_frame_idx] = {
out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
for i, out_obj_id in enumerate(out_obj_ids)
}
# visualize the segmentation results
COLORS = {
1: (255, 0, 0),
2: (0, 255, 0),
3: (0, 0, 255),
4: (255, 255, 0),
5: (255, 0, 255),
}
images = glob("video_dir/*.jpg")
images = sorted(images)
os.makedirs("demo", exist_ok=True)
for frame_id, img_path in enumerate(images):
img = cv2.imread(img_path)
demo_img = img.copy()
for obj_id, mask in video_segments[frame_id].items():
mask = mask[0]
color = COLORS[obj_id]
colored_mask = np.zeros_like(img)
colored_mask[mask] = np.array(color)
demo_img = cv2.addWeighted(demo_img, 1, colored_mask, 0.8, 0)
rect = cv2.boundingRect(mask.astype(np.uint8))
cv2.rectangle(demo_img, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), color, 2)
cv2.putText(demo_img, f"{obj_id}", (rect[0] - 20, rect[1] - 20), cv2.FONT_HERSHEY_SIMPLEX, 2, color, 2)
cv2.imwrite(f"demo/{os.path.basename(img_path)}", demo_img)
以下の部分が初見だと気付きにくい実装ですね
for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(state, reverse=True):
video_segments[out_frame_idx] = {
out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
for i, out_obj_id in enumerate(out_obj_ids)
}
5. 結果を動画に変換
連番画像を元に戻してやります.
ffmpeg -framerate 30 -i demo/%05d.jpg -c:v mpeg4 output.mp4
おわりに
これで君も,レッツVOS!
Discussion