Detectron2 ベースの BodyHands のONNXエクスポート試行

git clone https://github.com/cvlab-stonybrook/BodyHands.git
cd BodyHands

Dockerfile

FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04

ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update && apt-get install -y \
    ca-certificates \
    python3-dev \
    git \
    wget \
    sudo \
    ninja-build \
    python-is-python3 \
    python3-pip \
    libgl1-mesa-dev \
    libglib2.0-0 \
    libsm6 \
    libxrender1 \
    libxext-dev \
    nano \
    && sed -i 's/# set linenumbers/set linenumbers/g' /etc/nanorc \
    && apt clean \
    && rm -rf /var/lib/apt/lists/*

# create a non-root user
ENV USERNAME=user
RUN echo "root:root" | chpasswd \
    && adduser --disabled-password --gecos "" "${USERNAME}" \
    && echo "${USERNAME}:${USERNAME}" | chpasswd \
    && echo "%${USERNAME}    ALL=(ALL)   NOPASSWD:    ALL" >> /etc/sudoers.d/${USERNAME} \
    && chmod 0440 /etc/sudoers.d/${USERNAME} \
    && mkdir -p /home/${USERNAME}
USER ${USERNAME}
# RUN echo HOME: ${HOME}
# RUN echo PWD: `pwd`
ENV HOME=/home/${USERNAME}
WORKDIR ${HOME}

ENV PATH="${HOME}/.local/bin:${PATH}"

# install dependencies
# See https://pytorch.org/ for other options if you use a different version of CUDA
RUN pip install --user pip -U
RUN pip install --user tensorboard cmake onnx   # cmake from apt-get is too old
RUN pip install --user torch==1.10 torchvision==0.11.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html
RUN pip install opencv-python==4.1.2.30 scipy scikit-image

RUN pip install --user 'git+https://github.com/facebookresearch/fvcore'
# install detectron2
RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo
# set FORCE_CUDA because during `docker build` cuda is not accessible
ENV FORCE_CUDA="1"
# This will by default build detectron2 for all common cuda architectures and take a lot more time,
# because inside `docker build`, there is no way to tell which architecture will be used.
ARG TORCH_CUDA_ARCH_LIST="Ampere"
ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"

RUN pip install --user -e detectron2_repo

# Set a fixed model cache directory.
ENV FVCORE_CACHE="/tmp"
WORKDIR ${HOME}/detectron2_repo

RUN pip install --user torch==1.13.1 torchvision==0.14.1 --extra-index-url https://download.pytorch.org/whl/cu116

RUN echo "export QT_X11_NO_MITSHM=1" >> ${HOME}/.bashrc \
    && echo "sudo chmod 777 /dev/video*" >> ${HOME}/.bashrc

docker build -t pinto0309/bodyhands:latest .

xhost +local: && \
docker run -it --rm --gpus all \
-v $PWD:/home/user/detectron2_repo/BodyHands \
-v /tmp/.X11-unix/:/tmp/.X11-unix:rw \
--device /dev/video0:/dev/video0:mwr \
--net=host \
-e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
-e DISPLAY=$DISPLAY \
--privileged \
pinto0309/bodyhands:latest

PINTO

cd BodyHands

python demo_cam.py

PINTO

demo_cam.py

import argparse
import os
import cv2
import torch
from detectron2.data import MetadataCatalog
from detectron2.modeling import build_model
from detectron2.config import get_cfg
import detectron2.data.transforms as T
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.data import MetadataCatalog
from detectron2.modeling import build_model
from bodyhands import *
from datasets import *
from bodyhands import add_bodyhands_config
from bodyhands import CustomVisualizer
import copy

class CustomPredictor:

    def __init__(self, cfg):
        self.cfg = cfg.clone()
        self.model = build_model(self.cfg)
        self.model.eval()
        self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])

        checkpointer = DetectionCheckpointer(self.model)
        checkpointer.load(cfg.MODEL.WEIGHTS)

        self.input_format = cfg.INPUT.FORMAT
        assert self.input_format in ["RGB", "BGR"], self.input_format

    def __call__(self, original_image):
        with torch.no_grad():
            if self.input_format == "RGB":
                original_image = original_image[:, :, ::-1]
            height, width = original_image.shape[:2]
            image = torch.as_tensor(original_image.astype("float32").transpose(2, 0, 1))
            inputs = {"image": image, "height": height, "width": width}
            predictions = self.model([inputs], height, width)[0]
            return predictions

def prepareModel(cfg_file, weights, thresh):
    cfg = get_cfg()
    add_bodyhands_config(cfg)
    cfg.merge_from_file(cfg_file)
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = thresh
    cfg.MODEL.WEIGHTS = os.path.abspath(weights)
    predictor = CustomPredictor(cfg)
    return predictor

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Arguments for evaluation')
    parser.add_argument(
        '--thresh',
        required=False,
        metavar='threshold for hand detections', \
        help='hand detection score threshold',
        default=0.7,
    )

    args = parser.parse_args()
    out_path = os.path.abspath('./demoOutput/')
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    roi_score_thresh = float(args.thresh)
    model = prepareModel('./configs/BodyHands.yaml', './models/model.pth', roi_score_thresh)


    cap_device = 0
    cap = cv2.VideoCapture(cap_device)
    cap_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    cap_fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc('m','p','4','v')
    video_writer = cv2.VideoWriter(
        filename='output.mp4',
        fourcc=fourcc,
        fps=15,
        frameSize=(cap_width, cap_height),
    )


    while True:
        ret, im = cap.read()
        if not ret:
            break
        debug_image = copy.deepcopy(im)
        debug_image = cv2.resize(debug_image, (cap_width, cap_height))
        outputs = model(debug_image)

        v = CustomVisualizer(debug_image[:, :, ::-1], MetadataCatalog.get("HandBodyContactHands_sub"), scale=1.0)
        #######################################################################################
        outputs = outputs["instances"].to("cpu")
        classes = outputs.pred_classes
        body_ids = outputs.pred_body_ids
        boxes = outputs.pred_boxes.tensor
        masks = outputs.pred_masks
        hand_indices = classes == 0
        body_indices = classes == 1
        hand_boxes = boxes[hand_indices]
        hand_masks = masks[hand_indices]
        hand_body_ids = body_ids[hand_indices]
        body_boxes = boxes[body_indices]
        body_body_ids = body_ids[body_indices]
        num_hands, num_bodies = hand_boxes.shape[0], body_boxes.shape[0]
        body_masks = []
        for body_no in range(num_bodies):
            box = body_boxes[body_no].view(-1).cpu().numpy()
            xmin, ymin, xmax, ymax = box
            body_poly = [[(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin)]]
            body_masks.append(body_poly)
        ########################################################################################
        v = v.modified_draw_instance_predictions(hand_boxes, hand_masks, hand_body_ids, body_boxes, body_masks, body_body_ids)
        out = v.get_image()[:, :, ::-1]
        print(out.shape)

        video_writer.write(out)
        cv2.imshow(f'BodyHands', out)
        key = cv2.waitKey(1)
        if key == 27:  # ESC
            break

if video_writer:
    video_writer.release()
if cap:
    cap.release()
cv2.destroyAllWindows()

PINTO

.devcontainer/devcontainer.json

{
    "name": "Python 3",
    "image": "pinto0309/bodyhands:latest",

    // Configure tool-specific properties.
    "customizations": {
        // Configure properties specific to VS Code.
        "vscode": {
            // Set *default* container specific settings.json values on container create.
            "settings": {
                "python.defaultInterpreterPath": "/usr/local/bin/python",
                "python.linting.enabled": true,
                "python.linting.pylintEnabled": true,
                "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
                "python.formatting.blackPath": "/usr/local/py-utils/bin/black",
                "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
                "python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
                "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
                "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
                "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
                "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
                "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
            },

            // Add the IDs of extensions you want installed when the container is created.
            "extensions": [
                "ms-python.python",
                "ms-python.vscode-pylance"
            ]
        }
    },

    "remoteUser": "appuser",
    "features": {
        "github-cli": "latest"
    },

    "runArgs": [
        "--gpus", "all",
        "--shm-size", "64gb",
        "--device", "/dev/video0:/dev/video0:mwr",
        "-v", "${localWorkspaceFolder}:/home/appuser/detectron2_repo/BodyHands",
        "--privileged"
    ]
}

PINTO

エクスポート用ロジック（デモコードを少し改造）

demo_cam.py

import argparse
import os
import cv2
import torch
from detectron2.data import MetadataCatalog
from detectron2.modeling import build_model
from detectron2.config import get_cfg
import detectron2.data.transforms as T
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.data import MetadataCatalog
from detectron2.modeling import build_model
from bodyhands import *
from datasets import *
from bodyhands import add_bodyhands_config
from bodyhands import CustomVisualizer
import copy

class CustomPredictor:

    def __init__(self, cfg):
        self.cfg = cfg.clone()
        self.model = build_model(self.cfg)
        self.model.eval()
        self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])

        checkpointer = DetectionCheckpointer(self.model)
        checkpointer.load(cfg.MODEL.WEIGHTS)
        print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ cfg.MODEL.WEIGHTS:{cfg.MODEL.WEIGHTS}')
        self.input_format = cfg.INPUT.FORMAT
        print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ cfg.INPUT.FORMAT:{cfg.INPUT.FORMAT}')

        # print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ checkpointer.model:{checkpointer.model}')
        # print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ self.model:{self.model}')
        # print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ self.model.__call__:{self.model.__call__}')

        def inference_func(model, image):
            inputs = [{"image": image, "height": image.shape[1], "width": image.shape[2]}]
            # inputs = [{"image": image, "height": H, "width": W}]
            # return model.inference(inputs, height=480, width=640, do_postprocess=True)[0]["instances"]
            # return model.inference(inputs, height=480, width=640, do_postprocess=False)[0]["instances"]
            return model.inference(inputs, height=480, width=640, do_postprocess=False)

        import onnx  # isort:skip
        import io
        from detectron2.export.flatten import TracingAdapter

        H=480
        W=640
        # inputs = {"image": x, "height": H, "width": W}
        # inputs = {"image": x}
        from torchvision.io import read_image
        inputs = read_image(path='teaser.jpeg')
        import torchvision.transforms.functional as F
        inputs = F.resize(img=inputs, size=(H, W))

        f = io.BytesIO()
        adapter_model = TracingAdapter(self.model, inputs, inference_func)
        adapter_model.eval()
        with torch.no_grad():
            try:
                torch.onnx.enable_log()
            except AttributeError:
                # Older ONNX versions does not have this API
                pass
            torch.onnx.export(
                adapter_model,
                adapter_model.flattened_inputs,
                f,
                training=torch.onnx.TrainingMode.EVAL,
                opset_version=11,
                verbose=True,
            )
        onnx_model = onnx.load_from_string(f.getvalue())
        assert onnx_model is not None
        onnx.save(onnx_model, f'bodyhands_{H}x{W}.onnx')
        import sys
        sys.exit(0)

        assert self.input_format in ["RGB", "BGR"], self.input_format

    def __call__(self, original_image):
        with torch.no_grad():
            if self.input_format == "RGB":
                original_image = original_image[:, :, ::-1]
            height, width = original_image.shape[:2]
            image = torch.as_tensor(original_image.astype("float32").transpose(2, 0, 1))
            inputs = {"image": image, "height": height, "width": width}
            predictions = self.model([inputs], height, width)[0]
            return predictions

def prepareModel(cfg_file, weights, thresh):
    cfg = get_cfg()
    add_bodyhands_config(cfg)
    cfg.merge_from_file(cfg_file)
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = thresh
    cfg.MODEL.WEIGHTS = os.path.abspath(weights)
    predictor = CustomPredictor(cfg)
    return predictor

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Arguments for evaluation')
    parser.add_argument(
        '--thresh',
        required=False,
        metavar='threshold for hand detections', \
        help='hand detection score threshold',
        default=0.7,
    )

    args = parser.parse_args()
    out_path = os.path.abspath('./demoOutput/')
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    roi_score_thresh = float(args.thresh)
    model = prepareModel('./configs/BodyHands.yaml', './models/model.pth', roi_score_thresh)


    cap_device = 0
    cap = cv2.VideoCapture(cap_device)
    cap_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    cap_fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc('m','p','4','v')
    video_writer = cv2.VideoWriter(
        filename='output.mp4',
        fourcc=fourcc,
        fps=15,
        frameSize=(cap_width, cap_height),
    )


    while True:
        ret, im = cap.read()
        if not ret:
            break
        debug_image = copy.deepcopy(im)
        debug_image = cv2.resize(debug_image, (cap_width, cap_height))
        outputs = model(debug_image)

        v = CustomVisualizer(debug_image[:, :, ::-1], MetadataCatalog.get("HandBodyContactHands_sub"), scale=1.0)
        #######################################################################################
        outputs = outputs["instances"].to("cpu")
        classes = outputs.pred_classes
        body_ids = outputs.pred_body_ids
        boxes = outputs.pred_boxes.tensor
        masks = outputs.pred_masks
        hand_indices = classes == 0
        body_indices = classes == 1
        hand_boxes = boxes[hand_indices]
        hand_masks = masks[hand_indices]
        hand_body_ids = body_ids[hand_indices]
        body_boxes = boxes[body_indices]
        body_body_ids = body_ids[body_indices]
        num_hands, num_bodies = hand_boxes.shape[0], body_boxes.shape[0]
        body_masks = []
        for body_no in range(num_bodies):
            box = body_boxes[body_no].view(-1).cpu().numpy()
            xmin, ymin, xmax, ymax = box
            body_poly = [[(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin)]]
            body_masks.append(body_poly)
        ########################################################################################
        v = v.modified_draw_instance_predictions(hand_boxes, hand_masks, hand_body_ids, body_boxes, body_masks, body_body_ids)
        out = v.get_image()[:, :, ::-1]
        print(out.shape)

        video_writer.write(out)
        cv2.imshow(f'BodyHands', out)
        key = cv2.waitKey(1)
        if key == 27:  # ESC
            break

if video_writer:
    video_writer.release()
if cap:
    cap.release()
cv2.destroyAllWindows()

PINTO

ココの書き換えが必要。

    row_ind, col_ind = linear_sum_assignment(-scores_numpy)
    col_ind = torch.from_numpy(col_ind)
    row_ind = torch.from_numpy(row_ind)

    print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ col_ind: {type(col_ind)}')
    print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ num_bodies: {type(num_bodies)}')

    col_ind = (col_ind % (num_bodies+1)) + 1
    # row_ind, col_ind = torch.from_numpy(row_ind).to(device), torch.from_numpy(col_ind).to(device)
    row_ind, col_ind = row_ind.to(device), col_ind.to(device)

overlap_estimation.py

import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import Linear, ShapeSpec, Conv2d, get_norm, cat
from detectron2.utils.registry import Registry
import numpy as np
import fvcore.nn.weight_init as weight_init
from detectron2.modeling.box_regression import Box2BoxTransform
from scipy.optimize import linear_sum_assignment

ROI_OVERLAP_ESTIMATION_HEAD_REGISTRY = Registry("ROI_OVERLAP_ESTIMATION_HEAD")
ROI_OVERLAP_ESTIMATION_HEAD_REGISTRY.__doc__ == """Registry for Overlap Estimation Module."""

def OverlapEstimationInference(cfg, handbody_components, pred_instances, device):

    num_hands = handbody_components["num_hands"]
    num_bodies = handbody_components["num_bodies"]
    hand_indices = handbody_components["hand_indices"]
    body_indices = handbody_components["body_indices"]
    gt_overlap = (handbody_components["gt_ioa"] > 0).float()

    if num_hands == 0:
        pred_instances[0].pred_body_ids = torch.Tensor([i for i in range(1, num_bodies+1)]).to(device)
        return pred_instances
    if num_bodies == 0:
        pred_instances[0].pred_body_ids = torch.Tensor([num_bodies+1] * num_hands).to(device)

    pred_body_ids = torch.Tensor([-1.0] * (num_hands+num_bodies)).to(device)
    pred_hand_boxes = handbody_components["hand_boxes"]
    pred_body_boxes = handbody_components["body_boxes"]
    pred_mu = handbody_components["pred_mu"]
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    mu_hand = box2box_transform.get_deltas(
            pred_hand_boxes, pred_mu
        )
    mu_body = [] # A list of length num_hands
    scores_positional_density = []
    for hand_no in range(num_hands):
        hand_boxes_hand_no = pred_hand_boxes[hand_no:hand_no+1]
        new_pred_body_boxes = torch.cat([pred_body_boxes, hand_boxes_hand_no], dim=0)
        hand_boxes_hand_no = hand_boxes_hand_no.repeat(num_bodies+1, 1)
        mu_body_hand_no = box2box_transform.get_deltas(
            hand_boxes_hand_no, new_pred_body_boxes
        ) # (num_bodies+1, 4)
        mu_hand_hand_no = mu_hand[hand_no:hand_no+1].repeat(num_bodies+1, 1)
        # (Num_bodies+1, 4)
        conf_hand_no = torch.exp(
            -2.0 * 1e-1 * torch.sum(torch.abs(mu_hand_hand_no - mu_body_hand_no), dim=1)
        )
        scores_positional_density.append(conf_hand_no.reshape(1, num_bodies+1))
        mu_body.append(mu_body_hand_no)
    scores_positional_density = torch.cat(scores_positional_density, dim=0)
    pred_overlap = handbody_components["pred_overlap"]
    pred_overlap = F.sigmoid(pred_overlap)
    overlap_mask = (pred_overlap > 0.1).float()

    scores = pred_overlap * scores_positional_density * overlap_mask

    scores = torch.cat([scores, scores], dim=1)
    scores_numpy = scores.detach().to("cpu").numpy()

    row_ind, col_ind = linear_sum_assignment(-scores_numpy)
    col_ind = torch.from_numpy(col_ind)
    row_ind = torch.from_numpy(row_ind)

    print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ col_ind: {type(col_ind)}')
    print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ num_bodies: {type(num_bodies)}')

    col_ind = (col_ind % (num_bodies+1)) + 1
    # row_ind, col_ind = torch.from_numpy(row_ind).to(device), torch.from_numpy(col_ind).to(device)
    row_ind, col_ind = row_ind.to(device), col_ind.to(device)

    pred_body_ids_for_bodies = torch.arange(1, num_bodies+1).to(device)
    pred_body_ids_for_hands = torch.FloatTensor([num_bodies+1] * num_hands).to(device)
    pred_body_ids_for_hands[row_ind] = col_ind.float()
    pred_body_ids[hand_indices] = pred_body_ids_for_hands
    pred_body_ids[body_indices] = pred_body_ids_for_bodies.float()
    pred_instances[0].pred_body_ids = pred_body_ids

    return pred_instances

def OverlapEstimationLoss(pred_overlap, ioa_gt, cfg):
    weight = cfg.MODEL.ROI_OVERLAP_ESTIMATION_HEAD.LOSS_WEIGHT
    overlap_gt = (ioa_gt > 0).float()
    loss = weight * F.binary_cross_entropy_with_logits(pred_overlap, overlap_gt, reduction="mean")
    return loss


@ROI_OVERLAP_ESTIMATION_HEAD_REGISTRY.register()
class OverlapEstimationHead(nn.Module):

    def __init__(self, cfg, input_shape: ShapeSpec):

        super(OverlapEstimationHead, self).__init__()

        conv_params = cfg.MODEL.ROI_OVERLAP_ESTIMATION_HEAD.CONV_DIMS
        conv_norm = cfg.MODEL.ROI_OVERLAP_ESTIMATION_HEAD.CONV_NORM
        fc_dims = cfg.MODEL.ROI_OVERLAP_ESTIMATION_HEAD.FC_DIM
        num_fc = len(fc_dims)
        self.cfg = cfg
        self.device = cfg.MODEL.DEVICE
        self._output_size = (2*input_shape.channels, input_shape.height, input_shape.width)
        self.conv_norm_relus = []
        for k, conv_param in enumerate(conv_params):
            conv = Conv2d(
                self._output_size[0],
                conv_param[0],
                kernel_size=conv_param[1],
                padding=conv_param[2],
                bias=not conv_norm,
                norm=get_norm(conv_norm, conv_param[0]),
                activation=F.relu,
            )
            self.add_module("overlap_estimation_conv{}".format(k+1), conv)
            self.conv_norm_relus.append(conv)
            self._output_size = (conv_param[0], self._output_size[1], self._output_size[2])

        for layer in self.conv_norm_relus:
            weight_init.c2_msra_fill(layer)

        self.fcs = []
        for k in range(num_fc):
            fc = Linear(np.prod(self._output_size), fc_dims[k])
            self.add_module("overlap_estimation_fc{}".format(k+1), fc)
            self.fcs.append(fc)
            self._output_size = fc_dims[k]

        for layer in self.fcs:
            weight_init.c2_xavier_fill(layer)

    def forward(self, pred_mu, pred_mu_features, handbody_components, instances):

        if self.training:

            hand_proposal_features = handbody_components["hand_proposal_features"]
            body_proposal_features = handbody_components["body_proposal_features"]
            hand_proposal_boxes = handbody_components["hand_proposal_boxes"]
            body_proposal_boxes = handbody_components["body_proposal_boxes"]
            proposal_body_ids_hands = handbody_components["proposal_body_ids_hands"]
            proposal_body_ids_bodies = handbody_components["proposal_body_ids_bodies"]
            ioa_proposal_boxes = handbody_components["ioa_proposal_boxes"]
            num_hands = hand_proposal_boxes.shape[0]
            num_bodies = body_proposal_features.shape[0]

            if num_hands ==0 or num_bodies == 0:
                return {"loss overlap estimation": torch.sum(body_proposal_boxes) * 0,}

            pred_overlap = []
            for i in range(num_hands):
                h_f = hand_proposal_features[i: i+1]
                new_body_proposal_features = torch.cat([body_proposal_features, h_f], dim=0)
                h_f = hand_proposal_features[i:i+1].repeat(num_bodies+1, 1, 1, 1)
                hb_f = torch.cat([h_f, new_body_proposal_features], dim=1)
                for num in range(len(self.conv_norm_relus)):
                    hb_f = self.conv_norm_relus[num](hb_f)
                hb_f = torch.flatten(hb_f, start_dim=1)
                for num in range(len(self.fcs)-1):
                    hb_f = F.relu(self.fcs[num](hb_f))
                if len(self.fcs) == 1:
                    num = -1
                hb_f = self.fcs[num+1](hb_f)
                hb_f = hb_f.squeeze(1).unsqueeze(0)
                pred_overlap.append(hb_f)
            pred_overlap = torch.cat(pred_overlap, dim=0)
            torch_ones = torch.ones(num_hands, 1).to(ioa_proposal_boxes.device)
            ioa_proposal_boxes = torch.cat([ioa_proposal_boxes, torch_ones], dim=1)
            return {"loss ioa prediction": OverlapEstimationLoss(pred_overlap, ioa_proposal_boxes, self.cfg),}

        else:
            pred_overlap = []
            hand_boxes = handbody_components["hand_boxes"]
            body_boxes = handbody_components["body_boxes"]
            hand_features = handbody_components["hand_features"]
            body_features = handbody_components["body_features"]
            num_hands = hand_boxes.shape[0]
            num_bodies = body_boxes.shape[0]
            for i in range(num_hands):
                h_f = hand_features[i: i+1]
                new_body_features = torch.cat([body_features, h_f], dim=0)
                h_f = hand_features[i:i+1].repeat(num_bodies+1, 1, 1, 1)
                hb_f = torch.cat([h_f, new_body_features], dim=1)
                for num in range(len(self.conv_norm_relus)):
                    hb_f = self.conv_norm_relus[num](hb_f)
                hb_f = torch.flatten(hb_f, start_dim=1)
                for num in range(len(self.fcs)-1):
                    hb_f = F.relu(self.fcs[num](hb_f))
                if len(self.fcs) == 1:
                    num = -1
                hb_f = self.fcs[num+1](hb_f)
                hb_f = hb_f.squeeze(1).unsqueeze(0)
                pred_overlap.append(hb_f)
            if pred_overlap:
                pred_overlap = torch.cat(pred_overlap, dim=0)

            handbody_components["num_hands"] = num_hands
            handbody_components["num_bodies"] = num_bodies
            handbody_components["pred_overlap"] = pred_overlap
            handbody_components["pred_mu"] = pred_mu
            return OverlapEstimationInference(self.cfg, handbody_components, instances, self.device)

def build_overlap_estimation_head(cfg, input_shape):

    name = cfg.MODEL.ROI_OVERLAP_ESTIMATION_HEAD.NAME
    return ROI_OVERLAP_ESTIMATION_HEAD_REGISTRY.get(name)(cfg, input_shape)

PINTO

ココの書き換えが必要。

            # return results
            return results[0]._fields

rcnn.py

from detectron2.modeling import GeneralizedRCNN
from ..postprocessing import detector_postprocess
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY

__all__ = ["ModifiedPostProcessingRCNN"]

@META_ARCH_REGISTRY.register()
class ModifiedPostProcessingRCNN(GeneralizedRCNN):

    def __init__(self, cfg):
        super().__init__(cfg)

    def forward(self, batched_inputs, height, width):

        if not self.training:
            return self.inference(batched_inputs, height, width)

        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
            )
            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)

        if self.proposal_generator:
            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
        else:
            assert "proposals" in batched_inputs[0]
            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
            proposal_losses = {}

        _, detector_losses = self.roi_heads(images, height, width, features, proposals, gt_instances)
        if self.vis_period > 0:
            storage = get_event_storage()
            if storage.iter % self.vis_period == 0:
                self.visualize_training(batched_inputs, proposals)

        losses = {}
        losses.update(detector_losses)
        losses.update(proposal_losses)
        return losses
    
    def inference(self, batched_inputs, height, width, detected_instances=None, do_postprocess=True):

        assert not self.training

        images = self.preprocess_image(batched_inputs)
        features = self.backbone(images.tensor)

        if detected_instances is None:
            if self.proposal_generator:
                proposals, _ = self.proposal_generator(images, features, None)
            else:
                assert "proposals" in batched_inputs[0]
                proposals = [x["proposals"].to(self.device) for x in batched_inputs]

            results, _ = self.roi_heads(images, height, width, features, proposals, None)
        else:
            detected_instances = [x.to(self.device) for x in detected_instances]
            results = self.roi_heads.forward_with_given_boxes(height, width, features, detected_instances)

        if do_postprocess:
            return self._postprocess(results, batched_inputs, images.image_sizes)
        else:
            # return results
            return results[0]._fields
            """
            instances[0]._fields
            'pred_boxes':
                Boxes(tensor([[ 579.6041,  302.4180,  701.0220,  373.7803],
                        [ 600.7958,   48.7563, 1083.4999,  696.3106],
                        [ 142.3038,  337.8587,  481.7955,  707.3322],
                        [ 390.4144,  340.0520,  464.8652,  424.0021]], device='cuda:0'))
            'scores':
                tensor([0.9806, 0.9795, 0.9651, 0.9143], device='cuda:0')
            'pred_classes':
                tensor([0, 1, 1, 0], device='cuda:0')
            'pred_masks':
                tensor([[[[0.0701, 0.0951, 0.1169,  ..., 0.0881, 0.0853, 0.0665],
                        [0.1258, 0.1961, 0.2977,  ..., 0.2620, 0.2137, 0.1386],
                        [0.1894, 0.3168, 0.4793,  ..., 0.4686, 0.3339, 0.1788],
                        ...,
                        [0.3384, 0.4685, 0.6113,  ..., 0.5994, 0.5014, 0.2311],
                        [0.2737, 0.3955, 0.5539,  ..., 0.4649, 0.3806, 0.1805],
                        [0.1767, 0.2438, 0.3607,  ..., 0.2455, 0.2246, 0.1151]]],


                        [[[0.6020, 0.7644, 0.8599,  ..., 0.9930, 0.9871, 0.8904],
                        [0.7081, 0.8718, 0.9489,  ..., 0.9989, 0.9977, 0.9610],
                        [0.8359, 0.9588, 0.9896,  ..., 0.9993, 0.9983, 0.9580],
                        ...,
                        [0.7232, 0.9056, 0.9609,  ..., 0.9906, 0.9904, 0.8883],
                        [0.6591, 0.8414, 0.8881,  ..., 0.9833, 0.9800, 0.8654],
                        [0.6129, 0.7868, 0.8474,  ..., 0.9486, 0.9588, 0.8238]]],


                        [[[0.2969, 0.3252, 0.6798,  ..., 0.8717, 0.6705, 0.3716],
                        [0.3674, 0.4133, 0.7467,  ..., 0.9549, 0.8151, 0.4765],
                        [0.4921, 0.5654, 0.8138,  ..., 0.9647, 0.8356, 0.441...
            'pred_body_ids':
                tensor([1., 1., 2., 2.], device='cuda:0')
            """


    def _postprocess(self, instances, batched_inputs, image_sizes):

        processed_results = []
        for results_per_image, input_per_image, image_size in zip(
            instances, batched_inputs, image_sizes
        ):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            r = detector_postprocess(results_per_image, height, width)
            processed_results.append({"instances": r})
        return processed_results

PINTO

このモデルは32で割り切れる解像度に設定する必要が有る

PINTO

ココの boxes.max() の入力がスカラーになるとき（検出数ゼロのとき）に onnxruntime が Abort するので keepdim=True に変更する

File "/home/appuser/.local/lib/python3.8/site-packages/torchvision/ops/boxes.py", line 89


# only on the class idx, and is large enough so that boxes
# from different classes do not overlap
if boxes.numel() == 0:
~~~~~~~~~~~~~~~~~~~~~~
return torch.empty((0,), dtype=torch.int64, device=boxes.device)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
# max_coordinate = boxes.max()
max_coordinate, _  = torch.max(boxes, dim=0, keepdim=True)
offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))

PINTO

linear_sum_assignment

row_ind, col_ind = linear_sum_assignment(-scores_numpy)

PINTO

リトライ。Caffe2 が必要になるため、PyTorch==1.10.1 以下のバージョンを導入する必要がある。

FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04

ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update && apt-get install -y \
    ca-certificates \
    python3-dev \
    git \
    wget \
    sudo \
    ninja-build \
    python-is-python3 \
    python3-pip \
    libgl1-mesa-dev \
    libglib2.0-0 \
    libsm6 \
    libxrender1 \
    libxext-dev \
    nano \
    gstreamer1.0-plugins-base \
    gstreamer1.0-plugins-good \
    gstreamer1.0-plugins-bad \
    gstreamer1.0-plugins-ugly \
    gstreamer1.0-libav \
    gstreamer1.0-doc \
    gstreamer1.0-tools \
    gstreamer1.0-x \
    gstreamer1.0-alsa \
    gstreamer1.0-gl \
    gstreamer1.0-gtk3 \
    && sed -i 's/# set linenumbers/set linenumbers/g' /etc/nanorc \
    && apt clean \
    && rm -rf /var/lib/apt/lists/*

# create a non-root user
ENV USERNAME=user
RUN echo "root:root" | chpasswd \
    && adduser --disabled-password --gecos "" "${USERNAME}" \
    && echo "${USERNAME}:${USERNAME}" | chpasswd \
    && echo "%${USERNAME}    ALL=(ALL)   NOPASSWD:    ALL" >> /etc/sudoers.d/${USERNAME} \
    && chmod 0440 /etc/sudoers.d/${USERNAME} \
    && mkdir -p /home/${USERNAME}
USER ${USERNAME}
# RUN echo HOME: ${HOME}
# RUN echo PWD: `pwd`
ENV HOME=/home/${USERNAME}
WORKDIR ${HOME}

ENV PATH="${HOME}/.local/bin:${PATH}"

# install dependencies
# See https://pytorch.org/ for other options if you use a different version of CUDA
RUN pip install --user pip -U
RUN pip install --user tensorboard cmake onnx   # cmake from apt-get is too old
RUN pip install --user torch==1.10 torchvision==0.11.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html
RUN pip install opencv-contrib-python==4.1.2.30 scipy scikit-image
RUN pip install onnx==1.13.1 onnxsim==0.4.17 onnxruntime-gpu==1.13.1 future==0.18.3

RUN pip install --user 'git+https://github.com/facebookresearch/fvcore'
# install detectron2
RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo
# set FORCE_CUDA because during `docker build` cuda is not accessible
ENV FORCE_CUDA="1"
# This will by default build detectron2 for all common cuda architectures and take a lot more time,
# because inside `docker build`, there is no way to tell which architecture will be used.
ARG TORCH_CUDA_ARCH_LIST="Ampere"
ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"

RUN pip install --user -e detectron2_repo

# Set a fixed model cache directory.
ENV FVCORE_CACHE="/tmp"
WORKDIR ${HOME}/detectron2_repo

RUN echo "export QT_X11_NO_MITSHM=1" >> ${HOME}/.bashrc \
    && echo "sudo chmod 777 /dev/video*" >> ${HOME}/.bashrc

PINTO

docker build -t pinto0309/bodyhands_onnxexport:latest -f Dockerfile.onnxexport .

xhost +local: && \
docker run -it --rm --gpus all \
-v $PWD:/home/user/detectron2_repo/BodyHands \
-v /tmp/.X11-unix/:/tmp/.X11-unix:rw \
--device /dev/video0:/dev/video0:mwr \
--net=host \
-e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
-e DISPLAY=$DISPLAY \
--privileged \
pinto0309/bodyhands_onnxexport:latest /bin/bash

PINTO

#########################################################
tools/deploy/export_model.py
27 from BodyHands.bodyhands.config.config import add_bodyhands_config
34     add_bodyhands_config(cfg)

/home/user/detectron2_repo/BodyHands/bodyhands/data/dataset_mapper.py
7 from BodyHands.bodyhands.data import detection_utils as utils

/home/user/detectron2_repo/BodyHands/bodyhands/modeling/roi_heads/extract_handbody_components.py
3 from BodyHands.bodyhands.utils.extend_utils_boxes import pairwise_ioa

python tools/deploy/export_model.py \
--config-file BodyHands/configs/BodyHands.yaml \
--output ./ \
--format onnx \
--sample-image BodyHands/480x640.png \
--export-method caffe2_tracing MODEL.DEVICE cuda MODEL.WEIGHTS BodyHands/models/model.pth

PINTO

無理

Traceback (most recent call last):
  File "tools/deploy/export_model.py", line 221, in <module>
    exported_model = export_caffe2_tracing(cfg, torch_model, sample_inputs)
  File "tools/deploy/export_model.py", line 44, in export_caffe2_tracing
    tracer = Caffe2Tracer(cfg, torch_model, inputs)
  File "/home/user/detectron2_repo/detectron2/export/api.py", line 60, in __init__
    C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
KeyError: 'ModifiedPostProcessingRCNN'