Open2023/10/25にコメント追加1
YOLOフォーマットの頭部検出用アノテーションデータを一括生成するスクリプト

demo_yolov7_onnx.py
#!/usr/bin/env python

import os
import copy
import glob
import cv2
import numpy as np
import onnxruntime
from argparse import ArgumentParser
from typing import Tuple, Optional, List
from tqdm import tqdm


class YOLOv7ONNX(object):
    def __init__(
        self,
        model_path: Optional[str] = 'yolov7_tiny_head_0.768_post_480x640.onnx',
        class_score_th: Optional[float] = 0.30,
        providers: Optional[List] = [
            (
                'TensorrtExecutionProvider', {
                    'trt_engine_cache_enable': True,
                    'trt_engine_cache_path': '.',
                    'trt_fp16_enable': True,
                }
            ),
            'CUDAExecutionProvider',
            'CPUExecutionProvider',
        ],
    ):
        """YOLOv7ONNX

        Parameters
        ----------
        model_path: Optional[str]
            ONNX file path for YOLOv7

        class_score_th: Optional[float]

        class_score_th: Optional[float]
            Score threshold. Default: 0.30

        providers: Optional[List]
            Name of onnx execution providers
            Default:
            [
                (
                    'TensorrtExecutionProvider', {
                        'trt_engine_cache_enable': True,
                        'trt_engine_cache_path': '.',
                        'trt_fp16_enable': True,
                    }
                ),
                'CUDAExecutionProvider',
                'CPUExecutionProvider',
            ]
        """
        # Threshold
        self.class_score_th = class_score_th

        # Model loading
        session_option = onnxruntime.SessionOptions()
        session_option.log_severity_level = 3
        self.onnx_session = onnxruntime.InferenceSession(
            model_path,
            sess_options=session_option,
            providers=providers,
        )
        self.providers = self.onnx_session.get_providers()

        self.input_shapes = [
            input.shape for input in self.onnx_session.get_inputs()
        ]
        self.input_names = [
            input.name for input in self.onnx_session.get_inputs()
        ]
        self.output_names = [
            output.name for output in self.onnx_session.get_outputs()
        ]


    def __call__(
        self,
        image: np.ndarray,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """YOLOv7ONNX

        Parameters
        ----------
        image: np.ndarray
            Entire image

        Returns
        -------
        face_boxes: np.ndarray
            Predicted face boxes: [facecount, y1, x1, y2, x2]

        face_scores: np.ndarray
            Predicted face box scores: [facecount, score]
        """
        temp_image = copy.deepcopy(image)

        # PreProcess
        resized_image = self.__preprocess(
            temp_image,
        )

        # Inference
        inferece_image = np.asarray([resized_image], dtype=np.float32)
        scores, boxes = self.onnx_session.run(
            self.output_names,
            {input_name: inferece_image for input_name in self.input_names},
        )

        # PostProcess
        face_boxes, face_scores = self.__postprocess(
            image=temp_image,
            scores=scores,
            boxes=boxes,
        )

        return face_boxes, face_scores


    def __preprocess(
        self,
        image: np.ndarray,
        swap: Optional[Tuple[int,int,int]] = (2, 0, 1),
    ) -> np.ndarray:
        """__preprocess

        Parameters
        ----------
        image: np.ndarray
            Entire image

        swap: tuple
            HWC to CHW: (2,0,1)
            CHW to HWC: (1,2,0)
            HWC to HWC: (0,1,2)
            CHW to CHW: (0,1,2)

        Returns
        -------
        resized_image: np.ndarray
            Resized and normalized image.
        """
        # Normalization + BGR->RGB
        resized_image = cv2.resize(
            image,
            (
                int(self.input_shapes[0][3]),
                int(self.input_shapes[0][2]),
            )
        )
        resized_image = np.divide(resized_image, 255.0)
        resized_image = resized_image[..., ::-1]
        resized_image = resized_image.transpose(swap)
        resized_image = np.ascontiguousarray(
            resized_image,
            dtype=np.float32,
        )
        return resized_image


    def __postprocess(
        self,
        image: np.ndarray,
        scores: np.ndarray,
        boxes: np.ndarray,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """__postprocess

        Parameters
        ----------
        image: np.ndarray
            Entire image.

        scores: np.ndarray
            float32[N, 1]

        boxes: np.ndarray
            int64[N, 6]

        Returns
        -------
        faceboxes: np.ndarray
            Predicted face boxes: [facecount, y1, x1, y2, x2]

        facescores: np.ndarray
            Predicted face box confs: [facecount, score]
        """
        image_height = image.shape[0]
        image_width = image.shape[1]

        """
        Head Detector is
            N -> Number of boxes detected
            batchno -> always 0: BatchNo.0
            classid -> always 0: "Head"

        scores: float32[N,1],
        batchno_classid_y1x1y2x2: int64[N,6],
        """
        scores = scores
        keep_idxs = scores[:, 0] > self.class_score_th
        scores_keep = scores[keep_idxs, :]
        boxes_keep = boxes[keep_idxs, :]
        faceboxes = []
        facescores = []

        if len(boxes_keep) > 0:
            for box, score in zip(boxes_keep, scores_keep):
                x_min = int(max(box[3], 0) * image_width / self.input_shapes[0][3])
                y_min = int(max(box[2], 0) * image_height / self.input_shapes[0][2])
                x_max = int(min(box[5], self.input_shapes[0][3]) * image_width / self.input_shapes[0][3])
                y_max = int(min(box[4], self.input_shapes[0][2]) * image_height / self.input_shapes[0][2])

                faceboxes.append(
                    [x_min, y_min, x_max, y_max]
                )
                facescores.append(
                    score
                )

        return np.asarray(faceboxes), np.asarray(facescores)


def main():
    parser = ArgumentParser()
    parser.add_argument(
        '-m',
        '--model',
        type=str,
        default='yolov7_tiny_head_0.768_post_480x640.onnx',
    )
    parser.add_argument(
        '-i',
        '--image_folder',
        type=str,
        default='images',
    )
    args = parser.parse_args()

    model = YOLOv7ONNX(
        model_path=args.model,
    )

    img_path_list = sorted(glob.glob('images/*'))

    for img_path in tqdm(img_path_list, dynamic_ncols=True):
        image = cv2.imread(img_path)
        debug_image = copy.deepcopy(image)
        face_boxes, face_scores = model(debug_image)

        basename = os.path.basename(img_path)
        txt_file = os.path.join('output', basename.replace('.jpg', '.txt').replace('.PNG', '.txt'))

        os.makedirs('output', exist_ok=True)

        with open(txt_file, 'a') as out_file:
            for face_box, face_score in zip(face_boxes, face_scores):
                cv2.rectangle(
                    debug_image,
                    (face_box[0], face_box[1]),
                    (face_box[2], face_box[3]),
                    (255,255,255),
                    2,
                )
                cv2.rectangle(
                    debug_image,
                    (face_box[0], face_box[1]),
                    (face_box[2], face_box[3]),
                    (0,255,0),
                    1,
                )
                cv2.putText(
                    debug_image,
                    f'{face_score[0]:.2f}',
                    (
                        face_box[0],
                        face_box[1]-10 if face_box[1]-10 > 0 else 10
                    ),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.7,
                    (255, 255, 255),
                    2,
                    cv2.LINE_AA,
                )
                cv2.putText(
                    debug_image,
                    f'{face_score[0]:.2f}',
                    (
                        face_box[0],
                        face_box[1]-10 if face_box[1]-10 > 0 else 10
                    ),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.7,
                    (0, 255, 0),
                    1,
                    cv2.LINE_AA,
                )
                image_height = int(image.shape[0])
                image_width = int(image.shape[1])
                cx: float = (face_box[0] + face_box[2]) / 2.0 / image_width
                cy: float = (face_box[1] + face_box[3]) / 2.0 / image_height
                w: float = abs((face_box[2] - face_box[0])) / image_width * 1.18
                h: float = abs((face_box[3] - face_box[1])) / image_height * 1.14
                class_id = 0
                out_file.write(f"{class_id} {cx} {cy} {w} {h}\n")

        # cv2.imwrite(f'output/{basename}', debug_image)
        cv2.imwrite(f'output/{basename}', image)
        cv2.imshow("test", debug_image)
        key = cv2.waitKey(1)
        if key == 27: # ESC
            break

    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()