🔍
Unity Sentis YOLOv8 Nano で物体検知

2025/02/04に公開
Unity
 Unity Sentis で YOLOv8 Nano を実装！
 はじめにUnity で 軽量な物体検出モデル を動作させたい場合、Sentis を利用すると、GPU/CPU で ONNX モデルを推論できます。本記事では、YOLOv8 Nano を Sentis で実装し、NMS (Non-Maximum Suppression) を適用して最終的な物体検出結果を得る方法 を解説します。
iPhone16で動作確認しましたが、メモリも問題なくリアルタイムで物体検知ができました。

 1. モデルのロードまず、YOLOv8 Nano の ONNX モデルをロードします。
var model1 = ModelLoader.Load(asset);
asset は Unity にインポートした ONNX モデルです。

 2. バウンディングボックス座標の変換YOLOv8 の出力 (x_center, y_center, width, height) を (x_min, y_min, x_max, y_max) に変換します。
centersToCorners = new Tensor<float>(new TensorShape(4, 4), new float[]{
    1, 0, 1, 0,
    0, 1, 0, 1,
    -0.5f, 0, 0.5f, 0,
    0, -0.5f, 0, 0.5f
});
この行列を適用することで、バウンディングボックスを適切な形式に変換できます。

 3. モデルの出力を整理YOLOv8 の出力 [1, 84, 8400] から 座標情報とクラススコアを分離 します。
var boxCoords = modelOutput[0, 0..4, ..].Transpose(0, 1);  // shape=(8400,4)
var allScores = modelOutput[0, 4.., ..];  // shape=(80,8400)

boxCoords → 各ボックスの (x_center, y_center, width, height)

allScores → 各ボックスの 80 クラス分のスコア

 4. クラススコアの処理各ボックスごとに「最も高いスコアを持つクラス」と「そのスコア」を取得します。
var scores = FF.ReduceMax(allScores, 0);  // shape=(8400)
var classIDs = FF.ArgMax(allScores, 0);   // shape=(8400)

scores → 各ボックスの最高スコア

classIDs → 最もスコアが高いクラスの ID

 5. バウンディングボックスの座標変換var boxCorners = FF.MatMul(boxCoords, Functional.Constant(centersToCorners));
この変換で (x_min, y_min, x_max, y_max) に変換します。

 6. Non-Maximum Suppression (NMS) の適用var indices = FF.NMS(boxCorners, scores, iouThreshold, scoreThreshold); // shape=(N)

iouThreshold → ボックスがどの程度重なっていたら削除するか（例: 0.5）

scoreThreshold → 低スコアのボックスを削除するための閾値（例: 0.3）
NMS を適用することで、重複するボックスを削除 し、最終的な検出結果を取得できます。

 7. 最終的なボックスとクラス ID を取得var indices2 = indices.Unsqueeze(-1).BroadcastTo(new int[] { 4 }); // shape=(N,4)
var coords = FF.Gather(boxCoords, 0, indices2); // shape=(N,4)
var labelIDs = FF.Gather(classIDs, 0, indices); // shape=(N)

coords → NMS 後のバウンディングボックス座標

labelIDs → NMS 後のクラス ID

 8. 最適化されたモデルを作成model1 = graph.Compile(coords, labelIDs);
これにより、NMS などの後処理を含む最適化済みモデル が作成されます。

 9. エンジンを作成し、推論を実行engine = new Worker(model1, backend);

backend = Backend.GPU → GPU で推論（可能な場合）

backend = Backend.CPU → CPU で推論

 10. 実装コード全文以下に、使用している CameraFeatureExtractor.cs と YoloV8NanoDetector.cs の全コードを掲載します。
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using Unity.Sentis;

public class CameraFeatureExtractor : MonoBehaviour
{
    public YoloV8NanoDetector yolov8NanoDetector;  // ↑のクラス(実体)をインスペクタでアタッチ
    private WebCamTexture cameraTexture;

    public RawImage rawImage;
    public AspectRatioFitter fit;
    public GameObject boxPrefab;

    void Start()
    {
        // カメラ初期化
        StartCoroutine(InitializeCamera());
    }

    IEnumerator InitializeCamera()
    {
        WebCamDevice[] devices = WebCamTexture.devices;
        if (devices.Length == 0)
        {
            Debug.Log("No camera devices found");
            yield break;
        }

        // 適当なカメラを選択 (例: 背面カメラ)
        for (int i = 0; i < devices.Length; i++)
        {
            if (!devices[i].isFrontFacing)
            {
                cameraTexture = new WebCamTexture(devices[i].name, 640, 480);
                break;
            }
        }

        // 万が一背面カメラがない場合、最初のカメラ
        if (cameraTexture == null)
        {
            cameraTexture = new WebCamTexture(devices[0].name, 640, 480);
        }

        cameraTexture.Play();
        rawImage.texture = cameraTexture;

        // RawImage を画面一杯にフィットさせる設定
        var rt = rawImage.GetComponent<RectTransform>();
        rt.anchorMin = Vector2.zero;
        rt.anchorMax = Vector2.one;
        rt.offsetMin = Vector2.zero;
        rt.offsetMax = Vector2.zero;

        // アスペクトフィッターの設定
        fit.aspectMode = AspectRatioFitter.AspectMode.EnvelopeParent;

        yield return null;
    }

    void Update()
    {
        if (cameraTexture == null || !cameraTexture.didUpdateThisFrame)
            return;

        // アスペクト比を調整
        float ratio = (float)cameraTexture.width / (float)cameraTexture.height;
        fit.aspectRatio = ratio;

        // 上下反転の補正
        float scaleY = cameraTexture.videoVerticallyMirrored ? -1f : 1f;
        rawImage.rectTransform.localScale = new Vector3(1f, scaleY, 1f);

        // カメラの回転
        int orient = -cameraTexture.videoRotationAngle;
        rawImage.rectTransform.localEulerAngles = new Vector3(0, 0, orient);

        RectTransform rawRect = rawImage.GetComponent<RectTransform>();

        // 推論を実行 (コルーチン)
        // YoloV8NanoDetector 内で letterbox + 推論を行う
        StartCoroutine(yolov8NanoDetector.Detect(
            cameraTexture,
            rawRect,
            OnDetectComplete
        ));
    }

    /// <summary>
    /// YOLO の検出結果コールバック
    /// </summary>
    private void OnDetectComplete(List<BoundingBox> boxes)
    {
        // 既存の Box を削除 (rawImageの子にBoxPrefabを追加している想定)
        foreach (Transform child in rawImage.transform)
        {
            Destroy(child.gameObject);
        }

        // RawImage (UI) のサイズを取得
        RectTransform rawRect = rawImage.GetComponent<RectTransform>();
        float displayWidth = rawRect.rect.width;
        float displayHeight = rawRect.rect.height;

        // カメラ解像度
        float imageWidth = cameraTexture.width;
        float imageHeight = cameraTexture.height;

        float scaleX = displayWidth / imageWidth;
        float scaleY = displayHeight / imageHeight;

        // それぞれの検出物体に対し、UI上に BoxPrefab を生成して配置
        for (int i = 0; i < boxes.Count; i++)
        {
            var bb = boxes[i];

            // 新たな UI オブジェクト(バウンディングボックス用)を生成
            GameObject newBox = Instantiate(boxPrefab, rawRect);

            // RectTransform を取得
            RectTransform boxRT = newBox.GetComponent<RectTransform>();

            //Set box position
            boxRT.transform.localPosition = new Vector3(bb.centerX, -bb.centerY);

            //Set box size
            RectTransform rt = boxRT.GetComponent<RectTransform>();
            rt.sizeDelta = new Vector2(bb.width, bb.height);

            newBox.transform.SetParent(rawRect, false);

        }
    }
}

using System;
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.Sentis;
using FF = Unity.Sentis.Functional;


/// <summary>
/// YOLOv8 Nano 推論用クラス (Sentis 2.x対応版)
/// </summary>
public class YoloV8NanoDetector : MonoBehaviour
{
    [Header("Model / Label Info")]
    [SerializeField]
    private ModelAsset modelAsset;
    public TextAsset labelsAsset; // ラベル一覧(1行1ラベル)

    private Worker worker;

    // ラベルリスト
    private string[] labels;

    [Header("Network Size")]
    public int netWidth = 640;      // ネットワーク入力幅
    public int netHeight = 640;      // ネットワーク入力高さ

    [Header("Thresholds")]
    [Range(0f, 1f)]
    public float minConfidence = 0.25f;  // objectness と classConf の積がこの値以上で検出
    [Range(0f, 1f)]
    public float nmsIoU = 0.45f;        // NMS の IoU しきい値
    [SerializeField, Range(0, 1)] float scoreThreshold = 0.5f;
    public int maxObjects = 20;
    [Range(0, 1)] public float iouThreshold = 0.45f;
    private RenderTexture targetRT;
    Tensor<float> centersToCorners;
    const BackendType backend = BackendType.CPU;

    private Worker engine;



    void OnEnable()
    {
        // ラベル読込
        if (labelsAsset != null)
        {
            labels = labelsAsset.text.Split(new[] { '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
        }

        // モデル読み込み
        var model1 = ModelLoader.Load(modelAsset);

        centersToCorners = new Tensor<float>(new TensorShape(4, 4),
    new float[]
    {
                1,      0,      1,      0,
                0,      1,      0,      1,
                -0.5f,  0,      0.5f,   0,
                0,      -0.5f,  0,      0.5f
    });

        var graph = new FunctionalGraph();
        var input = graph.AddInput(model1, 0);
        var modelOutput = Functional.Forward(model1, input)[0];
        var boxCoords = modelOutput[0, 0..4, ..].Transpose(0, 1);        //shape=(8400,4)
        var allScores = modelOutput[0, 4.., ..];                         //shape=(80,8400)
        var scores = FF.ReduceMax(allScores, 0);        //shape=(8400)
        var classIDs = FF.ArgMax(allScores, 0);                          //shape=(8400) 
        var boxCorners = FF.MatMul(boxCoords, Functional.Constant(centersToCorners));
        var indices = FF.NMS(boxCorners, scores, iouThreshold, scoreThreshold);           //shape=(N)
        var indices2 = indices.Unsqueeze(-1).BroadcastTo(new int[] { 4 });//shape=(N,4)
        var coords = FF.Gather(boxCoords, 0, indices2);                  //shape=(N,4)
        var labelIDs = FF.Gather(classIDs, 0, indices);                  //shape=(N)
        model1 = graph.Compile(coords, labelIDs);

        engine = new Worker(model1, backend);

        targetRT = new RenderTexture(netWidth, netHeight, 0);
    }

    void OnDisable()
    {
        if (engine != null)
        {
            engine.Dispose();
            engine = null;
        }
    }

    /// <summary>
    /// コールチンで実行
    /// onComplete: 結果を返すコールバック
    /// </summary>
    public IEnumerator Detect(Texture texture, Action<List<BoundingBox>> onComplete)
    {
        using Tensor inputTensor = TextureConverter.ToTensor(texture, netWidth, netHeight, 3);

        engine.Schedule(inputTensor);

        var output = engine.PeekOutput() as Tensor<float>;
        var labelIDsGPU = engine.PeekOutput("output_1") as Tensor<int>;

        if (output == null)
        {
            Debug.LogError("YOLO output not found. Check output names (output0).");
            onComplete?.Invoke(new List<BoundingBox>());
            yield break;
        }

        var finalList = new List<BoundingBox>();

        var cpuOutput = output.ReadbackAndClone();
        var labelIDs = labelIDsGPU.ReadbackAndClone();
        
        if (cpuOutput.IsReadbackRequestDone())
        {

            int boxesFound = cpuOutput.shape[0];
            float displayWidth = rawRect.rect.width;
            float displayHeight = rawRect.rect.height;

            float scaleX = displayWidth / netWidth;
            float scaleY = displayHeight / netHeight;

            for (int n = 0; n < Mathf.Min(boxesFound, 200); n++)
            {
                var box = new BoundingBox
                {
                    centerX = cpuOutput[n, 0] * scaleX - displayWidth / 2,
                    centerY = cpuOutput[n, 1] * scaleY - displayHeight / 2,
                    width = cpuOutput[n, 2] * scaleX,
                    height = cpuOutput[n, 3] * scaleY,
                    label = labels[labelIDs[n]],
                };

                finalList.Add(box);
            }
        }

        // 後始末
        output.Dispose();
        Destroy(letterboxTex);

        // 7. コールバック
        onComplete?.Invoke(finalList);

        yield return null;
    }
}

// bounding box data
public struct BoundingBox
{
    public float centerX;
    public float centerY;
    public float width;
    public float height;
    public string label;
}

 まとめ
🎯 YOLOv8 Nano を Unity Sentis で実装する流れ
モデルをロード
座標変換の行列を設定
モデルの出力を整理（座標＋スコア）
最も高いクラススコアを取得
バウンディングボックス座標を変換
NMS を適用し、不要なボックスを削除
最終的なバウンディングボックスとクラス ID を取得
最適化されたモデルを作成
推論エンジン（Worker）を作成し、リアルタイム推論を実行
💡 次のステップ
カメラ映像を取り込み、リアルタイム推論を実装する
検出結果を Unity 上で描画する
モデルの軽量化や高速化を試す（量子化など）
🚀 これで、Unity Sentis で YOLOv8 Nano を実装し、物体検出をリアルタイムで動作させる準備が整いました！
📌 参考
Sentis 公式ドキュメント
YOLOv8 GitHub
Hugging Face Discussion
GPTで記事を生成していますので、記載ミスなどありましたらお知らせいただけますと幸いです。