😺

MMPoseをC++で動かす

に公開

MMPoseを入れる

  • git cloneする.
    cd ~/Documents
    git clone https://github.com/open-mmlab/mmpose.git
    cd mmpose
    
  • 依存ライブラリのインストール.
    pip install -r requirements.txt
    
  • MMPoseのインストール
    pip install -e.
    
  • モデルのダウンロード
    mkdir checkpoints
    cd checkpoints
    mim download mmpose --config rtmpose-l_8xb256-420e_coco-wholebody-256x192 --dest .
    
  • 人間が写っている適当な画像を./demo/resources/demo.pngとして保存する.

MMDeployを入れる

  • git cloneする.
    cd ~/Documents
    git clone https://github.com/open-mmlab/mmdeploy.git
    cd mmdeploy
    
  • 依存ライブラリのインストール.
    pip install -r requirements.txt
    
  • MMDeployをインストール
    pip install -e.
    

mmdetectionを入れる

  • git cloneする.
    cd ~/Documents
    git clone https://github.com/open-mmlab/mmdetection.git
    cd mmdetection
    
  • MMDetectionをインストール
    python3 -m pip install -e.
    
  • モデルのダウンロード
    mkdir checkpoints
    cd checkpoints
    mim download mmdet --config rtmdet_l_8xb32-300e_coco --dest .
    

ONNXへの変換

今回はRTMPose-lとRTMDetをONNX形式に変換する.
色々なパッケージが必要となるが,AIなどに聞いてみてください(何が必要か忘れたので).

  • RTMPoseをONNX形式に変換する.
    cd ~/Documents/mmdeploy
    python tools/deploy.py \
        configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \
        ~/Documents/mmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py \
        ~/Documents/mmpose/checkpoints/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth \
        ~/Documents/mmpose/demo/resources/demo.png \
        --work-dir mmdeploy_models/rtmpose_coco_onnx
    
  • RTMDetをONNX形式に変換する.
    cd ~/Documents/mmdeploy
    python tools/deploy.py \
        configs/mmdet/detection/detection_onnxruntime_dynamic.py \
        ~/Documents/mmdetection/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \
        ~/Documents/mmdetection/checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \
        ~/Documents/mmdetection/demo/demo.jpg \
        --work-dir mmdeploy_models/rtmdet_coco_onnx
    
  • 変換されたONNXモデルを確認する.
    ls ~/Documents/mmdeploy/mmdeploy_models/rtmpose_coco_onnx/end2end.onnx
    ls ~/Documents/mmdeploy/mmdeploy_models/rtmdet_coco_onnx/end2end.onnx
    
    の2つが存在すればOK.

C++用のonnxruntimeをインストール

cd ~/Documents
wget https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-linux-x64-gpu-1.22.0.tgz
tar -zxvf onnxruntime-linux-x64-gpu-1.22.0.tgz

テストコード

RTMPoseのモデルを~/Documents/mmdeploy/mmdeploy_models/rtmpose_coco_onnx/end2end.onnxに,RTMDetのモデルを~/Documents/mmdeploy/mmdeploy_models/rtmdet_coco_onnx/end2end.onnxに配置しておく.
画像は~/Documents/mmpose/demo/resources/demo.pngに配置しておく.

  • C++でONNXモデルを読み込んで推論するためのテストコード

    main.cpp
    #include <iostream>
    #include <vector>
    #include <string>
    #include <numeric>
    #include <algorithm>
    
    #include <opencv2/opencv.hpp>
    #include <onnxruntime_cxx_api.h>
    #include <chrono>
    #include <omp.h>
    
    // ========== データ構造の定義 ==========
    struct KeyPoint {
        float x;
        float y;
        float score;
    };
    
    struct BBox {
        cv::Rect rect;
        float score;
    };
    
    // ========== ポーズ推定の結果を格納する構造体 ==========
    struct PoseResult {
        BBox bbox;
        std::vector<KeyPoint> keypoints;
    };
    
    // ========== ステージ1: 人物検出器 (変更なし) ==========
    std::vector<BBox> run_detector(Ort::Session& session, const cv::Mat& image) {
        // --- 1. 前処理 ---
        const float INPUT_WIDTH = 640.0;
        const float INPUT_HEIGHT = 640.0;
        
        float scale = std::min(INPUT_WIDTH / static_cast<float>(image.cols), INPUT_HEIGHT / static_cast<float>(image.rows));
        int new_w = static_cast<int>(image.cols * scale);
        int new_h = static_cast<int>(image.rows * scale);
    
        cv::Mat resized_image;
        cv::resize(image, resized_image, cv::Size(new_w, new_h));
    
        cv::Mat letterbox_image = cv::Mat::zeros(cv::Size(INPUT_WIDTH, INPUT_HEIGHT), CV_8UC3);
        resized_image.copyTo(letterbox_image(cv::Rect(0, 0, new_w, new_h)));
    
        cv::Mat blob = cv::dnn::blobFromImage(letterbox_image, 1.0f / 255.0f, cv::Size(INPUT_WIDTH, INPUT_HEIGHT), cv::Scalar(0, 0, 0), true, false, CV_32F); 
    
        // --- 2. 推論 ---
        auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
        std::vector<int64_t> input_shape = {1, 3, (int64_t)INPUT_HEIGHT, (int64_t)INPUT_WIDTH};
        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, blob.ptr<float>(), blob.total(), input_shape.data(), input_shape.size());
    
        std::vector<const char*> input_names = {"input"};
        std::vector<const char*> output_names = {"dets", "labels"};
    
        auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), 2);
    
        // --- 3. 後処理 ---
        const float* dets_data = output_tensors[0].GetTensorData<float>();
        const int64_t* labels_data = output_tensors[1].GetTensorData<int64_t>();
    
        auto dets_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
        const int num_dets = static_cast<int>(dets_shape[1]);
    
        std::vector<BBox> final_bboxes;
        for (int i = 0; i < num_dets; ++i) {
            if (labels_data[i] == 0) { 
                float score = dets_data[i * 5 + 4];
                if (score > 0.3) {
                    int x1 = static_cast<int>(dets_data[i * 5 + 0] / scale);
                    int y1 = static_cast<int>(dets_data[i * 5 + 1] / scale);
                    int x2 = static_cast<int>(dets_data[i * 5 + 2] / scale);
                    int y2 = static_cast<int>(dets_data[i * 5 + 3] / scale);
                    final_bboxes.push_back({cv::Rect(x1, y1, x2 - x1, y2 - y1), score});
                }
            }
        }
        return final_bboxes;
    }
    
    // ========== ステージ2: ポーズ推定器 (個別処理版) ==========
    // 個別推論のため、引数はOrt::Valueに戻します
    std::vector<KeyPoint> decode_simcc(const Ort::Value& simcc_x_tensor, const Ort::Value& simcc_y_tensor, const cv::Size& model_input_size) {
        const float* simcc_x_data = simcc_x_tensor.GetTensorData<float>();
        const float* simcc_y_data = simcc_y_tensor.GetTensorData<float>();
    
        auto x_shape = simcc_x_tensor.GetTensorTypeAndShapeInfo().GetShape();
        auto y_shape = simcc_y_tensor.GetTensorTypeAndShapeInfo().GetShape();
        const int num_keypoints = static_cast<int>(x_shape[1]);
        const int x_dim = static_cast<int>(x_shape[2]);
        const int y_dim = static_cast<int>(y_shape[2]);
    
        const float simcc_split_ratio_x = static_cast<float>(x_dim) / model_input_size.width;
        const float simcc_split_ratio_y = static_cast<float>(y_dim) / model_input_size.height;
    
        std::vector<KeyPoint> keypoints;
        keypoints.reserve(num_keypoints);
    
        for (int k = 0; k < num_keypoints; ++k) {
            const float* x_k_data = simcc_x_data + k * x_dim;
            int max_idx_x = std::distance(x_k_data, std::max_element(x_k_data, x_k_data + x_dim));
            float coord_x = static_cast<float>(max_idx_x) / simcc_split_ratio_x;
    
            const float* y_k_data = simcc_y_data + k * y_dim;
            int max_idx_y = std::distance(y_k_data, std::max_element(y_k_data, y_k_data + y_dim));
            float coord_y = static_cast<float>(max_idx_y) / simcc_split_ratio_y;
    
            float final_score = std::max(x_k_data[max_idx_x], y_k_data[max_idx_y]);
            keypoints.push_back({coord_x, coord_y, final_score});
        }
        return keypoints;
    }
    
    // ========== メイン処理 [計算と描画を分離] ==========
    int main() {
        const std::string detector_model_path = "~/Documents/cpp_test/rtmpose_cpp_demo/models/det_end2end_l.onnx";
        const std::string pose_model_path   = "~/Documents/cpp_test/rtmpose_cpp_demo/models/pose_end2end_l.onnx";
        const std::string image_path        = "~/Documents/cpp_test/human-pose.png";
    
        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "PipelineDemo");
    
        // --- セッションの初期化 ---
        Ort::SessionOptions detector_session_options;
        detector_session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
        Ort::Session detector_session(env, detector_model_path.c_str(), detector_session_options);
    
        Ort::SessionOptions pose_session_options;
        pose_session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
        Ort::Session pose_session(env, pose_model_path.c_str(), pose_session_options);
    
        cv::Mat original_image = cv::imread(image_path);
        if (original_image.empty()) {
            std::cerr << "Could not read the image: " << image_path << std::endl;
            return -1;
        }
    
        using Clock = std::chrono::high_resolution_clock;
        auto start_time = Clock::now();
    
        // --- ステージ1: 人物検出 ---
        std::vector<BBox> bboxes = run_detector(detector_session, original_image);
        std::cout << "Detected " << bboxes.size() << " persons." << std::endl;
        if (bboxes.empty()) return 0;
    
        auto detector_end_time = Clock::now();
        std::chrono::duration<double, std::milli> detector_elapsed_time = detector_end_time - start_time;
        std::cout << "Detector processing time: " << detector_elapsed_time.count() << " ms" << std::endl;
        
        // --- ★ 1. 並列計算ループ ★ ---
        // 結果を格納するためのベクターを準備
        std::vector<PoseResult> final_results(bboxes.size());
    
        #pragma omp parallel for
        for (int i = 0; i < bboxes.size(); ++i) {
            const auto& bbox_info = bboxes[i];
            
            // --- 前処理 ---
            cv::Rect padded_bbox = bbox_info.rect;
            padded_bbox.x -= 10;
            padded_bbox.y -= 10;
            padded_bbox.width += 20;
            padded_bbox.height += 20;
            padded_bbox &= cv::Rect(0, 0, original_image.cols, original_image.rows);
            if (padded_bbox.width == 0 || padded_bbox.height == 0) continue;
    
            cv::Mat cropped_image = original_image(padded_bbox);
            
            const cv::Size pose_input_size(192, 256);
            double scale = std::min(static_cast<double>(pose_input_size.width) / cropped_image.cols, static_cast<double>(pose_input_size.height) / cropped_image.rows);
            int new_w = static_cast<int>(cropped_image.cols * scale);
            int new_h = static_cast<int>(cropped_image.rows * scale);
            cv::Mat resized_crop;
            cv::resize(cropped_image, resized_crop, cv::Size(new_w, new_h));
    
            cv::Mat padded_crop = cv::Mat::zeros(pose_input_size, CV_8UC3);
            resized_crop.copyTo(padded_crop(cv::Rect(0, 0, new_w, new_h)));
            cv::Mat pose_blob = cv::dnn::blobFromImage(padded_crop, 1.0f / 255.0f, pose_input_size, cv::Scalar(0,0,0), true, false, CV_32F);
    
            // --- 推論 ---
            auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
            std::vector<int64_t> pose_input_shape = {1, 3, pose_input_size.height, pose_input_size.width};
            std::vector<const char*> pose_input_names = {"input"};
            std::vector<const char*> pose_output_names = {"simcc_x", "simcc_y"};
            Ort::Value pose_input_tensor = Ort::Value::CreateTensor<float>(memory_info, pose_blob.ptr<float>(), pose_blob.total(), pose_input_shape.data(), pose_input_shape.size());
            auto pose_output_tensors = pose_session.Run(Ort::RunOptions{nullptr}, pose_input_names.data(), &pose_input_tensor, 1, pose_output_names.data(), 2);
            
            // --- 後処理 ---
            std::vector<KeyPoint> keypoints_in_padded = decode_simcc(pose_output_tensors[0], pose_output_tensors[1], pose_input_size);
            std::vector<KeyPoint> final_keypoints;
            final_keypoints.reserve(keypoints_in_padded.size());
            for(const auto& kp : keypoints_in_padded) {
                float x_in_crop = kp.x / scale;
                float y_in_crop = kp.y / scale;
                final_keypoints.push_back({
                    x_in_crop + padded_bbox.x,
                    y_in_crop + padded_bbox.y,
                    kp.score
                });
            }
            
            // --- 結果を格納 ---
            final_results[i] = {bbox_info, final_keypoints};
        }
    
        auto end_time = Clock::now();
        std::chrono::duration<double, std::milli> elapsed_time = end_time - start_time;
        std::cout << "Total processing time: " << elapsed_time.count() << " ms" << std::endl;
    
        // --- ★ 2. 直列描画ループ ★ ---
        const std::vector<std::pair<int, int>> skeleton = {{15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12}, {5, 6}, {5, 7}, {6, 8}, {7, 9}, {8, 10}, {1, 2}, {0, 1}, {0, 2}, {1, 3}, {2, 4}, {0, 5}, {0, 6}};
        const cv::Scalar bone_color(0, 255, 0); 
        const cv::Scalar joint_color(0, 0, 255);
        const float KEYPOINT_CONFIDENCE_THRESHOLD = 0.45;
    
        for (const auto& result : final_results) {
            if (result.bbox.rect.empty() || result.keypoints.empty()) continue;
    
            // BBoxを描画
            cv::rectangle(original_image, result.bbox.rect, cv::Scalar(255, 0, 0), 2);
            
            // 関節(点)を描画
            for (const auto& kp : result.keypoints) {
                if (kp.score > KEYPOINT_CONFIDENCE_THRESHOLD) {
                    cv::circle(original_image, cv::Point(static_cast<int>(kp.x), static_cast<int>(kp.y)), 3, joint_color, -1);
                }
            }
            
            // 骨格(線)を描画
            for (const auto& bone : skeleton) {
                const KeyPoint& kp1 = result.keypoints[bone.first];
                const KeyPoint& kp2 = result.keypoints[bone.second];
                if (kp1.score > KEYPOINT_CONFIDENCE_THRESHOLD && kp2.score > KEYPOINT_CONFIDENCE_THRESHOLD) {
                    cv::line(original_image, cv::Point(static_cast<int>(kp1.x), static_cast<int>(kp1.y)), cv::Point(static_cast<int>(kp2.x), static_cast<int>(kp2.y)), bone_color, 2);
                }
            }
        }
    
        cv::imshow("Final Pipeline Result", original_image);
        cv::waitKey(0);
        cv::imwrite("pipeline_result_separate.jpg", original_image);
    
        return 0;
    }
    
  • CMakeLists.txtの設定は以下の通り.

    CMakeLists.txt
    # CMakeの最低バージョンとプロジェクト名を設定
    cmake_minimum_required(VERSION 3.13)
    project(RTMPoseDemo CXX)
    
    # C++のバージョンを17に設定
    set(CMAKE_CXX_STANDARD 17)
    set(CMAKE_CXX_STANDARD_REQUIRED ON)
    
    # 必要なパッケージを探す (OpenCVとOpenMP)
    find_package(OpenCV REQUIRED)
    find_package(OpenMP REQUIRED)
    
    # 実行ファイルを作成
    add_executable(rtmpose_demo src/main.cpp)
    
    # 実行ファイルに必要なヘッダーファイルの場所を教える
    target_include_directories(rtmpose_demo PRIVATE
        ${OpenCV_INCLUDE_DIRS}
        "~/Documents/onnxruntime-linux-x64-gpu-1.22.0/include"
    )
    
    # 実行ファイルに必要なライブラリをリンクする
    target_link_libraries(rtmpose_demo PRIVATE
        ${OpenCV_LIBS}
        OpenMP::OpenMP_CXX
        "~/Documents/onnxruntime-linux-x64-gpu-1.22.0/lib/libonnxruntime.so"
    )
    

Discussion