void scale_nn_simd(int32_t* src, int src_width, int src_height, int32_t* dst, int dst_width, int dst_height)
{
    __m256 scale_w = _mm256_set1_ps((float)src_width / dst_width);
    __m256 scale_h = _mm256_set1_ps((float)src_height / dst_height);
    for (int y = 0; y < dst_height; y++) {
        __m256i v_x = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        for (int x = 0; x < dst_width; x+=8) {
            __m256 v = _mm256_cvtepi32_ps(v_x);
            __m256 x0f = _mm256_mul_ps(v, scale_w);
            __m256 y0f = _mm256_mul_ps(_mm256_set1_ps((float)y), scale_h);
            __m256i x0 = _mm256_cvtps_epi32(x0f);
            __m256i y0 = _mm256_cvtps_epi32(y0f);

            __m256i src_index = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_set1_epi32(src_width), y0), x0);
            __m256i data = _mm256_i32gather_epi32(src, src_index, 4);
            _mm256_storeu_epi32(dst + dst_width * y + x, data);

            v_x = _mm256_add_epi32(v_x, _mm256_set1_epi32(8));
        }
    }
}

書き換える中で気になった点

AVX2の命令セットではAVX512と同じバイト単位の処理がない？
- AVX512は対応CPU次第だが存在
- 上記のため、元のコードはuint8_tの方が使えている
- SIMDの命令だと_mm512_mask_cvtepi32_storeu_epi8と_mm512_mask_i32gather_epi32の2つ
AVX2の命令セットではAVX512と同じ丸めを指定する処理がない？
- AVX512は丸めを指定できている
- 書き換えたコードは指定できていない
- 上記のため、AVX512とバイナリレベルで出力結果は一致していないと考えられる

さらに高速化

CPUでもスレッド毎の処理が可能なため、雑だがOpenMPのループを適用
自環境では2倍弱改善。内側ループに対しては悪化。

void scale_nn_simd(int32_t* src, int src_width, int src_height, int32_t* dst, int dst_width, int dst_height)
{
    __m256 scale_w = _mm256_set1_ps((float)src_width / dst_width);
    __m256 scale_h = _mm256_set1_ps((float)src_height / dst_height);
#pragma omp parallel for
    for (int y = 0; y < dst_height; y++) {
        for (int x = 0; x < dst_width; x+=8) {
            __m256i v_x = _mm256_set_epi32(x+7, x+6, x+5, x+4, x+3, x+2, x+1, x);
            __m256 v = _mm256_cvtepi32_ps(v_x);
            __m256 x0f = _mm256_mul_ps(v, scale_w);
            __m256 y0f = _mm256_mul_ps(_mm256_set1_ps((float)y), scale_h);
            __m256i x0 = _mm256_cvtps_epi32(x0f);
            __m256i y0 = _mm256_cvtps_epi32(y0f);

            __m256i src_index = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_set1_epi32(src_width), y0), x0);
            __m256i data = _mm256_i32gather_epi32(src, src_index, 4);
            _mm256_storeu_epi32(dst + dst_width * y + x, data);
        }
    }
}

書き換える中で気になった点

2重ループの場合、外側ループでは効果を発揮しにくいと考えていたが違った
- 内側に適用するとスレッド分割数がループ数に対して小さいため、SIMDでのキャッシュヒットが悪くなっている？
- 以下の1重ループと同等の処理になっていると推測

void scale_nn_simd(int32_t* src, int src_width, int src_height, int32_t* dst, int dst_width, int dst_height)
{
    __m256 scale_w = _mm256_set1_ps((float)src_width / dst_width);
    __m256 scale_h = _mm256_set1_ps((float)src_height / dst_height);
#pragma omp parallel for
    for (int i = 0; i < dst_height * dst_width; i += 8) {
        int y = i / dst_width;
        int x = i % dst_width;
        __m256i v_x = _mm256_set_epi32(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x);
        __m256 v = _mm256_cvtepi32_ps(v_x);
        __m256 x0f = _mm256_mul_ps(v, scale_w);
        __m256 y0f = _mm256_mul_ps(_mm256_set1_ps((float)y), scale_h);
        __m256i x0 = _mm256_cvtps_epi32(x0f);
        __m256i y0 = _mm256_cvtps_epi32(y0f);

        __m256i src_index = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_set1_epi32(src_width), y0), x0);
        __m256i data = _mm256_i32gather_epi32(src, src_index, 4);
        _mm256_storeu_epi32(dst + dst_width * y + x, data);

        // v_x = _mm256_add_epi32(v_x, _mm256_set1_epi32(8));
    }
}

キャッシュヒットを確認するために

WindowsでのCPUアーキテクチャ非依存のキャッシュ測定用のツールが見つからなかった。
このため、perfコマンドを代替手段として使う。上記のコードをgccでビルドが通るように環境を立ち上げる。
ここではWSL2の環境でできるか確認する。
Visual Studioでの環境構築はこちらを参照。
また、gcc向けにはアライメント修飾子を書き換える必要がある。

脚注

リンク先の入力画像は別のものを使う。理由はwiki参照。512x512の画素のものであれば何でも良い。 ↩︎

totem

CUDA勉強メモ

CUDAとドライバのインストール

cuda toolkitをダウンロードし、ドライバとともにインストール
CUDA Toolkit Downloads

CUDAコードでの置き換え

Chat GPT 3.5を使用

__global__ void nearestNeighborResize(const uint8_t* src, uint8_t* dst, int srcWidth, int srcHeight, int dstWidth, int dstHeight) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dstWidth && y < dstHeight) {
        int srcX = static_cast<int>((x * (srcWidth / static_cast<float>(dstWidth)) + 0.5f));
        int srcY = static_cast<int>((y * (srcHeight / static_cast<float>(dstHeight)) + 0.5f));

        int srcIndex = srcY * srcWidth + srcX;
        int dstIndex = y * dstWidth + x;

        dst[dstIndex] = src[srcIndex];
    }
}

totem

SIMD, CUDA比較

画素数が多い場合はCUDAが早い
ただ、出力結果のバイナリ一致が得られていない。かつ、ビルド設定などの最適化はVisual StudioのReleaseビルド初期設定のままとしている

CPU, GPU高速化の勉強用メモ

CUDA参考資料

SIMD参考資料

HPC参考資料

ボトルネック解析参考資料

SIMD勉強メモ

CPUの対応の確認

AVX2コードでの置き換え

書き換える中で気になった点

さらに高速化

書き換える中で気になった点

キャッシュヒットを確認するために

CUDA勉強メモ

CUDAとドライバのインストール

CUDAコードでの置き換え

SIMD, CUDA比較

比較結果