🦻

WhisperX をサーバレスGPU(Modal)上で動かす

2024/09/16に公開

まずはこちらでアカウント作ってセットアップ
https://modal.com/docs/guide#getting-started

https://github.com/m-bain/whisperX

下記のようなコードを書いて

"""
Modal Labのserverless GPUを使ってWhisperXを使った文字起こしを行う
"""
import json

import modal
import pandas as pd

app = modal.App("your-app-name")

@app.function(
    image=modal.Image.from_registry("nvidia/cuda:12.4.0-devel-ubuntu22.04", add_python="3.11")
    .apt_install("git", "ffmpeg")
    .pip_install("git+https://github.com/m-bain/whisperx.git", "torchaudio"),
    gpu="A10G",
    timeout=1200,
)
def transcribe_whisperx(
    wav_data: bytes,
    language_code: str,
):
    import whisperx

    device = "cuda"

    batch_size = 16  # reduce if low on GPU mem
    compute_type = "float16"  # change to "int8" if low on GPU mem (may reduce accuracy)

    audio_file_path = "/root/audio.mp3"
    with open(audio_file_path, "wb") as f:
        f.write(wav_data)

    model = whisperx.load_model("large-v3", device, compute_type=compute_type)
    audio = whisperx.load_audio(audio_file_path)
    result = model.transcribe(audio, batch_size=batch_size, language=language_code)

    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(
        result["segments"], model_a, metadata, audio, device, return_char_alignments=False
    )

    return result


@app.local_entrypoint()
async def main(audio_file_path: str, language_code: str):
    with open(audio_file_path, "rb") as f:
        wav_data = f.read()
    result = transcribe_whisperx.remote(
        wav_data,
        language_code,
    )

    return result

インストールした modal コマンドで実行します。

modal run run_whisperx.py --audio-file-path {audio_file_path} --language-code ja

今回動的に渡したい音声ファイルのパスが変わり得るので一旦wavバイナリに変換して渡してますが、次のようにファイル毎マウントすることもできます。

https://modal.com/docs/reference/modal.Mount#modalmount

@app.function(
    ...,
    mounts=[modal.Mount.from_local_file("./audio.mp3", remote_path="/root/audio.mp3")],
    gpu="a10g",
    timeout=1200,
)

Discussion