🐨

Unity(Hololens2)でAzureの音声合成が強制終了できない問題を解決

ikebowsan

2024/03/31に公開

発生した問題

Azure AI Speech ServiceのText to Speechを使ってGPTからのレスポンスを音声出力しています。
たまにレスポンスが長い時は途中で音声を止めれるようにしているのですが、キャンセルボタンを押しても音声がストップされません。
ずっとしゃべり続けられるとうるさいので何とかしたいです。

現状の実装

引数に渡った文字列（GPTからのレスポンス）を音声出力しています。

AISpeechManger.cs

using System;
using System.Collections;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Collections.Generic;

using UnityEngine;
using UnityEngine.UI;

using Microsoft.MixedReality.Toolkit.UI;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Azure;


public class AISpeechManager : MonoBehaviour
{
    private SpeechConfig speechConfig;
    private SpeechSynthesizer speechSynthesizer;
    private SpeechSynthesisResult speechSynthesisResult;

    public Interactable voiceControlButton;

    private string language = "ja-JP";
    private string prompt = "You are an insanely talented AI assistant named Jarvis. Please respond with content that is full of intelligence.";
    private string voiceName = "ja-JP-KeitaNeural";

    void Awake()
    {
        speechConfig = SpeechConfig.FromSubscription("<your subscription>", "<your region>");
        speechConfig.SpeechSynthesisLanguage = language;
speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
        speechSynthesizer = new SpeechSynthesizer(speechConfig);
    }

　　void OnDestory()
    {
        speechSynthesizer.Dispose();
#if UNITY_EDITOR
        UnityEditor.EditorApplication.isPlaying = false;
#endif
    }

    public async Task SpeakText()
    {
        ... // GTPレスポンスとかでテキスト生成
        await SpeakResponseTextFromGPT(text);
    }

    public async Task StopSpeaking()
    {
        await speechSynthesizer.StopSpeakingAsync();
    }

    private async Task SpeakResponseTextFromGPT(String text)
    {
        var ssml =$"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts'>" +
               $"<voice name='{voiceName}'>" +
               "<mstts:express-as style='documentary-narration'>" +
               "<prosody rate='15%'>" +
                 text +
               "</prosody>" +
               "</mstts:express-as>" +
               "</voice></speak>";

        speechSynthesisResult = await speechSynthesizer.SpeakSsmlAsync(ssml);
        switch (speechSynthesisResult.Reason)
        {
            case ResultReason.SynthesizingAudioCompleted:
                break;
            case ResultReason.Canceled:
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(speechSynthesisResult);
                Debug.Log($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Debug.Log($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                }
                break;
            default:
                break;
        }
    }
}

原因

音声出力を強制終了するStopSpeakingAsyncメソッドですが、音声合成のキャンセルはするが音声の出力をキャンセルするわけではないようです。
なので音声合成が完了する前のテキストはキャンセルされて出力されませんが、合成しきってしまうと止めようがありません。。。

関数名的に音声出力も止めてくれると思うじゃん...(小言)

解決策

Azure側では音声合成までにして、音声出力はUnityの「AudioSource」を使って明示的にじっこうさせます。
音声を止めたいときはAudioSourceのStopメソッドを使って止めます。

修正

1. AudioSourceを作成

音声出力するためのオブジェクトを用意します。
左上の「Audio」内から「AudioSource」を作成

2. 処理を改修

new SpeechSynthesizerの第二引数はnullにします。

AISpeechManager.cs

using System;
using System.Collections;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Collections.Generic;

using UnityEngine;
using UnityEngine.UI;

using Microsoft.MixedReality.Toolkit.UI;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Azure;


public class AISpeechManager : MonoBehaviour
{
    private SpeechConfig speechConfig;
    private SpeechSynthesizer speechSynthesizer;
    private SpeechSynthesisResult speechSynthesisResult;

    public Interactable voiceControlButton;
　　public AudioSource audioSource;

    private string language = "ja-JP";
    private string prompt = "You are an insanely talented AI assistant named Jarvis. Please respond with content that is full of intelligence.";
    private string voiceName = "ja-JP-KeitaNeural";

    void Awake()
    {
        speechConfig = SpeechConfig.FromSubscription("<your subscription>", "<your region>");
        speechConfig.SpeechSynthesisLanguage = language;
speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
        // 第二引数をnullに変更
        speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
    }

　　void OnDestory()
    {
        speechSynthesizer.Dispose();
#if UNITY_EDITOR
        UnityEditor.EditorApplication.isPlaying = false;
#endif
    }

    public async Task SpeakText()
    {
        ... // GTPレスポンスとかでテキスト生成
        await SpeakResponseTextFromGPT(text);
    }

    public async Task StopSpeaking()
    {
        await speechSynthesizer.StopSpeakingAsync();
        audioSource.Stop();
    }

    private async Task SpeakResponseTextFromGPT(String text)
    {
        var ssml =$"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts'>" +
               $"<voice name='{voiceName}'>" +
               "<mstts:express-as style='documentary-narration'>" +
               "<prosody rate='15%'>" +
                 text +
               "</prosody>" +
               "</mstts:express-as>" +
               "</voice></speak>";

        using (var result = speechSynthesizer.SpeakSsmlAsync(ssml).Result)
        {
            if (result.Reason == ResultReason.SynthesizingAudioCompleted)
            {
                // 生成された音声からAudioClipを作成
                var audioCount = result.AudioData.Length / 2;
                var audioData = new float[audioCount];
                for (var i = 0; i < audioCount; ++i)
                {
                    audioData[i] = (short)(result.AudioData[i * 2 + 1] << 8 | result.AudioData[i * 2]) / 32768.0F;
                }

                var audioClip = AudioClip.Create("SynthesizedAudio", audioCount, 1, 16000, false);
                audioClip.SetData(audioData, 0);
                audioSource.clip = audioClip;
                audioSource.Play();              
            }
            else if (result.Reason == ResultReason.Canceled)
            {
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
            }
        }
    }
}

3. AISpeechManagerに割り当て

これでちゃんと音声をストップできると思います。

ヘッドウォータース

株式会社ヘッドウォータースのテックブログです。生成AI、LLM、Azureのサービスや資格、IoT、XR系などData&AIとApp modernizeに関して幅広く投稿します！