１話者のTTS

ライブラリインストール

pip install google-genai python-dotenv pyaudio

コード全文

import datetime
import os
import wave

from google import genai
from google.genai import types
from dotenv import load_dotenv
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Set up the wave file to save the output:
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(filename, "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)

client = genai.Client(api_key=GEMINI_API_KEY)

file_name="out.wav"

start_time = datetime.datetime.now()
response = client.models.generate_content(
#    model="gemini-2.5-flash-preview-tts",
   model="gemini-2.5-pro-preview-tts",
   contents="こんにちは。本日はどうされましたか？",
   config=types.GenerateContentConfig(
      response_modalities=["AUDIO"],
      speech_config=types.SpeechConfig(
         voice_config=types.VoiceConfig(
            prebuilt_voice_config=types.PrebuiltVoiceConfig(
               voice_name='Leda',
            )
         )
      ),
   )
)
print(f"time: {datetime.datetime.now() - start_time}")
data = response.candidates[0].content.parts[0].inline_data.data
wave_file(file_name, data) # Saves the file to current directory

30ほどの声のバリエーションがあるらしいです。
voice_nameは以下で確認できます。

"こんにちは。本日はどうされましたか？" という文だと、生成時間はflash=3.5秒程度, pro=5秒程度。

M Sea Bass

プロンプトでかなり調整ができそうです。サンプルのノートブックでは以下のようなプロンプトを利用しています。

contents="""Say "I am a very knowlegeable model, especially when using grounding", wait 5 seconds then say "Don't you think?"."""

間の秒数が指示より少し長いが、このような指示でもある程度守ってくれます。

contents="""以下の「」内のテキストを、指示に従って読み上げてください。:
(指示="元気に"): 「こんにちは！今日はいい天気ですね。」
(指示="２秒待機") 
(指示="恐る恐る"): 「そう思いませんか？」
"""

指示がある場合は、proの方が指示通りでかつ自然な音声になる印象。

M Sea Bass

2話者の場合

声の設定を２話者分設定し、プロンプトでどちらが話したかを指定。

speaker_voice_configs=[
   types.SpeakerVoiceConfig(
      speaker='Joe',
      voice_config=types.VoiceConfig(
         prebuilt_voice_config=types.PrebuiltVoiceConfig(
            voice_name='Kore',
         )
      )
   ),
   types.SpeakerVoiceConfig(
      speaker='Jane',
      voice_config=types.VoiceConfig(
         prebuilt_voice_config=types.PrebuiltVoiceConfig(
            voice_name='Puck',
         )
      )
   ),
]

コード全文

import os
import wave
from google import genai
from google.genai import types

from dotenv import load_dotenv
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Set up the wave file to save the output:
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(filename, "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)

client = genai.Client(api_key=GEMINI_API_KEY)

# prompt = """TTS the following conversation between Joe and Jane:
prompt = """Joe: こんにちは。本日はいい天気ですね、Jane。
Jane: どうも。今日は晴れですね。"""

response = client.models.generate_content(
   model="gemini-2.5-flash-preview-tts",
   contents=prompt,
   config=types.GenerateContentConfig(
      response_modalities=["AUDIO"],
      speech_config=types.SpeechConfig(
         multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
            speaker_voice_configs=[
               types.SpeakerVoiceConfig(
                  speaker='Joe',
                  voice_config=types.VoiceConfig(
                     prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name='Kore',
                     )
                  )
               ),
               types.SpeakerVoiceConfig(
                  speaker='Jane',
                  voice_config=types.VoiceConfig(
                     prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name='Puck',
                     )
                  )
               ),
            ]
         )
      )
   )
)

data = response.candidates[0].content.parts[0].inline_data.data

file_name='out_2_speaker.wav'
wave_file(file_name, data) # Saves the file to current directory

M Sea Bass

streaming

client.models.generate_content_stream or client.aio.models.generate_content_streamで実行してみました。
しかし、チャンクごとに生成されず。一括で生成されてしましました。
チャンクごとに生成させる方法ご存じの方いらしたら教えてください。

このスクラップは3ヶ月前にクローズされました