Open11

【Python】最近話題の音声認識ツールキットのVOSKを試してみる

k8shirok8shiro

適当な日本語wavファイル探して来たらwave.Error: unknown format: 3で怒られた

k8shirok8shiro

pythonのwaveが読み込める日本語音声ファイルがなかなか見つからない。
https://www3.jvckenwood.com/pro/soft_dl/pa-d_message/aisatu.html
で公開されている音声は使えた。

k8shirok8shiro

こんな感じで動く

    }, {
      "conf" : 0.956470,
      "end" : 1.320000,
      "start" : 1.080000,
      "word" : "ない"
    }, {
      "conf" : 0.956470,
      "end" : 1.650000,
      "start" : 1.320000,
      "word" : "よう"
    }, {
      "conf" : 0.956470,
      "end" : 2.250000,
      "start" : 1.980000,
      "word" : "お気"
    }, {
      "conf" : 1.000000,
      "end" : 2.340000,
      "start" : 2.278694,
      "word" : "を"
    }, {
      "conf" : 0.683808,
      "end" : 2.580000,
      "start" : 2.340000,
      "word" : "付け"
    }, {
      "conf" : 1.000000,
      "end" : 2.790000,
      "start" : 2.610000,
      "word" : "て"
    }, {
      "conf" : 0.439423,
      "end" : 2.910000,
      "start" : 2.820000,
      "word" : "お"
    }, {
      "conf" : 0.297040,
      "end" : 3.330000,
      "start" : 3.030000,
      "word" : "遜"
    }]
}
{
  "partial" : "お 忘れ物 の ない よう お気 を 付け て お 遜",
  "partial_result" : [{
      "conf" : 0.875089,
      "end" : 0.359229,
      "start" : 0.270000,
      "word" : "お"
    }, {
      "conf" : 0.941267,
      "end" : 0.930000,
      "start" : 0.359229,
      "word" : "忘れ物"
    }, {
      "conf" : 1.000000,
      "end" : 1.080000,
      "start" : 0.930000,
      "word" : "の"
    }, {
      "conf" : 0.956470,
      "end" : 1.320000,
      "start" : 1.080000,
      "word" : "ない"
    }, {
      "conf" : 0.956470,
      "end" : 1.650000,
      "start" : 1.320000,
      "word" : "よう"
    }, {
      "conf" : 0.956470,
      "end" : 2.250000,
      "start" : 1.980000,
      "word" : "お気"
    }, {
      "conf" : 1.000000,
      "end" : 2.340000,
      "start" : 2.278694,
      "word" : "を"
    }, {
      "conf" : 0.683808,
      "end" : 2.580000,
      "start" : 2.340000,
      "word" : "付け"
    }, {
      "conf" : 1.000000,
      "end" : 2.790000,
      "start" : 2.610000,
      "word" : "て"
    }, {
      "conf" : 0.439423,
      "end" : 2.910000,
      "start" : 2.820000,
      "word" : "お"
    }, {
      "conf" : 0.297040,
      "end" : 3.330000,
      "start" : 3.030000,
      "word" : "遜"
    }]
}
{
  "result" : [{
      "conf" : 0.867830,
      "end" : 0.359463,
      "start" : 0.270000,
      "word" : "お"
    }, {
      "conf" : 0.914248,
      "end" : 0.930000,
      "start" : 0.359463,
      "word" : "忘れ物"
    }, {
      "conf" : 1.000000,
      "end" : 1.080000,
      "start" : 0.930000,
      "word" : "の"
    }, {
      "conf" : 0.962447,
      "end" : 1.320000,
      "start" : 1.080000,
      "word" : "ない"
    }, {
      "conf" : 0.962447,
      "end" : 1.650000,
      "start" : 1.320000,
      "word" : "よう"
    }, {
      "conf" : 0.962447,
      "end" : 2.250000,
      "start" : 1.980000,
      "word" : "お気"
    }, {
      "conf" : 1.000000,
      "end" : 2.340000,
      "start" : 2.278873,
      "word" : "を"
    }, {
      "conf" : 0.696247,
      "end" : 2.580000,
      "start" : 2.340000,
      "word" : "付け"
    }, {
      "conf" : 1.000000,
      "end" : 2.790000,
      "start" : 2.610000,
      "word" : "て"
    }, {
      "conf" : 0.621879,
      "end" : 2.910000,
      "start" : 2.820000,
      "word" : "お"
    }, {
      "conf" : 0.784194,
      "end" : 3.600000,
      "start" : 3.180000,
      "word" : "ください"
    }, {
      "conf" : 1.000000,
      "end" : 3.960000,
      "start" : 3.630000,
      "word" : "ませ"
    }],
  "text" : "お 忘れ物 の ない よう お気 を 付け て お ください ませ"
}
{
  "partial" : ""
}
{
  "partial" : ""
}
{
  "text" : ""
}
k8shirok8shiro

"Audio file must be WAV format mono PCM."なのでchannel 2のものはそのままだと動かないっぽい。

k8shirok8shiro

wavファイルだけどwaveでは開けないものもあるっぽい?

k8shirok8shiro

SoundFileを使ってchannel 2やwaveでは読み込めないファイルにも対応してみた。

#!/usr/bin/env python3

# waveだけでは読み込めないファイルへの対応

from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import soundfile
import wave

SetLogLevel(0)

if not os.path.exists("/model"):
    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit (1)

filepath = sys.argv[1]
tmp_filepath = 'tmp.wav'
sf = soundfile.SoundFile(filepath)
data = sf.read(-1)
print(sf.channels)
print(sf.format)
print(sf.subtype)
print(data)
if sf.channels != 1:
    data = [sum(d) / sf.channels for d in data ]
soundfile.write(tmp_filepath, data, sf.samplerate)
wf = wave.open(tmp_filepath, "rb")

if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print ("Audio file must be WAV format mono PCM.")
    exit (1)

model = Model("/model/vosk-model-small-ja-0.22")
rec = KaldiRecognizer(model, sf.samplerate)
rec.SetWords(True)
rec.SetPartialWords(True)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())