👏

flaskを使ってApple M1 で MediaRecorder で取得した音声をWhisper APIで利用する時に非常にハマった

2023/10/01に公開

openinterpreterに実装を頼んだけど、なかなか解決しなかった。音声入力のデバッグが難しかった。seleniumで工夫したらいけるかもしれない。

fileuploadした際に seek(0) を実行する

recorded_file.stream.seek(0)

ioを使えば、
いちいちlocalにファイルを保存しなくても良いはずだけれど、
それは後で検証する

from flask import Flask, jsonify, render_template
from flask import request
import base64
import requests
import os
import openai

app = Flask(__name__)
openai.api_key = os.getenv("OPENAI_API_KEY")

@app.route('/record_and_convert', methods=['POST'])
def record_and_convert():
    recorded_file = request.files['audio']
    recorded_file.stream.seek(0)

    recorded_file.save('file.webm')
    audio_file = open("file.webm", "rb")
    print(audio_file)
    
    speech = openai.Audio.transcribe("whisper-1", audio_file)
    print(speech)

    return jsonify({'speech': speech.text})

if __name__ == '__main__':
    app.run(host="0.0.0.0", debug=False)
<button class="btn waves-effect waves-light" id="record-button">Start Recording</button>
<button class="btn waves-effect waves-light" disabled="" id="stop-button">Stop Recording and Upload</button>
<textarea id="speech-text"></textarea>
<button class="btn waves-effect waves-light" id="format-button">Format as medical record</button>
<div id="formatted-text"></div>
<script>
  var mediaRecorder;
  var audioChunks = [];

  document.getElementById('record-button').addEventListener('click', function() {
    navigator.mediaDevices.getUserMedia({ audio: true })
      .then(function(stream) {
        mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });

        mediaRecorder.addEventListener('dataavailable', function(event) {
          audioChunks.push(event.data);
        });

        // timeslice to 1000
        mediaRecorder.start(1000);

        document.getElementById('stop-button').disabled = false;
      });
  });

  document.getElementById('stop-button').addEventListener('click', function() {
    mediaRecorder.stop();

    let recordedBlob = new Blob(audioChunks);
    console.log(recordedBlob);

    var audioURL = window.URL.createObjectURL(recordedBlob);
    console.log('Audio Blob:', audioURL);

    let audioFile = new File([recordedBlob], 'recored.webm', { type: "audio/webm" });

    var formData = new FormData();
    formData.append('audio', audioFile);

    console.log(audioFile);

    audioChunks = [];

    fetch('/record_and_convert', {
      method: 'POST',
      body: formData
    })
    .then(response => {
      console.log('Response:', response);
      return response.json();
    })
    .then(data => {
      console.log('Data:', data);
      document.getElementById('speech-text').value = data.speech;
    })
    .catch(error => {
      console.error('Error:', error);
    });

    document.getElementById('stop-button').disabled = true;
  });
</script>

https://developer.mozilla.org/ja/docs/Web/API/MediaRecorder/dataavailable_event

Discussion