Automatically convert TTS audio to MP3 on demand (#102814)

* Add ATTR_PREFERRED_FORMAT to TTS for auto-converting audio

* Move conversion into SpeechManager

* Handle None case for expected_extension

* Only use ATTR_AUDIO_OUTPUT

* Prefer MP3 in pipelines

* Automatically convert to mp3 on demand

* Add preferred audio format

* Break out preferred format

* Add ATTR_BLOCKING to allow async fetching

* Make a copy of supported options

* Fix MaryTTS tests

* Update ESPHome to use "wav" instead of "raw"

* Clean up tests, remove blocking

* Clean up rest of TTS tests

* Fix ESPHome tests

* More test coverage
This commit is contained in:
Michael Hansen 2023-11-06 14:26:00 -06:00 committed by GitHub
parent 054089291f
commit ae516ffbb5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 723 additions and 241 deletions

View file

@ -4,7 +4,7 @@ import io
import logging
import wave
from wyoming.audio import AudioChunk, AudioChunkConverter, AudioStop
from wyoming.audio import AudioChunk, AudioStop
from wyoming.client import AsyncTcpClient
from wyoming.tts import Synthesize, SynthesizeVoice
@ -88,12 +88,16 @@ class WyomingTtsProvider(tts.TextToSpeechEntity):
@property
def supported_options(self):
"""Return list of supported options like voice, emotion."""
return [tts.ATTR_AUDIO_OUTPUT, tts.ATTR_VOICE, ATTR_SPEAKER]
return [
tts.ATTR_AUDIO_OUTPUT,
tts.ATTR_VOICE,
ATTR_SPEAKER,
]
@property
def default_options(self):
"""Return a dict include default options."""
return {tts.ATTR_AUDIO_OUTPUT: "wav"}
return {}
@callback
def async_get_supported_voices(self, language: str) -> list[tts.Voice] | None:
@ -143,27 +147,4 @@ class WyomingTtsProvider(tts.TextToSpeechEntity):
except (OSError, WyomingError):
return (None, None)
if options[tts.ATTR_AUDIO_OUTPUT] == "wav":
return ("wav", data)
# Raw output (convert to 16Khz, 16-bit mono)
with io.BytesIO(data) as wav_io:
wav_reader: wave.Wave_read = wave.open(wav_io, "rb")
raw_data = (
AudioChunkConverter(
rate=16000,
width=2,
channels=1,
)
.convert(
AudioChunk(
audio=wav_reader.readframes(wav_reader.getnframes()),
rate=wav_reader.getframerate(),
width=wav_reader.getsampwidth(),
channels=wav_reader.getnchannels(),
)
)
.audio
)
return ("raw", raw_data)
return ("wav", data)