From 3a8a8e881333ac5a289ad596ac60dcb50f2c87f5 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 9 May 2023 12:10:26 -0500 Subject: [PATCH 1/6] Use ffmpeg to convert Piper audio to mp3 --- .../components/wyoming/manifest.json | 1 + homeassistant/components/wyoming/tts.py | 54 +++++++++++++++++-- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/homeassistant/components/wyoming/manifest.json b/homeassistant/components/wyoming/manifest.json index 9ad8092bb8c..47eb54e9ea6 100644 --- a/homeassistant/components/wyoming/manifest.json +++ b/homeassistant/components/wyoming/manifest.json @@ -3,6 +3,7 @@ "name": "Wyoming Protocol", "codeowners": ["@balloob", "@synesthesiam"], "config_flow": true, + "dependencies": ["ffmpeg"], "documentation": "https://www.home-assistant.io/integrations/wyoming", "iot_class": "local_push", "requirements": ["wyoming==0.0.1"] diff --git a/homeassistant/components/wyoming/tts.py b/homeassistant/components/wyoming/tts.py index f2e314dc13e..57122ea8b1a 100644 --- a/homeassistant/components/wyoming/tts.py +++ b/homeassistant/components/wyoming/tts.py @@ -1,4 +1,5 @@ """Support for Wyoming text to speech services.""" +import asyncio from collections import defaultdict import io import logging @@ -8,7 +9,7 @@ from wyoming.audio import AudioChunk, AudioChunkConverter, AudioStop from wyoming.client import AsyncTcpClient from wyoming.tts import Synthesize -from homeassistant.components import tts +from homeassistant.components import ffmpeg, tts from homeassistant.config_entries import ConfigEntry from homeassistant.core import HomeAssistant, callback from homeassistant.helpers.entity_platform import AddEntitiesCallback @@ -18,6 +19,7 @@ from .data import WyomingService from .error import WyomingError _LOGGER = logging.getLogger(__name__) +_DEFAULT_FORMAT = "mp3" async def async_setup_entry( @@ -27,9 +29,10 @@ async def async_setup_entry( ) -> None: """Set up Wyoming speech to text.""" service: WyomingService = hass.data[DOMAIN][config_entry.entry_id] + ffmpeg_manager = ffmpeg.get_ffmpeg_manager(hass) async_add_entities( [ - WyomingTtsProvider(config_entry, service), + WyomingTtsProvider(config_entry, service, ffmpeg_manager), ] ) @@ -41,9 +44,11 @@ class WyomingTtsProvider(tts.TextToSpeechEntity): self, config_entry: ConfigEntry, service: WyomingService, + ffmpeg_manager: ffmpeg.FFmpegManager, ) -> None: """Set up provider.""" self.service = service + self._ffmpeg_manager = ffmpeg_manager self._tts_service = next(tts for tts in service.info.tts if tts.installed) voice_languages: set[str] = set() @@ -87,7 +92,7 @@ class WyomingTtsProvider(tts.TextToSpeechEntity): @property def default_options(self): """Return a dict include default options.""" - return {tts.ATTR_AUDIO_OUTPUT: "wav"} + return {tts.ATTR_AUDIO_OUTPUT: _DEFAULT_FORMAT} @callback def async_get_supported_voices(self, language: str) -> list[tts.Voice] | None: @@ -129,9 +134,20 @@ class WyomingTtsProvider(tts.TextToSpeechEntity): except (OSError, WyomingError): return (None, None) - if (options is None) or (options[tts.ATTR_AUDIO_OUTPUT] == "wav"): + if options is None: + output_format = _DEFAULT_FORMAT + else: + output_format = options.get(tts.ATTR_AUDIO_OUTPUT, _DEFAULT_FORMAT) + + if output_format == "wav": + # Already WAV data return ("wav", data) + if output_format != "raw": + # Convert with ffmpeg + converted_data = await self._convert_audio(data, output_format) + return (output_format, converted_data) + # Raw output (convert to 16Khz, 16-bit mono) with io.BytesIO(data) as wav_io: wav_reader: wave.Wave_read = wave.open(wav_io, "rb") @@ -153,3 +169,33 @@ class WyomingTtsProvider(tts.TextToSpeechEntity): ) return ("raw", raw_data) + + async def _convert_audio(self, wav_data: bytes, output_format: str) -> bytes: + """Convert from WAV to a different format using ffmpeg asynchronously.""" + ffmpeg_input = [ + "-f", + "wav", + "-i", + "pipe:", # input from stdin + ] + ffmpeg_output = [ + "-f", + output_format, + ] + + if output_format == "mp3": + ffmpeg_output.extend(["-q:a", "0"]) # max quality + + ffmpeg_output.append("pipe:") # output to stdout + + ffmpeg_proc = await asyncio.create_subprocess_exec( + self._ffmpeg_manager.binary, + *ffmpeg_input, + *ffmpeg_output, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + + stdout, _stderr = await ffmpeg_proc.communicate(input=wav_data) + return stdout From fa7a5c7bc1b1a31982f66f4ac1a2b6c7f45159ab Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 9 May 2023 12:12:40 -0500 Subject: [PATCH 2/6] Fix test --- tests/components/wyoming/test_tts.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index f2a10710c26..1b58f93b9f2 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -1,9 +1,7 @@ """Test tts.""" from __future__ import annotations -import io from unittest.mock import patch -import wave import pytest from wyoming.audio import AudioChunk, AudioStop @@ -56,14 +54,8 @@ async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> tts.generate_media_source_id(hass, "Hello world", "tts.test_tts", "en-US"), ) - assert extension == "wav" + assert extension == "mp3" assert data is not None - with io.BytesIO(data) as wav_io, wave.open(wav_io, "rb") as wav_file: - assert wav_file.getframerate() == 16000 - assert wav_file.getsampwidth() == 2 - assert wav_file.getnchannels() == 1 - assert wav_file.readframes(wav_file.getnframes()) == audio - assert mock_client.written == snapshot From 86a6b941d2254b27708379c853c98406a9340bc6 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 9 May 2023 13:52:09 -0500 Subject: [PATCH 3/6] Test empty options too --- .../wyoming/snapshots/test_tts.ambr | 11 ++++++++++ tests/components/wyoming/test_tts.py | 22 ++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/components/wyoming/snapshots/test_tts.ambr b/tests/components/wyoming/snapshots/test_tts.ambr index eb0b33c3276..e869d01c684 100644 --- a/tests/components/wyoming/snapshots/test_tts.ambr +++ b/tests/components/wyoming/snapshots/test_tts.ambr @@ -10,6 +10,17 @@ }), ]) # --- +# name: test_get_tts_audio.1 + list([ + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize', + }), + ]) +# --- # name: test_get_tts_audio_raw list([ dict({ diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index 1b58f93b9f2..30bbb11c4d6 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -54,9 +54,25 @@ async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> tts.generate_media_source_id(hass, "Hello world", "tts.test_tts", "en-US"), ) - assert extension == "mp3" - assert data is not None - assert mock_client.written == snapshot + assert extension == "mp3" + assert data is not None + assert mock_client.written == snapshot + + # Test empty options too + extension, data = await tts.async_get_media_source_audio( + hass, + tts.generate_media_source_id( + hass, + "Hello world", + "tts.test_tts", + "en-US", + options=None, + ), + ) + + assert extension == "mp3" + assert data is not None + assert mock_client.written == snapshot async def test_get_tts_audio_raw( From d57d8e21f66c3b81c09d85659bd44e4058060b7c Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 9 May 2023 14:45:25 -0500 Subject: [PATCH 4/6] Appeasing the codebot --- tests/components/wyoming/test_tts.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index 30bbb11c4d6..e7daccaa4cf 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -30,6 +30,7 @@ async def test_support(hass: HomeAssistant, init_wyoming_tts) -> None: assert entity.supported_languages == ["en-US"] assert entity.supported_options == [tts.ATTR_AUDIO_OUTPUT, tts.ATTR_VOICE] + assert entity.default_language == "en-US" voices = entity.async_get_supported_voices("en-US") assert len(voices) == 1 assert voices[0].name == "Test Voice" @@ -58,22 +59,6 @@ async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> assert data is not None assert mock_client.written == snapshot - # Test empty options too - extension, data = await tts.async_get_media_source_audio( - hass, - tts.generate_media_source_id( - hass, - "Hello world", - "tts.test_tts", - "en-US", - options=None, - ), - ) - - assert extension == "mp3" - assert data is not None - assert mock_client.written == snapshot - async def test_get_tts_audio_raw( hass: HomeAssistant, init_wyoming_tts, snapshot From c64ac285e0b94d0b271ec2d00cb78c5e55d004bc Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 10 May 2023 11:49:38 -0500 Subject: [PATCH 5/6] Increase test coverage --- .../wyoming/snapshots/test_tts.ambr | 33 +++++++++++ tests/components/wyoming/test_tts.py | 59 ++++++++++++++++--- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/tests/components/wyoming/snapshots/test_tts.ambr b/tests/components/wyoming/snapshots/test_tts.ambr index e869d01c684..f69d9f1940f 100644 --- a/tests/components/wyoming/snapshots/test_tts.ambr +++ b/tests/components/wyoming/snapshots/test_tts.ambr @@ -21,6 +21,28 @@ }), ]) # --- +# name: test_get_tts_audio_format[raw] + list([ + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize', + }), + ]) +# --- +# name: test_get_tts_audio_format[wav] + list([ + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize', + }), + ]) +# --- # name: test_get_tts_audio_raw list([ dict({ @@ -32,3 +54,14 @@ }), ]) # --- +# name: test_no_options + list([ + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize', + }), + ]) +# --- diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index e7daccaa4cf..085bacf70a7 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -1,7 +1,9 @@ """Test tts.""" from __future__ import annotations +import io from unittest.mock import patch +import wave import pytest from wyoming.audio import AudioChunk, AudioStop @@ -38,6 +40,33 @@ async def test_support(hass: HomeAssistant, init_wyoming_tts) -> None: assert not entity.async_get_supported_voices("de-DE") +async def test_no_options(hass: HomeAssistant, init_wyoming_tts, snapshot) -> None: + """Test options=None.""" + audio = bytes(100) + audio_events = [ + AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), + AudioStop().event(), + ] + + state = hass.states.get("tts.test_tts") + assert state is not None + + entity = hass.data[DATA_INSTANCES]["tts"].get_entity("tts.test_tts") + assert entity is not None + + with patch( + "homeassistant.components.wyoming.tts.AsyncTcpClient", + MockAsyncTcpClient(audio_events), + ) as mock_client: + extension, data = await entity.async_get_tts_audio( + "Hello world", "en-US", options=None + ) + + assert extension == "mp3" + assert data is not None + assert mock_client.written == snapshot + + async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> None: """Test get audio.""" audio = bytes(100) @@ -60,10 +89,14 @@ async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> assert mock_client.written == snapshot -async def test_get_tts_audio_raw( - hass: HomeAssistant, init_wyoming_tts, snapshot +@pytest.mark.parametrize( + "audio_format", + [("wav",), ("raw",)], +) +async def test_get_tts_audio_format( + hass: HomeAssistant, init_wyoming_tts, snapshot, audio_format: str ) -> None: - """Test get raw audio.""" + """Test get audio in a specific format.""" audio = bytes(100) audio_events = [ AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), @@ -81,12 +114,22 @@ async def test_get_tts_audio_raw( "Hello world", "tts.test_tts", "en-US", - options={tts.ATTR_AUDIO_OUTPUT: "raw"}, + options={tts.ATTR_AUDIO_OUTPUT: audio_format}, ), ) - assert extension == "raw" - assert data == audio + assert extension == audio_format + + if audio_format == "raw": + assert data == audio + else: + # Verify WAV audio + with io.BytesIO(data) as wav_io, wave.open(wav_io, "rb") as wav_file: + assert wav_file.getframerate() == 16000 + assert wav_file.getsampwidth() == 2 + assert wav_file.getnchannels() == 1 + assert wav_file.readframes(wav_file.getnframes()) == audio + assert mock_client.written == snapshot @@ -126,7 +169,5 @@ async def test_get_tts_audio_audio_oserror( ): await tts.async_get_media_source_audio( hass, - tts.generate_media_source_id( - hass, "Hello world", "tts.test_tts", hass.config.language - ), + tts.generate_media_source_id(hass, "Hello world", "tts.test_tts", "en-US"), ) From 28a9ceee3e816f00baead8f75002ce0eca3db987 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 10 May 2023 12:13:11 -0500 Subject: [PATCH 6/6] Fix parameterize --- tests/components/wyoming/test_tts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index 085bacf70a7..1a883709f2a 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -89,10 +89,7 @@ async def test_get_tts_audio(hass: HomeAssistant, init_wyoming_tts, snapshot) -> assert mock_client.written == snapshot -@pytest.mark.parametrize( - "audio_format", - [("wav",), ("raw",)], -) +@pytest.mark.parametrize("audio_format", ("wav", "raw")) async def test_get_tts_audio_format( hass: HomeAssistant, init_wyoming_tts, snapshot, audio_format: str ) -> None: