From 8acc027f383438003e60d2ca8d7376aca0c71e84 Mon Sep 17 00:00:00 2001 From: Simon <80467011+sorgfresser@users.noreply.github.com> Date: Sun, 8 Sep 2024 13:11:26 +0200 Subject: [PATCH] Add voice settings to ElevenLabs options flow (#123265) Add voice settings to options flow --- .../components/elevenlabs/config_flow.py | 87 ++++++++++- homeassistant/components/elevenlabs/const.py | 11 ++ .../components/elevenlabs/strings.json | 22 ++- homeassistant/components/elevenlabs/tts.py | 43 +++++- .../components/elevenlabs/test_config_flow.py | 59 +++++++- tests/components/elevenlabs/test_tts.py | 138 +++++++++++++++++- 6 files changed, 349 insertions(+), 11 deletions(-) diff --git a/homeassistant/components/elevenlabs/config_flow.py b/homeassistant/components/elevenlabs/config_flow.py index cf04304510a..6eec35d0583 100644 --- a/homeassistant/components/elevenlabs/config_flow.py +++ b/homeassistant/components/elevenlabs/config_flow.py @@ -23,7 +23,23 @@ from homeassistant.helpers.selector import ( SelectSelectorConfig, ) -from .const import CONF_MODEL, CONF_VOICE, DEFAULT_MODEL, DOMAIN +from .const import ( + CONF_CONFIGURE_VOICE, + CONF_MODEL, + CONF_OPTIMIZE_LATENCY, + CONF_SIMILARITY, + CONF_STABILITY, + CONF_STYLE, + CONF_USE_SPEAKER_BOOST, + CONF_VOICE, + DEFAULT_MODEL, + DEFAULT_OPTIMIZE_LATENCY, + DEFAULT_SIMILARITY, + DEFAULT_STABILITY, + DEFAULT_STYLE, + DEFAULT_USE_SPEAKER_BOOST, + DOMAIN, +) USER_STEP_SCHEMA = vol.Schema({vol.Required(CONF_API_KEY): str}) @@ -92,6 +108,8 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry): # id -> name self.voices: dict[str, str] = {} self.models: dict[str, str] = {} + self.model: str | None = None + self.voice: str | None = None async def async_step_init( self, user_input: dict[str, Any] | None = None @@ -103,6 +121,11 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry): assert self.models and self.voices if user_input is not None: + self.model = user_input[CONF_MODEL] + self.voice = user_input[CONF_VOICE] + configure_voice = user_input.pop(CONF_CONFIGURE_VOICE) + if configure_voice: + return await self.async_step_voice_settings() return self.async_create_entry( title="ElevenLabs", data=user_input, @@ -139,7 +162,69 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry): ] ) ), + vol.Required(CONF_CONFIGURE_VOICE, default=False): bool, } ), self.options, ) + + async def async_step_voice_settings( + self, user_input: dict[str, Any] | None = None + ) -> ConfigFlowResult: + """Handle voice settings.""" + assert self.voices and self.models + if user_input is not None: + user_input[CONF_MODEL] = self.model + user_input[CONF_VOICE] = self.voice + return self.async_create_entry( + title="ElevenLabs", + data=user_input, + ) + return self.async_show_form( + step_id="voice_settings", + data_schema=self.elevenlabs_config_options_voice_schema(), + ) + + def elevenlabs_config_options_voice_schema(self) -> vol.Schema: + """Elevenlabs options voice schema.""" + return vol.Schema( + { + vol.Optional( + CONF_STABILITY, + default=self.config_entry.options.get( + CONF_STABILITY, DEFAULT_STABILITY + ), + ): vol.All( + vol.Coerce(float), + vol.Range(min=0, max=1), + ), + vol.Optional( + CONF_SIMILARITY, + default=self.config_entry.options.get( + CONF_SIMILARITY, DEFAULT_SIMILARITY + ), + ): vol.All( + vol.Coerce(float), + vol.Range(min=0, max=1), + ), + vol.Optional( + CONF_OPTIMIZE_LATENCY, + default=self.config_entry.options.get( + CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY + ), + ): vol.All(int, vol.Range(min=0, max=4)), + vol.Optional( + CONF_STYLE, + default=self.config_entry.options.get(CONF_STYLE, DEFAULT_STYLE), + ): vol.All( + vol.Coerce(float), + vol.Range(min=0, max=1), + ), + vol.Optional( + CONF_USE_SPEAKER_BOOST, + default=self.config_entry.options.get( + CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST + ), + ): bool, + } + ) diff --git a/homeassistant/components/elevenlabs/const.py b/homeassistant/components/elevenlabs/const.py index c0fc3c7b1b0..040d38d272c 100644 --- a/homeassistant/components/elevenlabs/const.py +++ b/homeassistant/components/elevenlabs/const.py @@ -2,6 +2,17 @@ CONF_VOICE = "voice" CONF_MODEL = "model" +CONF_CONFIGURE_VOICE = "configure_voice" +CONF_STABILITY = "stability" +CONF_SIMILARITY = "similarity" +CONF_OPTIMIZE_LATENCY = "optimize_streaming_latency" +CONF_STYLE = "style" +CONF_USE_SPEAKER_BOOST = "use_speaker_boost" DOMAIN = "elevenlabs" DEFAULT_MODEL = "eleven_multilingual_v2" +DEFAULT_STABILITY = 0.5 +DEFAULT_SIMILARITY = 0.75 +DEFAULT_OPTIMIZE_LATENCY = 0 +DEFAULT_STYLE = 0 +DEFAULT_USE_SPEAKER_BOOST = True diff --git a/homeassistant/components/elevenlabs/strings.json b/homeassistant/components/elevenlabs/strings.json index 16b40137090..b346f94a963 100644 --- a/homeassistant/components/elevenlabs/strings.json +++ b/homeassistant/components/elevenlabs/strings.json @@ -19,11 +19,29 @@ "init": { "data": { "voice": "Voice", - "model": "Model" + "model": "Model", + "configure_voice": "Configure advanced voice settings" }, "data_description": { "voice": "Voice to use for the TTS.", - "model": "ElevenLabs model to use. Please note that not all models support all languages equally well." + "model": "ElevenLabs model to use. Please note that not all models support all languages equally well.", + "configure_voice": "Configure advanced voice settings. Find more information in the ElevenLabs documentation." + } + }, + "voice_settings": { + "data": { + "stability": "Stability", + "similarity": "Similarity", + "optimize_streaming_latency": "Latency", + "style": "Style", + "use_speaker_boost": "Speaker boost" + }, + "data_description": { + "stability": "Stability of the generated audio. Higher values lead to less emotional audio.", + "similarity": "Similarity of the generated audio to the original voice. Higher values may result in more similar audio, but may also introduce background noise.", + "optimize_streaming_latency": "Optimize the model for streaming. This may reduce the quality of the generated audio.", + "style": "Style of the generated audio. Recommended to keep at 0 for most almost all use cases.", + "use_speaker_boost": "Use speaker boost to increase the similarity of the generated audio to the original voice." } } } diff --git a/homeassistant/components/elevenlabs/tts.py b/homeassistant/components/elevenlabs/tts.py index 35ba6053cd8..e7f35775560 100644 --- a/homeassistant/components/elevenlabs/tts.py +++ b/homeassistant/components/elevenlabs/tts.py @@ -3,11 +3,12 @@ from __future__ import annotations import logging +from types import MappingProxyType from typing import Any from elevenlabs.client import AsyncElevenLabs from elevenlabs.core import ApiError -from elevenlabs.types import Model, Voice as ElevenLabsVoice +from elevenlabs.types import Model, Voice as ElevenLabsVoice, VoiceSettings from homeassistant.components.tts import ( ATTR_VOICE, @@ -21,11 +22,36 @@ from homeassistant.helpers.device_registry import DeviceEntryType, DeviceInfo from homeassistant.helpers.entity_platform import AddEntitiesCallback from . import EleventLabsConfigEntry -from .const import CONF_VOICE, DOMAIN +from .const import ( + CONF_OPTIMIZE_LATENCY, + CONF_SIMILARITY, + CONF_STABILITY, + CONF_STYLE, + CONF_USE_SPEAKER_BOOST, + CONF_VOICE, + DEFAULT_OPTIMIZE_LATENCY, + DEFAULT_SIMILARITY, + DEFAULT_STABILITY, + DEFAULT_STYLE, + DEFAULT_USE_SPEAKER_BOOST, + DOMAIN, +) _LOGGER = logging.getLogger(__name__) +def to_voice_settings(options: MappingProxyType[str, Any]) -> VoiceSettings: + """Return voice settings.""" + return VoiceSettings( + stability=options.get(CONF_STABILITY, DEFAULT_STABILITY), + similarity_boost=options.get(CONF_SIMILARITY, DEFAULT_SIMILARITY), + style=options.get(CONF_STYLE, DEFAULT_STYLE), + use_speaker_boost=options.get( + CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST + ), + ) + + async def async_setup_entry( hass: HomeAssistant, config_entry: EleventLabsConfigEntry, @@ -35,6 +61,7 @@ async def async_setup_entry( client = config_entry.runtime_data.client voices = (await client.voices.get_all()).voices default_voice_id = config_entry.options[CONF_VOICE] + voice_settings = to_voice_settings(config_entry.options) async_add_entities( [ ElevenLabsTTSEntity( @@ -44,6 +71,10 @@ async def async_setup_entry( default_voice_id, config_entry.entry_id, config_entry.title, + voice_settings, + config_entry.options.get( + CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY + ), ) ] ) @@ -62,6 +93,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity): default_voice_id: str, entry_id: str, title: str, + voice_settings: VoiceSettings, + latency: int = 0, ) -> None: """Init ElevenLabs TTS service.""" self._client = client @@ -77,6 +110,10 @@ class ElevenLabsTTSEntity(TextToSpeechEntity): ] if voice_indices: self._voices.insert(0, self._voices.pop(voice_indices[0])) + self._voice_settings = voice_settings + self._latency = latency + + # Entity attributes self._attr_unique_id = entry_id self._attr_name = title self._attr_device_info = DeviceInfo( @@ -105,6 +142,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity): audio = await self._client.generate( text=message, voice=voice_id, + optimize_streaming_latency=self._latency, + voice_settings=self._voice_settings, model=self._model.model_id, ) bytes_combined = b"".join([byte_seg async for byte_seg in audio]) diff --git a/tests/components/elevenlabs/test_config_flow.py b/tests/components/elevenlabs/test_config_flow.py index 853c49d48ff..971fa75939a 100644 --- a/tests/components/elevenlabs/test_config_flow.py +++ b/tests/components/elevenlabs/test_config_flow.py @@ -3,9 +3,20 @@ from unittest.mock import AsyncMock from homeassistant.components.elevenlabs.const import ( + CONF_CONFIGURE_VOICE, CONF_MODEL, + CONF_OPTIMIZE_LATENCY, + CONF_SIMILARITY, + CONF_STABILITY, + CONF_STYLE, + CONF_USE_SPEAKER_BOOST, CONF_VOICE, DEFAULT_MODEL, + DEFAULT_OPTIMIZE_LATENCY, + DEFAULT_SIMILARITY, + DEFAULT_STABILITY, + DEFAULT_STYLE, + DEFAULT_USE_SPEAKER_BOOST, DOMAIN, ) from homeassistant.config_entries import SOURCE_USER @@ -89,6 +100,52 @@ async def test_options_flow_init( ) assert result["type"] is FlowResultType.CREATE_ENTRY - assert mock_entry.options == {CONF_MODEL: "model1", CONF_VOICE: "voice1"} + assert mock_entry.options == { + CONF_MODEL: "model1", + CONF_VOICE: "voice1", + } mock_setup_entry.assert_called_once() + + +async def test_options_flow_voice_settings_default( + hass: HomeAssistant, + mock_setup_entry: AsyncMock, + mock_async_client: AsyncMock, + mock_entry: MockConfigEntry, +) -> None: + """Test options flow voice settings.""" + mock_entry.add_to_hass(hass) + assert await hass.config_entries.async_setup(mock_entry.entry_id) + await hass.async_block_till_done() + + result = await hass.config_entries.options.async_init(mock_entry.entry_id) + assert result["type"] is FlowResultType.FORM + assert result["step_id"] == "init" + + result = await hass.config_entries.options.async_configure( + result["flow_id"], + user_input={ + CONF_MODEL: "model1", + CONF_VOICE: "voice1", + CONF_CONFIGURE_VOICE: True, + }, + ) + + assert result["type"] is FlowResultType.FORM + assert result["step_id"] == "voice_settings" + + result = await hass.config_entries.options.async_configure( + result["flow_id"], + user_input={}, + ) + assert result["type"] is FlowResultType.CREATE_ENTRY + assert mock_entry.options == { + CONF_MODEL: "model1", + CONF_VOICE: "voice1", + CONF_OPTIMIZE_LATENCY: DEFAULT_OPTIMIZE_LATENCY, + CONF_SIMILARITY: DEFAULT_SIMILARITY, + CONF_STABILITY: DEFAULT_STABILITY, + CONF_STYLE: DEFAULT_STYLE, + CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST, + } diff --git a/tests/components/elevenlabs/test_tts.py b/tests/components/elevenlabs/test_tts.py index 8b14ab26487..9ed96117daa 100644 --- a/tests/components/elevenlabs/test_tts.py +++ b/tests/components/elevenlabs/test_tts.py @@ -8,11 +8,25 @@ from typing import Any from unittest.mock import AsyncMock, MagicMock, patch from elevenlabs.core import ApiError -from elevenlabs.types import GetVoicesResponse +from elevenlabs.types import GetVoicesResponse, VoiceSettings import pytest from homeassistant.components import tts -from homeassistant.components.elevenlabs.const import CONF_MODEL, CONF_VOICE, DOMAIN +from homeassistant.components.elevenlabs.const import ( + CONF_MODEL, + CONF_OPTIMIZE_LATENCY, + CONF_SIMILARITY, + CONF_STABILITY, + CONF_STYLE, + CONF_USE_SPEAKER_BOOST, + CONF_VOICE, + DEFAULT_OPTIMIZE_LATENCY, + DEFAULT_SIMILARITY, + DEFAULT_STABILITY, + DEFAULT_STYLE, + DEFAULT_USE_SPEAKER_BOOST, + DOMAIN, +) from homeassistant.components.media_player import ( ATTR_MEDIA_CONTENT_ID, DOMAIN as DOMAIN_MP, @@ -53,17 +67,32 @@ async def setup_internal_url(hass: HomeAssistant) -> None: ) +@pytest.fixture +def mock_similarity(): + """Mock similarity.""" + return DEFAULT_SIMILARITY / 2 + + +@pytest.fixture +def mock_latency(): + """Mock latency.""" + return (DEFAULT_OPTIMIZE_LATENCY + 1) % 5 # 0, 1, 2, 3, 4 + + @pytest.fixture(name="setup") async def setup_fixture( hass: HomeAssistant, config_data: dict[str, Any], config_options: dict[str, Any], + config_options_voice: dict[str, Any], request: pytest.FixtureRequest, mock_async_client: AsyncMock, ) -> AsyncMock: """Set up the test environment.""" if request.param == "mock_config_entry_setup": await mock_config_entry_setup(hass, config_data, config_options) + elif request.param == "mock_config_entry_setup_voice": + await mock_config_entry_setup(hass, config_data, config_options_voice) else: raise RuntimeError("Invalid setup fixture") @@ -83,6 +112,18 @@ def config_options_fixture() -> dict[str, Any]: return {} +@pytest.fixture(name="config_options_voice") +def config_options_voice_fixture(mock_similarity, mock_latency) -> dict[str, Any]: + """Return config options.""" + return { + CONF_OPTIMIZE_LATENCY: mock_latency, + CONF_SIMILARITY: mock_similarity, + CONF_STABILITY: DEFAULT_STABILITY, + CONF_STYLE: DEFAULT_STYLE, + CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST, + } + + async def mock_config_entry_setup( hass: HomeAssistant, config_data: dict[str, Any], config_options: dict[str, Any] ) -> None: @@ -146,6 +187,12 @@ async def test_tts_service_speak( """Test tts service.""" tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID]) tts_entity._client.generate.reset_mock() + assert tts_entity._voice_settings == VoiceSettings( + stability=DEFAULT_STABILITY, + similarity_boost=DEFAULT_SIMILARITY, + style=DEFAULT_STYLE, + use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST, + ) await hass.services.async_call( tts.DOMAIN, @@ -161,7 +208,11 @@ async def test_tts_service_speak( ) tts_entity._client.generate.assert_called_once_with( - text="There is a person at the front door.", voice="voice2", model="model1" + text="There is a person at the front door.", + voice="voice2", + model="model1", + voice_settings=tts_entity._voice_settings, + optimize_streaming_latency=tts_entity._latency, ) @@ -219,7 +270,11 @@ async def test_tts_service_speak_lang_config( ) tts_entity._client.generate.assert_called_once_with( - text="There is a person at the front door.", voice="voice1", model="model1" + text="There is a person at the front door.", + voice="voice1", + model="model1", + voice_settings=tts_entity._voice_settings, + optimize_streaming_latency=tts_entity._latency, ) @@ -266,5 +321,78 @@ async def test_tts_service_speak_error( ) tts_entity._client.generate.assert_called_once_with( - text="There is a person at the front door.", voice="voice1", model="model1" + text="There is a person at the front door.", + voice="voice1", + model="model1", + voice_settings=tts_entity._voice_settings, + optimize_streaming_latency=tts_entity._latency, + ) + + +@pytest.mark.parametrize( + "config_data", + [ + {}, + {tts.CONF_LANG: "de"}, + {tts.CONF_LANG: "en"}, + {tts.CONF_LANG: "ja"}, + {tts.CONF_LANG: "es"}, + ], +) +@pytest.mark.parametrize( + ("setup", "tts_service", "service_data"), + [ + ( + "mock_config_entry_setup_voice", + "speak", + { + ATTR_ENTITY_ID: "tts.mock_title", + tts.ATTR_MEDIA_PLAYER_ENTITY_ID: "media_player.something", + tts.ATTR_MESSAGE: "There is a person at the front door.", + tts.ATTR_OPTIONS: {tts.ATTR_VOICE: "voice2"}, + }, + ), + ], + indirect=["setup"], +) +async def test_tts_service_speak_voice_settings( + setup: AsyncMock, + hass: HomeAssistant, + hass_client: ClientSessionGenerator, + calls: list[ServiceCall], + tts_service: str, + service_data: dict[str, Any], + mock_similarity: float, + mock_latency: int, +) -> None: + """Test tts service.""" + tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID]) + tts_entity._client.generate.reset_mock() + assert tts_entity._voice_settings == VoiceSettings( + stability=DEFAULT_STABILITY, + similarity_boost=mock_similarity, + style=DEFAULT_STYLE, + use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST, + ) + assert tts_entity._latency == mock_latency + + await hass.services.async_call( + tts.DOMAIN, + tts_service, + service_data, + blocking=True, + ) + + assert len(calls) == 1 + assert ( + await retrieve_media(hass, hass_client, calls[0].data[ATTR_MEDIA_CONTENT_ID]) + == HTTPStatus.OK + ) + + tts_entity._client.generate.assert_called_once_with( + text="There is a person at the front door.", + voice="voice2", + model="model1", + voice_settings=tts_entity._voice_settings, + optimize_streaming_latency=tts_entity._latency, )