Add voice settings to ElevenLabs options flow (#123265)

Add voice settings to options flow
2024-09-08 13:11:26 +02:00 · 2024-09-08 13:11:26 +02:00 · 8acc027f38
commit 8acc027f38
parent 3139a7e431
6 changed files with 349 additions and 11 deletions
--- a/homeassistant/components/elevenlabs/config_flow.py
+++ b/homeassistant/components/elevenlabs/config_flow.py
@ -23,7 +23,23 @@ from homeassistant.helpers.selector import (
    SelectSelectorConfig,
 )

-from .const import CONF_MODEL, CONF_VOICE, DEFAULT_MODEL, DOMAIN
+from .const import (
+    CONF_CONFIGURE_VOICE,
+    CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_MODEL,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)

 USER_STEP_SCHEMA = vol.Schema({vol.Required(CONF_API_KEY): str})

@ -92,6 +108,8 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
        # id -> name
        self.voices: dict[str, str] = {}
        self.models: dict[str, str] = {}
+        self.model: str | None = None
+        self.voice: str | None = None

    async def async_step_init(
        self, user_input: dict[str, Any] | None = None
@ -103,6 +121,11 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
        assert self.models and self.voices

        if user_input is not None:
+            self.model = user_input[CONF_MODEL]
+            self.voice = user_input[CONF_VOICE]
+            configure_voice = user_input.pop(CONF_CONFIGURE_VOICE)
+            if configure_voice:
+                return await self.async_step_voice_settings()
            return self.async_create_entry(
                title="ElevenLabs",
                data=user_input,
@ -139,7 +162,69 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
                            ]
                        )
                    ),
+                    vol.Required(CONF_CONFIGURE_VOICE, default=False): bool,
                }
            ),
            self.options,
        )
+
+    async def async_step_voice_settings(
+        self, user_input: dict[str, Any] | None = None
+    ) -> ConfigFlowResult:
+        """Handle voice settings."""
+        assert self.voices and self.models
+        if user_input is not None:
+            user_input[CONF_MODEL] = self.model
+            user_input[CONF_VOICE] = self.voice
+            return self.async_create_entry(
+                title="ElevenLabs",
+                data=user_input,
+            )
+        return self.async_show_form(
+            step_id="voice_settings",
+            data_schema=self.elevenlabs_config_options_voice_schema(),
+        )
+
+    def elevenlabs_config_options_voice_schema(self) -> vol.Schema:
+        """Elevenlabs options voice schema."""
+        return vol.Schema(
+            {
+                vol.Optional(
+                    CONF_STABILITY,
+                    default=self.config_entry.options.get(
+                        CONF_STABILITY, DEFAULT_STABILITY
+                    ),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_SIMILARITY,
+                    default=self.config_entry.options.get(
+                        CONF_SIMILARITY, DEFAULT_SIMILARITY
+                    ),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_OPTIMIZE_LATENCY,
+                    default=self.config_entry.options.get(
+                        CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
+                    ),
+                ): vol.All(int, vol.Range(min=0, max=4)),
+                vol.Optional(
+                    CONF_STYLE,
+                    default=self.config_entry.options.get(CONF_STYLE, DEFAULT_STYLE),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_USE_SPEAKER_BOOST,
+                    default=self.config_entry.options.get(
+                        CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
+                    ),
+                ): bool,
+            }
+        )
--- a/homeassistant/components/elevenlabs/const.py
+++ b/homeassistant/components/elevenlabs/const.py
@ -2,6 +2,17 @@

 CONF_VOICE = "voice"
 CONF_MODEL = "model"
+CONF_CONFIGURE_VOICE = "configure_voice"
+CONF_STABILITY = "stability"
+CONF_SIMILARITY = "similarity"
+CONF_OPTIMIZE_LATENCY = "optimize_streaming_latency"
+CONF_STYLE = "style"
+CONF_USE_SPEAKER_BOOST = "use_speaker_boost"
 DOMAIN = "elevenlabs"

 DEFAULT_MODEL = "eleven_multilingual_v2"
+DEFAULT_STABILITY = 0.5
+DEFAULT_SIMILARITY = 0.75
+DEFAULT_OPTIMIZE_LATENCY = 0
+DEFAULT_STYLE = 0
+DEFAULT_USE_SPEAKER_BOOST = True
--- a/homeassistant/components/elevenlabs/strings.json
+++ b/homeassistant/components/elevenlabs/strings.json
@ -19,11 +19,29 @@
      "init": {
        "data": {
          "voice": "Voice",
-          "model": "Model"
+          "model": "Model",
+          "configure_voice": "Configure advanced voice settings"
        },
        "data_description": {
          "voice": "Voice to use for the TTS.",
-          "model": "ElevenLabs model to use. Please note that not all models support all languages equally well."
+          "model": "ElevenLabs model to use. Please note that not all models support all languages equally well.",
+          "configure_voice": "Configure advanced voice settings. Find more information in the ElevenLabs documentation."
+        }
+      },
+      "voice_settings": {
+        "data": {
+          "stability": "Stability",
+          "similarity": "Similarity",
+          "optimize_streaming_latency": "Latency",
+          "style": "Style",
+          "use_speaker_boost": "Speaker boost"
+        },
+        "data_description": {
+          "stability": "Stability of the generated audio. Higher values lead to less emotional audio.",
+          "similarity": "Similarity of the generated audio to the original voice. Higher values may result in more similar audio, but may also introduce background noise.",
+          "optimize_streaming_latency": "Optimize the model for streaming. This may reduce the quality of the generated audio.",
+          "style": "Style of the generated audio. Recommended to keep at 0 for most almost all use cases.",
+          "use_speaker_boost": "Use speaker boost to increase the similarity of the generated audio to the original voice."
        }
      }
    }
--- a/homeassistant/components/elevenlabs/tts.py
+++ b/homeassistant/components/elevenlabs/tts.py
@ -3,11 +3,12 @@
 from __future__ import annotations

 import logging
+from types import MappingProxyType
 from typing import Any

 from elevenlabs.client import AsyncElevenLabs
 from elevenlabs.core import ApiError
-from elevenlabs.types import Model, Voice as ElevenLabsVoice
+from elevenlabs.types import Model, Voice as ElevenLabsVoice, VoiceSettings

 from homeassistant.components.tts import (
    ATTR_VOICE,
@ -21,11 +22,36 @@ from homeassistant.helpers.device_registry import DeviceEntryType, DeviceInfo
 from homeassistant.helpers.entity_platform import AddEntitiesCallback

 from . import EleventLabsConfigEntry
-from .const import CONF_VOICE, DOMAIN
+from .const import (
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)

 _LOGGER = logging.getLogger(__name__)


+def to_voice_settings(options: MappingProxyType[str, Any]) -> VoiceSettings:
+    """Return voice settings."""
+    return VoiceSettings(
+        stability=options.get(CONF_STABILITY, DEFAULT_STABILITY),
+        similarity_boost=options.get(CONF_SIMILARITY, DEFAULT_SIMILARITY),
+        style=options.get(CONF_STYLE, DEFAULT_STYLE),
+        use_speaker_boost=options.get(
+            CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
+        ),
+    )
+
+
 async def async_setup_entry(
    hass: HomeAssistant,
    config_entry: EleventLabsConfigEntry,
@ -35,6 +61,7 @@ async def async_setup_entry(
    client = config_entry.runtime_data.client
    voices = (await client.voices.get_all()).voices
    default_voice_id = config_entry.options[CONF_VOICE]
+    voice_settings = to_voice_settings(config_entry.options)
    async_add_entities(
        [
            ElevenLabsTTSEntity(
@ -44,6 +71,10 @@ async def async_setup_entry(
                default_voice_id,
                config_entry.entry_id,
                config_entry.title,
+                voice_settings,
+                config_entry.options.get(
+                    CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
+                ),
            )
        ]
    )
@ -62,6 +93,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
        default_voice_id: str,
        entry_id: str,
        title: str,
+        voice_settings: VoiceSettings,
+        latency: int = 0,
    ) -> None:
        """Init ElevenLabs TTS service."""
        self._client = client
@ -77,6 +110,10 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
        ]
        if voice_indices:
            self._voices.insert(0, self._voices.pop(voice_indices[0]))
+        self._voice_settings = voice_settings
+        self._latency = latency
+
+        # Entity attributes
        self._attr_unique_id = entry_id
        self._attr_name = title
        self._attr_device_info = DeviceInfo(
@ -105,6 +142,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
            audio = await self._client.generate(
                text=message,
                voice=voice_id,
+                optimize_streaming_latency=self._latency,
+                voice_settings=self._voice_settings,
                model=self._model.model_id,
            )
            bytes_combined = b"".join([byte_seg async for byte_seg in audio])
--- a/tests/components/elevenlabs/test_config_flow.py
+++ b/tests/components/elevenlabs/test_config_flow.py
@ -3,9 +3,20 @@
 from unittest.mock import AsyncMock

 from homeassistant.components.elevenlabs.const import (
+    CONF_CONFIGURE_VOICE,
    CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
    CONF_VOICE,
    DEFAULT_MODEL,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
    DOMAIN,
 )
 from homeassistant.config_entries import SOURCE_USER
@ -89,6 +100,52 @@ async def test_options_flow_init(
    )

    assert result["type"] is FlowResultType.CREATE_ENTRY
-    assert mock_entry.options == {CONF_MODEL: "model1", CONF_VOICE: "voice1"}
+    assert mock_entry.options == {
+        CONF_MODEL: "model1",
+        CONF_VOICE: "voice1",
+    }

    mock_setup_entry.assert_called_once()
+
+
+async def test_options_flow_voice_settings_default(
+    hass: HomeAssistant,
+    mock_setup_entry: AsyncMock,
+    mock_async_client: AsyncMock,
+    mock_entry: MockConfigEntry,
+) -> None:
+    """Test options flow voice settings."""
+    mock_entry.add_to_hass(hass)
+    assert await hass.config_entries.async_setup(mock_entry.entry_id)
+    await hass.async_block_till_done()
+
+    result = await hass.config_entries.options.async_init(mock_entry.entry_id)
+    assert result["type"] is FlowResultType.FORM
+    assert result["step_id"] == "init"
+
+    result = await hass.config_entries.options.async_configure(
+        result["flow_id"],
+        user_input={
+            CONF_MODEL: "model1",
+            CONF_VOICE: "voice1",
+            CONF_CONFIGURE_VOICE: True,
+        },
+    )
+
+    assert result["type"] is FlowResultType.FORM
+    assert result["step_id"] == "voice_settings"
+
+    result = await hass.config_entries.options.async_configure(
+        result["flow_id"],
+        user_input={},
+    )
+    assert result["type"] is FlowResultType.CREATE_ENTRY
+    assert mock_entry.options == {
+        CONF_MODEL: "model1",
+        CONF_VOICE: "voice1",
+        CONF_OPTIMIZE_LATENCY: DEFAULT_OPTIMIZE_LATENCY,
+        CONF_SIMILARITY: DEFAULT_SIMILARITY,
+        CONF_STABILITY: DEFAULT_STABILITY,
+        CONF_STYLE: DEFAULT_STYLE,
+        CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
+    }
--- a/tests/components/elevenlabs/test_tts.py
+++ b/tests/components/elevenlabs/test_tts.py
@ -8,11 +8,25 @@ from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch

 from elevenlabs.core import ApiError
-from elevenlabs.types import GetVoicesResponse
+from elevenlabs.types import GetVoicesResponse, VoiceSettings
 import pytest

 from homeassistant.components import tts
-from homeassistant.components.elevenlabs.const import CONF_MODEL, CONF_VOICE, DOMAIN
+from homeassistant.components.elevenlabs.const import (
+    CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)
 from homeassistant.components.media_player import (
    ATTR_MEDIA_CONTENT_ID,
    DOMAIN as DOMAIN_MP,
@ -53,17 +67,32 @@ async def setup_internal_url(hass: HomeAssistant) -> None:
    )


+@pytest.fixture
+def mock_similarity():
+    """Mock similarity."""
+    return DEFAULT_SIMILARITY / 2
+
+
+@pytest.fixture
+def mock_latency():
+    """Mock latency."""
+    return (DEFAULT_OPTIMIZE_LATENCY + 1) % 5  # 0, 1, 2, 3, 4
+
+
@pytest.fixture(name="setup")
 async def setup_fixture(
    hass: HomeAssistant,
    config_data: dict[str, Any],
    config_options: dict[str, Any],
+    config_options_voice: dict[str, Any],
    request: pytest.FixtureRequest,
    mock_async_client: AsyncMock,
 ) -> AsyncMock:
    """Set up the test environment."""
    if request.param == "mock_config_entry_setup":
        await mock_config_entry_setup(hass, config_data, config_options)
+    elif request.param == "mock_config_entry_setup_voice":
+        await mock_config_entry_setup(hass, config_data, config_options_voice)
    else:
        raise RuntimeError("Invalid setup fixture")

@ -83,6 +112,18 @@ def config_options_fixture() -> dict[str, Any]:
    return {}


+@pytest.fixture(name="config_options_voice")
+def config_options_voice_fixture(mock_similarity, mock_latency) -> dict[str, Any]:
+    """Return config options."""
+    return {
+        CONF_OPTIMIZE_LATENCY: mock_latency,
+        CONF_SIMILARITY: mock_similarity,
+        CONF_STABILITY: DEFAULT_STABILITY,
+        CONF_STYLE: DEFAULT_STYLE,
+        CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
+    }
+
+
 async def mock_config_entry_setup(
    hass: HomeAssistant, config_data: dict[str, Any], config_options: dict[str, Any]
 ) -> None:
@ -146,6 +187,12 @@ async def test_tts_service_speak(
    """Test tts service."""
    tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
    tts_entity._client.generate.reset_mock()
+    assert tts_entity._voice_settings == VoiceSettings(
+        stability=DEFAULT_STABILITY,
+        similarity_boost=DEFAULT_SIMILARITY,
+        style=DEFAULT_STYLE,
+        use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
+    )

    await hass.services.async_call(
        tts.DOMAIN,
@ -161,7 +208,11 @@ async def test_tts_service_speak(
    )

    tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice2", model="model1"
+        text="There is a person at the front door.",
+        voice="voice2",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
    )


@ -219,7 +270,11 @@ async def test_tts_service_speak_lang_config(
    )

    tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice1", model="model1"
+        text="There is a person at the front door.",
+        voice="voice1",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
    )


@ -266,5 +321,78 @@ async def test_tts_service_speak_error(
    )

    tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice1", model="model1"
+        text="There is a person at the front door.",
+        voice="voice1",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
+    )
+
+
+@pytest.mark.parametrize(
+    "config_data",
+    [
+        {},
+        {tts.CONF_LANG: "de"},
+        {tts.CONF_LANG: "en"},
+        {tts.CONF_LANG: "ja"},
+        {tts.CONF_LANG: "es"},
+    ],
+)
+@pytest.mark.parametrize(
+    ("setup", "tts_service", "service_data"),
+    [
+        (
+            "mock_config_entry_setup_voice",
+            "speak",
+            {
+                ATTR_ENTITY_ID: "tts.mock_title",
+                tts.ATTR_MEDIA_PLAYER_ENTITY_ID: "media_player.something",
+                tts.ATTR_MESSAGE: "There is a person at the front door.",
+                tts.ATTR_OPTIONS: {tts.ATTR_VOICE: "voice2"},
+            },
+        ),
+    ],
+    indirect=["setup"],
+)
+async def test_tts_service_speak_voice_settings(
+    setup: AsyncMock,
+    hass: HomeAssistant,
+    hass_client: ClientSessionGenerator,
+    calls: list[ServiceCall],
+    tts_service: str,
+    service_data: dict[str, Any],
+    mock_similarity: float,
+    mock_latency: int,
+) -> None:
+    """Test tts service."""
+    tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
+    tts_entity._client.generate.reset_mock()
+    assert tts_entity._voice_settings == VoiceSettings(
+        stability=DEFAULT_STABILITY,
+        similarity_boost=mock_similarity,
+        style=DEFAULT_STYLE,
+        use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
+    )
+    assert tts_entity._latency == mock_latency
+
+    await hass.services.async_call(
+        tts.DOMAIN,
+        tts_service,
+        service_data,
+        blocking=True,
+    )
+
+    assert len(calls) == 1
+    assert (
+        await retrieve_media(hass, hass_client, calls[0].data[ATTR_MEDIA_CONTENT_ID])
+        == HTTPStatus.OK
+    )
+
+    tts_entity._client.generate.assert_called_once_with(
+        text="There is a person at the front door.",
+        voice="voice2",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
    )