From 8acc027f383438003e60d2ca8d7376aca0c71e84 Mon Sep 17 00:00:00 2001
From: Simon <80467011+sorgfresser@users.noreply.github.com>
Date: Sun, 8 Sep 2024 13:11:26 +0200
Subject: [PATCH] Add voice settings to ElevenLabs options flow (#123265)

Add voice settings to options flow
---
 .../components/elevenlabs/config_flow.py      |  87 ++++++++++-
 homeassistant/components/elevenlabs/const.py  |  11 ++
 .../components/elevenlabs/strings.json        |  22 ++-
 homeassistant/components/elevenlabs/tts.py    |  43 +++++-
 .../components/elevenlabs/test_config_flow.py |  59 +++++++-
 tests/components/elevenlabs/test_tts.py       | 138 +++++++++++++++++-
 6 files changed, 349 insertions(+), 11 deletions(-)

diff --git a/homeassistant/components/elevenlabs/config_flow.py b/homeassistant/components/elevenlabs/config_flow.py
index cf04304510a..6eec35d0583 100644
--- a/homeassistant/components/elevenlabs/config_flow.py
+++ b/homeassistant/components/elevenlabs/config_flow.py
@@ -23,7 +23,23 @@ from homeassistant.helpers.selector import (
     SelectSelectorConfig,
 )
 
-from .const import CONF_MODEL, CONF_VOICE, DEFAULT_MODEL, DOMAIN
+from .const import (
+    CONF_CONFIGURE_VOICE,
+    CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_MODEL,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)
 
 USER_STEP_SCHEMA = vol.Schema({vol.Required(CONF_API_KEY): str})
 
@@ -92,6 +108,8 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
         # id -> name
         self.voices: dict[str, str] = {}
         self.models: dict[str, str] = {}
+        self.model: str | None = None
+        self.voice: str | None = None
 
     async def async_step_init(
         self, user_input: dict[str, Any] | None = None
@@ -103,6 +121,11 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
         assert self.models and self.voices
 
         if user_input is not None:
+            self.model = user_input[CONF_MODEL]
+            self.voice = user_input[CONF_VOICE]
+            configure_voice = user_input.pop(CONF_CONFIGURE_VOICE)
+            if configure_voice:
+                return await self.async_step_voice_settings()
             return self.async_create_entry(
                 title="ElevenLabs",
                 data=user_input,
@@ -139,7 +162,69 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
                             ]
                         )
                     ),
+                    vol.Required(CONF_CONFIGURE_VOICE, default=False): bool,
                 }
             ),
             self.options,
         )
+
+    async def async_step_voice_settings(
+        self, user_input: dict[str, Any] | None = None
+    ) -> ConfigFlowResult:
+        """Handle voice settings."""
+        assert self.voices and self.models
+        if user_input is not None:
+            user_input[CONF_MODEL] = self.model
+            user_input[CONF_VOICE] = self.voice
+            return self.async_create_entry(
+                title="ElevenLabs",
+                data=user_input,
+            )
+        return self.async_show_form(
+            step_id="voice_settings",
+            data_schema=self.elevenlabs_config_options_voice_schema(),
+        )
+
+    def elevenlabs_config_options_voice_schema(self) -> vol.Schema:
+        """Elevenlabs options voice schema."""
+        return vol.Schema(
+            {
+                vol.Optional(
+                    CONF_STABILITY,
+                    default=self.config_entry.options.get(
+                        CONF_STABILITY, DEFAULT_STABILITY
+                    ),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_SIMILARITY,
+                    default=self.config_entry.options.get(
+                        CONF_SIMILARITY, DEFAULT_SIMILARITY
+                    ),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_OPTIMIZE_LATENCY,
+                    default=self.config_entry.options.get(
+                        CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
+                    ),
+                ): vol.All(int, vol.Range(min=0, max=4)),
+                vol.Optional(
+                    CONF_STYLE,
+                    default=self.config_entry.options.get(CONF_STYLE, DEFAULT_STYLE),
+                ): vol.All(
+                    vol.Coerce(float),
+                    vol.Range(min=0, max=1),
+                ),
+                vol.Optional(
+                    CONF_USE_SPEAKER_BOOST,
+                    default=self.config_entry.options.get(
+                        CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
+                    ),
+                ): bool,
+            }
+        )
diff --git a/homeassistant/components/elevenlabs/const.py b/homeassistant/components/elevenlabs/const.py
index c0fc3c7b1b0..040d38d272c 100644
--- a/homeassistant/components/elevenlabs/const.py
+++ b/homeassistant/components/elevenlabs/const.py
@@ -2,6 +2,17 @@
 
 CONF_VOICE = "voice"
 CONF_MODEL = "model"
+CONF_CONFIGURE_VOICE = "configure_voice"
+CONF_STABILITY = "stability"
+CONF_SIMILARITY = "similarity"
+CONF_OPTIMIZE_LATENCY = "optimize_streaming_latency"
+CONF_STYLE = "style"
+CONF_USE_SPEAKER_BOOST = "use_speaker_boost"
 DOMAIN = "elevenlabs"
 
 DEFAULT_MODEL = "eleven_multilingual_v2"
+DEFAULT_STABILITY = 0.5
+DEFAULT_SIMILARITY = 0.75
+DEFAULT_OPTIMIZE_LATENCY = 0
+DEFAULT_STYLE = 0
+DEFAULT_USE_SPEAKER_BOOST = True
diff --git a/homeassistant/components/elevenlabs/strings.json b/homeassistant/components/elevenlabs/strings.json
index 16b40137090..b346f94a963 100644
--- a/homeassistant/components/elevenlabs/strings.json
+++ b/homeassistant/components/elevenlabs/strings.json
@@ -19,11 +19,29 @@
       "init": {
         "data": {
           "voice": "Voice",
-          "model": "Model"
+          "model": "Model",
+          "configure_voice": "Configure advanced voice settings"
         },
         "data_description": {
           "voice": "Voice to use for the TTS.",
-          "model": "ElevenLabs model to use. Please note that not all models support all languages equally well."
+          "model": "ElevenLabs model to use. Please note that not all models support all languages equally well.",
+          "configure_voice": "Configure advanced voice settings. Find more information in the ElevenLabs documentation."
+        }
+      },
+      "voice_settings": {
+        "data": {
+          "stability": "Stability",
+          "similarity": "Similarity",
+          "optimize_streaming_latency": "Latency",
+          "style": "Style",
+          "use_speaker_boost": "Speaker boost"
+        },
+        "data_description": {
+          "stability": "Stability of the generated audio. Higher values lead to less emotional audio.",
+          "similarity": "Similarity of the generated audio to the original voice. Higher values may result in more similar audio, but may also introduce background noise.",
+          "optimize_streaming_latency": "Optimize the model for streaming. This may reduce the quality of the generated audio.",
+          "style": "Style of the generated audio. Recommended to keep at 0 for most almost all use cases.",
+          "use_speaker_boost": "Use speaker boost to increase the similarity of the generated audio to the original voice."
         }
       }
     }
diff --git a/homeassistant/components/elevenlabs/tts.py b/homeassistant/components/elevenlabs/tts.py
index 35ba6053cd8..e7f35775560 100644
--- a/homeassistant/components/elevenlabs/tts.py
+++ b/homeassistant/components/elevenlabs/tts.py
@@ -3,11 +3,12 @@
 from __future__ import annotations
 
 import logging
+from types import MappingProxyType
 from typing import Any
 
 from elevenlabs.client import AsyncElevenLabs
 from elevenlabs.core import ApiError
-from elevenlabs.types import Model, Voice as ElevenLabsVoice
+from elevenlabs.types import Model, Voice as ElevenLabsVoice, VoiceSettings
 
 from homeassistant.components.tts import (
     ATTR_VOICE,
@@ -21,11 +22,36 @@ from homeassistant.helpers.device_registry import DeviceEntryType, DeviceInfo
 from homeassistant.helpers.entity_platform import AddEntitiesCallback
 
 from . import EleventLabsConfigEntry
-from .const import CONF_VOICE, DOMAIN
+from .const import (
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)
 
 _LOGGER = logging.getLogger(__name__)
 
 
+def to_voice_settings(options: MappingProxyType[str, Any]) -> VoiceSettings:
+    """Return voice settings."""
+    return VoiceSettings(
+        stability=options.get(CONF_STABILITY, DEFAULT_STABILITY),
+        similarity_boost=options.get(CONF_SIMILARITY, DEFAULT_SIMILARITY),
+        style=options.get(CONF_STYLE, DEFAULT_STYLE),
+        use_speaker_boost=options.get(
+            CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
+        ),
+    )
+
+
 async def async_setup_entry(
     hass: HomeAssistant,
     config_entry: EleventLabsConfigEntry,
@@ -35,6 +61,7 @@ async def async_setup_entry(
     client = config_entry.runtime_data.client
     voices = (await client.voices.get_all()).voices
     default_voice_id = config_entry.options[CONF_VOICE]
+    voice_settings = to_voice_settings(config_entry.options)
     async_add_entities(
         [
             ElevenLabsTTSEntity(
@@ -44,6 +71,10 @@ async def async_setup_entry(
                 default_voice_id,
                 config_entry.entry_id,
                 config_entry.title,
+                voice_settings,
+                config_entry.options.get(
+                    CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
+                ),
             )
         ]
     )
@@ -62,6 +93,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
         default_voice_id: str,
         entry_id: str,
         title: str,
+        voice_settings: VoiceSettings,
+        latency: int = 0,
     ) -> None:
         """Init ElevenLabs TTS service."""
         self._client = client
@@ -77,6 +110,10 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
         ]
         if voice_indices:
             self._voices.insert(0, self._voices.pop(voice_indices[0]))
+        self._voice_settings = voice_settings
+        self._latency = latency
+
+        # Entity attributes
         self._attr_unique_id = entry_id
         self._attr_name = title
         self._attr_device_info = DeviceInfo(
@@ -105,6 +142,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
             audio = await self._client.generate(
                 text=message,
                 voice=voice_id,
+                optimize_streaming_latency=self._latency,
+                voice_settings=self._voice_settings,
                 model=self._model.model_id,
             )
             bytes_combined = b"".join([byte_seg async for byte_seg in audio])
diff --git a/tests/components/elevenlabs/test_config_flow.py b/tests/components/elevenlabs/test_config_flow.py
index 853c49d48ff..971fa75939a 100644
--- a/tests/components/elevenlabs/test_config_flow.py
+++ b/tests/components/elevenlabs/test_config_flow.py
@@ -3,9 +3,20 @@
 from unittest.mock import AsyncMock
 
 from homeassistant.components.elevenlabs.const import (
+    CONF_CONFIGURE_VOICE,
     CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
     CONF_VOICE,
     DEFAULT_MODEL,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
     DOMAIN,
 )
 from homeassistant.config_entries import SOURCE_USER
@@ -89,6 +100,52 @@ async def test_options_flow_init(
     )
 
     assert result["type"] is FlowResultType.CREATE_ENTRY
-    assert mock_entry.options == {CONF_MODEL: "model1", CONF_VOICE: "voice1"}
+    assert mock_entry.options == {
+        CONF_MODEL: "model1",
+        CONF_VOICE: "voice1",
+    }
 
     mock_setup_entry.assert_called_once()
+
+
+async def test_options_flow_voice_settings_default(
+    hass: HomeAssistant,
+    mock_setup_entry: AsyncMock,
+    mock_async_client: AsyncMock,
+    mock_entry: MockConfigEntry,
+) -> None:
+    """Test options flow voice settings."""
+    mock_entry.add_to_hass(hass)
+    assert await hass.config_entries.async_setup(mock_entry.entry_id)
+    await hass.async_block_till_done()
+
+    result = await hass.config_entries.options.async_init(mock_entry.entry_id)
+    assert result["type"] is FlowResultType.FORM
+    assert result["step_id"] == "init"
+
+    result = await hass.config_entries.options.async_configure(
+        result["flow_id"],
+        user_input={
+            CONF_MODEL: "model1",
+            CONF_VOICE: "voice1",
+            CONF_CONFIGURE_VOICE: True,
+        },
+    )
+
+    assert result["type"] is FlowResultType.FORM
+    assert result["step_id"] == "voice_settings"
+
+    result = await hass.config_entries.options.async_configure(
+        result["flow_id"],
+        user_input={},
+    )
+    assert result["type"] is FlowResultType.CREATE_ENTRY
+    assert mock_entry.options == {
+        CONF_MODEL: "model1",
+        CONF_VOICE: "voice1",
+        CONF_OPTIMIZE_LATENCY: DEFAULT_OPTIMIZE_LATENCY,
+        CONF_SIMILARITY: DEFAULT_SIMILARITY,
+        CONF_STABILITY: DEFAULT_STABILITY,
+        CONF_STYLE: DEFAULT_STYLE,
+        CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
+    }
diff --git a/tests/components/elevenlabs/test_tts.py b/tests/components/elevenlabs/test_tts.py
index 8b14ab26487..9ed96117daa 100644
--- a/tests/components/elevenlabs/test_tts.py
+++ b/tests/components/elevenlabs/test_tts.py
@@ -8,11 +8,25 @@ from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
 from elevenlabs.core import ApiError
-from elevenlabs.types import GetVoicesResponse
+from elevenlabs.types import GetVoicesResponse, VoiceSettings
 import pytest
 
 from homeassistant.components import tts
-from homeassistant.components.elevenlabs.const import CONF_MODEL, CONF_VOICE, DOMAIN
+from homeassistant.components.elevenlabs.const import (
+    CONF_MODEL,
+    CONF_OPTIMIZE_LATENCY,
+    CONF_SIMILARITY,
+    CONF_STABILITY,
+    CONF_STYLE,
+    CONF_USE_SPEAKER_BOOST,
+    CONF_VOICE,
+    DEFAULT_OPTIMIZE_LATENCY,
+    DEFAULT_SIMILARITY,
+    DEFAULT_STABILITY,
+    DEFAULT_STYLE,
+    DEFAULT_USE_SPEAKER_BOOST,
+    DOMAIN,
+)
 from homeassistant.components.media_player import (
     ATTR_MEDIA_CONTENT_ID,
     DOMAIN as DOMAIN_MP,
@@ -53,17 +67,32 @@ async def setup_internal_url(hass: HomeAssistant) -> None:
     )
 
 
+@pytest.fixture
+def mock_similarity():
+    """Mock similarity."""
+    return DEFAULT_SIMILARITY / 2
+
+
+@pytest.fixture
+def mock_latency():
+    """Mock latency."""
+    return (DEFAULT_OPTIMIZE_LATENCY + 1) % 5  # 0, 1, 2, 3, 4
+
+
 @pytest.fixture(name="setup")
 async def setup_fixture(
     hass: HomeAssistant,
     config_data: dict[str, Any],
     config_options: dict[str, Any],
+    config_options_voice: dict[str, Any],
     request: pytest.FixtureRequest,
     mock_async_client: AsyncMock,
 ) -> AsyncMock:
     """Set up the test environment."""
     if request.param == "mock_config_entry_setup":
         await mock_config_entry_setup(hass, config_data, config_options)
+    elif request.param == "mock_config_entry_setup_voice":
+        await mock_config_entry_setup(hass, config_data, config_options_voice)
     else:
         raise RuntimeError("Invalid setup fixture")
 
@@ -83,6 +112,18 @@ def config_options_fixture() -> dict[str, Any]:
     return {}
 
 
+@pytest.fixture(name="config_options_voice")
+def config_options_voice_fixture(mock_similarity, mock_latency) -> dict[str, Any]:
+    """Return config options."""
+    return {
+        CONF_OPTIMIZE_LATENCY: mock_latency,
+        CONF_SIMILARITY: mock_similarity,
+        CONF_STABILITY: DEFAULT_STABILITY,
+        CONF_STYLE: DEFAULT_STYLE,
+        CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
+    }
+
+
 async def mock_config_entry_setup(
     hass: HomeAssistant, config_data: dict[str, Any], config_options: dict[str, Any]
 ) -> None:
@@ -146,6 +187,12 @@ async def test_tts_service_speak(
     """Test tts service."""
     tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
     tts_entity._client.generate.reset_mock()
+    assert tts_entity._voice_settings == VoiceSettings(
+        stability=DEFAULT_STABILITY,
+        similarity_boost=DEFAULT_SIMILARITY,
+        style=DEFAULT_STYLE,
+        use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
+    )
 
     await hass.services.async_call(
         tts.DOMAIN,
@@ -161,7 +208,11 @@ async def test_tts_service_speak(
     )
 
     tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice2", model="model1"
+        text="There is a person at the front door.",
+        voice="voice2",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
     )
 
 
@@ -219,7 +270,11 @@ async def test_tts_service_speak_lang_config(
     )
 
     tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice1", model="model1"
+        text="There is a person at the front door.",
+        voice="voice1",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
     )
 
 
@@ -266,5 +321,78 @@ async def test_tts_service_speak_error(
     )
 
     tts_entity._client.generate.assert_called_once_with(
-        text="There is a person at the front door.", voice="voice1", model="model1"
+        text="There is a person at the front door.",
+        voice="voice1",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
+    )
+
+
+@pytest.mark.parametrize(
+    "config_data",
+    [
+        {},
+        {tts.CONF_LANG: "de"},
+        {tts.CONF_LANG: "en"},
+        {tts.CONF_LANG: "ja"},
+        {tts.CONF_LANG: "es"},
+    ],
+)
+@pytest.mark.parametrize(
+    ("setup", "tts_service", "service_data"),
+    [
+        (
+            "mock_config_entry_setup_voice",
+            "speak",
+            {
+                ATTR_ENTITY_ID: "tts.mock_title",
+                tts.ATTR_MEDIA_PLAYER_ENTITY_ID: "media_player.something",
+                tts.ATTR_MESSAGE: "There is a person at the front door.",
+                tts.ATTR_OPTIONS: {tts.ATTR_VOICE: "voice2"},
+            },
+        ),
+    ],
+    indirect=["setup"],
+)
+async def test_tts_service_speak_voice_settings(
+    setup: AsyncMock,
+    hass: HomeAssistant,
+    hass_client: ClientSessionGenerator,
+    calls: list[ServiceCall],
+    tts_service: str,
+    service_data: dict[str, Any],
+    mock_similarity: float,
+    mock_latency: int,
+) -> None:
+    """Test tts service."""
+    tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
+    tts_entity._client.generate.reset_mock()
+    assert tts_entity._voice_settings == VoiceSettings(
+        stability=DEFAULT_STABILITY,
+        similarity_boost=mock_similarity,
+        style=DEFAULT_STYLE,
+        use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
+    )
+    assert tts_entity._latency == mock_latency
+
+    await hass.services.async_call(
+        tts.DOMAIN,
+        tts_service,
+        service_data,
+        blocking=True,
+    )
+
+    assert len(calls) == 1
+    assert (
+        await retrieve_media(hass, hass_client, calls[0].data[ATTR_MEDIA_CONTENT_ID])
+        == HTTPStatus.OK
+    )
+
+    tts_entity._client.generate.assert_called_once_with(
+        text="There is a person at the front door.",
+        voice="voice2",
+        model="model1",
+        voice_settings=tts_entity._voice_settings,
+        optimize_streaming_latency=tts_entity._latency,
     )