Add voice settings to ElevenLabs options flow (#123265)

Add voice settings to options flow
This commit is contained in:
Simon 2024-09-08 13:11:26 +02:00 committed by GitHub
parent 3139a7e431
commit 8acc027f38
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 349 additions and 11 deletions

View file

@ -23,7 +23,23 @@ from homeassistant.helpers.selector import (
SelectSelectorConfig,
)
from .const import CONF_MODEL, CONF_VOICE, DEFAULT_MODEL, DOMAIN
from .const import (
CONF_CONFIGURE_VOICE,
CONF_MODEL,
CONF_OPTIMIZE_LATENCY,
CONF_SIMILARITY,
CONF_STABILITY,
CONF_STYLE,
CONF_USE_SPEAKER_BOOST,
CONF_VOICE,
DEFAULT_MODEL,
DEFAULT_OPTIMIZE_LATENCY,
DEFAULT_SIMILARITY,
DEFAULT_STABILITY,
DEFAULT_STYLE,
DEFAULT_USE_SPEAKER_BOOST,
DOMAIN,
)
USER_STEP_SCHEMA = vol.Schema({vol.Required(CONF_API_KEY): str})
@ -92,6 +108,8 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
# id -> name
self.voices: dict[str, str] = {}
self.models: dict[str, str] = {}
self.model: str | None = None
self.voice: str | None = None
async def async_step_init(
self, user_input: dict[str, Any] | None = None
@ -103,6 +121,11 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
assert self.models and self.voices
if user_input is not None:
self.model = user_input[CONF_MODEL]
self.voice = user_input[CONF_VOICE]
configure_voice = user_input.pop(CONF_CONFIGURE_VOICE)
if configure_voice:
return await self.async_step_voice_settings()
return self.async_create_entry(
title="ElevenLabs",
data=user_input,
@ -139,7 +162,69 @@ class ElevenLabsOptionsFlow(OptionsFlowWithConfigEntry):
]
)
),
vol.Required(CONF_CONFIGURE_VOICE, default=False): bool,
}
),
self.options,
)
async def async_step_voice_settings(
self, user_input: dict[str, Any] | None = None
) -> ConfigFlowResult:
"""Handle voice settings."""
assert self.voices and self.models
if user_input is not None:
user_input[CONF_MODEL] = self.model
user_input[CONF_VOICE] = self.voice
return self.async_create_entry(
title="ElevenLabs",
data=user_input,
)
return self.async_show_form(
step_id="voice_settings",
data_schema=self.elevenlabs_config_options_voice_schema(),
)
def elevenlabs_config_options_voice_schema(self) -> vol.Schema:
"""Elevenlabs options voice schema."""
return vol.Schema(
{
vol.Optional(
CONF_STABILITY,
default=self.config_entry.options.get(
CONF_STABILITY, DEFAULT_STABILITY
),
): vol.All(
vol.Coerce(float),
vol.Range(min=0, max=1),
),
vol.Optional(
CONF_SIMILARITY,
default=self.config_entry.options.get(
CONF_SIMILARITY, DEFAULT_SIMILARITY
),
): vol.All(
vol.Coerce(float),
vol.Range(min=0, max=1),
),
vol.Optional(
CONF_OPTIMIZE_LATENCY,
default=self.config_entry.options.get(
CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
),
): vol.All(int, vol.Range(min=0, max=4)),
vol.Optional(
CONF_STYLE,
default=self.config_entry.options.get(CONF_STYLE, DEFAULT_STYLE),
): vol.All(
vol.Coerce(float),
vol.Range(min=0, max=1),
),
vol.Optional(
CONF_USE_SPEAKER_BOOST,
default=self.config_entry.options.get(
CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
),
): bool,
}
)

View file

@ -2,6 +2,17 @@
CONF_VOICE = "voice"
CONF_MODEL = "model"
CONF_CONFIGURE_VOICE = "configure_voice"
CONF_STABILITY = "stability"
CONF_SIMILARITY = "similarity"
CONF_OPTIMIZE_LATENCY = "optimize_streaming_latency"
CONF_STYLE = "style"
CONF_USE_SPEAKER_BOOST = "use_speaker_boost"
DOMAIN = "elevenlabs"
DEFAULT_MODEL = "eleven_multilingual_v2"
DEFAULT_STABILITY = 0.5
DEFAULT_SIMILARITY = 0.75
DEFAULT_OPTIMIZE_LATENCY = 0
DEFAULT_STYLE = 0
DEFAULT_USE_SPEAKER_BOOST = True

View file

@ -19,11 +19,29 @@
"init": {
"data": {
"voice": "Voice",
"model": "Model"
"model": "Model",
"configure_voice": "Configure advanced voice settings"
},
"data_description": {
"voice": "Voice to use for the TTS.",
"model": "ElevenLabs model to use. Please note that not all models support all languages equally well."
"model": "ElevenLabs model to use. Please note that not all models support all languages equally well.",
"configure_voice": "Configure advanced voice settings. Find more information in the ElevenLabs documentation."
}
},
"voice_settings": {
"data": {
"stability": "Stability",
"similarity": "Similarity",
"optimize_streaming_latency": "Latency",
"style": "Style",
"use_speaker_boost": "Speaker boost"
},
"data_description": {
"stability": "Stability of the generated audio. Higher values lead to less emotional audio.",
"similarity": "Similarity of the generated audio to the original voice. Higher values may result in more similar audio, but may also introduce background noise.",
"optimize_streaming_latency": "Optimize the model for streaming. This may reduce the quality of the generated audio.",
"style": "Style of the generated audio. Recommended to keep at 0 for most almost all use cases.",
"use_speaker_boost": "Use speaker boost to increase the similarity of the generated audio to the original voice."
}
}
}

View file

@ -3,11 +3,12 @@
from __future__ import annotations
import logging
from types import MappingProxyType
from typing import Any
from elevenlabs.client import AsyncElevenLabs
from elevenlabs.core import ApiError
from elevenlabs.types import Model, Voice as ElevenLabsVoice
from elevenlabs.types import Model, Voice as ElevenLabsVoice, VoiceSettings
from homeassistant.components.tts import (
ATTR_VOICE,
@ -21,11 +22,36 @@ from homeassistant.helpers.device_registry import DeviceEntryType, DeviceInfo
from homeassistant.helpers.entity_platform import AddEntitiesCallback
from . import EleventLabsConfigEntry
from .const import CONF_VOICE, DOMAIN
from .const import (
CONF_OPTIMIZE_LATENCY,
CONF_SIMILARITY,
CONF_STABILITY,
CONF_STYLE,
CONF_USE_SPEAKER_BOOST,
CONF_VOICE,
DEFAULT_OPTIMIZE_LATENCY,
DEFAULT_SIMILARITY,
DEFAULT_STABILITY,
DEFAULT_STYLE,
DEFAULT_USE_SPEAKER_BOOST,
DOMAIN,
)
_LOGGER = logging.getLogger(__name__)
def to_voice_settings(options: MappingProxyType[str, Any]) -> VoiceSettings:
"""Return voice settings."""
return VoiceSettings(
stability=options.get(CONF_STABILITY, DEFAULT_STABILITY),
similarity_boost=options.get(CONF_SIMILARITY, DEFAULT_SIMILARITY),
style=options.get(CONF_STYLE, DEFAULT_STYLE),
use_speaker_boost=options.get(
CONF_USE_SPEAKER_BOOST, DEFAULT_USE_SPEAKER_BOOST
),
)
async def async_setup_entry(
hass: HomeAssistant,
config_entry: EleventLabsConfigEntry,
@ -35,6 +61,7 @@ async def async_setup_entry(
client = config_entry.runtime_data.client
voices = (await client.voices.get_all()).voices
default_voice_id = config_entry.options[CONF_VOICE]
voice_settings = to_voice_settings(config_entry.options)
async_add_entities(
[
ElevenLabsTTSEntity(
@ -44,6 +71,10 @@ async def async_setup_entry(
default_voice_id,
config_entry.entry_id,
config_entry.title,
voice_settings,
config_entry.options.get(
CONF_OPTIMIZE_LATENCY, DEFAULT_OPTIMIZE_LATENCY
),
)
]
)
@ -62,6 +93,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
default_voice_id: str,
entry_id: str,
title: str,
voice_settings: VoiceSettings,
latency: int = 0,
) -> None:
"""Init ElevenLabs TTS service."""
self._client = client
@ -77,6 +110,10 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
]
if voice_indices:
self._voices.insert(0, self._voices.pop(voice_indices[0]))
self._voice_settings = voice_settings
self._latency = latency
# Entity attributes
self._attr_unique_id = entry_id
self._attr_name = title
self._attr_device_info = DeviceInfo(
@ -105,6 +142,8 @@ class ElevenLabsTTSEntity(TextToSpeechEntity):
audio = await self._client.generate(
text=message,
voice=voice_id,
optimize_streaming_latency=self._latency,
voice_settings=self._voice_settings,
model=self._model.model_id,
)
bytes_combined = b"".join([byte_seg async for byte_seg in audio])

View file

@ -3,9 +3,20 @@
from unittest.mock import AsyncMock
from homeassistant.components.elevenlabs.const import (
CONF_CONFIGURE_VOICE,
CONF_MODEL,
CONF_OPTIMIZE_LATENCY,
CONF_SIMILARITY,
CONF_STABILITY,
CONF_STYLE,
CONF_USE_SPEAKER_BOOST,
CONF_VOICE,
DEFAULT_MODEL,
DEFAULT_OPTIMIZE_LATENCY,
DEFAULT_SIMILARITY,
DEFAULT_STABILITY,
DEFAULT_STYLE,
DEFAULT_USE_SPEAKER_BOOST,
DOMAIN,
)
from homeassistant.config_entries import SOURCE_USER
@ -89,6 +100,52 @@ async def test_options_flow_init(
)
assert result["type"] is FlowResultType.CREATE_ENTRY
assert mock_entry.options == {CONF_MODEL: "model1", CONF_VOICE: "voice1"}
assert mock_entry.options == {
CONF_MODEL: "model1",
CONF_VOICE: "voice1",
}
mock_setup_entry.assert_called_once()
async def test_options_flow_voice_settings_default(
hass: HomeAssistant,
mock_setup_entry: AsyncMock,
mock_async_client: AsyncMock,
mock_entry: MockConfigEntry,
) -> None:
"""Test options flow voice settings."""
mock_entry.add_to_hass(hass)
assert await hass.config_entries.async_setup(mock_entry.entry_id)
await hass.async_block_till_done()
result = await hass.config_entries.options.async_init(mock_entry.entry_id)
assert result["type"] is FlowResultType.FORM
assert result["step_id"] == "init"
result = await hass.config_entries.options.async_configure(
result["flow_id"],
user_input={
CONF_MODEL: "model1",
CONF_VOICE: "voice1",
CONF_CONFIGURE_VOICE: True,
},
)
assert result["type"] is FlowResultType.FORM
assert result["step_id"] == "voice_settings"
result = await hass.config_entries.options.async_configure(
result["flow_id"],
user_input={},
)
assert result["type"] is FlowResultType.CREATE_ENTRY
assert mock_entry.options == {
CONF_MODEL: "model1",
CONF_VOICE: "voice1",
CONF_OPTIMIZE_LATENCY: DEFAULT_OPTIMIZE_LATENCY,
CONF_SIMILARITY: DEFAULT_SIMILARITY,
CONF_STABILITY: DEFAULT_STABILITY,
CONF_STYLE: DEFAULT_STYLE,
CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
}

View file

@ -8,11 +8,25 @@ from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
from elevenlabs.core import ApiError
from elevenlabs.types import GetVoicesResponse
from elevenlabs.types import GetVoicesResponse, VoiceSettings
import pytest
from homeassistant.components import tts
from homeassistant.components.elevenlabs.const import CONF_MODEL, CONF_VOICE, DOMAIN
from homeassistant.components.elevenlabs.const import (
CONF_MODEL,
CONF_OPTIMIZE_LATENCY,
CONF_SIMILARITY,
CONF_STABILITY,
CONF_STYLE,
CONF_USE_SPEAKER_BOOST,
CONF_VOICE,
DEFAULT_OPTIMIZE_LATENCY,
DEFAULT_SIMILARITY,
DEFAULT_STABILITY,
DEFAULT_STYLE,
DEFAULT_USE_SPEAKER_BOOST,
DOMAIN,
)
from homeassistant.components.media_player import (
ATTR_MEDIA_CONTENT_ID,
DOMAIN as DOMAIN_MP,
@ -53,17 +67,32 @@ async def setup_internal_url(hass: HomeAssistant) -> None:
)
@pytest.fixture
def mock_similarity():
"""Mock similarity."""
return DEFAULT_SIMILARITY / 2
@pytest.fixture
def mock_latency():
"""Mock latency."""
return (DEFAULT_OPTIMIZE_LATENCY + 1) % 5 # 0, 1, 2, 3, 4
@pytest.fixture(name="setup")
async def setup_fixture(
hass: HomeAssistant,
config_data: dict[str, Any],
config_options: dict[str, Any],
config_options_voice: dict[str, Any],
request: pytest.FixtureRequest,
mock_async_client: AsyncMock,
) -> AsyncMock:
"""Set up the test environment."""
if request.param == "mock_config_entry_setup":
await mock_config_entry_setup(hass, config_data, config_options)
elif request.param == "mock_config_entry_setup_voice":
await mock_config_entry_setup(hass, config_data, config_options_voice)
else:
raise RuntimeError("Invalid setup fixture")
@ -83,6 +112,18 @@ def config_options_fixture() -> dict[str, Any]:
return {}
@pytest.fixture(name="config_options_voice")
def config_options_voice_fixture(mock_similarity, mock_latency) -> dict[str, Any]:
"""Return config options."""
return {
CONF_OPTIMIZE_LATENCY: mock_latency,
CONF_SIMILARITY: mock_similarity,
CONF_STABILITY: DEFAULT_STABILITY,
CONF_STYLE: DEFAULT_STYLE,
CONF_USE_SPEAKER_BOOST: DEFAULT_USE_SPEAKER_BOOST,
}
async def mock_config_entry_setup(
hass: HomeAssistant, config_data: dict[str, Any], config_options: dict[str, Any]
) -> None:
@ -146,6 +187,12 @@ async def test_tts_service_speak(
"""Test tts service."""
tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
tts_entity._client.generate.reset_mock()
assert tts_entity._voice_settings == VoiceSettings(
stability=DEFAULT_STABILITY,
similarity_boost=DEFAULT_SIMILARITY,
style=DEFAULT_STYLE,
use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
)
await hass.services.async_call(
tts.DOMAIN,
@ -161,7 +208,11 @@ async def test_tts_service_speak(
)
tts_entity._client.generate.assert_called_once_with(
text="There is a person at the front door.", voice="voice2", model="model1"
text="There is a person at the front door.",
voice="voice2",
model="model1",
voice_settings=tts_entity._voice_settings,
optimize_streaming_latency=tts_entity._latency,
)
@ -219,7 +270,11 @@ async def test_tts_service_speak_lang_config(
)
tts_entity._client.generate.assert_called_once_with(
text="There is a person at the front door.", voice="voice1", model="model1"
text="There is a person at the front door.",
voice="voice1",
model="model1",
voice_settings=tts_entity._voice_settings,
optimize_streaming_latency=tts_entity._latency,
)
@ -266,5 +321,78 @@ async def test_tts_service_speak_error(
)
tts_entity._client.generate.assert_called_once_with(
text="There is a person at the front door.", voice="voice1", model="model1"
text="There is a person at the front door.",
voice="voice1",
model="model1",
voice_settings=tts_entity._voice_settings,
optimize_streaming_latency=tts_entity._latency,
)
@pytest.mark.parametrize(
"config_data",
[
{},
{tts.CONF_LANG: "de"},
{tts.CONF_LANG: "en"},
{tts.CONF_LANG: "ja"},
{tts.CONF_LANG: "es"},
],
)
@pytest.mark.parametrize(
("setup", "tts_service", "service_data"),
[
(
"mock_config_entry_setup_voice",
"speak",
{
ATTR_ENTITY_ID: "tts.mock_title",
tts.ATTR_MEDIA_PLAYER_ENTITY_ID: "media_player.something",
tts.ATTR_MESSAGE: "There is a person at the front door.",
tts.ATTR_OPTIONS: {tts.ATTR_VOICE: "voice2"},
},
),
],
indirect=["setup"],
)
async def test_tts_service_speak_voice_settings(
setup: AsyncMock,
hass: HomeAssistant,
hass_client: ClientSessionGenerator,
calls: list[ServiceCall],
tts_service: str,
service_data: dict[str, Any],
mock_similarity: float,
mock_latency: int,
) -> None:
"""Test tts service."""
tts_entity = hass.data[tts.DOMAIN].get_entity(service_data[ATTR_ENTITY_ID])
tts_entity._client.generate.reset_mock()
assert tts_entity._voice_settings == VoiceSettings(
stability=DEFAULT_STABILITY,
similarity_boost=mock_similarity,
style=DEFAULT_STYLE,
use_speaker_boost=DEFAULT_USE_SPEAKER_BOOST,
)
assert tts_entity._latency == mock_latency
await hass.services.async_call(
tts.DOMAIN,
tts_service,
service_data,
blocking=True,
)
assert len(calls) == 1
assert (
await retrieve_media(hass, hass_client, calls[0].data[ATTR_MEDIA_CONTENT_ID])
== HTTPStatus.OK
)
tts_entity._client.generate.assert_called_once_with(
text="There is a person at the front door.",
voice="voice2",
model="model1",
voice_settings=tts_entity._voice_settings,
optimize_streaming_latency=tts_entity._latency,
)