Fix TTS streaming for VoIP (#104620)

* Use wav instead of raw tts audio in voip * More tests * Use mock TTS dir
2023-11-29 11:07:22 -06:00 · 2023-11-29 11:07:22 -06:00 · a894146cee
commit a894146cee
parent 47426a3ddc
2 changed files with 239 additions and 8 deletions
--- a/tests/components/voip/test_voip.py
+++ b/tests/components/voip/test_voip.py
@ -1,7 +1,9 @@
 """Test VoIP protocol."""
 import asyncio
+import io
 import time
 from unittest.mock import AsyncMock, Mock, patch
+import wave

 import pytest

@ -14,6 +16,24 @@ _ONE_SECOND = 16000 * 2  # 16Khz 16-bit
 _MEDIA_ID = "12345"


+@pytest.fixture(autouse=True)
+def mock_tts_cache_dir_autouse(mock_tts_cache_dir):
+    """Mock the TTS cache dir with empty dir."""
+    return mock_tts_cache_dir
+
+
+def _empty_wav() -> bytes:
+    """Return bytes of an empty WAV file."""
+    with io.BytesIO() as wav_io:
+        wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+        with wav_file:
+            wav_file.setframerate(16000)
+            wav_file.setsampwidth(2)
+            wav_file.setnchannels(1)
+
+        return wav_io.getvalue()
+
+
 async def test_pipeline(
    hass: HomeAssistant,
    voip_device: VoIPDevice,
@ -72,8 +92,7 @@ async def test_pipeline(
        media_source_id: str,
    ) -> tuple[str, bytes]:
        assert media_source_id == _MEDIA_ID
-
-        return ("mp3", b"")
+        return ("wav", _empty_wav())

    with patch(
        "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
@ -266,7 +285,7 @@ async def test_tts_timeout(
        media_source_id: str,
    ) -> tuple[str, bytes]:
        # Should time out immediately
-        return ("raw", bytes(0))
+        return ("wav", _empty_wav())

    with patch(
        "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
@ -305,8 +324,197 @@ async def test_tts_timeout(

            done.set()

-        rtp_protocol._async_send_audio = AsyncMock(side_effect=async_send_audio)
-        rtp_protocol._send_tts = AsyncMock(side_effect=send_tts)
+        rtp_protocol._async_send_audio = AsyncMock(side_effect=async_send_audio)  # type: ignore[method-assign]
+        rtp_protocol._send_tts = AsyncMock(side_effect=send_tts)  # type: ignore[method-assign]
+
+        # silence
+        rtp_protocol.on_chunk(bytes(_ONE_SECOND))
+
+        # "speech"
+        rtp_protocol.on_chunk(bytes([255] * _ONE_SECOND * 2))
+
+        # silence (assumes relaxed VAD sensitivity)
+        rtp_protocol.on_chunk(bytes(_ONE_SECOND * 4))
+
+        # Wait for mock pipeline to exhaust the audio stream
+        async with asyncio.timeout(1):
+            await done.wait()
+
+
+async def test_tts_wrong_extension(
+    hass: HomeAssistant,
+    voip_device: VoIPDevice,
+) -> None:
+    """Test that TTS will only stream WAV audio."""
+    assert await async_setup_component(hass, "voip", {})
+
+    def is_speech(self, chunk):
+        """Anything non-zero is speech."""
+        return sum(chunk) > 0
+
+    done = asyncio.Event()
+
+    async def async_pipeline_from_audio_stream(*args, **kwargs):
+        stt_stream = kwargs["stt_stream"]
+        event_callback = kwargs["event_callback"]
+        async for _chunk in stt_stream:
+            # Stream will end when VAD detects end of "speech"
+            pass
+
+        # Fake intent result
+        event_callback(
+            assist_pipeline.PipelineEvent(
+                type=assist_pipeline.PipelineEventType.INTENT_END,
+                data={
+                    "intent_output": {
+                        "conversation_id": "fake-conversation",
+                    }
+                },
+            )
+        )
+
+        # Proceed with media output
+        event_callback(
+            assist_pipeline.PipelineEvent(
+                type=assist_pipeline.PipelineEventType.TTS_END,
+                data={"tts_output": {"media_id": _MEDIA_ID}},
+            )
+        )
+
+    async def async_get_media_source_audio(
+        hass: HomeAssistant,
+        media_source_id: str,
+    ) -> tuple[str, bytes]:
+        # Should fail because it's not "wav"
+        return ("mp3", b"")
+
+    with patch(
+        "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
+        new=is_speech,
+    ), patch(
+        "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
+        new=async_pipeline_from_audio_stream,
+    ), patch(
+        "homeassistant.components.voip.voip.tts.async_get_media_source_audio",
+        new=async_get_media_source_audio,
+    ):
+        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
+            hass,
+            hass.config.language,
+            voip_device,
+            Context(),
+            opus_payload_type=123,
+        )
+        rtp_protocol.transport = Mock()
+
+        original_send_tts = rtp_protocol._send_tts
+
+        async def send_tts(*args, **kwargs):
+            # Call original then end test successfully
+            with pytest.raises(ValueError):
+                await original_send_tts(*args, **kwargs)
+
+            done.set()
+
+        rtp_protocol._send_tts = AsyncMock(side_effect=send_tts)  # type: ignore[method-assign]
+
+        # silence
+        rtp_protocol.on_chunk(bytes(_ONE_SECOND))
+
+        # "speech"
+        rtp_protocol.on_chunk(bytes([255] * _ONE_SECOND * 2))
+
+        # silence (assumes relaxed VAD sensitivity)
+        rtp_protocol.on_chunk(bytes(_ONE_SECOND * 4))
+
+        # Wait for mock pipeline to exhaust the audio stream
+        async with asyncio.timeout(1):
+            await done.wait()
+
+
+async def test_tts_wrong_wav_format(
+    hass: HomeAssistant,
+    voip_device: VoIPDevice,
+) -> None:
+    """Test that TTS will only stream WAV audio with a specific format."""
+    assert await async_setup_component(hass, "voip", {})
+
+    def is_speech(self, chunk):
+        """Anything non-zero is speech."""
+        return sum(chunk) > 0
+
+    done = asyncio.Event()
+
+    async def async_pipeline_from_audio_stream(*args, **kwargs):
+        stt_stream = kwargs["stt_stream"]
+        event_callback = kwargs["event_callback"]
+        async for _chunk in stt_stream:
+            # Stream will end when VAD detects end of "speech"
+            pass
+
+        # Fake intent result
+        event_callback(
+            assist_pipeline.PipelineEvent(
+                type=assist_pipeline.PipelineEventType.INTENT_END,
+                data={
+                    "intent_output": {
+                        "conversation_id": "fake-conversation",
+                    }
+                },
+            )
+        )
+
+        # Proceed with media output
+        event_callback(
+            assist_pipeline.PipelineEvent(
+                type=assist_pipeline.PipelineEventType.TTS_END,
+                data={"tts_output": {"media_id": _MEDIA_ID}},
+            )
+        )
+
+    async def async_get_media_source_audio(
+        hass: HomeAssistant,
+        media_source_id: str,
+    ) -> tuple[str, bytes]:
+        # Should fail because it's not 16Khz, 16-bit mono
+        with io.BytesIO() as wav_io:
+            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+            with wav_file:
+                wav_file.setframerate(22050)
+                wav_file.setsampwidth(2)
+                wav_file.setnchannels(2)
+
+            return ("wav", wav_io.getvalue())
+
+    with patch(
+        "homeassistant.components.assist_pipeline.vad.WebRtcVad.is_speech",
+        new=is_speech,
+    ), patch(
+        "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
+        new=async_pipeline_from_audio_stream,
+    ), patch(
+        "homeassistant.components.voip.voip.tts.async_get_media_source_audio",
+        new=async_get_media_source_audio,
+    ):
+        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
+            hass,
+            hass.config.language,
+            voip_device,
+            Context(),
+            opus_payload_type=123,
+        )
+        rtp_protocol.transport = Mock()
+
+        original_send_tts = rtp_protocol._send_tts
+
+        async def send_tts(*args, **kwargs):
+            # Call original then end test successfully
+            with pytest.raises(ValueError):
+                await original_send_tts(*args, **kwargs)
+
+            done.set()
+
+        rtp_protocol._send_tts = AsyncMock(side_effect=send_tts)  # type: ignore[method-assign]

        # silence
        rtp_protocol.on_chunk(bytes(_ONE_SECOND))