Add pipeline VAD events (#98603)

* Add stt-vad-start and stt-vad-end pipeline events * Update tests
2023-08-17 18:58:58 -05:00 · 2023-08-17 18:58:58 -05:00 · 49d2c60992
commit 49d2c60992
parent c17f08a3f5
3 changed files with 34 additions and 6 deletions
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@ -254,6 +254,8 @@ class PipelineEventType(StrEnum):
    WAKE_WORD_START = "wake_word-start"
    WAKE_WORD_END = "wake_word-end"
    STT_START = "stt-start"
+    STT_VAD_START = "stt-vad-start"
+    STT_VAD_END = "stt-vad-end"
    STT_END = "stt-end"
    INTENT_START = "intent-start"
    INTENT_END = "intent-end"
@ -612,11 +614,31 @@ class PipelineRun:
                stream: AsyncIterable[bytes],
            ) -> AsyncGenerator[bytes, None]:
                """Stop stream when voice command is finished."""
+                sent_vad_start = False
+                timestamp_ms = 0
                async for chunk in stream:
                    if not segmenter.process(chunk):
+                        # Silence detected at the end of voice command
+                        self.process_event(
+                            PipelineEvent(
+                                PipelineEventType.STT_VAD_END,
+                                {"timestamp": timestamp_ms},
+                            )
+                        )
                        break

+                    if segmenter.in_command and (not sent_vad_start):
+                        # Speech detected at start of voice command
+                        self.process_event(
+                            PipelineEvent(
+                                PipelineEventType.STT_VAD_START,
+                                {"timestamp": timestamp_ms},
+                            )
+                        )
+                        sent_vad_start = True
+
                    yield chunk
+                    timestamp_ms += (len(chunk) // 2) // 16  # milliseconds @ 16Khz

            # Transcribe audio stream
            result = await self.stt_provider.async_process_audio_stream(
--- a/tests/components/assist_pipeline/snapshots/test_init.ambr
+++ b/tests/components/assist_pipeline/snapshots/test_init.ambr
@ -311,6 +311,12 @@
      }),
      'type': <PipelineEventType.STT_START: 'stt-start'>,
    }),
+    dict({
+      'data': dict({
+        'timestamp': 0,
+      }),
+      'type': <PipelineEventType.STT_VAD_START: 'stt-vad-start'>,
+    }),
    dict({
      'data': dict({
        'stt_output': dict({
--- a/tests/components/assist_pipeline/test_init.py
+++ b/tests/components/assist_pipeline/test_init.py
@ -40,7 +40,7 @@ async def test_pipeline_from_audio_stream_auto(
    In this test, no pipeline is specified.
    """

-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    async def audio_data():
        yield b"part1"
@ -79,7 +79,7 @@ async def test_pipeline_from_audio_stream_legacy(
    """
    client = await hass_ws_client(hass)

-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    async def audio_data():
        yield b"part1"
@ -139,7 +139,7 @@ async def test_pipeline_from_audio_stream_entity(
    """
    client = await hass_ws_client(hass)

-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    async def audio_data():
        yield b"part1"
@ -199,7 +199,7 @@ async def test_pipeline_from_audio_stream_no_stt(
    """
    client = await hass_ws_client(hass)

-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    async def audio_data():
        yield b"part1"
@ -257,7 +257,7 @@ async def test_pipeline_from_audio_stream_unknown_pipeline(

    In this test, the pipeline does not exist.
    """
-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    async def audio_data():
        yield b"part1"
@ -294,7 +294,7 @@ async def test_pipeline_from_audio_stream_wake_word(
 ) -> None:
    """Test creating a pipeline from an audio stream with wake word."""

-    events = []
+    events: list[assist_pipeline.PipelineEvent] = []

    # [0, 1, ...]
    wake_chunk_1 = bytes(it.islice(it.cycle(range(256)), BYTES_ONE_SECOND))