Add speech-to-text cooldown for local wake word (#108806)

* Deconflict based on wake word * Undo test * Make wake up key a string, rename error * Update snapshot * Change to "wake word phrase" and normalize * Move normalization into the wake provider * Working on describe * Use satellite info to resolve wake word phrase * Add test for wake word phrase * Match phrase with model name in wake word provider * Check model id * Use one constant wake word cooldown * Update homeassistant/components/assist_pipeline/error.py Co-authored-by: Paulus Schoutsen <balloob@gmail.com> * Fix wake word tests --------- Co-authored-by: Paulus Schoutsen <balloob@gmail.com>
2024-02-26 19:35:19 -06:00 · 2024-02-26 19:35:19 -06:00 · f6622ea8e0
commit f6622ea8e0
parent c38e0d22b8
20 changed files with 641 additions and 184 deletions
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@ -55,10 +55,11 @@ from .const import (
    CONF_DEBUG_RECORDING_DIR,
    DATA_CONFIG,
    DATA_LAST_WAKE_UP,
-    DEFAULT_WAKE_WORD_COOLDOWN,
    DOMAIN,
+    WAKE_WORD_COOLDOWN,
 )
 from .error import (
+    DuplicateWakeUpDetectedError,
    IntentRecognitionError,
    PipelineError,
    PipelineNotFound,
@ -453,9 +454,6 @@ class WakeWordSettings:
    audio_seconds_to_buffer: float = 0
    """Seconds of audio to buffer before detection and forward to STT."""

-    cooldown_seconds: float = DEFAULT_WAKE_WORD_COOLDOWN
-    """Seconds after a wake word detection where other detections are ignored."""
-

@dataclass(frozen=True)
 class AudioSettings:
@ -742,16 +740,22 @@ class PipelineRun:
            wake_word_output: dict[str, Any] = {}
        else:
            # Avoid duplicate detections by checking cooldown
-            wake_up_key = f"{self.wake_word_entity_id}.{result.wake_word_id}"
-            last_wake_up = self.hass.data[DATA_LAST_WAKE_UP].get(wake_up_key)
+            last_wake_up = self.hass.data[DATA_LAST_WAKE_UP].get(
+                result.wake_word_phrase
+            )
            if last_wake_up is not None:
                sec_since_last_wake_up = time.monotonic() - last_wake_up
-                if sec_since_last_wake_up < wake_word_settings.cooldown_seconds:
-                    _LOGGER.debug("Duplicate wake word detection occurred")
-                    raise WakeWordDetectionAborted
+                if sec_since_last_wake_up < WAKE_WORD_COOLDOWN:
+                    _LOGGER.debug(
+                        "Duplicate wake word detection occurred for %s",
+                        result.wake_word_phrase,
+                    )
+                    raise DuplicateWakeUpDetectedError(result.wake_word_phrase)

            # Record last wake up time to block duplicate detections
-            self.hass.data[DATA_LAST_WAKE_UP][wake_up_key] = time.monotonic()
+            self.hass.data[DATA_LAST_WAKE_UP][
+                result.wake_word_phrase
+            ] = time.monotonic()

            if result.queued_audio:
                # Add audio that was pending at detection.
@ -1308,6 +1312,9 @@ class PipelineInput:
    stt_stream: AsyncIterable[bytes] | None = None
    """Input audio for stt. Required when start_stage = stt."""

+    wake_word_phrase: str | None = None
+    """Optional key used to de-duplicate wake-ups for local wake word detection."""
+
    intent_input: str | None = None
    """Input for conversation agent. Required when start_stage = intent."""

@ -1352,6 +1359,25 @@ class PipelineInput:
                assert self.stt_metadata is not None
                assert stt_processed_stream is not None

+                if self.wake_word_phrase is not None:
+                    # Avoid duplicate wake-ups by checking cooldown
+                    last_wake_up = self.run.hass.data[DATA_LAST_WAKE_UP].get(
+                        self.wake_word_phrase
+                    )
+                    if last_wake_up is not None:
+                        sec_since_last_wake_up = time.monotonic() - last_wake_up
+                        if sec_since_last_wake_up < WAKE_WORD_COOLDOWN:
+                            _LOGGER.debug(
+                                "Speech-to-text cancelled to avoid duplicate wake-up for %s",
+                                self.wake_word_phrase,
+                            )
+                            raise DuplicateWakeUpDetectedError(self.wake_word_phrase)
+
+                    # Record last wake up time to block duplicate detections
+                    self.run.hass.data[DATA_LAST_WAKE_UP][
+                        self.wake_word_phrase
+                    ] = time.monotonic()
+
                stt_input_stream = stt_processed_stream

                if stt_audio_buffer: