Adjust VAD seconds better for microVAD (#123942)

2024-08-15 04:08:40 -05:00 · 2024-08-15 04:08:40 -05:00 · f2d39feec0
commit f2d39feec0
parent 9b78ae5908
2 changed files with 123 additions and 15 deletions
--- a/homeassistant/components/assist_pipeline/vad.py
+++ b/homeassistant/components/assist_pipeline/vad.py
@ -6,13 +6,11 @@ from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from enum import StrEnum
 import logging
-from typing import Final
+
 from .const import SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
 _LOGGER = logging.getLogger(__name__)
 _SAMPLE_RATE: Final = 16000  # Hz
 _SAMPLE_WIDTH: Final = 2  # bytes
 class VadSensitivity(StrEnum):
    """How quickly the end of a voice command is detected."""
@ -26,12 +24,12 @@ class VadSensitivity(StrEnum):
        """Return seconds of silence for sensitivity level."""
        sensitivity = VadSensitivity(sensitivity)
        if sensitivity == VadSensitivity.RELAXED:
-            return 2.0
+            return 1.25
        if sensitivity == VadSensitivity.AGGRESSIVE:
-            return 0.5
+            return 0.25
-        return 1.0
+        return 0.7
 class AudioBuffer:
@ -80,7 +78,7 @@ class VoiceCommandSegmenter:
    speech_seconds: float = 0.3
    """Seconds of speech before voice command has started."""
-    silence_seconds: float = 1.0
+    silence_seconds: float = 0.7
    """Seconds of silence after voice command has ended."""
    timeout_seconds: float = 15.0
@ -92,6 +90,9 @@ class VoiceCommandSegmenter:
    in_command: bool = False
    """True if inside voice command."""
    timed_out: bool = False
    """True a timeout occurred during voice command."""
    _speech_seconds_left: float = 0.0
    """Seconds left before considering voice command as started."""
@ -121,6 +122,9 @@ class VoiceCommandSegmenter:
        Returns False when command is done.
        """
        if self.timed_out:
            self.timed_out = False
        self._timeout_seconds_left -= chunk_seconds
        if self._timeout_seconds_left <= 0:
            _LOGGER.warning(
@ -128,6 +132,7 @@ class VoiceCommandSegmenter:
                self.timeout_seconds,
            )
            self.reset()
            self.timed_out = True
            return False
        if not self.in_command:
@ -179,7 +184,9 @@ class VoiceCommandSegmenter:
        """
        if vad_samples_per_chunk is None:
            # No chunking
-            chunk_seconds = (len(chunk) // _SAMPLE_WIDTH) / _SAMPLE_RATE
+            chunk_seconds = (
                len(chunk) // (SAMPLE_WIDTH * SAMPLE_CHANNELS)
            ) / SAMPLE_RATE
            is_speech = vad_is_speech(chunk)
            return self.process(chunk_seconds, is_speech)
@ -187,8 +194,8 @@ class VoiceCommandSegmenter:
            raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
        # With chunking
-        seconds_per_chunk = vad_samples_per_chunk / _SAMPLE_RATE
+        seconds_per_chunk = vad_samples_per_chunk / SAMPLE_RATE
-        bytes_per_chunk = vad_samples_per_chunk * _SAMPLE_WIDTH
+        bytes_per_chunk = vad_samples_per_chunk * (SAMPLE_WIDTH * SAMPLE_CHANNELS)
        for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
            is_speech = vad_is_speech(vad_chunk)
            if not self.process(seconds_per_chunk, is_speech):
--- a/tests/components/assist_pipeline/test_vad.py
+++ b/tests/components/assist_pipeline/test_vad.py
@ -17,15 +17,12 @@ def test_silence() -> None:
    # True return value indicates voice command has not finished
    assert segmenter.process(_ONE_SECOND * 3, False)
    assert not segmenter.in_command
 def test_speech() -> None:
    """Test that silence + speech + silence triggers a voice command."""
    def is_speech(chunk):
        """Anything non-zero is speech."""
        return sum(chunk) > 0
    segmenter = VoiceCommandSegmenter()
    # silence
@ -33,10 +30,12 @@ def test_speech() -> None:
    # "speech"
    assert segmenter.process(_ONE_SECOND, True)
    assert segmenter.in_command
    # silence
    # False return value indicates voice command is finished
    assert not segmenter.process(_ONE_SECOND, False)
    assert not segmenter.in_command
 def test_audio_buffer() -> None:
@ -105,3 +104,105 @@ def test_chunk_samples_leftover() -> None:
    assert len(chunks) == 1
    assert leftover_chunk_buffer.bytes() == bytes([5, 6])
 def test_silence_seconds() -> None:
    """Test end of voice command silence seconds."""
    segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
    # silence
    assert segmenter.process(_ONE_SECOND, False)
    assert not segmenter.in_command
    # "speech"
    assert segmenter.process(_ONE_SECOND, True)
    assert segmenter.in_command
    # not enough silence to end
    assert segmenter.process(_ONE_SECOND * 0.5, False)
    assert segmenter.in_command
    # exactly enough silence now
    assert not segmenter.process(_ONE_SECOND * 0.5, False)
    assert not segmenter.in_command
 def test_silence_reset() -> None:
    """Test that speech resets end of voice command detection."""
    segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
    # silence
    assert segmenter.process(_ONE_SECOND, False)
    assert not segmenter.in_command
    # "speech"
    assert segmenter.process(_ONE_SECOND, True)
    assert segmenter.in_command
    # not enough silence to end
    assert segmenter.process(_ONE_SECOND * 0.5, False)
    assert segmenter.in_command
    # speech should reset silence detection
    assert segmenter.process(_ONE_SECOND * 0.5, True)
    assert segmenter.in_command
    # not enough silence to end
    assert segmenter.process(_ONE_SECOND * 0.5, False)
    assert segmenter.in_command
    # exactly enough silence now
    assert not segmenter.process(_ONE_SECOND * 0.5, False)
    assert not segmenter.in_command
 def test_speech_reset() -> None:
    """Test that silence resets start of voice command detection."""
    segmenter = VoiceCommandSegmenter(
        silence_seconds=1.0, reset_seconds=0.5, speech_seconds=1.0
    )
    # silence
    assert segmenter.process(_ONE_SECOND, False)
    assert not segmenter.in_command
    # not enough speech to start voice command
    assert segmenter.process(_ONE_SECOND * 0.5, True)
    assert not segmenter.in_command
    # silence should reset speech detection
    assert segmenter.process(_ONE_SECOND, False)
    assert not segmenter.in_command
    # not enough speech to start voice command
    assert segmenter.process(_ONE_SECOND * 0.5, True)
    assert not segmenter.in_command
    # exactly enough speech now
    assert segmenter.process(_ONE_SECOND * 0.5, True)
    assert segmenter.in_command
 def test_timeout() -> None:
    """Test that voice command detection times out."""
    segmenter = VoiceCommandSegmenter(timeout_seconds=1.0)
    # not enough to time out
    assert not segmenter.timed_out
    assert segmenter.process(_ONE_SECOND * 0.5, False)
    assert not segmenter.timed_out
    # enough to time out
    assert not segmenter.process(_ONE_SECOND * 0.5, True)
    assert segmenter.timed_out
    # flag resets with more audio
    assert segmenter.process(_ONE_SECOND * 0.5, True)
    assert not segmenter.timed_out
    assert not segmenter.process(_ONE_SECOND * 0.5, False)
    assert segmenter.timed_out