Adjust VAD seconds better for microVAD (#123942)

This commit is contained in:
Michael Hansen 2024-08-15 04:08:40 -05:00 committed by GitHub
parent 9b78ae5908
commit f2d39feec0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 15 deletions

View file

@ -6,13 +6,11 @@ from collections.abc import Callable, Iterable
from dataclasses import dataclass
from enum import StrEnum
import logging
from typing import Final
from .const import SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
_LOGGER = logging.getLogger(__name__)
_SAMPLE_RATE: Final = 16000 # Hz
_SAMPLE_WIDTH: Final = 2 # bytes
class VadSensitivity(StrEnum):
"""How quickly the end of a voice command is detected."""
@ -26,12 +24,12 @@ class VadSensitivity(StrEnum):
"""Return seconds of silence for sensitivity level."""
sensitivity = VadSensitivity(sensitivity)
if sensitivity == VadSensitivity.RELAXED:
return 2.0
return 1.25
if sensitivity == VadSensitivity.AGGRESSIVE:
return 0.5
return 0.25
return 1.0
return 0.7
class AudioBuffer:
@ -80,7 +78,7 @@ class VoiceCommandSegmenter:
speech_seconds: float = 0.3
"""Seconds of speech before voice command has started."""
silence_seconds: float = 1.0
silence_seconds: float = 0.7
"""Seconds of silence after voice command has ended."""
timeout_seconds: float = 15.0
@ -92,6 +90,9 @@ class VoiceCommandSegmenter:
in_command: bool = False
"""True if inside voice command."""
timed_out: bool = False
"""True a timeout occurred during voice command."""
_speech_seconds_left: float = 0.0
"""Seconds left before considering voice command as started."""
@ -121,6 +122,9 @@ class VoiceCommandSegmenter:
Returns False when command is done.
"""
if self.timed_out:
self.timed_out = False
self._timeout_seconds_left -= chunk_seconds
if self._timeout_seconds_left <= 0:
_LOGGER.warning(
@ -128,6 +132,7 @@ class VoiceCommandSegmenter:
self.timeout_seconds,
)
self.reset()
self.timed_out = True
return False
if not self.in_command:
@ -179,7 +184,9 @@ class VoiceCommandSegmenter:
"""
if vad_samples_per_chunk is None:
# No chunking
chunk_seconds = (len(chunk) // _SAMPLE_WIDTH) / _SAMPLE_RATE
chunk_seconds = (
len(chunk) // (SAMPLE_WIDTH * SAMPLE_CHANNELS)
) / SAMPLE_RATE
is_speech = vad_is_speech(chunk)
return self.process(chunk_seconds, is_speech)
@ -187,8 +194,8 @@ class VoiceCommandSegmenter:
raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
# With chunking
seconds_per_chunk = vad_samples_per_chunk / _SAMPLE_RATE
bytes_per_chunk = vad_samples_per_chunk * _SAMPLE_WIDTH
seconds_per_chunk = vad_samples_per_chunk / SAMPLE_RATE
bytes_per_chunk = vad_samples_per_chunk * (SAMPLE_WIDTH * SAMPLE_CHANNELS)
for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
is_speech = vad_is_speech(vad_chunk)
if not self.process(seconds_per_chunk, is_speech):

View file

@ -17,15 +17,12 @@ def test_silence() -> None:
# True return value indicates voice command has not finished
assert segmenter.process(_ONE_SECOND * 3, False)
assert not segmenter.in_command
def test_speech() -> None:
"""Test that silence + speech + silence triggers a voice command."""
def is_speech(chunk):
"""Anything non-zero is speech."""
return sum(chunk) > 0
segmenter = VoiceCommandSegmenter()
# silence
@ -33,10 +30,12 @@ def test_speech() -> None:
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# silence
# False return value indicates voice command is finished
assert not segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
def test_audio_buffer() -> None:
@ -105,3 +104,105 @@ def test_chunk_samples_leftover() -> None:
assert len(chunks) == 1
assert leftover_chunk_buffer.bytes() == bytes([5, 6])
def test_silence_seconds() -> None:
"""Test end of voice command silence seconds."""
segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.in_command
def test_silence_reset() -> None:
"""Test that speech resets end of voice command detection."""
segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# speech should reset silence detection
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.in_command
def test_speech_reset() -> None:
"""Test that silence resets start of voice command detection."""
segmenter = VoiceCommandSegmenter(
silence_seconds=1.0, reset_seconds=0.5, speech_seconds=1.0
)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.in_command
# silence should reset speech detection
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.in_command
# exactly enough speech now
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.in_command
def test_timeout() -> None:
"""Test that voice command detection times out."""
segmenter = VoiceCommandSegmenter(timeout_seconds=1.0)
# not enough to time out
assert not segmenter.timed_out
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.timed_out
# enough to time out
assert not segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.timed_out
# flag resets with more audio
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.timed_out
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.timed_out