Adjust VAD seconds better for microVAD (#123942)

This commit is contained in:
Michael Hansen 2024-08-15 04:08:40 -05:00 committed by GitHub
parent 9b78ae5908
commit f2d39feec0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 15 deletions

View file

@ -6,13 +6,11 @@ from collections.abc import Callable, Iterable
from dataclasses import dataclass from dataclasses import dataclass
from enum import StrEnum from enum import StrEnum
import logging import logging
from typing import Final
from .const import SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
_SAMPLE_RATE: Final = 16000 # Hz
_SAMPLE_WIDTH: Final = 2 # bytes
class VadSensitivity(StrEnum): class VadSensitivity(StrEnum):
"""How quickly the end of a voice command is detected.""" """How quickly the end of a voice command is detected."""
@ -26,12 +24,12 @@ class VadSensitivity(StrEnum):
"""Return seconds of silence for sensitivity level.""" """Return seconds of silence for sensitivity level."""
sensitivity = VadSensitivity(sensitivity) sensitivity = VadSensitivity(sensitivity)
if sensitivity == VadSensitivity.RELAXED: if sensitivity == VadSensitivity.RELAXED:
return 2.0 return 1.25
if sensitivity == VadSensitivity.AGGRESSIVE: if sensitivity == VadSensitivity.AGGRESSIVE:
return 0.5 return 0.25
return 1.0 return 0.7
class AudioBuffer: class AudioBuffer:
@ -80,7 +78,7 @@ class VoiceCommandSegmenter:
speech_seconds: float = 0.3 speech_seconds: float = 0.3
"""Seconds of speech before voice command has started.""" """Seconds of speech before voice command has started."""
silence_seconds: float = 1.0 silence_seconds: float = 0.7
"""Seconds of silence after voice command has ended.""" """Seconds of silence after voice command has ended."""
timeout_seconds: float = 15.0 timeout_seconds: float = 15.0
@ -92,6 +90,9 @@ class VoiceCommandSegmenter:
in_command: bool = False in_command: bool = False
"""True if inside voice command.""" """True if inside voice command."""
timed_out: bool = False
"""True a timeout occurred during voice command."""
_speech_seconds_left: float = 0.0 _speech_seconds_left: float = 0.0
"""Seconds left before considering voice command as started.""" """Seconds left before considering voice command as started."""
@ -121,6 +122,9 @@ class VoiceCommandSegmenter:
Returns False when command is done. Returns False when command is done.
""" """
if self.timed_out:
self.timed_out = False
self._timeout_seconds_left -= chunk_seconds self._timeout_seconds_left -= chunk_seconds
if self._timeout_seconds_left <= 0: if self._timeout_seconds_left <= 0:
_LOGGER.warning( _LOGGER.warning(
@ -128,6 +132,7 @@ class VoiceCommandSegmenter:
self.timeout_seconds, self.timeout_seconds,
) )
self.reset() self.reset()
self.timed_out = True
return False return False
if not self.in_command: if not self.in_command:
@ -179,7 +184,9 @@ class VoiceCommandSegmenter:
""" """
if vad_samples_per_chunk is None: if vad_samples_per_chunk is None:
# No chunking # No chunking
chunk_seconds = (len(chunk) // _SAMPLE_WIDTH) / _SAMPLE_RATE chunk_seconds = (
len(chunk) // (SAMPLE_WIDTH * SAMPLE_CHANNELS)
) / SAMPLE_RATE
is_speech = vad_is_speech(chunk) is_speech = vad_is_speech(chunk)
return self.process(chunk_seconds, is_speech) return self.process(chunk_seconds, is_speech)
@ -187,8 +194,8 @@ class VoiceCommandSegmenter:
raise ValueError("leftover_chunk_buffer is required when vad uses chunking") raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
# With chunking # With chunking
seconds_per_chunk = vad_samples_per_chunk / _SAMPLE_RATE seconds_per_chunk = vad_samples_per_chunk / SAMPLE_RATE
bytes_per_chunk = vad_samples_per_chunk * _SAMPLE_WIDTH bytes_per_chunk = vad_samples_per_chunk * (SAMPLE_WIDTH * SAMPLE_CHANNELS)
for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer): for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
is_speech = vad_is_speech(vad_chunk) is_speech = vad_is_speech(vad_chunk)
if not self.process(seconds_per_chunk, is_speech): if not self.process(seconds_per_chunk, is_speech):

View file

@ -17,15 +17,12 @@ def test_silence() -> None:
# True return value indicates voice command has not finished # True return value indicates voice command has not finished
assert segmenter.process(_ONE_SECOND * 3, False) assert segmenter.process(_ONE_SECOND * 3, False)
assert not segmenter.in_command
def test_speech() -> None: def test_speech() -> None:
"""Test that silence + speech + silence triggers a voice command.""" """Test that silence + speech + silence triggers a voice command."""
def is_speech(chunk):
"""Anything non-zero is speech."""
return sum(chunk) > 0
segmenter = VoiceCommandSegmenter() segmenter = VoiceCommandSegmenter()
# silence # silence
@ -33,10 +30,12 @@ def test_speech() -> None:
# "speech" # "speech"
assert segmenter.process(_ONE_SECOND, True) assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# silence # silence
# False return value indicates voice command is finished # False return value indicates voice command is finished
assert not segmenter.process(_ONE_SECOND, False) assert not segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
def test_audio_buffer() -> None: def test_audio_buffer() -> None:
@ -105,3 +104,105 @@ def test_chunk_samples_leftover() -> None:
assert len(chunks) == 1 assert len(chunks) == 1
assert leftover_chunk_buffer.bytes() == bytes([5, 6]) assert leftover_chunk_buffer.bytes() == bytes([5, 6])
def test_silence_seconds() -> None:
"""Test end of voice command silence seconds."""
segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.in_command
def test_silence_reset() -> None:
"""Test that speech resets end of voice command detection."""
segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# speech should reset silence detection
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.in_command
def test_speech_reset() -> None:
"""Test that silence resets start of voice command detection."""
segmenter = VoiceCommandSegmenter(
silence_seconds=1.0, reset_seconds=0.5, speech_seconds=1.0
)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.in_command
# silence should reset speech detection
assert segmenter.process(_ONE_SECOND, False)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.in_command
# exactly enough speech now
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.in_command
def test_timeout() -> None:
"""Test that voice command detection times out."""
segmenter = VoiceCommandSegmenter(timeout_seconds=1.0)
# not enough to time out
assert not segmenter.timed_out
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.timed_out
# enough to time out
assert not segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.timed_out
# flag resets with more audio
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.timed_out
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.timed_out