Adjust VAD seconds better for microVAD (#123942)
This commit is contained in:
parent
9b78ae5908
commit
f2d39feec0
2 changed files with 123 additions and 15 deletions
|
@ -6,13 +6,11 @@ from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import StrEnum
|
from enum import StrEnum
|
||||||
import logging
|
import logging
|
||||||
from typing import Final
|
|
||||||
|
from .const import SAMPLE_CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
|
||||||
|
|
||||||
_LOGGER = logging.getLogger(__name__)
|
_LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
_SAMPLE_RATE: Final = 16000 # Hz
|
|
||||||
_SAMPLE_WIDTH: Final = 2 # bytes
|
|
||||||
|
|
||||||
|
|
||||||
class VadSensitivity(StrEnum):
|
class VadSensitivity(StrEnum):
|
||||||
"""How quickly the end of a voice command is detected."""
|
"""How quickly the end of a voice command is detected."""
|
||||||
|
@ -26,12 +24,12 @@ class VadSensitivity(StrEnum):
|
||||||
"""Return seconds of silence for sensitivity level."""
|
"""Return seconds of silence for sensitivity level."""
|
||||||
sensitivity = VadSensitivity(sensitivity)
|
sensitivity = VadSensitivity(sensitivity)
|
||||||
if sensitivity == VadSensitivity.RELAXED:
|
if sensitivity == VadSensitivity.RELAXED:
|
||||||
return 2.0
|
return 1.25
|
||||||
|
|
||||||
if sensitivity == VadSensitivity.AGGRESSIVE:
|
if sensitivity == VadSensitivity.AGGRESSIVE:
|
||||||
return 0.5
|
return 0.25
|
||||||
|
|
||||||
return 1.0
|
return 0.7
|
||||||
|
|
||||||
|
|
||||||
class AudioBuffer:
|
class AudioBuffer:
|
||||||
|
@ -80,7 +78,7 @@ class VoiceCommandSegmenter:
|
||||||
speech_seconds: float = 0.3
|
speech_seconds: float = 0.3
|
||||||
"""Seconds of speech before voice command has started."""
|
"""Seconds of speech before voice command has started."""
|
||||||
|
|
||||||
silence_seconds: float = 1.0
|
silence_seconds: float = 0.7
|
||||||
"""Seconds of silence after voice command has ended."""
|
"""Seconds of silence after voice command has ended."""
|
||||||
|
|
||||||
timeout_seconds: float = 15.0
|
timeout_seconds: float = 15.0
|
||||||
|
@ -92,6 +90,9 @@ class VoiceCommandSegmenter:
|
||||||
in_command: bool = False
|
in_command: bool = False
|
||||||
"""True if inside voice command."""
|
"""True if inside voice command."""
|
||||||
|
|
||||||
|
timed_out: bool = False
|
||||||
|
"""True a timeout occurred during voice command."""
|
||||||
|
|
||||||
_speech_seconds_left: float = 0.0
|
_speech_seconds_left: float = 0.0
|
||||||
"""Seconds left before considering voice command as started."""
|
"""Seconds left before considering voice command as started."""
|
||||||
|
|
||||||
|
@ -121,6 +122,9 @@ class VoiceCommandSegmenter:
|
||||||
|
|
||||||
Returns False when command is done.
|
Returns False when command is done.
|
||||||
"""
|
"""
|
||||||
|
if self.timed_out:
|
||||||
|
self.timed_out = False
|
||||||
|
|
||||||
self._timeout_seconds_left -= chunk_seconds
|
self._timeout_seconds_left -= chunk_seconds
|
||||||
if self._timeout_seconds_left <= 0:
|
if self._timeout_seconds_left <= 0:
|
||||||
_LOGGER.warning(
|
_LOGGER.warning(
|
||||||
|
@ -128,6 +132,7 @@ class VoiceCommandSegmenter:
|
||||||
self.timeout_seconds,
|
self.timeout_seconds,
|
||||||
)
|
)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
self.timed_out = True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not self.in_command:
|
if not self.in_command:
|
||||||
|
@ -179,7 +184,9 @@ class VoiceCommandSegmenter:
|
||||||
"""
|
"""
|
||||||
if vad_samples_per_chunk is None:
|
if vad_samples_per_chunk is None:
|
||||||
# No chunking
|
# No chunking
|
||||||
chunk_seconds = (len(chunk) // _SAMPLE_WIDTH) / _SAMPLE_RATE
|
chunk_seconds = (
|
||||||
|
len(chunk) // (SAMPLE_WIDTH * SAMPLE_CHANNELS)
|
||||||
|
) / SAMPLE_RATE
|
||||||
is_speech = vad_is_speech(chunk)
|
is_speech = vad_is_speech(chunk)
|
||||||
return self.process(chunk_seconds, is_speech)
|
return self.process(chunk_seconds, is_speech)
|
||||||
|
|
||||||
|
@ -187,8 +194,8 @@ class VoiceCommandSegmenter:
|
||||||
raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
|
raise ValueError("leftover_chunk_buffer is required when vad uses chunking")
|
||||||
|
|
||||||
# With chunking
|
# With chunking
|
||||||
seconds_per_chunk = vad_samples_per_chunk / _SAMPLE_RATE
|
seconds_per_chunk = vad_samples_per_chunk / SAMPLE_RATE
|
||||||
bytes_per_chunk = vad_samples_per_chunk * _SAMPLE_WIDTH
|
bytes_per_chunk = vad_samples_per_chunk * (SAMPLE_WIDTH * SAMPLE_CHANNELS)
|
||||||
for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
|
for vad_chunk in chunk_samples(chunk, bytes_per_chunk, leftover_chunk_buffer):
|
||||||
is_speech = vad_is_speech(vad_chunk)
|
is_speech = vad_is_speech(vad_chunk)
|
||||||
if not self.process(seconds_per_chunk, is_speech):
|
if not self.process(seconds_per_chunk, is_speech):
|
||||||
|
|
|
@ -17,15 +17,12 @@ def test_silence() -> None:
|
||||||
|
|
||||||
# True return value indicates voice command has not finished
|
# True return value indicates voice command has not finished
|
||||||
assert segmenter.process(_ONE_SECOND * 3, False)
|
assert segmenter.process(_ONE_SECOND * 3, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
|
||||||
def test_speech() -> None:
|
def test_speech() -> None:
|
||||||
"""Test that silence + speech + silence triggers a voice command."""
|
"""Test that silence + speech + silence triggers a voice command."""
|
||||||
|
|
||||||
def is_speech(chunk):
|
|
||||||
"""Anything non-zero is speech."""
|
|
||||||
return sum(chunk) > 0
|
|
||||||
|
|
||||||
segmenter = VoiceCommandSegmenter()
|
segmenter = VoiceCommandSegmenter()
|
||||||
|
|
||||||
# silence
|
# silence
|
||||||
|
@ -33,10 +30,12 @@ def test_speech() -> None:
|
||||||
|
|
||||||
# "speech"
|
# "speech"
|
||||||
assert segmenter.process(_ONE_SECOND, True)
|
assert segmenter.process(_ONE_SECOND, True)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
# silence
|
# silence
|
||||||
# False return value indicates voice command is finished
|
# False return value indicates voice command is finished
|
||||||
assert not segmenter.process(_ONE_SECOND, False)
|
assert not segmenter.process(_ONE_SECOND, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
|
||||||
def test_audio_buffer() -> None:
|
def test_audio_buffer() -> None:
|
||||||
|
@ -105,3 +104,105 @@ def test_chunk_samples_leftover() -> None:
|
||||||
|
|
||||||
assert len(chunks) == 1
|
assert len(chunks) == 1
|
||||||
assert leftover_chunk_buffer.bytes() == bytes([5, 6])
|
assert leftover_chunk_buffer.bytes() == bytes([5, 6])
|
||||||
|
|
||||||
|
|
||||||
|
def test_silence_seconds() -> None:
|
||||||
|
"""Test end of voice command silence seconds."""
|
||||||
|
|
||||||
|
segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
|
||||||
|
|
||||||
|
# silence
|
||||||
|
assert segmenter.process(_ONE_SECOND, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# "speech"
|
||||||
|
assert segmenter.process(_ONE_SECOND, True)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# not enough silence to end
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# exactly enough silence now
|
||||||
|
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
|
||||||
|
def test_silence_reset() -> None:
|
||||||
|
"""Test that speech resets end of voice command detection."""
|
||||||
|
|
||||||
|
segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
|
||||||
|
|
||||||
|
# silence
|
||||||
|
assert segmenter.process(_ONE_SECOND, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# "speech"
|
||||||
|
assert segmenter.process(_ONE_SECOND, True)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# not enough silence to end
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# speech should reset silence detection
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# not enough silence to end
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
# exactly enough silence now
|
||||||
|
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
|
||||||
|
def test_speech_reset() -> None:
|
||||||
|
"""Test that silence resets start of voice command detection."""
|
||||||
|
|
||||||
|
segmenter = VoiceCommandSegmenter(
|
||||||
|
silence_seconds=1.0, reset_seconds=0.5, speech_seconds=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# silence
|
||||||
|
assert segmenter.process(_ONE_SECOND, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# not enough speech to start voice command
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# silence should reset speech detection
|
||||||
|
assert segmenter.process(_ONE_SECOND, False)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# not enough speech to start voice command
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert not segmenter.in_command
|
||||||
|
|
||||||
|
# exactly enough speech now
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert segmenter.in_command
|
||||||
|
|
||||||
|
|
||||||
|
def test_timeout() -> None:
|
||||||
|
"""Test that voice command detection times out."""
|
||||||
|
|
||||||
|
segmenter = VoiceCommandSegmenter(timeout_seconds=1.0)
|
||||||
|
|
||||||
|
# not enough to time out
|
||||||
|
assert not segmenter.timed_out
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert not segmenter.timed_out
|
||||||
|
|
||||||
|
# enough to time out
|
||||||
|
assert not segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert segmenter.timed_out
|
||||||
|
|
||||||
|
# flag resets with more audio
|
||||||
|
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||||
|
assert not segmenter.timed_out
|
||||||
|
|
||||||
|
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||||
|
assert segmenter.timed_out
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue