mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
chunk speech after s if no pause detected by VAD (#328)
* chunk speech after s if no pause detected by VAD * add attr descriptions in AlgoOptions * Fix --------- Co-authored-by: Freddy Boulton <41651716+freddyaboulton@users.noreply.github.com>
This commit is contained in:
@@ -18,11 +18,20 @@ logger = getLogger(__name__)
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class AlgoOptions:
|
class AlgoOptions:
|
||||||
"""Algorithm options."""
|
"""
|
||||||
|
Algorithm options.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
- audio_chunk_duration: Duration in seconds of audio chunks passed to the VAD model.
|
||||||
|
- started_talking_threshold: If the chunk has more than started_talking_threshold seconds of speech, the user started talking.
|
||||||
|
- speech_threshold: If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking.
|
||||||
|
- max_continuous_speech_s: Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model.
|
||||||
|
"""
|
||||||
|
|
||||||
audio_chunk_duration: float = 0.6
|
audio_chunk_duration: float = 0.6
|
||||||
started_talking_threshold: float = 0.2
|
started_talking_threshold: float = 0.2
|
||||||
speech_threshold: float = 0.1
|
speech_threshold: float = 0.1
|
||||||
|
max_continuous_speech_s: float = float("inf")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -216,7 +225,14 @@ class ReplyOnPause(StreamHandler):
|
|||||||
state.stream = audio
|
state.stream = audio
|
||||||
else:
|
else:
|
||||||
state.stream = np.concatenate((state.stream, audio))
|
state.stream = np.concatenate((state.stream, audio))
|
||||||
|
|
||||||
|
# Check if continuous speech limit has been reached
|
||||||
|
current_duration = len(state.stream) / sampling_rate
|
||||||
|
if current_duration >= self.algo_options.max_continuous_speech_s:
|
||||||
|
return True
|
||||||
state.buffer = None
|
state.buffer = None
|
||||||
|
|
||||||
|
# Check if a pause has been detected by the VAD model
|
||||||
if dur_vad < self.algo_options.speech_threshold and state.started_talking:
|
if dur_vad < self.algo_options.speech_threshold and state.started_talking:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user