chunk speech after s if no pause detected by VAD (#328)

* chunk speech after s if no pause detected by VAD * add attr descriptions in AlgoOptions * Fix --------- Co-authored-by: Freddy Boulton <41651716+freddyaboulton@users.noreply.github.com>
2026-02-04 17:39:23 +08:00 · 2025-05-27 20:54:33 +02:00
parent db6d411538
commit 6f02a2f2a9
1 changed files with 17 additions and 1 deletions
--- a/backend/fastrtc/reply_on_pause.py
+++ b/backend/fastrtc/reply_on_pause.py
@@ -18,11 +18,20 @@ logger = getLogger(__name__)

@dataclass
 class AlgoOptions:
-    """Algorithm options."""
+    """
+    Algorithm options.
+
+    Attributes:
+    - audio_chunk_duration: Duration in seconds of audio chunks passed to the VAD model.
+    - started_talking_threshold: If the chunk has more than started_talking_threshold seconds of speech, the user started talking.
+    - speech_threshold: If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking.
+    - max_continuous_speech_s: Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model.
+    """

    audio_chunk_duration: float = 0.6
    started_talking_threshold: float = 0.2
    speech_threshold: float = 0.1
+    max_continuous_speech_s: float = float("inf")


@dataclass
@@ -216,7 +225,14 @@ class ReplyOnPause(StreamHandler):
                    state.stream = audio
                else:
                    state.stream = np.concatenate((state.stream, audio))
+
+                # Check if continuous speech limit has been reached
+                current_duration = len(state.stream) / sampling_rate
+                if current_duration >= self.algo_options.max_continuous_speech_s:
+                    return True
            state.buffer = None
+
+            # Check if a pause has been detected by the VAD model
            if dur_vad < self.algo_options.speech_threshold and state.started_talking:
                return True
        return False