t :# 请为您的变更输入提交说明。以 '#' 开始的行将被忽略，而一个空的提交

2026-02-05 09:59:22 +08:00 · 2025-01-21 13:55:45 +08:00
commit 8a313bd700
62 changed files with 14687 additions and 0 deletions
--- a/backend/gradio_webrtc/init.py
+++ b/backend/gradio_webrtc/init.py
@@ -0,0 +1,54 @@
+from .credentials import (
+    get_hf_turn_credentials,
+    get_turn_credentials,
+    get_twilio_turn_credentials,
+)
+from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
+from .reply_on_stopwords import ReplyOnStopWords
+from .speech_to_text import stt, stt_for_chunks
+from .utils import (
+    AdditionalOutputs,
+    Warning,
+    WebRTCError,
+    aggregate_bytes_to_16bit,
+    async_aggregate_bytes_to_16bit,
+    audio_to_bytes,
+    audio_to_file,
+    audio_to_float32,
+)
+from .webrtc import (
+    AsyncAudioVideoStreamHandler,
+    AsyncStreamHandler,
+    AudioVideoStreamHandler,
+    StreamHandler,
+    WebRTC,
+    VideoEmitType,
+    AudioEmitType,
+)
+
+__all__ = [
+    "AsyncStreamHandler",
+    "AudioVideoStreamHandler",
+    "AudioEmitType",
+    "AsyncAudioVideoStreamHandler",
+    "AlgoOptions",
+    "AdditionalOutputs",
+    "aggregate_bytes_to_16bit",
+    "async_aggregate_bytes_to_16bit",
+    "audio_to_bytes",
+    "audio_to_file",
+    "audio_to_float32",
+    "get_hf_turn_credentials",
+    "get_twilio_turn_credentials",
+    "get_turn_credentials",
+    "ReplyOnPause",
+    "ReplyOnStopWords",
+    "SileroVadOptions",
+    "stt",
+    "stt_for_chunks",
+    "StreamHandler",
+    "VideoEmitType",
+    "WebRTC",
+    "WebRTCError",
+    "Warning",
+]
--- a/backend/gradio_webrtc/credentials.py
+++ b/backend/gradio_webrtc/credentials.py
@@ -0,0 +1,52 @@
+import os
+from typing import Literal
+
+import requests
+
+
+def get_hf_turn_credentials(token=None):
+    if token is None:
+        token = os.getenv("HF_TOKEN")
+    credentials = requests.get(
+        "https://freddyaboulton-turn-server-login.hf.space/credentials",
+        headers={"X-HF-Access-Token": token},
+    )
+    if not credentials.status_code == 200:
+        raise ValueError("Failed to get credentials from HF turn server")
+    return {
+        "iceServers": [
+            {
+                "urls": "turn:gradio-turn.com:80",
+                **credentials.json(),
+            },
+        ]
+    }
+
+
+def get_twilio_turn_credentials(twilio_sid=None, twilio_token=None):
+    try:
+        from twilio.rest import Client
+    except ImportError:
+        raise ImportError("Please install twilio with `pip install twilio`")
+
+    if not twilio_sid and not twilio_token:
+        twilio_sid = os.environ.get("TWILIO_ACCOUNT_SID")
+        twilio_token = os.environ.get("TWILIO_AUTH_TOKEN")
+
+    client = Client(twilio_sid, twilio_token)
+
+    token = client.tokens.create()
+
+    return {
+        "iceServers": token.ice_servers,
+        "iceTransportPolicy": "relay",
+    }
+
+
+def get_turn_credentials(method: Literal["hf", "twilio"] = "hf", **kwargs):
+    if method == "hf":
+        return get_hf_turn_credentials(**kwargs)
+    elif method == "twilio":
+        return get_twilio_turn_credentials(**kwargs)
+    else:
+        raise ValueError("Invalid method. Must be 'hf' or 'twilio'")
--- a/backend/gradio_webrtc/pause_detection/init.py
+++ b/backend/gradio_webrtc/pause_detection/init.py
@@ -0,0 +1,3 @@
+from .vad import SileroVADModel, SileroVadOptions
+
+__all__ = ["SileroVADModel", "SileroVadOptions"]
--- a/backend/gradio_webrtc/pause_detection/vad.py
+++ b/backend/gradio_webrtc/pause_detection/vad.py
@@ -0,0 +1,320 @@
+import logging
+import warnings
+from dataclasses import dataclass
+from typing import List, Literal, overload
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from numpy.typing import NDArray
+
+from ..utils import AudioChunk
+
+logger = logging.getLogger(__name__)
+
+# The code below is adapted from https://github.com/snakers4/silero-vad.
+# The code below is adapted from https://github.com/gpt-omni/mini-omni/blob/main/utils/vad.py
+
+
+@dataclass
+class SileroVadOptions:
+    """VAD options.
+
+    Attributes:
+      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
+        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
+        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
+      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
+        than max_speech_duration_s will be split at the timestamp of the last silence that
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
+      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
+        before separating it
+      window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
+        Values other than these may affect model performance!!
+      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
+      speech_duration: If the length of the speech is less than this value, a pause will be detected.
+    """
+
+    threshold: float = 0.5
+    min_speech_duration_ms: int = 250
+    max_speech_duration_s: float = float("inf")
+    min_silence_duration_ms: int = 2000
+    window_size_samples: int = 1024
+    speech_pad_ms: int = 400
+
+
+class SileroVADModel:
+    @staticmethod
+    def download_model() -> str:
+        return hf_hub_download(
+            repo_id="freddyaboulton/silero-vad", filename="silero_vad.onnx"
+        )
+
+    def __init__(self):
+        try:
+            import onnxruntime
+        except ImportError as e:
+            raise RuntimeError(
+                "Applying the VAD filter requires the onnxruntime package"
+            ) from e
+
+        path = self.download_model()
+
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        opts.log_severity_level = 4
+
+        self.session = onnxruntime.InferenceSession(
+            path,
+            providers=["CPUExecutionProvider"],
+            sess_options=opts,
+        )
+
+    def get_initial_state(self, batch_size: int):
+        h = np.zeros((2, batch_size, 64), dtype=np.float32)
+        c = np.zeros((2, batch_size, 64), dtype=np.float32)
+        return h, c
+
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[AudioChunk]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
+        if not chunks:
+            return np.array([], dtype=np.float32)
+
+        return np.concatenate(
+            [audio[chunk["start"] : chunk["end"]] for chunk in chunks]
+        )
+
+    def get_speech_timestamps(
+        self,
+        audio: np.ndarray,
+        vad_options: SileroVadOptions,
+        **kwargs,
+    ) -> List[AudioChunk]:
+        """This method is used for splitting long audios into speech chunks using silero VAD.
+
+        Args:
+        audio: One dimensional float array.
+        vad_options: Options for VAD processing.
+        kwargs: VAD options passed as keyword arguments for backward compatibility.
+
+        Returns:
+        List of dicts containing begin and end samples of each speech chunk.
+        """
+
+        threshold = vad_options.threshold
+        min_speech_duration_ms = vad_options.min_speech_duration_ms
+        max_speech_duration_s = vad_options.max_speech_duration_s
+        min_silence_duration_ms = vad_options.min_silence_duration_ms
+        window_size_samples = vad_options.window_size_samples
+        speech_pad_ms = vad_options.speech_pad_ms
+
+        if window_size_samples not in [512, 1024, 1536]:
+            warnings.warn(
+                "Unusual window_size_samples! Supported window_size_samples:\n"
+                " - [512, 1024, 1536] for 16000 sampling_rate"
+            )
+
+        sampling_rate = 16000
+        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        max_speech_samples = (
+            sampling_rate * max_speech_duration_s
+            - window_size_samples
+            - 2 * speech_pad_samples
+        )
+        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+
+        audio_length_samples = len(audio)
+
+        state = self.get_initial_state(batch_size=1)
+
+        speech_probs = []
+        for current_start_sample in range(0, audio_length_samples, window_size_samples):
+            chunk = audio[
+                current_start_sample : current_start_sample + window_size_samples
+            ]
+            if len(chunk) < window_size_samples:
+                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+            speech_prob, state = self(chunk, state, sampling_rate)
+            speech_probs.append(speech_prob)
+
+        triggered = False
+        speeches = []
+        current_speech = {}
+        neg_threshold = threshold - 0.15
+
+        # to save potential segment end (and tolerate some silence)
+        temp_end = 0
+        # to save potential segment limits in case of maximum segment size reached
+        prev_end = next_start = 0
+
+        for i, speech_prob in enumerate(speech_probs):
+            if (speech_prob >= threshold) and temp_end:
+                temp_end = 0
+                if next_start < prev_end:
+                    next_start = window_size_samples * i
+
+            if (speech_prob >= threshold) and not triggered:
+                triggered = True
+                current_speech["start"] = window_size_samples * i
+                continue
+
+            if (
+                triggered
+                and (window_size_samples * i) - current_speech["start"]
+                > max_speech_samples
+            ):
+                if prev_end:
+                    current_speech["end"] = prev_end
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    if next_start < prev_end:
+                        triggered = False
+                    else:
+                        current_speech["start"] = next_start
+                    prev_end = next_start = temp_end = 0
+                else:
+                    current_speech["end"] = window_size_samples * i
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+
+            if (speech_prob < neg_threshold) and triggered:
+                if not temp_end:
+                    temp_end = window_size_samples * i
+                # condition to avoid cutting in very short silence
+                if (
+                    window_size_samples * i
+                ) - temp_end > min_silence_samples_at_max_speech:
+                    prev_end = temp_end
+                if (window_size_samples * i) - temp_end < min_silence_samples:
+                    continue
+                else:
+                    current_speech["end"] = temp_end
+                    if (
+                        current_speech["end"] - current_speech["start"]
+                    ) > min_speech_samples:
+                        speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+
+        if (
+            current_speech
+            and (audio_length_samples - current_speech["start"]) > min_speech_samples
+        ):
+            current_speech["end"] = audio_length_samples
+            speeches.append(current_speech)
+
+        for i, speech in enumerate(speeches):
+            if i == 0:
+                speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
+            if i != len(speeches) - 1:
+                silence_duration = speeches[i + 1]["start"] - speech["end"]
+                if silence_duration < 2 * speech_pad_samples:
+                    speech["end"] += int(silence_duration // 2)
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - silence_duration // 2)
+                    )
+                else:
+                    speech["end"] = int(
+                        min(audio_length_samples, speech["end"] + speech_pad_samples)
+                    )
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - speech_pad_samples)
+                    )
+            else:
+                speech["end"] = int(
+                    min(audio_length_samples, speech["end"] + speech_pad_samples)
+                )
+
+        return speeches
+
+    @overload
+    def vad(
+        self,
+        audio_tuple: tuple[int, NDArray],
+        vad_parameters: None | SileroVadOptions,
+        return_chunks: Literal[True],
+    ) -> tuple[float, List[AudioChunk]]: ...
+
+    @overload
+    def vad(
+        self,
+        audio_tuple: tuple[int, NDArray],
+        vad_parameters: None | SileroVadOptions,
+        return_chunks: bool = False,
+    ) -> float: ...
+
+    def vad(
+        self,
+        audio_tuple: tuple[int, NDArray],
+        vad_parameters: None | SileroVadOptions,
+        return_chunks: bool = False,
+    ) -> float | tuple[float, List[AudioChunk]]:
+        sampling_rate, audio = audio_tuple
+        logger.debug("VAD audio shape input: %s", audio.shape)
+        try:
+            if audio.dtype != np.float32:
+                audio = audio.astype(np.float32) / 32768.0
+            sr = 16000
+            if sr != sampling_rate:
+                try:
+                    import librosa  # type: ignore
+                except ImportError as e:
+                    raise RuntimeError(
+                        "Applying the VAD filter requires the librosa if the input sampling rate is not 16000hz"
+                    ) from e
+                audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
+
+            if not vad_parameters:
+                vad_parameters = SileroVadOptions()
+            speech_chunks = self.get_speech_timestamps(audio, vad_parameters)
+            logger.debug("VAD speech chunks: %s", speech_chunks)
+            audio = self.collect_chunks(audio, speech_chunks)
+            logger.debug("VAD audio shape: %s", audio.shape)
+            duration_after_vad = audio.shape[0] / sr
+            if return_chunks:
+                return duration_after_vad, speech_chunks
+            return duration_after_vad
+        except Exception as e:
+            import math
+            import traceback
+
+            logger.debug("VAD Exception: %s", str(e))
+            exec = traceback.format_exc()
+            logger.debug("traceback %s", exec)
+            return math.inf
+
+    def __call__(self, x, state, sr: int):
+        if len(x.shape) == 1:
+            x = np.expand_dims(x, 0)
+        if len(x.shape) > 2:
+            raise ValueError(
+                f"Too many dimensions for input audio chunk {len(x.shape)}"
+            )
+        if sr / x.shape[1] > 31.25:  # type: ignore
+            raise ValueError("Input audio chunk is too short")
+
+        h, c = state
+
+        ort_inputs = {
+            "input": x,
+            "h": h,
+            "c": c,
+            "sr": np.array(sr, dtype="int64"),
+        }
+
+        out, h, c = self.session.run(None, ort_inputs)
+        state = (h, c)
+
+        return out, state
--- a/backend/gradio_webrtc/reply_on_pause.py
+++ b/backend/gradio_webrtc/reply_on_pause.py
@@ -0,0 +1,188 @@
+import asyncio
+import inspect
+from dataclasses import dataclass
+from functools import lru_cache
+from logging import getLogger
+from threading import Event
+from typing import Any, Callable, Generator, Literal, Union, cast
+
+import numpy as np
+
+from gradio_webrtc.pause_detection import SileroVADModel, SileroVadOptions
+from gradio_webrtc.webrtc import EmitType, StreamHandler
+
+logger = getLogger(__name__)
+
+counter = 0
+
+
+@lru_cache
+def get_vad_model() -> SileroVADModel:
+    """Returns the VAD model instance."""
+    return SileroVADModel()
+
+
+@dataclass
+class AlgoOptions:
+    """Algorithm options."""
+
+    audio_chunk_duration: float = 0.6
+    started_talking_threshold: float = 0.2
+    speech_threshold: float = 0.1
+
+
+@dataclass
+class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
+    pause_detected: bool = False
+    started_talking: bool = False
+    responding: bool = False
+    stopped: bool = False
+    buffer: np.ndarray | None = None
+
+
+ReplyFnGenerator = Union[
+    # For two arguments
+    Callable[
+        [tuple[int, np.ndarray], list[dict[Any, Any]]],
+        Generator[EmitType, None, None],
+    ],
+    Callable[
+        [tuple[int, np.ndarray]],
+        Generator[EmitType, None, None],
+    ],
+]
+
+
+async def iterate(generator: Generator) -> Any:
+    return next(generator)
+
+
+class ReplyOnPause(StreamHandler):
+    def __init__(
+        self,
+        fn: ReplyFnGenerator,
+        algo_options: AlgoOptions | None = None,
+        model_options: SileroVadOptions | None = None,
+        expected_layout: Literal["mono", "stereo"] = "mono",
+        output_sample_rate: int = 24000,
+        output_frame_size: int = 480,
+        input_sample_rate: int = 48000,
+    ):
+        super().__init__(
+            expected_layout,
+            output_sample_rate,
+            output_frame_size,
+            input_sample_rate=input_sample_rate,
+        )
+        self.expected_layout: Literal["mono", "stereo"] = expected_layout
+        self.output_sample_rate = output_sample_rate
+        self.output_frame_size = output_frame_size
+        self.model = get_vad_model()
+        self.fn = fn
+        self.is_async = inspect.isasyncgenfunction(fn)
+        self.event = Event()
+        self.state = AppState()
+        self.generator: Generator[EmitType, None, None] | None = None
+        self.model_options = model_options
+        self.algo_options = algo_options or AlgoOptions()
+
+    @property
+    def _needs_additional_inputs(self) -> bool:
+        return len(inspect.signature(self.fn).parameters) > 1
+
+    def copy(self):
+        return ReplyOnPause(
+            self.fn,
+            self.algo_options,
+            self.model_options,
+            self.expected_layout,
+            self.output_sample_rate,
+            self.output_frame_size,
+            self.input_sample_rate,
+        )
+
+    def determine_pause(
+        self, audio: np.ndarray, sampling_rate: int, state: AppState
+    ) -> bool:
+        """Take in the stream, determine if a pause happened"""
+        duration = len(audio) / sampling_rate
+
+        if duration >= self.algo_options.audio_chunk_duration:
+            dur_vad = self.model.vad((sampling_rate, audio), self.model_options)
+            logger.debug("VAD duration: %s", dur_vad)
+            if (
+                dur_vad > self.algo_options.started_talking_threshold
+                and not state.started_talking
+            ):
+                state.started_talking = True
+                logger.debug("Started talking")
+            if state.started_talking:
+                if state.stream is None:
+                    state.stream = audio
+                else:
+                    state.stream = np.concatenate((state.stream, audio))
+            state.buffer = None
+            if dur_vad < self.algo_options.speech_threshold and state.started_talking:
+                return True
+        return False
+
+    def process_audio(self, audio: tuple[int, np.ndarray], state: AppState) -> None:
+        frame_rate, array = audio
+        array = np.squeeze(array)
+        if not state.sampling_rate:
+            state.sampling_rate = frame_rate
+        if state.buffer is None:
+            state.buffer = array
+        else:
+            state.buffer = np.concatenate((state.buffer, array))
+
+        pause_detected = self.determine_pause(
+            state.buffer, state.sampling_rate, self.state
+        )
+        state.pause_detected = pause_detected
+
+    def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        if self.state.responding:
+            return
+        self.process_audio(frame, self.state)
+        if self.state.pause_detected:
+            self.event.set()
+
+    def reset(self):
+        super().reset()
+        self.generator = None
+        self.event.clear()
+        self.state = AppState()
+
+    async def async_iterate(self, generator) -> EmitType:
+        return await anext(generator)
+
+    def emit(self):
+        if not self.event.is_set():
+            return None
+        else:
+            if not self.generator:
+                if self._needs_additional_inputs and not self.args_set.is_set():
+                    asyncio.run_coroutine_threadsafe(
+                        self.wait_for_args(), self.loop
+                    ).result()
+                logger.debug("Creating generator")
+                audio = cast(np.ndarray, self.state.stream).reshape(1, -1)
+                if self._needs_additional_inputs:
+                    self.latest_args[0] = (self.state.sampling_rate, audio)
+                    self.generator = self.fn(*self.latest_args)
+                else:
+                    self.generator = self.fn((self.state.sampling_rate, audio))  # type: ignore
+                logger.debug("Latest args: %s", self.latest_args)
+            self.state.responding = True
+            try:
+                if self.is_async:
+                    return asyncio.run_coroutine_threadsafe(
+                        self.async_iterate(self.generator), self.loop
+                    ).result()
+                else:
+                    return next(self.generator)
+            except (StopIteration, StopAsyncIteration):
+                self.reset()
--- a/backend/gradio_webrtc/reply_on_stopwords.py
+++ b/backend/gradio_webrtc/reply_on_stopwords.py
@@ -0,0 +1,147 @@
+import asyncio
+import logging
+import re
+from typing import Literal
+
+import numpy as np
+
+from .reply_on_pause import (
+    AlgoOptions,
+    AppState,
+    ReplyFnGenerator,
+    ReplyOnPause,
+    SileroVadOptions,
+)
+from .speech_to_text import get_stt_model, stt_for_chunks
+from .utils import audio_to_float32
+
+logger = logging.getLogger(__name__)
+
+
+class ReplyOnStopWordsState(AppState):
+    stop_word_detected: bool = False
+    post_stop_word_buffer: np.ndarray | None = None
+    started_talking_pre_stop_word: bool = False
+
+
+class ReplyOnStopWords(ReplyOnPause):
+    def __init__(
+        self,
+        fn: ReplyFnGenerator,
+        stop_words: list[str],
+        algo_options: AlgoOptions | None = None,
+        model_options: SileroVadOptions | None = None,
+        expected_layout: Literal["mono", "stereo"] = "mono",
+        output_sample_rate: int = 24000,
+        output_frame_size: int = 480,
+        input_sample_rate: int = 48000,
+    ):
+        super().__init__(
+            fn,
+            algo_options=algo_options,
+            model_options=model_options,
+            expected_layout=expected_layout,
+            output_sample_rate=output_sample_rate,
+            output_frame_size=output_frame_size,
+            input_sample_rate=input_sample_rate,
+        )
+        self.stop_words = stop_words
+        self.state = ReplyOnStopWordsState()
+        # Download Model
+        get_stt_model()
+
+    def stop_word_detected(self, text: str) -> bool:
+        for stop_word in self.stop_words:
+            stop_word = stop_word.lower().strip().split(" ")
+            if bool(
+                re.search(r"\b" + r"\s+".join(map(re.escape, stop_word)) + r"\b", text)
+            ):
+                logger.debug("Stop word detected: %s", stop_word)
+                return True
+        return False
+
+    async def _send_stopword(
+        self,
+    ):
+        if self.channel:
+            self.channel.send("stopword")
+            logger.debug("Sent stopword")
+
+    def send_stopword(self):
+        asyncio.run_coroutine_threadsafe(self._send_stopword(), self.loop)
+
+    def determine_pause(  # type: ignore
+        self, audio: np.ndarray, sampling_rate: int, state: ReplyOnStopWordsState
+    ) -> bool:
+        """Take in the stream, determine if a pause happened"""
+        import librosa
+
+        duration = len(audio) / sampling_rate
+
+        if duration >= self.algo_options.audio_chunk_duration:
+            if not state.stop_word_detected:
+                audio_f32 = audio_to_float32((sampling_rate, audio))
+                audio_rs = librosa.resample(
+                    audio_f32, orig_sr=sampling_rate, target_sr=16000
+                )
+                if state.post_stop_word_buffer is None:
+                    state.post_stop_word_buffer = audio_rs
+                else:
+                    state.post_stop_word_buffer = np.concatenate(
+                        (state.post_stop_word_buffer, audio_rs)
+                    )
+                if len(state.post_stop_word_buffer) / 16000 > 2:
+                    state.post_stop_word_buffer = state.post_stop_word_buffer[-32000:]
+                dur_vad, chunks = self.model.vad(
+                    (16000, state.post_stop_word_buffer),
+                    self.model_options,
+                    return_chunks=True,
+                )
+                text = stt_for_chunks((16000, state.post_stop_word_buffer), chunks)
+                logger.debug(f"STT: {text}")
+                state.stop_word_detected = self.stop_word_detected(text)
+                if state.stop_word_detected:
+                    logger.debug("Stop word detected")
+                    self.send_stopword()
+                state.buffer = None
+            else:
+                dur_vad = self.model.vad((sampling_rate, audio), self.model_options)
+                logger.debug("VAD duration: %s", dur_vad)
+                if (
+                    dur_vad > self.algo_options.started_talking_threshold
+                    and not state.started_talking
+                    and state.stop_word_detected
+                ):
+                    state.started_talking = True
+                    logger.debug("Started talking")
+                if state.started_talking:
+                    if state.stream is None:
+                        state.stream = audio
+                    else:
+                        state.stream = np.concatenate((state.stream, audio))
+                state.buffer = None
+                if (
+                    dur_vad < self.algo_options.speech_threshold
+                    and state.started_talking
+                    and state.stop_word_detected
+                ):
+                    return True
+        return False
+
+    def reset(self):
+        super().reset()
+        self.generator = None
+        self.event.clear()
+        self.state = ReplyOnStopWordsState()
+
+    def copy(self):
+        return ReplyOnStopWords(
+            self.fn,
+            self.stop_words,
+            self.algo_options,
+            self.model_options,
+            self.expected_layout,
+            self.output_sample_rate,
+            self.output_frame_size,
+            self.input_sample_rate,
+        )
--- a/backend/gradio_webrtc/speech_to_text/init.py
+++ b/backend/gradio_webrtc/speech_to_text/init.py
@@ -0,0 +1,3 @@
+from .stt_ import get_stt_model, stt, stt_for_chunks
+
+__all__ = ["stt", "stt_for_chunks", "get_stt_model"]
--- a/backend/gradio_webrtc/speech_to_text/stt_.py
+++ b/backend/gradio_webrtc/speech_to_text/stt_.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Callable
+
+import numpy as np
+from numpy.typing import NDArray
+
+from ..utils import AudioChunk
+
+
+@dataclass
+class STTModel:
+    encoder: Callable
+    decoder: Callable
+
+
+@lru_cache
+def get_stt_model() -> STTModel:
+    from silero import silero_stt
+
+    model, decoder, _ = silero_stt(language="en", version="v6", jit_model="jit_xlarge")
+    return STTModel(model, decoder)
+
+
+def stt(audio: tuple[int, NDArray[np.int16]]) -> str:
+    model = get_stt_model()
+    sr, audio_np = audio
+    if audio_np.dtype != np.float32:
+        print("converting")
+        audio_np = audio_np.astype(np.float32) / 32768.0
+    try:
+        import torch
+    except ImportError:
+        raise ImportError(
+            "PyTorch is required to run speech-to-text for stopword detection. Run `pip install torch`."
+        )
+    audio_torch = torch.tensor(audio_np, dtype=torch.float32)
+    if audio_torch.ndim == 1:
+        audio_torch = audio_torch.unsqueeze(0)
+    assert audio_torch.ndim == 2, "Audio must have a batch dimension"
+    print("before")
+    res = model.decoder(model.encoder(audio_torch)[0])
+    print("after")
+    return res
+
+
+def stt_for_chunks(
+    audio: tuple[int, NDArray[np.int16]], chunks: list[AudioChunk]
+) -> str:
+    sr, audio_np = audio
+    return " ".join(
+        [stt((sr, audio_np[chunk["start"] : chunk["end"]])) for chunk in chunks]
+    )
--- a/backend/gradio_webrtc/utils.py
+++ b/backend/gradio_webrtc/utils.py
@@ -0,0 +1,313 @@
+import asyncio
+import fractions
+import io
+import json
+import logging
+import tempfile
+from contextvars import ContextVar
+from typing import Any, Callable, Protocol, TypedDict, cast
+
+import av
+import numpy as np
+from pydub import AudioSegment
+
+logger = logging.getLogger(__name__)
+
+
+AUDIO_PTIME = 0.020
+
+
+class AudioChunk(TypedDict):
+    start: int
+    end: int
+
+
+class AdditionalOutputs:
+    def __init__(self, *args) -> None:
+        self.args = args
+
+
+class DataChannel(Protocol):
+    def send(self, message: str) -> None: ...
+
+
+current_channel: ContextVar[DataChannel | None] = ContextVar(
+    "current_channel", default=None
+)
+
+
+def _send_log(message: str, type: str) -> None:
+    async def _send(channel: DataChannel) -> None:
+        channel.send(
+            json.dumps(
+                {
+                    "type": type,
+                    "message": message,
+                }
+            )
+        )
+
+    if channel := current_channel.get():
+        print("channel", channel)
+        try:
+            loop = asyncio.get_running_loop()
+            asyncio.run_coroutine_threadsafe(_send(channel), loop)
+        except RuntimeError:
+            asyncio.run(_send(channel))
+
+
+def Warning(  # noqa: N802
+    message: str = "Warning issued.",
+):
+    """
+    Send a warning message that is deplayed in the UI of the application.
+
+    Parameters
+    ----------
+    audio : str
+        The warning message to send
+
+    Returns
+    -------
+    None
+    """
+    _send_log(message, "warning")
+
+
+class WebRTCError(Exception):
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+        _send_log(message, "error")
+
+
+def split_output(data: tuple | Any) -> tuple[Any, AdditionalOutputs | None]:
+    if isinstance(data, AdditionalOutputs):
+        return None, data
+    if isinstance(data, tuple):
+        # handle the bare audio case
+        if 2 <= len(data) <= 3 and isinstance(data[1], np.ndarray):
+            return data, None
+        if not len(data) == 2:
+            raise ValueError(
+                "The tuple must have exactly two elements: the data and an instance of AdditionalOutputs."
+            )
+        if not isinstance(data[-1], AdditionalOutputs):
+            raise ValueError(
+                "The last element of the tuple must be an instance of AdditionalOutputs."
+            )
+        return data[0], cast(AdditionalOutputs, data[1])
+    return data, None
+
+
+async def player_worker_decode(
+    next_frame: Callable,
+    queue: asyncio.Queue,
+    thread_quit: asyncio.Event,
+    channel: Callable[[], DataChannel | None] | None,
+    set_additional_outputs: Callable | None,
+    quit_on_none: bool = False,
+    sample_rate: int = 48000,
+    frame_size: int = int(48000 * AUDIO_PTIME),
+):
+    audio_samples = 0
+    audio_time_base = fractions.Fraction(1, sample_rate)
+    audio_resampler = av.AudioResampler(  # type: ignore
+        format="s16",
+        layout="stereo",
+        rate=sample_rate,
+        frame_size=frame_size,
+    )
+
+    while not thread_quit.is_set():
+        try:
+            # Get next frame
+            frame, outputs = split_output(
+                await asyncio.wait_for(next_frame(), timeout=60)
+            )
+            if (
+                isinstance(outputs, AdditionalOutputs)
+                and set_additional_outputs
+                and channel
+                and channel()
+            ):
+                set_additional_outputs(outputs)
+                cast(DataChannel, channel()).send("change")
+
+            if frame is None:
+                if quit_on_none:
+                    await queue.put(None)
+                    break
+                continue
+
+            if len(frame) == 2:
+                sample_rate, audio_array = frame
+                layout = "mono"
+            elif len(frame) == 3:
+                sample_rate, audio_array, layout = frame
+
+            logger.debug(
+                "received array with shape %s sample rate %s layout %s",
+                audio_array.shape,  # type: ignore
+                sample_rate,
+                layout,  # type: ignore
+            )
+            format = "s16" if audio_array.dtype == "int16" else "fltp"  # type: ignore
+
+            # Convert to audio frame and resample
+            # This runs in the same timeout context
+            frame = av.AudioFrame.from_ndarray(  # type: ignore
+                audio_array,  # type: ignore
+                format=format,
+                layout=layout,  # type: ignore
+            )
+            frame.sample_rate = sample_rate
+
+            for processed_frame in audio_resampler.resample(frame):
+                processed_frame.pts = audio_samples
+                processed_frame.time_base = audio_time_base
+                audio_samples += processed_frame.samples
+                await queue.put(processed_frame)
+            logger.debug("Queue size utils.py: %s", queue.qsize())
+
+        except (TimeoutError, asyncio.TimeoutError):
+            logger.warning(
+                "Timeout in frame processing cycle after %s seconds - resetting", 60
+            )
+            continue
+        except Exception as e:
+            import traceback
+
+            exec = traceback.format_exc()
+            logger.debug("traceback %s", exec)
+            logger.error("Error processing frame: %s", str(e))
+            continue
+
+
+def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
+    """
+    Convert an audio tuple containing sample rate and numpy array data into bytes.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    bytes
+        The audio data encoded as bytes, suitable for transmission or storage
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> audio_bytes = audio_to_bytes(audio_tuple)
+    """
+    audio_buffer = io.BytesIO()
+    segment = AudioSegment(
+        audio[1].tobytes(),
+        frame_rate=audio[0],
+        sample_width=audio[1].dtype.itemsize,
+        channels=1,
+    )
+    segment.export(audio_buffer, format="mp3")
+    return audio_buffer.getvalue()
+
+
+def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
+    """
+    Save an audio tuple containing sample rate and numpy array data to a file.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    str
+        The path to the saved audio file
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> file_path = audio_to_file(audio_tuple)
+    >>> print(f"Audio saved to: {file_path}")
+    """
+    bytes_ = audio_to_bytes(audio)
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(bytes_)
+    return f.name
+
+
+def audio_to_float32(audio: tuple[int, np.ndarray]) -> np.ndarray:
+    """
+    Convert an audio tuple containing sample rate (int16) and numpy array data to float32.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    np.ndarray
+        The audio data as a numpy array with dtype float32
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> audio_float32 = audio_to_float32(audio_tuple)
+    """
+    return audio[1].astype(np.float32) / 32768.0
+
+
+def aggregate_bytes_to_16bit(chunks_iterator):
+    leftover = b""  # Store incomplete bytes between chunks
+
+    for chunk in chunks_iterator:
+        # Combine with any leftover bytes from previous chunk
+        current_bytes = leftover + chunk
+
+        # Calculate complete samples
+        n_complete_samples = len(current_bytes) // 2  # int16 = 2 bytes
+        bytes_to_process = n_complete_samples * 2
+
+        # Split into complete samples and leftover
+        to_process = current_bytes[:bytes_to_process]
+        leftover = current_bytes[bytes_to_process:]
+
+        if to_process:  # Only yield if we have complete samples
+            audio_array = np.frombuffer(to_process, dtype=np.int16).reshape(1, -1)
+            yield audio_array
+
+
+async def async_aggregate_bytes_to_16bit(chunks_iterator):
+    leftover = b""  # Store incomplete bytes between chunks
+
+    async for chunk in chunks_iterator:
+        # Combine with any leftover bytes from previous chunk
+        current_bytes = leftover + chunk
+
+        # Calculate complete samples
+        n_complete_samples = len(current_bytes) // 2  # int16 = 2 bytes
+        bytes_to_process = n_complete_samples * 2
+
+        # Split into complete samples and leftover
+        to_process = current_bytes[:bytes_to_process]
+        leftover = current_bytes[bytes_to_process:]
+
+        if to_process:  # Only yield if we have complete samples
+            audio_array = np.frombuffer(to_process, dtype=np.int16).reshape(1, -1)
+            yield audio_array
--- a/backend/gradio_webrtc/webrtc.py
+++ b/backend/gradio_webrtc/webrtc.py