mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 09:59:22 +08:00
t :# 请为您的变更输入提交说明。以 '#' 开始的行将被忽略,而一个空的提交
This commit is contained in:
54
backend/gradio_webrtc/__init__.py
Normal file
54
backend/gradio_webrtc/__init__.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from .credentials import (
|
||||
get_hf_turn_credentials,
|
||||
get_turn_credentials,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
|
||||
from .reply_on_stopwords import ReplyOnStopWords
|
||||
from .speech_to_text import stt, stt_for_chunks
|
||||
from .utils import (
|
||||
AdditionalOutputs,
|
||||
Warning,
|
||||
WebRTCError,
|
||||
aggregate_bytes_to_16bit,
|
||||
async_aggregate_bytes_to_16bit,
|
||||
audio_to_bytes,
|
||||
audio_to_file,
|
||||
audio_to_float32,
|
||||
)
|
||||
from .webrtc import (
|
||||
AsyncAudioVideoStreamHandler,
|
||||
AsyncStreamHandler,
|
||||
AudioVideoStreamHandler,
|
||||
StreamHandler,
|
||||
WebRTC,
|
||||
VideoEmitType,
|
||||
AudioEmitType,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AsyncStreamHandler",
|
||||
"AudioVideoStreamHandler",
|
||||
"AudioEmitType",
|
||||
"AsyncAudioVideoStreamHandler",
|
||||
"AlgoOptions",
|
||||
"AdditionalOutputs",
|
||||
"aggregate_bytes_to_16bit",
|
||||
"async_aggregate_bytes_to_16bit",
|
||||
"audio_to_bytes",
|
||||
"audio_to_file",
|
||||
"audio_to_float32",
|
||||
"get_hf_turn_credentials",
|
||||
"get_twilio_turn_credentials",
|
||||
"get_turn_credentials",
|
||||
"ReplyOnPause",
|
||||
"ReplyOnStopWords",
|
||||
"SileroVadOptions",
|
||||
"stt",
|
||||
"stt_for_chunks",
|
||||
"StreamHandler",
|
||||
"VideoEmitType",
|
||||
"WebRTC",
|
||||
"WebRTCError",
|
||||
"Warning",
|
||||
]
|
||||
52
backend/gradio_webrtc/credentials.py
Normal file
52
backend/gradio_webrtc/credentials.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import os
|
||||
from typing import Literal
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def get_hf_turn_credentials(token=None):
|
||||
if token is None:
|
||||
token = os.getenv("HF_TOKEN")
|
||||
credentials = requests.get(
|
||||
"https://freddyaboulton-turn-server-login.hf.space/credentials",
|
||||
headers={"X-HF-Access-Token": token},
|
||||
)
|
||||
if not credentials.status_code == 200:
|
||||
raise ValueError("Failed to get credentials from HF turn server")
|
||||
return {
|
||||
"iceServers": [
|
||||
{
|
||||
"urls": "turn:gradio-turn.com:80",
|
||||
**credentials.json(),
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def get_twilio_turn_credentials(twilio_sid=None, twilio_token=None):
|
||||
try:
|
||||
from twilio.rest import Client
|
||||
except ImportError:
|
||||
raise ImportError("Please install twilio with `pip install twilio`")
|
||||
|
||||
if not twilio_sid and not twilio_token:
|
||||
twilio_sid = os.environ.get("TWILIO_ACCOUNT_SID")
|
||||
twilio_token = os.environ.get("TWILIO_AUTH_TOKEN")
|
||||
|
||||
client = Client(twilio_sid, twilio_token)
|
||||
|
||||
token = client.tokens.create()
|
||||
|
||||
return {
|
||||
"iceServers": token.ice_servers,
|
||||
"iceTransportPolicy": "relay",
|
||||
}
|
||||
|
||||
|
||||
def get_turn_credentials(method: Literal["hf", "twilio"] = "hf", **kwargs):
|
||||
if method == "hf":
|
||||
return get_hf_turn_credentials(**kwargs)
|
||||
elif method == "twilio":
|
||||
return get_twilio_turn_credentials(**kwargs)
|
||||
else:
|
||||
raise ValueError("Invalid method. Must be 'hf' or 'twilio'")
|
||||
3
backend/gradio_webrtc/pause_detection/__init__.py
Normal file
3
backend/gradio_webrtc/pause_detection/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .vad import SileroVADModel, SileroVadOptions
|
||||
|
||||
__all__ = ["SileroVADModel", "SileroVadOptions"]
|
||||
320
backend/gradio_webrtc/pause_detection/vad.py
Normal file
320
backend/gradio_webrtc/pause_detection/vad.py
Normal file
@@ -0,0 +1,320 @@
|
||||
import logging
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, overload
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import hf_hub_download
|
||||
from numpy.typing import NDArray
|
||||
|
||||
from ..utils import AudioChunk
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# The code below is adapted from https://github.com/snakers4/silero-vad.
|
||||
# The code below is adapted from https://github.com/gpt-omni/mini-omni/blob/main/utils/vad.py
|
||||
|
||||
|
||||
@dataclass
|
||||
class SileroVadOptions:
|
||||
"""VAD options.
|
||||
|
||||
Attributes:
|
||||
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
|
||||
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
||||
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
||||
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
|
||||
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
|
||||
than max_speech_duration_s will be split at the timestamp of the last silence that
|
||||
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
||||
split aggressively just before max_speech_duration_s.
|
||||
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
||||
before separating it
|
||||
window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
|
||||
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
|
||||
Values other than these may affect model performance!!
|
||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||
speech_duration: If the length of the speech is less than this value, a pause will be detected.
|
||||
"""
|
||||
|
||||
threshold: float = 0.5
|
||||
min_speech_duration_ms: int = 250
|
||||
max_speech_duration_s: float = float("inf")
|
||||
min_silence_duration_ms: int = 2000
|
||||
window_size_samples: int = 1024
|
||||
speech_pad_ms: int = 400
|
||||
|
||||
|
||||
class SileroVADModel:
|
||||
@staticmethod
|
||||
def download_model() -> str:
|
||||
return hf_hub_download(
|
||||
repo_id="freddyaboulton/silero-vad", filename="silero_vad.onnx"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
try:
|
||||
import onnxruntime
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
"Applying the VAD filter requires the onnxruntime package"
|
||||
) from e
|
||||
|
||||
path = self.download_model()
|
||||
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
opts.log_severity_level = 4
|
||||
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path,
|
||||
providers=["CPUExecutionProvider"],
|
||||
sess_options=opts,
|
||||
)
|
||||
|
||||
def get_initial_state(self, batch_size: int):
|
||||
h = np.zeros((2, batch_size, 64), dtype=np.float32)
|
||||
c = np.zeros((2, batch_size, 64), dtype=np.float32)
|
||||
return h, c
|
||||
|
||||
@staticmethod
|
||||
def collect_chunks(audio: np.ndarray, chunks: List[AudioChunk]) -> np.ndarray:
|
||||
"""Collects and concatenates audio chunks."""
|
||||
if not chunks:
|
||||
return np.array([], dtype=np.float32)
|
||||
|
||||
return np.concatenate(
|
||||
[audio[chunk["start"] : chunk["end"]] for chunk in chunks]
|
||||
)
|
||||
|
||||
def get_speech_timestamps(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
vad_options: SileroVadOptions,
|
||||
**kwargs,
|
||||
) -> List[AudioChunk]:
|
||||
"""This method is used for splitting long audios into speech chunks using silero VAD.
|
||||
|
||||
Args:
|
||||
audio: One dimensional float array.
|
||||
vad_options: Options for VAD processing.
|
||||
kwargs: VAD options passed as keyword arguments for backward compatibility.
|
||||
|
||||
Returns:
|
||||
List of dicts containing begin and end samples of each speech chunk.
|
||||
"""
|
||||
|
||||
threshold = vad_options.threshold
|
||||
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
||||
max_speech_duration_s = vad_options.max_speech_duration_s
|
||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||
window_size_samples = vad_options.window_size_samples
|
||||
speech_pad_ms = vad_options.speech_pad_ms
|
||||
|
||||
if window_size_samples not in [512, 1024, 1536]:
|
||||
warnings.warn(
|
||||
"Unusual window_size_samples! Supported window_size_samples:\n"
|
||||
" - [512, 1024, 1536] for 16000 sampling_rate"
|
||||
)
|
||||
|
||||
sampling_rate = 16000
|
||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||
max_speech_samples = (
|
||||
sampling_rate * max_speech_duration_s
|
||||
- window_size_samples
|
||||
- 2 * speech_pad_samples
|
||||
)
|
||||
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
||||
|
||||
audio_length_samples = len(audio)
|
||||
|
||||
state = self.get_initial_state(batch_size=1)
|
||||
|
||||
speech_probs = []
|
||||
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
||||
chunk = audio[
|
||||
current_start_sample : current_start_sample + window_size_samples
|
||||
]
|
||||
if len(chunk) < window_size_samples:
|
||||
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
||||
speech_prob, state = self(chunk, state, sampling_rate)
|
||||
speech_probs.append(speech_prob)
|
||||
|
||||
triggered = False
|
||||
speeches = []
|
||||
current_speech = {}
|
||||
neg_threshold = threshold - 0.15
|
||||
|
||||
# to save potential segment end (and tolerate some silence)
|
||||
temp_end = 0
|
||||
# to save potential segment limits in case of maximum segment size reached
|
||||
prev_end = next_start = 0
|
||||
|
||||
for i, speech_prob in enumerate(speech_probs):
|
||||
if (speech_prob >= threshold) and temp_end:
|
||||
temp_end = 0
|
||||
if next_start < prev_end:
|
||||
next_start = window_size_samples * i
|
||||
|
||||
if (speech_prob >= threshold) and not triggered:
|
||||
triggered = True
|
||||
current_speech["start"] = window_size_samples * i
|
||||
continue
|
||||
|
||||
if (
|
||||
triggered
|
||||
and (window_size_samples * i) - current_speech["start"]
|
||||
> max_speech_samples
|
||||
):
|
||||
if prev_end:
|
||||
current_speech["end"] = prev_end
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
# previously reached silence (< neg_thres) and is still not speech (< thres)
|
||||
if next_start < prev_end:
|
||||
triggered = False
|
||||
else:
|
||||
current_speech["start"] = next_start
|
||||
prev_end = next_start = temp_end = 0
|
||||
else:
|
||||
current_speech["end"] = window_size_samples * i
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (speech_prob < neg_threshold) and triggered:
|
||||
if not temp_end:
|
||||
temp_end = window_size_samples * i
|
||||
# condition to avoid cutting in very short silence
|
||||
if (
|
||||
window_size_samples * i
|
||||
) - temp_end > min_silence_samples_at_max_speech:
|
||||
prev_end = temp_end
|
||||
if (window_size_samples * i) - temp_end < min_silence_samples:
|
||||
continue
|
||||
else:
|
||||
current_speech["end"] = temp_end
|
||||
if (
|
||||
current_speech["end"] - current_speech["start"]
|
||||
) > min_speech_samples:
|
||||
speeches.append(current_speech)
|
||||
current_speech = {}
|
||||
prev_end = next_start = temp_end = 0
|
||||
triggered = False
|
||||
continue
|
||||
|
||||
if (
|
||||
current_speech
|
||||
and (audio_length_samples - current_speech["start"]) > min_speech_samples
|
||||
):
|
||||
current_speech["end"] = audio_length_samples
|
||||
speeches.append(current_speech)
|
||||
|
||||
for i, speech in enumerate(speeches):
|
||||
if i == 0:
|
||||
speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
|
||||
if i != len(speeches) - 1:
|
||||
silence_duration = speeches[i + 1]["start"] - speech["end"]
|
||||
if silence_duration < 2 * speech_pad_samples:
|
||||
speech["end"] += int(silence_duration // 2)
|
||||
speeches[i + 1]["start"] = int(
|
||||
max(0, speeches[i + 1]["start"] - silence_duration // 2)
|
||||
)
|
||||
else:
|
||||
speech["end"] = int(
|
||||
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
||||
)
|
||||
speeches[i + 1]["start"] = int(
|
||||
max(0, speeches[i + 1]["start"] - speech_pad_samples)
|
||||
)
|
||||
else:
|
||||
speech["end"] = int(
|
||||
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
||||
)
|
||||
|
||||
return speeches
|
||||
|
||||
@overload
|
||||
def vad(
|
||||
self,
|
||||
audio_tuple: tuple[int, NDArray],
|
||||
vad_parameters: None | SileroVadOptions,
|
||||
return_chunks: Literal[True],
|
||||
) -> tuple[float, List[AudioChunk]]: ...
|
||||
|
||||
@overload
|
||||
def vad(
|
||||
self,
|
||||
audio_tuple: tuple[int, NDArray],
|
||||
vad_parameters: None | SileroVadOptions,
|
||||
return_chunks: bool = False,
|
||||
) -> float: ...
|
||||
|
||||
def vad(
|
||||
self,
|
||||
audio_tuple: tuple[int, NDArray],
|
||||
vad_parameters: None | SileroVadOptions,
|
||||
return_chunks: bool = False,
|
||||
) -> float | tuple[float, List[AudioChunk]]:
|
||||
sampling_rate, audio = audio_tuple
|
||||
logger.debug("VAD audio shape input: %s", audio.shape)
|
||||
try:
|
||||
if audio.dtype != np.float32:
|
||||
audio = audio.astype(np.float32) / 32768.0
|
||||
sr = 16000
|
||||
if sr != sampling_rate:
|
||||
try:
|
||||
import librosa # type: ignore
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
"Applying the VAD filter requires the librosa if the input sampling rate is not 16000hz"
|
||||
) from e
|
||||
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
|
||||
|
||||
if not vad_parameters:
|
||||
vad_parameters = SileroVadOptions()
|
||||
speech_chunks = self.get_speech_timestamps(audio, vad_parameters)
|
||||
logger.debug("VAD speech chunks: %s", speech_chunks)
|
||||
audio = self.collect_chunks(audio, speech_chunks)
|
||||
logger.debug("VAD audio shape: %s", audio.shape)
|
||||
duration_after_vad = audio.shape[0] / sr
|
||||
if return_chunks:
|
||||
return duration_after_vad, speech_chunks
|
||||
return duration_after_vad
|
||||
except Exception as e:
|
||||
import math
|
||||
import traceback
|
||||
|
||||
logger.debug("VAD Exception: %s", str(e))
|
||||
exec = traceback.format_exc()
|
||||
logger.debug("traceback %s", exec)
|
||||
return math.inf
|
||||
|
||||
def __call__(self, x, state, sr: int):
|
||||
if len(x.shape) == 1:
|
||||
x = np.expand_dims(x, 0)
|
||||
if len(x.shape) > 2:
|
||||
raise ValueError(
|
||||
f"Too many dimensions for input audio chunk {len(x.shape)}"
|
||||
)
|
||||
if sr / x.shape[1] > 31.25: # type: ignore
|
||||
raise ValueError("Input audio chunk is too short")
|
||||
|
||||
h, c = state
|
||||
|
||||
ort_inputs = {
|
||||
"input": x,
|
||||
"h": h,
|
||||
"c": c,
|
||||
"sr": np.array(sr, dtype="int64"),
|
||||
}
|
||||
|
||||
out, h, c = self.session.run(None, ort_inputs)
|
||||
state = (h, c)
|
||||
|
||||
return out, state
|
||||
188
backend/gradio_webrtc/reply_on_pause.py
Normal file
188
backend/gradio_webrtc/reply_on_pause.py
Normal file
@@ -0,0 +1,188 @@
|
||||
import asyncio
|
||||
import inspect
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
from threading import Event
|
||||
from typing import Any, Callable, Generator, Literal, Union, cast
|
||||
|
||||
import numpy as np
|
||||
|
||||
from gradio_webrtc.pause_detection import SileroVADModel, SileroVadOptions
|
||||
from gradio_webrtc.webrtc import EmitType, StreamHandler
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
counter = 0
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_vad_model() -> SileroVADModel:
|
||||
"""Returns the VAD model instance."""
|
||||
return SileroVADModel()
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlgoOptions:
|
||||
"""Algorithm options."""
|
||||
|
||||
audio_chunk_duration: float = 0.6
|
||||
started_talking_threshold: float = 0.2
|
||||
speech_threshold: float = 0.1
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppState:
|
||||
stream: np.ndarray | None = None
|
||||
sampling_rate: int = 0
|
||||
pause_detected: bool = False
|
||||
started_talking: bool = False
|
||||
responding: bool = False
|
||||
stopped: bool = False
|
||||
buffer: np.ndarray | None = None
|
||||
|
||||
|
||||
ReplyFnGenerator = Union[
|
||||
# For two arguments
|
||||
Callable[
|
||||
[tuple[int, np.ndarray], list[dict[Any, Any]]],
|
||||
Generator[EmitType, None, None],
|
||||
],
|
||||
Callable[
|
||||
[tuple[int, np.ndarray]],
|
||||
Generator[EmitType, None, None],
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
async def iterate(generator: Generator) -> Any:
|
||||
return next(generator)
|
||||
|
||||
|
||||
class ReplyOnPause(StreamHandler):
|
||||
def __init__(
|
||||
self,
|
||||
fn: ReplyFnGenerator,
|
||||
algo_options: AlgoOptions | None = None,
|
||||
model_options: SileroVadOptions | None = None,
|
||||
expected_layout: Literal["mono", "stereo"] = "mono",
|
||||
output_sample_rate: int = 24000,
|
||||
output_frame_size: int = 480,
|
||||
input_sample_rate: int = 48000,
|
||||
):
|
||||
super().__init__(
|
||||
expected_layout,
|
||||
output_sample_rate,
|
||||
output_frame_size,
|
||||
input_sample_rate=input_sample_rate,
|
||||
)
|
||||
self.expected_layout: Literal["mono", "stereo"] = expected_layout
|
||||
self.output_sample_rate = output_sample_rate
|
||||
self.output_frame_size = output_frame_size
|
||||
self.model = get_vad_model()
|
||||
self.fn = fn
|
||||
self.is_async = inspect.isasyncgenfunction(fn)
|
||||
self.event = Event()
|
||||
self.state = AppState()
|
||||
self.generator: Generator[EmitType, None, None] | None = None
|
||||
self.model_options = model_options
|
||||
self.algo_options = algo_options or AlgoOptions()
|
||||
|
||||
@property
|
||||
def _needs_additional_inputs(self) -> bool:
|
||||
return len(inspect.signature(self.fn).parameters) > 1
|
||||
|
||||
def copy(self):
|
||||
return ReplyOnPause(
|
||||
self.fn,
|
||||
self.algo_options,
|
||||
self.model_options,
|
||||
self.expected_layout,
|
||||
self.output_sample_rate,
|
||||
self.output_frame_size,
|
||||
self.input_sample_rate,
|
||||
)
|
||||
|
||||
def determine_pause(
|
||||
self, audio: np.ndarray, sampling_rate: int, state: AppState
|
||||
) -> bool:
|
||||
"""Take in the stream, determine if a pause happened"""
|
||||
duration = len(audio) / sampling_rate
|
||||
|
||||
if duration >= self.algo_options.audio_chunk_duration:
|
||||
dur_vad = self.model.vad((sampling_rate, audio), self.model_options)
|
||||
logger.debug("VAD duration: %s", dur_vad)
|
||||
if (
|
||||
dur_vad > self.algo_options.started_talking_threshold
|
||||
and not state.started_talking
|
||||
):
|
||||
state.started_talking = True
|
||||
logger.debug("Started talking")
|
||||
if state.started_talking:
|
||||
if state.stream is None:
|
||||
state.stream = audio
|
||||
else:
|
||||
state.stream = np.concatenate((state.stream, audio))
|
||||
state.buffer = None
|
||||
if dur_vad < self.algo_options.speech_threshold and state.started_talking:
|
||||
return True
|
||||
return False
|
||||
|
||||
def process_audio(self, audio: tuple[int, np.ndarray], state: AppState) -> None:
|
||||
frame_rate, array = audio
|
||||
array = np.squeeze(array)
|
||||
if not state.sampling_rate:
|
||||
state.sampling_rate = frame_rate
|
||||
if state.buffer is None:
|
||||
state.buffer = array
|
||||
else:
|
||||
state.buffer = np.concatenate((state.buffer, array))
|
||||
|
||||
pause_detected = self.determine_pause(
|
||||
state.buffer, state.sampling_rate, self.state
|
||||
)
|
||||
state.pause_detected = pause_detected
|
||||
|
||||
def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
if self.state.responding:
|
||||
return
|
||||
self.process_audio(frame, self.state)
|
||||
if self.state.pause_detected:
|
||||
self.event.set()
|
||||
|
||||
def reset(self):
|
||||
super().reset()
|
||||
self.generator = None
|
||||
self.event.clear()
|
||||
self.state = AppState()
|
||||
|
||||
async def async_iterate(self, generator) -> EmitType:
|
||||
return await anext(generator)
|
||||
|
||||
def emit(self):
|
||||
if not self.event.is_set():
|
||||
return None
|
||||
else:
|
||||
if not self.generator:
|
||||
if self._needs_additional_inputs and not self.args_set.is_set():
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.wait_for_args(), self.loop
|
||||
).result()
|
||||
logger.debug("Creating generator")
|
||||
audio = cast(np.ndarray, self.state.stream).reshape(1, -1)
|
||||
if self._needs_additional_inputs:
|
||||
self.latest_args[0] = (self.state.sampling_rate, audio)
|
||||
self.generator = self.fn(*self.latest_args)
|
||||
else:
|
||||
self.generator = self.fn((self.state.sampling_rate, audio)) # type: ignore
|
||||
logger.debug("Latest args: %s", self.latest_args)
|
||||
self.state.responding = True
|
||||
try:
|
||||
if self.is_async:
|
||||
return asyncio.run_coroutine_threadsafe(
|
||||
self.async_iterate(self.generator), self.loop
|
||||
).result()
|
||||
else:
|
||||
return next(self.generator)
|
||||
except (StopIteration, StopAsyncIteration):
|
||||
self.reset()
|
||||
147
backend/gradio_webrtc/reply_on_stopwords.py
Normal file
147
backend/gradio_webrtc/reply_on_stopwords.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .reply_on_pause import (
|
||||
AlgoOptions,
|
||||
AppState,
|
||||
ReplyFnGenerator,
|
||||
ReplyOnPause,
|
||||
SileroVadOptions,
|
||||
)
|
||||
from .speech_to_text import get_stt_model, stt_for_chunks
|
||||
from .utils import audio_to_float32
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReplyOnStopWordsState(AppState):
|
||||
stop_word_detected: bool = False
|
||||
post_stop_word_buffer: np.ndarray | None = None
|
||||
started_talking_pre_stop_word: bool = False
|
||||
|
||||
|
||||
class ReplyOnStopWords(ReplyOnPause):
|
||||
def __init__(
|
||||
self,
|
||||
fn: ReplyFnGenerator,
|
||||
stop_words: list[str],
|
||||
algo_options: AlgoOptions | None = None,
|
||||
model_options: SileroVadOptions | None = None,
|
||||
expected_layout: Literal["mono", "stereo"] = "mono",
|
||||
output_sample_rate: int = 24000,
|
||||
output_frame_size: int = 480,
|
||||
input_sample_rate: int = 48000,
|
||||
):
|
||||
super().__init__(
|
||||
fn,
|
||||
algo_options=algo_options,
|
||||
model_options=model_options,
|
||||
expected_layout=expected_layout,
|
||||
output_sample_rate=output_sample_rate,
|
||||
output_frame_size=output_frame_size,
|
||||
input_sample_rate=input_sample_rate,
|
||||
)
|
||||
self.stop_words = stop_words
|
||||
self.state = ReplyOnStopWordsState()
|
||||
# Download Model
|
||||
get_stt_model()
|
||||
|
||||
def stop_word_detected(self, text: str) -> bool:
|
||||
for stop_word in self.stop_words:
|
||||
stop_word = stop_word.lower().strip().split(" ")
|
||||
if bool(
|
||||
re.search(r"\b" + r"\s+".join(map(re.escape, stop_word)) + r"\b", text)
|
||||
):
|
||||
logger.debug("Stop word detected: %s", stop_word)
|
||||
return True
|
||||
return False
|
||||
|
||||
async def _send_stopword(
|
||||
self,
|
||||
):
|
||||
if self.channel:
|
||||
self.channel.send("stopword")
|
||||
logger.debug("Sent stopword")
|
||||
|
||||
def send_stopword(self):
|
||||
asyncio.run_coroutine_threadsafe(self._send_stopword(), self.loop)
|
||||
|
||||
def determine_pause( # type: ignore
|
||||
self, audio: np.ndarray, sampling_rate: int, state: ReplyOnStopWordsState
|
||||
) -> bool:
|
||||
"""Take in the stream, determine if a pause happened"""
|
||||
import librosa
|
||||
|
||||
duration = len(audio) / sampling_rate
|
||||
|
||||
if duration >= self.algo_options.audio_chunk_duration:
|
||||
if not state.stop_word_detected:
|
||||
audio_f32 = audio_to_float32((sampling_rate, audio))
|
||||
audio_rs = librosa.resample(
|
||||
audio_f32, orig_sr=sampling_rate, target_sr=16000
|
||||
)
|
||||
if state.post_stop_word_buffer is None:
|
||||
state.post_stop_word_buffer = audio_rs
|
||||
else:
|
||||
state.post_stop_word_buffer = np.concatenate(
|
||||
(state.post_stop_word_buffer, audio_rs)
|
||||
)
|
||||
if len(state.post_stop_word_buffer) / 16000 > 2:
|
||||
state.post_stop_word_buffer = state.post_stop_word_buffer[-32000:]
|
||||
dur_vad, chunks = self.model.vad(
|
||||
(16000, state.post_stop_word_buffer),
|
||||
self.model_options,
|
||||
return_chunks=True,
|
||||
)
|
||||
text = stt_for_chunks((16000, state.post_stop_word_buffer), chunks)
|
||||
logger.debug(f"STT: {text}")
|
||||
state.stop_word_detected = self.stop_word_detected(text)
|
||||
if state.stop_word_detected:
|
||||
logger.debug("Stop word detected")
|
||||
self.send_stopword()
|
||||
state.buffer = None
|
||||
else:
|
||||
dur_vad = self.model.vad((sampling_rate, audio), self.model_options)
|
||||
logger.debug("VAD duration: %s", dur_vad)
|
||||
if (
|
||||
dur_vad > self.algo_options.started_talking_threshold
|
||||
and not state.started_talking
|
||||
and state.stop_word_detected
|
||||
):
|
||||
state.started_talking = True
|
||||
logger.debug("Started talking")
|
||||
if state.started_talking:
|
||||
if state.stream is None:
|
||||
state.stream = audio
|
||||
else:
|
||||
state.stream = np.concatenate((state.stream, audio))
|
||||
state.buffer = None
|
||||
if (
|
||||
dur_vad < self.algo_options.speech_threshold
|
||||
and state.started_talking
|
||||
and state.stop_word_detected
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
super().reset()
|
||||
self.generator = None
|
||||
self.event.clear()
|
||||
self.state = ReplyOnStopWordsState()
|
||||
|
||||
def copy(self):
|
||||
return ReplyOnStopWords(
|
||||
self.fn,
|
||||
self.stop_words,
|
||||
self.algo_options,
|
||||
self.model_options,
|
||||
self.expected_layout,
|
||||
self.output_sample_rate,
|
||||
self.output_frame_size,
|
||||
self.input_sample_rate,
|
||||
)
|
||||
3
backend/gradio_webrtc/speech_to_text/__init__.py
Normal file
3
backend/gradio_webrtc/speech_to_text/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .stt_ import get_stt_model, stt, stt_for_chunks
|
||||
|
||||
__all__ = ["stt", "stt_for_chunks", "get_stt_model"]
|
||||
53
backend/gradio_webrtc/speech_to_text/stt_.py
Normal file
53
backend/gradio_webrtc/speech_to_text/stt_.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
from numpy.typing import NDArray
|
||||
|
||||
from ..utils import AudioChunk
|
||||
|
||||
|
||||
@dataclass
|
||||
class STTModel:
|
||||
encoder: Callable
|
||||
decoder: Callable
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_stt_model() -> STTModel:
|
||||
from silero import silero_stt
|
||||
|
||||
model, decoder, _ = silero_stt(language="en", version="v6", jit_model="jit_xlarge")
|
||||
return STTModel(model, decoder)
|
||||
|
||||
|
||||
def stt(audio: tuple[int, NDArray[np.int16]]) -> str:
|
||||
model = get_stt_model()
|
||||
sr, audio_np = audio
|
||||
if audio_np.dtype != np.float32:
|
||||
print("converting")
|
||||
audio_np = audio_np.astype(np.float32) / 32768.0
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"PyTorch is required to run speech-to-text for stopword detection. Run `pip install torch`."
|
||||
)
|
||||
audio_torch = torch.tensor(audio_np, dtype=torch.float32)
|
||||
if audio_torch.ndim == 1:
|
||||
audio_torch = audio_torch.unsqueeze(0)
|
||||
assert audio_torch.ndim == 2, "Audio must have a batch dimension"
|
||||
print("before")
|
||||
res = model.decoder(model.encoder(audio_torch)[0])
|
||||
print("after")
|
||||
return res
|
||||
|
||||
|
||||
def stt_for_chunks(
|
||||
audio: tuple[int, NDArray[np.int16]], chunks: list[AudioChunk]
|
||||
) -> str:
|
||||
sr, audio_np = audio
|
||||
return " ".join(
|
||||
[stt((sr, audio_np[chunk["start"] : chunk["end"]])) for chunk in chunks]
|
||||
)
|
||||
313
backend/gradio_webrtc/utils.py
Normal file
313
backend/gradio_webrtc/utils.py
Normal file
@@ -0,0 +1,313 @@
|
||||
import asyncio
|
||||
import fractions
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
from contextvars import ContextVar
|
||||
from typing import Any, Callable, Protocol, TypedDict, cast
|
||||
|
||||
import av
|
||||
import numpy as np
|
||||
from pydub import AudioSegment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
AUDIO_PTIME = 0.020
|
||||
|
||||
|
||||
class AudioChunk(TypedDict):
|
||||
start: int
|
||||
end: int
|
||||
|
||||
|
||||
class AdditionalOutputs:
|
||||
def __init__(self, *args) -> None:
|
||||
self.args = args
|
||||
|
||||
|
||||
class DataChannel(Protocol):
|
||||
def send(self, message: str) -> None: ...
|
||||
|
||||
|
||||
current_channel: ContextVar[DataChannel | None] = ContextVar(
|
||||
"current_channel", default=None
|
||||
)
|
||||
|
||||
|
||||
def _send_log(message: str, type: str) -> None:
|
||||
async def _send(channel: DataChannel) -> None:
|
||||
channel.send(
|
||||
json.dumps(
|
||||
{
|
||||
"type": type,
|
||||
"message": message,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if channel := current_channel.get():
|
||||
print("channel", channel)
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
asyncio.run_coroutine_threadsafe(_send(channel), loop)
|
||||
except RuntimeError:
|
||||
asyncio.run(_send(channel))
|
||||
|
||||
|
||||
def Warning( # noqa: N802
|
||||
message: str = "Warning issued.",
|
||||
):
|
||||
"""
|
||||
Send a warning message that is deplayed in the UI of the application.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : str
|
||||
The warning message to send
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
_send_log(message, "warning")
|
||||
|
||||
|
||||
class WebRTCError(Exception):
|
||||
def __init__(self, message: str) -> None:
|
||||
super().__init__(message)
|
||||
_send_log(message, "error")
|
||||
|
||||
|
||||
def split_output(data: tuple | Any) -> tuple[Any, AdditionalOutputs | None]:
|
||||
if isinstance(data, AdditionalOutputs):
|
||||
return None, data
|
||||
if isinstance(data, tuple):
|
||||
# handle the bare audio case
|
||||
if 2 <= len(data) <= 3 and isinstance(data[1], np.ndarray):
|
||||
return data, None
|
||||
if not len(data) == 2:
|
||||
raise ValueError(
|
||||
"The tuple must have exactly two elements: the data and an instance of AdditionalOutputs."
|
||||
)
|
||||
if not isinstance(data[-1], AdditionalOutputs):
|
||||
raise ValueError(
|
||||
"The last element of the tuple must be an instance of AdditionalOutputs."
|
||||
)
|
||||
return data[0], cast(AdditionalOutputs, data[1])
|
||||
return data, None
|
||||
|
||||
|
||||
async def player_worker_decode(
|
||||
next_frame: Callable,
|
||||
queue: asyncio.Queue,
|
||||
thread_quit: asyncio.Event,
|
||||
channel: Callable[[], DataChannel | None] | None,
|
||||
set_additional_outputs: Callable | None,
|
||||
quit_on_none: bool = False,
|
||||
sample_rate: int = 48000,
|
||||
frame_size: int = int(48000 * AUDIO_PTIME),
|
||||
):
|
||||
audio_samples = 0
|
||||
audio_time_base = fractions.Fraction(1, sample_rate)
|
||||
audio_resampler = av.AudioResampler( # type: ignore
|
||||
format="s16",
|
||||
layout="stereo",
|
||||
rate=sample_rate,
|
||||
frame_size=frame_size,
|
||||
)
|
||||
|
||||
while not thread_quit.is_set():
|
||||
try:
|
||||
# Get next frame
|
||||
frame, outputs = split_output(
|
||||
await asyncio.wait_for(next_frame(), timeout=60)
|
||||
)
|
||||
if (
|
||||
isinstance(outputs, AdditionalOutputs)
|
||||
and set_additional_outputs
|
||||
and channel
|
||||
and channel()
|
||||
):
|
||||
set_additional_outputs(outputs)
|
||||
cast(DataChannel, channel()).send("change")
|
||||
|
||||
if frame is None:
|
||||
if quit_on_none:
|
||||
await queue.put(None)
|
||||
break
|
||||
continue
|
||||
|
||||
if len(frame) == 2:
|
||||
sample_rate, audio_array = frame
|
||||
layout = "mono"
|
||||
elif len(frame) == 3:
|
||||
sample_rate, audio_array, layout = frame
|
||||
|
||||
logger.debug(
|
||||
"received array with shape %s sample rate %s layout %s",
|
||||
audio_array.shape, # type: ignore
|
||||
sample_rate,
|
||||
layout, # type: ignore
|
||||
)
|
||||
format = "s16" if audio_array.dtype == "int16" else "fltp" # type: ignore
|
||||
|
||||
# Convert to audio frame and resample
|
||||
# This runs in the same timeout context
|
||||
frame = av.AudioFrame.from_ndarray( # type: ignore
|
||||
audio_array, # type: ignore
|
||||
format=format,
|
||||
layout=layout, # type: ignore
|
||||
)
|
||||
frame.sample_rate = sample_rate
|
||||
|
||||
for processed_frame in audio_resampler.resample(frame):
|
||||
processed_frame.pts = audio_samples
|
||||
processed_frame.time_base = audio_time_base
|
||||
audio_samples += processed_frame.samples
|
||||
await queue.put(processed_frame)
|
||||
logger.debug("Queue size utils.py: %s", queue.qsize())
|
||||
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
logger.warning(
|
||||
"Timeout in frame processing cycle after %s seconds - resetting", 60
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
exec = traceback.format_exc()
|
||||
logger.debug("traceback %s", exec)
|
||||
logger.error("Error processing frame: %s", str(e))
|
||||
continue
|
||||
|
||||
|
||||
def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
|
||||
"""
|
||||
Convert an audio tuple containing sample rate and numpy array data into bytes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : tuple[int, np.ndarray]
|
||||
A tuple containing:
|
||||
- sample_rate (int): The audio sample rate in Hz
|
||||
- data (np.ndarray): The audio data as a numpy array
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes
|
||||
The audio data encoded as bytes, suitable for transmission or storage
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> sample_rate = 44100
|
||||
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||
>>> audio_tuple = (sample_rate, audio_data)
|
||||
>>> audio_bytes = audio_to_bytes(audio_tuple)
|
||||
"""
|
||||
audio_buffer = io.BytesIO()
|
||||
segment = AudioSegment(
|
||||
audio[1].tobytes(),
|
||||
frame_rate=audio[0],
|
||||
sample_width=audio[1].dtype.itemsize,
|
||||
channels=1,
|
||||
)
|
||||
segment.export(audio_buffer, format="mp3")
|
||||
return audio_buffer.getvalue()
|
||||
|
||||
|
||||
def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
|
||||
"""
|
||||
Save an audio tuple containing sample rate and numpy array data to a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : tuple[int, np.ndarray]
|
||||
A tuple containing:
|
||||
- sample_rate (int): The audio sample rate in Hz
|
||||
- data (np.ndarray): The audio data as a numpy array
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The path to the saved audio file
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> sample_rate = 44100
|
||||
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||
>>> audio_tuple = (sample_rate, audio_data)
|
||||
>>> file_path = audio_to_file(audio_tuple)
|
||||
>>> print(f"Audio saved to: {file_path}")
|
||||
"""
|
||||
bytes_ = audio_to_bytes(audio)
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
|
||||
f.write(bytes_)
|
||||
return f.name
|
||||
|
||||
|
||||
def audio_to_float32(audio: tuple[int, np.ndarray]) -> np.ndarray:
|
||||
"""
|
||||
Convert an audio tuple containing sample rate (int16) and numpy array data to float32.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : tuple[int, np.ndarray]
|
||||
A tuple containing:
|
||||
- sample_rate (int): The audio sample rate in Hz
|
||||
- data (np.ndarray): The audio data as a numpy array
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
The audio data as a numpy array with dtype float32
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> sample_rate = 44100
|
||||
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||
>>> audio_tuple = (sample_rate, audio_data)
|
||||
>>> audio_float32 = audio_to_float32(audio_tuple)
|
||||
"""
|
||||
return audio[1].astype(np.float32) / 32768.0
|
||||
|
||||
|
||||
def aggregate_bytes_to_16bit(chunks_iterator):
|
||||
leftover = b"" # Store incomplete bytes between chunks
|
||||
|
||||
for chunk in chunks_iterator:
|
||||
# Combine with any leftover bytes from previous chunk
|
||||
current_bytes = leftover + chunk
|
||||
|
||||
# Calculate complete samples
|
||||
n_complete_samples = len(current_bytes) // 2 # int16 = 2 bytes
|
||||
bytes_to_process = n_complete_samples * 2
|
||||
|
||||
# Split into complete samples and leftover
|
||||
to_process = current_bytes[:bytes_to_process]
|
||||
leftover = current_bytes[bytes_to_process:]
|
||||
|
||||
if to_process: # Only yield if we have complete samples
|
||||
audio_array = np.frombuffer(to_process, dtype=np.int16).reshape(1, -1)
|
||||
yield audio_array
|
||||
|
||||
|
||||
async def async_aggregate_bytes_to_16bit(chunks_iterator):
|
||||
leftover = b"" # Store incomplete bytes between chunks
|
||||
|
||||
async for chunk in chunks_iterator:
|
||||
# Combine with any leftover bytes from previous chunk
|
||||
current_bytes = leftover + chunk
|
||||
|
||||
# Calculate complete samples
|
||||
n_complete_samples = len(current_bytes) // 2 # int16 = 2 bytes
|
||||
bytes_to_process = n_complete_samples * 2
|
||||
|
||||
# Split into complete samples and leftover
|
||||
to_process = current_bytes[:bytes_to_process]
|
||||
leftover = current_bytes[bytes_to_process:]
|
||||
|
||||
if to_process: # Only yield if we have complete samples
|
||||
audio_array = np.frombuffer(to_process, dtype=np.int16).reshape(1, -1)
|
||||
yield audio_array
|
||||
1151
backend/gradio_webrtc/webrtc.py
Normal file
1151
backend/gradio_webrtc/webrtc.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user