From 44aac8d9643bc4416c838d94da7b19be9b7c3b7f Mon Sep 17 00:00:00 2001 From: Freddy Boulton Date: Tue, 18 Mar 2025 18:53:47 -0400 Subject: [PATCH] Fix issue when the audio stream mixes sample rates and numpy array data types (#188) * Fix code * Fix * keep same --- backend/fastrtc/utils.py | 21 ++++++++++++++++----- demo/talk_to_azure_openai/app.py | 3 +-- docs/userguide/audio.md | 4 ++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/backend/fastrtc/utils.py b/backend/fastrtc/utils.py index bd1772c..21f8677 100644 --- a/backend/fastrtc/utils.py +++ b/backend/fastrtc/utils.py @@ -11,6 +11,7 @@ from contextvars import ContextVar from typing import Any, Callable, Literal, Protocol, TypedDict, cast import av +import librosa import numpy as np from numpy.typing import NDArray from pydub import AudioSegment @@ -134,7 +135,7 @@ async def player_worker_decode( rate=sample_rate, frame_size=frame_size, ) - + first_sample_rate = None while not thread_quit.is_set(): try: # Get next frame @@ -174,19 +175,29 @@ async def player_worker_decode( layout, # type: ignore ) format = "s16" if audio_array.dtype == "int16" else "fltp" # type: ignore + if first_sample_rate is None: + first_sample_rate = sample_rate + + if format == "s16": + audio_array = audio_to_float32((sample_rate, audio_array)) + + if first_sample_rate != sample_rate: + audio_array = librosa.resample( + audio_array, target_sr=first_sample_rate, orig_sr=sample_rate + ) if audio_array.ndim == 1: audio_array = audio_array.reshape(1, -1) - # Convert to audio frame and resample + # Convert to audio frame and + # This runs in the same timeout context frame = av.AudioFrame.from_ndarray( # type: ignore audio_array, # type: ignore - format=format, + format="fltp", layout=layout, # type: ignore ) - frame.sample_rate = sample_rate - + frame.sample_rate = first_sample_rate for processed_frame in audio_resampler.resample(frame): processed_frame.pts = audio_samples processed_frame.time_base = audio_time_base diff --git a/demo/talk_to_azure_openai/app.py b/demo/talk_to_azure_openai/app.py index 59b3c8f..a278ccc 100644 --- a/demo/talk_to_azure_openai/app.py +++ b/demo/talk_to_azure_openai/app.py @@ -2,11 +2,10 @@ import asyncio import base64 import json from pathlib import Path -import sounddevice as sd +import aiohttp # pip install aiohttp import gradio as gr import numpy as np -import aiohttp # pip install aiohttp from dotenv import load_dotenv from fastapi import FastAPI from fastapi.responses import HTMLResponse, StreamingResponse diff --git a/docs/userguide/audio.md b/docs/userguide/audio.md index dcb937c..e012d31 100644 --- a/docs/userguide/audio.md +++ b/docs/userguide/audio.md @@ -86,7 +86,7 @@ from fastrtc import get_tts_model, Stream, ReplyOnPause tts_client = get_tts_model() -def detection(audio: tuple[int, np.ndarray]): +def echo(audio: tuple[int, np.ndarray]): # Implement any iterator that yields audio # See "LLM Voice Chat" for a more complete example yield audio @@ -98,7 +98,7 @@ def startup(): stream = Stream( - handler=ReplyOnPause(detection, startup_fn=startup), + handler=ReplyOnPause(echo, startup_fn=startup), modality="audio", mode="send-receive", ui_args={"title": "Echo Audio"},