From 44aac8d9643bc4416c838d94da7b19be9b7c3b7f Mon Sep 17 00:00:00 2001
From: Freddy Boulton <alfonsoboulton@gmail.com>
Date: Tue, 18 Mar 2025 18:53:47 -0400
Subject: [PATCH] Fix issue when the audio stream mixes sample rates and numpy
 array data types (#188)

* Fix code

* Fix

* keep same
---
 backend/fastrtc/utils.py         | 21 ++++++++++++++++-----
 demo/talk_to_azure_openai/app.py |  3 +--
 docs/userguide/audio.md          |  4 ++--
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/backend/fastrtc/utils.py b/backend/fastrtc/utils.py
index bd1772c..21f8677 100644
--- a/backend/fastrtc/utils.py
+++ b/backend/fastrtc/utils.py
@@ -11,6 +11,7 @@ from contextvars import ContextVar
 from typing import Any, Callable, Literal, Protocol, TypedDict, cast
 
 import av
+import librosa
 import numpy as np
 from numpy.typing import NDArray
 from pydub import AudioSegment
@@ -134,7 +135,7 @@ async def player_worker_decode(
         rate=sample_rate,
         frame_size=frame_size,
     )
-
+    first_sample_rate = None
     while not thread_quit.is_set():
         try:
             # Get next frame
@@ -174,19 +175,29 @@ async def player_worker_decode(
                 layout,  # type: ignore
             )
             format = "s16" if audio_array.dtype == "int16" else "fltp"  # type: ignore
+            if first_sample_rate is None:
+                first_sample_rate = sample_rate
+
+            if format == "s16":
+                audio_array = audio_to_float32((sample_rate, audio_array))
+
+            if first_sample_rate != sample_rate:
+                audio_array = librosa.resample(
+                    audio_array, target_sr=first_sample_rate, orig_sr=sample_rate
+                )
 
             if audio_array.ndim == 1:
                 audio_array = audio_array.reshape(1, -1)
 
-            # Convert to audio frame and resample
+            # Convert to audio frame and
+
             # This runs in the same timeout context
             frame = av.AudioFrame.from_ndarray(  # type: ignore
                 audio_array,  # type: ignore
-                format=format,
+                format="fltp",
                 layout=layout,  # type: ignore
             )
-            frame.sample_rate = sample_rate
-
+            frame.sample_rate = first_sample_rate
             for processed_frame in audio_resampler.resample(frame):
                 processed_frame.pts = audio_samples
                 processed_frame.time_base = audio_time_base
diff --git a/demo/talk_to_azure_openai/app.py b/demo/talk_to_azure_openai/app.py
index 59b3c8f..a278ccc 100644
--- a/demo/talk_to_azure_openai/app.py
+++ b/demo/talk_to_azure_openai/app.py
@@ -2,11 +2,10 @@ import asyncio
 import base64
 import json
 from pathlib import Path
-import sounddevice as sd
 
+import aiohttp  # pip install aiohttp
 import gradio as gr
 import numpy as np
-import aiohttp  # pip install aiohttp
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, StreamingResponse
diff --git a/docs/userguide/audio.md b/docs/userguide/audio.md
index dcb937c..e012d31 100644
--- a/docs/userguide/audio.md
+++ b/docs/userguide/audio.md
@@ -86,7 +86,7 @@ from fastrtc import get_tts_model, Stream, ReplyOnPause
 tts_client = get_tts_model()
 
 
-def detection(audio: tuple[int, np.ndarray]):
+def echo(audio: tuple[int, np.ndarray]):
     # Implement any iterator that yields audio
     # See "LLM Voice Chat" for a more complete example
     yield audio
@@ -98,7 +98,7 @@ def startup():
 
 
 stream = Stream(
-    handler=ReplyOnPause(detection, startup_fn=startup),
+    handler=ReplyOnPause(echo, startup_fn=startup),
     modality="audio",
     mode="send-receive",
     ui_args={"title": "Echo Audio"},