diff --git a/backend/fastrtc/reply_on_pause.py b/backend/fastrtc/reply_on_pause.py index 1e3215d..64623dc 100644 --- a/backend/fastrtc/reply_on_pause.py +++ b/backend/fastrtc/reply_on_pause.py @@ -74,7 +74,7 @@ class ReplyOnPause(StreamHandler): can_interrupt: bool = True, expected_layout: Literal["mono", "stereo"] = "mono", output_sample_rate: int = 24000, - output_frame_size: int = 480, + output_frame_size: int | None = None, # Deprecated input_sample_rate: int = 48000, model: PauseDetectionModel | None = None, ): @@ -86,8 +86,6 @@ class ReplyOnPause(StreamHandler): ) self.can_interrupt = can_interrupt self.expected_layout: Literal["mono", "stereo"] = expected_layout - self.output_sample_rate = output_sample_rate - self.output_frame_size = output_frame_size self.model = model or get_silero_model() self.fn = fn self.is_async = inspect.isasyncgenfunction(fn) diff --git a/backend/fastrtc/reply_on_stopwords.py b/backend/fastrtc/reply_on_stopwords.py index 7d082d7..5503851 100644 --- a/backend/fastrtc/reply_on_stopwords.py +++ b/backend/fastrtc/reply_on_stopwords.py @@ -39,7 +39,7 @@ class ReplyOnStopWords(ReplyOnPause): can_interrupt: bool = True, expected_layout: Literal["mono", "stereo"] = "mono", output_sample_rate: int = 24000, - output_frame_size: int = 480, + output_frame_size: int | None = None, # Deprecated input_sample_rate: int = 48000, model: PauseDetectionModel | None = None, ): diff --git a/backend/fastrtc/tracks.py b/backend/fastrtc/tracks.py index 36fcea1..1b272e9 100644 --- a/backend/fastrtc/tracks.py +++ b/backend/fastrtc/tracks.py @@ -10,6 +10,7 @@ import logging import threading import time import traceback +import warnings from abc import ABC, abstractmethod from collections.abc import Callable from dataclasses import dataclass @@ -239,13 +240,12 @@ class StreamHandlerBase(ABC): self, expected_layout: Literal["mono", "stereo"] = "mono", output_sample_rate: int = 24000, - output_frame_size: int = 960, + output_frame_size: int | None = None, input_sample_rate: int = 48000, fps: int = 30, ) -> None: self.expected_layout = expected_layout self.output_sample_rate = output_sample_rate - self.output_frame_size = output_frame_size self.input_sample_rate = input_sample_rate self.fps = fps self.latest_args: list[Any] = [] @@ -257,6 +257,30 @@ class StreamHandlerBase(ABC): self._phone_mode = False self._clear_queue: Callable | None = None + sample_rate_to_frame_size_coef = 50 + if output_sample_rate % sample_rate_to_frame_size_coef != 0: + raise ValueError( + "output_sample_rate must be a multiple of " + f"{sample_rate_to_frame_size_coef}, got {output_sample_rate}" + ) + + actual_output_frame_size = output_sample_rate // sample_rate_to_frame_size_coef + if ( + output_frame_size is not None + and output_frame_size != actual_output_frame_size + ): + warnings.warn( + "The output_frame_size parameter is deprecated and will be removed " + "in a future release. The value passed in will be ignored. " + f"The actual output frame size is {actual_output_frame_size}, " + f"corresponding to {1 / sample_rate_to_frame_size_coef:.2f}s " + f"at {output_sample_rate=}Hz.", + # DeprecationWarning is filtered out by default, so use UserWarning + UserWarning, + stacklevel=2, # So that the warning points to the user's code + ) + self.output_frame_size = actual_output_frame_size + @property def clear_queue(self) -> Callable: return cast(Callable, self._clear_queue) diff --git a/demo/gemini_audio_video/app.py b/demo/gemini_audio_video/app.py index b6a2d7f..e62dfc0 100644 --- a/demo/gemini_audio_video/app.py +++ b/demo/gemini_audio_video/app.py @@ -44,7 +44,6 @@ class GeminiHandler(AsyncAudioVideoStreamHandler): super().__init__( "mono", output_sample_rate=24000, - output_frame_size=480, input_sample_rate=16000, ) self.audio_queue = asyncio.Queue() diff --git a/demo/gemini_conversation/app.py b/demo/gemini_conversation/app.py index 907693b..8dcb21c 100644 --- a/demo/gemini_conversation/app.py +++ b/demo/gemini_conversation/app.py @@ -42,7 +42,6 @@ class GeminiHandler(AsyncStreamHandler): super().__init__( expected_layout="mono", output_sample_rate=24000, - output_frame_size=480, input_sample_rate=24000, ) self.input_queue: asyncio.Queue = asyncio.Queue() diff --git a/demo/talk_to_azure_openai/app.py b/demo/talk_to_azure_openai/app.py index a278ccc..ce7ad19 100644 --- a/demo/talk_to_azure_openai/app.py +++ b/demo/talk_to_azure_openai/app.py @@ -38,7 +38,6 @@ class AzureAudioHandler(AsyncStreamHandler): super().__init__( expected_layout="mono", output_sample_rate=SAMPLE_RATE, - output_frame_size=480, input_sample_rate=SAMPLE_RATE, ) self.ws = None diff --git a/demo/talk_to_gemini/app.py b/demo/talk_to_gemini/app.py index db5dcff..7e8929c 100644 --- a/demo/talk_to_gemini/app.py +++ b/demo/talk_to_gemini/app.py @@ -43,12 +43,10 @@ class GeminiHandler(AsyncStreamHandler): self, expected_layout: Literal["mono"] = "mono", output_sample_rate: int = 24000, - output_frame_size: int = 480, ) -> None: super().__init__( expected_layout, output_sample_rate, - output_frame_size, input_sample_rate=16000, ) self.input_queue: asyncio.Queue = asyncio.Queue() @@ -59,7 +57,6 @@ class GeminiHandler(AsyncStreamHandler): return GeminiHandler( expected_layout="mono", output_sample_rate=self.output_sample_rate, - output_frame_size=self.output_frame_size, ) async def start_up(self): diff --git a/demo/talk_to_openai/app.py b/demo/talk_to_openai/app.py index e60ec1f..2bf13ab 100644 --- a/demo/talk_to_openai/app.py +++ b/demo/talk_to_openai/app.py @@ -33,7 +33,6 @@ class OpenAIHandler(AsyncStreamHandler): super().__init__( expected_layout="mono", output_sample_rate=SAMPLE_RATE, - output_frame_size=480, input_sample_rate=SAMPLE_RATE, ) self.connection = None diff --git a/docs/advanced-configuration.md b/docs/advanced-configuration.md index 36aacbd..7e3fd8b 100644 --- a/docs/advanced-configuration.md +++ b/docs/advanced-configuration.md @@ -92,27 +92,19 @@ stream = Stream( ## Stream Handler Output Audio -You can configure the output audio chunk size of `ReplyOnPause` (and any `StreamHandler`) -with the `output_sample_rate` and `output_frame_size` parameters. - -The following code (which uses the default values of these parameters), states that each output chunk will be a frame of 960 samples at a frame rate of `24,000` hz. So it will correspond to `0.04` seconds. +You can configure the output sampling rate of `ReplyOnPause` (and any `StreamHandler`) +with the `output_sample_rate` and parameter. For example: ```python from fastrtc import ReplyOnPause, Stream stream = Stream( - handler=ReplyOnPause(..., output_sample_rate=24000, output_frame_size=960), + handler=ReplyOnPause(..., output_sample_rate=16000), modality="audio", mode="send-receive" ) ``` -!!! tip - - In general it is best to leave these settings untouched. In some cases, - lowering the output_frame_size can yield smoother audio playback. - - ## Audio Icon You can display an icon of your choice instead of the default wave animation for audio streaming.