Audio to float32 util (#32)

* add util * version bump
2026-02-05 01:49:23 +08:00 · 2024-12-06 17:31:12 -05:00
parent 80283f6631
commit 903f1f70bd
4 changed files with 36 additions and 3 deletions
--- a/backend/gradio_webrtc/init.py
+++ b/backend/gradio_webrtc/init.py
@@ -4,7 +4,7 @@ from .credentials import (
    get_twilio_turn_credentials,
 )
 from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
-from .utils import AdditionalOutputs, audio_to_bytes, audio_to_file
+from .utils import AdditionalOutputs, audio_to_bytes, audio_to_file, audio_to_float32
 from .webrtc import StreamHandler, WebRTC

 __all__ = [
@@ -12,6 +12,7 @@ __all__ = [
    "AdditionalOutputs",
    "audio_to_bytes",
    "audio_to_file",
+    "audio_to_float32",
    "get_hf_turn_credentials",
    "get_twilio_turn_credentials",
    "get_turn_credentials",
--- a/backend/gradio_webrtc/reply_on_pause.py
+++ b/backend/gradio_webrtc/reply_on_pause.py
@@ -83,8 +83,14 @@ class ReplyOnPause(StreamHandler):
        expected_layout: Literal["mono", "stereo"] = "mono",
        output_sample_rate: int = 24000,
        output_frame_size: int = 480,
+        input_sample_rate: int = 48000,
    ):
-        super().__init__(expected_layout, output_sample_rate, output_frame_size)
+        super().__init__(
+            expected_layout,
+            output_sample_rate,
+            output_frame_size,
+            input_sample_rate=input_sample_rate,
+        )
        self.expected_layout: Literal["mono", "stereo"] = expected_layout
        self.output_sample_rate = output_sample_rate
        self.output_frame_size = output_frame_size
--- a/backend/gradio_webrtc/utils.py
+++ b/backend/gradio_webrtc/utils.py
@@ -187,3 +187,29 @@ def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
        f.write(bytes_)
    return f.name
+
+
+def audio_to_float32(audio: tuple[int, np.ndarray]) -> np.ndarray:
+    """
+    Convert an audio tuple containing sample rate (int16) and numpy array data to float32.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    np.ndarray
+        The audio data as a numpy array with dtype float32
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> audio_float32 = audio_to_float32(audio_tuple)
+    """
+    return audio[1].astype(np.float32) / 32768.0
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"

 [project]
 name = "gradio_webrtc"
-version = "0.0.19"
+version = "0.0.20"
 description = "Stream images in realtime with webrtc"
 readme = "README.md"
 license = "apache-2.0"