Add some utils fns, add moshi to cookbook, fix querySelector, support async functions in ReplyOnPause (#29)

* add * add code
2026-02-05 01:49:23 +08:00 · 2024-12-04 15:14:19 -05:00
parent c85c117576
commit 868e0bfa64
9 changed files with 158 additions and 10 deletions
--- a/backend/gradio_webrtc/init.py
+++ b/backend/gradio_webrtc/init.py
@@ -4,12 +4,14 @@ from .credentials import (
    get_twilio_turn_credentials,
 )
 from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
-from .utils import AdditionalOutputs
+from .utils import AdditionalOutputs, audio_to_bytes, audio_to_file
 from .webrtc import StreamHandler, WebRTC

 __all__ = [
    "AlgoOptions",
    "AdditionalOutputs",
+    "audio_to_bytes",
+    "audio_to_file",
    "get_hf_turn_credentials",
    "get_twilio_turn_credentials",
    "get_turn_credentials",
--- a/backend/gradio_webrtc/reply_on_pause.py
+++ b/backend/gradio_webrtc/reply_on_pause.py
@@ -70,6 +70,10 @@ ReplyFnGenerator = Union[
 ]


+async def iterate(generator: Generator) -> Any:
+    return next(generator)
+
+
 class ReplyOnPause(StreamHandler):
    def __init__(
        self,
@@ -86,6 +90,7 @@ class ReplyOnPause(StreamHandler):
        self.output_frame_size = output_frame_size
        self.model = get_vad_model()
        self.fn = fn
+        self.is_async = inspect.isasyncgenfunction(fn)
        self.event = Event()
        self.state = AppState()
        self.generator = None
@@ -172,6 +177,9 @@ class ReplyOnPause(StreamHandler):
            self.channel.send("tick")
            logger.debug("Sent tick")

+    async def async_iterate(self, generator) -> Any:
+        return await anext(generator)
+
    def emit(self):
        if not self.event.is_set():
            return None
@@ -190,6 +198,11 @@ class ReplyOnPause(StreamHandler):
                logger.debug("Latest args: %s", self.latest_args)
            self.state.responding = True
            try:
-                return next(self.generator)
-            except StopIteration:
+                if self.is_async:
+                    return asyncio.run_coroutine_threadsafe(
+                        self.async_iterate(self.generator), self.loop
+                    ).result()
+                else:
+                    return next(self.generator)
+            except (StopIteration, StopAsyncIteration):
                self.reset()
--- a/backend/gradio_webrtc/utils.py
+++ b/backend/gradio_webrtc/utils.py
@@ -1,10 +1,13 @@
 import asyncio
 import fractions
+import io
 import logging
+import tempfile
 from typing import Any, Callable, Protocol, cast

 import av
 import numpy as np
+from pydub import AudioSegment

 logger = logging.getLogger(__name__)

@@ -120,3 +123,67 @@ async def player_worker_decode(
            logger.debug("traceback %s", exec)
            logger.error("Error processing frame: %s", str(e))
            continue
+
+
+def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
+    """
+    Convert an audio tuple containing sample rate and numpy array data into bytes.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    bytes
+        The audio data encoded as bytes, suitable for transmission or storage
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> audio_bytes = audio_to_bytes(audio_tuple)
+    """
+    audio_buffer = io.BytesIO()
+    segment = AudioSegment(
+        audio[1].tobytes(),
+        frame_rate=audio[0],
+        sample_width=audio[1].dtype.itemsize,
+        channels=1,
+    )
+    segment.export(audio_buffer, format="mp3")
+    return audio_buffer.getvalue()
+
+
+def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
+    """
+    Save an audio tuple containing sample rate and numpy array data to a file.
+
+    Parameters
+    ----------
+    audio : tuple[int, np.ndarray]
+        A tuple containing:
+            - sample_rate (int): The audio sample rate in Hz
+            - data (np.ndarray): The audio data as a numpy array
+
+    Returns
+    -------
+    str
+        The path to the saved audio file
+
+    Example
+    -------
+    >>> sample_rate = 44100
+    >>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+    >>> audio_tuple = (sample_rate, audio_data)
+    >>> file_path = audio_to_file(audio_tuple)
+    >>> print(f"Audio saved to: {file_path}")
+    """
+    bytes_ = audio_to_bytes(audio)
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(bytes_)
+    return f.name
--- a/docs/cookbook.md
+++ b/docs/cookbook.md
@@ -24,6 +24,18 @@
    
    [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py)

+-   :speaking_head:{ .lg .middle } __Kyutai Moshi__
+
+    ---
+
+    Kyutai's moshi is a novel speech-to-speech model for modeling human conversations.
+
+    <video width=98% src="https://github.com/user-attachments/assets/becc7a13-9e89-4a19-9df2-5fb1467a0137" controls style="text-align: center"></video>
+
+    [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi)
+    
+    [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi/blob/main/app.py)
+
 -   :robot:{ .lg .middle } __Llama Code Editor__

    ---
--- a/docs/index.md
+++ b/docs/index.md
@@ -22,7 +22,4 @@ pip install gradio_webrtc[vad]
 ```

 ## Examples
-1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
-2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
-3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
-4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
+See the [cookbook](/cookbook)
--- a/docs/utils.md
+++ b/docs/utils.md
@@ -0,0 +1,54 @@
+# Utils
+
+## `audio_to_bytes`
+
+Convert an audio tuple containing sample rate and numpy array data into bytes.
+Useful for sending data to external APIs from `ReplyOnPause` handler.
+
+Parameters
+```
+audio : tuple[int, np.ndarray]
+    A tuple containing:
+        - sample_rate (int): The audio sample rate in Hz
+        - data (np.ndarray): The audio data as a numpy array
+```
+
+Returns
+```
+bytes
+    The audio data encoded as bytes, suitable for transmission or storage
+```
+
+Example
+```python
+>>> sample_rate = 44100
+>>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+>>> audio_tuple = (sample_rate, audio_data)
+>>> audio_bytes = audio_to_bytes(audio_tuple)
+```
+
+## `audio_to_file`
+
+Save an audio tuple containing sample rate and numpy array data to a file.
+
+Parameters
+```
+audio : tuple[int, np.ndarray]
+    A tuple containing:
+        - sample_rate (int): The audio sample rate in Hz
+        - data (np.ndarray): The audio data as a numpy array
+```
+Returns
+```
+str
+    The path to the saved audio file
+```
+Example
+```
+```python
+>>> sample_rate = 44100
+>>> audio_data = np.array([0.1, -0.2, 0.3])  # Example audio samples
+>>> audio_tuple = (sample_rate, audio_data)
+>>> file_path = audio_to_file(audio_tuple)
+>>> print(f"Audio saved to: {file_path}")
+```
--- a/frontend/shared/AudioWave.svelte
+++ b/frontend/shared/AudioWave.svelte
@@ -41,8 +41,7 @@
  
    function updateBars() {
      analyser.getByteFrequencyData(dataArray);
-      
-      const bars = document.querySelectorAll('.box');
+      const bars = document.querySelectorAll('.waveContainer .box');
      for (let i = 0; i < bars.length; i++) {
        const barHeight = (dataArray[i] / 255) * 2; // Amplify the effect
        bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -19,6 +19,7 @@ nav:
  - Cookbook: cookbook.md
  - Deployment: deployment.md
  - Advanced Configuration: advanced-configuration.md
+  - Utils: utils.md
  - Frequently Asked Questions: faq.md
 markdown_extensions:
  - pymdownx.highlight:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"

 [project]
 name = "gradio_webrtc"
-version = "0.0.15"
+version = "0.0.16"
 description = "Stream images in realtime with webrtc"
 readme = "README.md"
 license = "apache-2.0"
@@ -50,3 +50,6 @@ artifacts = ["/backend/gradio_webrtc/templates", "*.pyi"]

 [tool.hatch.build.targets.wheel]
 packages = ["/backend/gradio_webrtc"]
+
+[tool.ruff]
+target-version = "py310"