Add some utils fns, add moshi to cookbook, fix querySelector, support async functions in ReplyOnPause (#29)

* add

* add code
This commit is contained in:
Freddy Boulton
2024-12-04 15:14:19 -05:00
committed by GitHub
parent c85c117576
commit 868e0bfa64
9 changed files with 158 additions and 10 deletions

View File

@@ -4,12 +4,14 @@ from .credentials import (
get_twilio_turn_credentials, get_twilio_turn_credentials,
) )
from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
from .utils import AdditionalOutputs from .utils import AdditionalOutputs, audio_to_bytes, audio_to_file
from .webrtc import StreamHandler, WebRTC from .webrtc import StreamHandler, WebRTC
__all__ = [ __all__ = [
"AlgoOptions", "AlgoOptions",
"AdditionalOutputs", "AdditionalOutputs",
"audio_to_bytes",
"audio_to_file",
"get_hf_turn_credentials", "get_hf_turn_credentials",
"get_twilio_turn_credentials", "get_twilio_turn_credentials",
"get_turn_credentials", "get_turn_credentials",

View File

@@ -70,6 +70,10 @@ ReplyFnGenerator = Union[
] ]
async def iterate(generator: Generator) -> Any:
return next(generator)
class ReplyOnPause(StreamHandler): class ReplyOnPause(StreamHandler):
def __init__( def __init__(
self, self,
@@ -86,6 +90,7 @@ class ReplyOnPause(StreamHandler):
self.output_frame_size = output_frame_size self.output_frame_size = output_frame_size
self.model = get_vad_model() self.model = get_vad_model()
self.fn = fn self.fn = fn
self.is_async = inspect.isasyncgenfunction(fn)
self.event = Event() self.event = Event()
self.state = AppState() self.state = AppState()
self.generator = None self.generator = None
@@ -172,6 +177,9 @@ class ReplyOnPause(StreamHandler):
self.channel.send("tick") self.channel.send("tick")
logger.debug("Sent tick") logger.debug("Sent tick")
async def async_iterate(self, generator) -> Any:
return await anext(generator)
def emit(self): def emit(self):
if not self.event.is_set(): if not self.event.is_set():
return None return None
@@ -190,6 +198,11 @@ class ReplyOnPause(StreamHandler):
logger.debug("Latest args: %s", self.latest_args) logger.debug("Latest args: %s", self.latest_args)
self.state.responding = True self.state.responding = True
try: try:
return next(self.generator) if self.is_async:
except StopIteration: return asyncio.run_coroutine_threadsafe(
self.async_iterate(self.generator), self.loop
).result()
else:
return next(self.generator)
except (StopIteration, StopAsyncIteration):
self.reset() self.reset()

View File

@@ -1,10 +1,13 @@
import asyncio import asyncio
import fractions import fractions
import io
import logging import logging
import tempfile
from typing import Any, Callable, Protocol, cast from typing import Any, Callable, Protocol, cast
import av import av
import numpy as np import numpy as np
from pydub import AudioSegment
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -120,3 +123,67 @@ async def player_worker_decode(
logger.debug("traceback %s", exec) logger.debug("traceback %s", exec)
logger.error("Error processing frame: %s", str(e)) logger.error("Error processing frame: %s", str(e))
continue continue
def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
"""
Convert an audio tuple containing sample rate and numpy array data into bytes.
Parameters
----------
audio : tuple[int, np.ndarray]
A tuple containing:
- sample_rate (int): The audio sample rate in Hz
- data (np.ndarray): The audio data as a numpy array
Returns
-------
bytes
The audio data encoded as bytes, suitable for transmission or storage
Example
-------
>>> sample_rate = 44100
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
>>> audio_tuple = (sample_rate, audio_data)
>>> audio_bytes = audio_to_bytes(audio_tuple)
"""
audio_buffer = io.BytesIO()
segment = AudioSegment(
audio[1].tobytes(),
frame_rate=audio[0],
sample_width=audio[1].dtype.itemsize,
channels=1,
)
segment.export(audio_buffer, format="mp3")
return audio_buffer.getvalue()
def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
"""
Save an audio tuple containing sample rate and numpy array data to a file.
Parameters
----------
audio : tuple[int, np.ndarray]
A tuple containing:
- sample_rate (int): The audio sample rate in Hz
- data (np.ndarray): The audio data as a numpy array
Returns
-------
str
The path to the saved audio file
Example
-------
>>> sample_rate = 44100
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
>>> audio_tuple = (sample_rate, audio_data)
>>> file_path = audio_to_file(audio_tuple)
>>> print(f"Audio saved to: {file_path}")
"""
bytes_ = audio_to_bytes(audio)
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
f.write(bytes_)
return f.name

View File

@@ -24,6 +24,18 @@
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py) [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py)
- :speaking_head:{ .lg .middle } __Kyutai Moshi__
---
Kyutai's moshi is a novel speech-to-speech model for modeling human conversations.
<video width=98% src="https://github.com/user-attachments/assets/becc7a13-9e89-4a19-9df2-5fb1467a0137" controls style="text-align: center"></video>
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi)
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi/blob/main/app.py)
- :robot:{ .lg .middle } __Llama Code Editor__ - :robot:{ .lg .middle } __Llama Code Editor__
--- ---

View File

@@ -22,7 +22,4 @@ pip install gradio_webrtc[vad]
``` ```
## Examples ## Examples
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷 See the [cookbook](/cookbook)
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️

54
docs/utils.md Normal file
View File

@@ -0,0 +1,54 @@
# Utils
## `audio_to_bytes`
Convert an audio tuple containing sample rate and numpy array data into bytes.
Useful for sending data to external APIs from `ReplyOnPause` handler.
Parameters
```
audio : tuple[int, np.ndarray]
A tuple containing:
- sample_rate (int): The audio sample rate in Hz
- data (np.ndarray): The audio data as a numpy array
```
Returns
```
bytes
The audio data encoded as bytes, suitable for transmission or storage
```
Example
```python
>>> sample_rate = 44100
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
>>> audio_tuple = (sample_rate, audio_data)
>>> audio_bytes = audio_to_bytes(audio_tuple)
```
## `audio_to_file`
Save an audio tuple containing sample rate and numpy array data to a file.
Parameters
```
audio : tuple[int, np.ndarray]
A tuple containing:
- sample_rate (int): The audio sample rate in Hz
- data (np.ndarray): The audio data as a numpy array
```
Returns
```
str
The path to the saved audio file
```
Example
```
```python
>>> sample_rate = 44100
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
>>> audio_tuple = (sample_rate, audio_data)
>>> file_path = audio_to_file(audio_tuple)
>>> print(f"Audio saved to: {file_path}")
```

View File

@@ -41,8 +41,7 @@
function updateBars() { function updateBars() {
analyser.getByteFrequencyData(dataArray); analyser.getByteFrequencyData(dataArray);
const bars = document.querySelectorAll('.waveContainer .box');
const bars = document.querySelectorAll('.box');
for (let i = 0; i < bars.length; i++) { for (let i = 0; i < bars.length; i++) {
const barHeight = (dataArray[i] / 255) * 2; // Amplify the effect const barHeight = (dataArray[i] / 255) * 2; // Amplify the effect
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`; bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;

View File

@@ -19,6 +19,7 @@ nav:
- Cookbook: cookbook.md - Cookbook: cookbook.md
- Deployment: deployment.md - Deployment: deployment.md
- Advanced Configuration: advanced-configuration.md - Advanced Configuration: advanced-configuration.md
- Utils: utils.md
- Frequently Asked Questions: faq.md - Frequently Asked Questions: faq.md
markdown_extensions: markdown_extensions:
- pymdownx.highlight: - pymdownx.highlight:

View File

@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "gradio_webrtc" name = "gradio_webrtc"
version = "0.0.15" version = "0.0.16"
description = "Stream images in realtime with webrtc" description = "Stream images in realtime with webrtc"
readme = "README.md" readme = "README.md"
license = "apache-2.0" license = "apache-2.0"
@@ -50,3 +50,6 @@ artifacts = ["/backend/gradio_webrtc/templates", "*.pyi"]
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = ["/backend/gradio_webrtc"] packages = ["/backend/gradio_webrtc"]
[tool.ruff]
target-version = "py310"