mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
Add some utils fns, add moshi to cookbook, fix querySelector, support async functions in ReplyOnPause (#29)
* add * add code
This commit is contained in:
@@ -4,12 +4,14 @@ from .credentials import (
|
|||||||
get_twilio_turn_credentials,
|
get_twilio_turn_credentials,
|
||||||
)
|
)
|
||||||
from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
|
from .reply_on_pause import AlgoOptions, ReplyOnPause, SileroVadOptions
|
||||||
from .utils import AdditionalOutputs
|
from .utils import AdditionalOutputs, audio_to_bytes, audio_to_file
|
||||||
from .webrtc import StreamHandler, WebRTC
|
from .webrtc import StreamHandler, WebRTC
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AlgoOptions",
|
"AlgoOptions",
|
||||||
"AdditionalOutputs",
|
"AdditionalOutputs",
|
||||||
|
"audio_to_bytes",
|
||||||
|
"audio_to_file",
|
||||||
"get_hf_turn_credentials",
|
"get_hf_turn_credentials",
|
||||||
"get_twilio_turn_credentials",
|
"get_twilio_turn_credentials",
|
||||||
"get_turn_credentials",
|
"get_turn_credentials",
|
||||||
|
|||||||
@@ -70,6 +70,10 @@ ReplyFnGenerator = Union[
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def iterate(generator: Generator) -> Any:
|
||||||
|
return next(generator)
|
||||||
|
|
||||||
|
|
||||||
class ReplyOnPause(StreamHandler):
|
class ReplyOnPause(StreamHandler):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -86,6 +90,7 @@ class ReplyOnPause(StreamHandler):
|
|||||||
self.output_frame_size = output_frame_size
|
self.output_frame_size = output_frame_size
|
||||||
self.model = get_vad_model()
|
self.model = get_vad_model()
|
||||||
self.fn = fn
|
self.fn = fn
|
||||||
|
self.is_async = inspect.isasyncgenfunction(fn)
|
||||||
self.event = Event()
|
self.event = Event()
|
||||||
self.state = AppState()
|
self.state = AppState()
|
||||||
self.generator = None
|
self.generator = None
|
||||||
@@ -172,6 +177,9 @@ class ReplyOnPause(StreamHandler):
|
|||||||
self.channel.send("tick")
|
self.channel.send("tick")
|
||||||
logger.debug("Sent tick")
|
logger.debug("Sent tick")
|
||||||
|
|
||||||
|
async def async_iterate(self, generator) -> Any:
|
||||||
|
return await anext(generator)
|
||||||
|
|
||||||
def emit(self):
|
def emit(self):
|
||||||
if not self.event.is_set():
|
if not self.event.is_set():
|
||||||
return None
|
return None
|
||||||
@@ -190,6 +198,11 @@ class ReplyOnPause(StreamHandler):
|
|||||||
logger.debug("Latest args: %s", self.latest_args)
|
logger.debug("Latest args: %s", self.latest_args)
|
||||||
self.state.responding = True
|
self.state.responding = True
|
||||||
try:
|
try:
|
||||||
return next(self.generator)
|
if self.is_async:
|
||||||
except StopIteration:
|
return asyncio.run_coroutine_threadsafe(
|
||||||
|
self.async_iterate(self.generator), self.loop
|
||||||
|
).result()
|
||||||
|
else:
|
||||||
|
return next(self.generator)
|
||||||
|
except (StopIteration, StopAsyncIteration):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import fractions
|
import fractions
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import tempfile
|
||||||
from typing import Any, Callable, Protocol, cast
|
from typing import Any, Callable, Protocol, cast
|
||||||
|
|
||||||
import av
|
import av
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -120,3 +123,67 @@ async def player_worker_decode(
|
|||||||
logger.debug("traceback %s", exec)
|
logger.debug("traceback %s", exec)
|
||||||
logger.error("Error processing frame: %s", str(e))
|
logger.error("Error processing frame: %s", str(e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
def audio_to_bytes(audio: tuple[int, np.ndarray]) -> bytes:
|
||||||
|
"""
|
||||||
|
Convert an audio tuple containing sample rate and numpy array data into bytes.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
audio : tuple[int, np.ndarray]
|
||||||
|
A tuple containing:
|
||||||
|
- sample_rate (int): The audio sample rate in Hz
|
||||||
|
- data (np.ndarray): The audio data as a numpy array
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bytes
|
||||||
|
The audio data encoded as bytes, suitable for transmission or storage
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> sample_rate = 44100
|
||||||
|
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||||
|
>>> audio_tuple = (sample_rate, audio_data)
|
||||||
|
>>> audio_bytes = audio_to_bytes(audio_tuple)
|
||||||
|
"""
|
||||||
|
audio_buffer = io.BytesIO()
|
||||||
|
segment = AudioSegment(
|
||||||
|
audio[1].tobytes(),
|
||||||
|
frame_rate=audio[0],
|
||||||
|
sample_width=audio[1].dtype.itemsize,
|
||||||
|
channels=1,
|
||||||
|
)
|
||||||
|
segment.export(audio_buffer, format="mp3")
|
||||||
|
return audio_buffer.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def audio_to_file(audio: tuple[int, np.ndarray]) -> str:
|
||||||
|
"""
|
||||||
|
Save an audio tuple containing sample rate and numpy array data to a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
audio : tuple[int, np.ndarray]
|
||||||
|
A tuple containing:
|
||||||
|
- sample_rate (int): The audio sample rate in Hz
|
||||||
|
- data (np.ndarray): The audio data as a numpy array
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
The path to the saved audio file
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> sample_rate = 44100
|
||||||
|
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||||
|
>>> audio_tuple = (sample_rate, audio_data)
|
||||||
|
>>> file_path = audio_to_file(audio_tuple)
|
||||||
|
>>> print(f"Audio saved to: {file_path}")
|
||||||
|
"""
|
||||||
|
bytes_ = audio_to_bytes(audio)
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
|
||||||
|
f.write(bytes_)
|
||||||
|
return f.name
|
||||||
|
|||||||
@@ -24,6 +24,18 @@
|
|||||||
|
|
||||||
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py)
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py)
|
||||||
|
|
||||||
|
- :speaking_head:{ .lg .middle } __Kyutai Moshi__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Kyutai's moshi is a novel speech-to-speech model for modeling human conversations.
|
||||||
|
|
||||||
|
<video width=98% src="https://github.com/user-attachments/assets/becc7a13-9e89-4a19-9df2-5fb1467a0137" controls style="text-align: center"></video>
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-moshi/blob/main/app.py)
|
||||||
|
|
||||||
- :robot:{ .lg .middle } __Llama Code Editor__
|
- :robot:{ .lg .middle } __Llama Code Editor__
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -22,7 +22,4 @@ pip install gradio_webrtc[vad]
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
See the [cookbook](/cookbook)
|
||||||
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
|
||||||
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
|
||||||
4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
|
|
||||||
54
docs/utils.md
Normal file
54
docs/utils.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Utils
|
||||||
|
|
||||||
|
## `audio_to_bytes`
|
||||||
|
|
||||||
|
Convert an audio tuple containing sample rate and numpy array data into bytes.
|
||||||
|
Useful for sending data to external APIs from `ReplyOnPause` handler.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
```
|
||||||
|
audio : tuple[int, np.ndarray]
|
||||||
|
A tuple containing:
|
||||||
|
- sample_rate (int): The audio sample rate in Hz
|
||||||
|
- data (np.ndarray): The audio data as a numpy array
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns
|
||||||
|
```
|
||||||
|
bytes
|
||||||
|
The audio data encoded as bytes, suitable for transmission or storage
|
||||||
|
```
|
||||||
|
|
||||||
|
Example
|
||||||
|
```python
|
||||||
|
>>> sample_rate = 44100
|
||||||
|
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||||
|
>>> audio_tuple = (sample_rate, audio_data)
|
||||||
|
>>> audio_bytes = audio_to_bytes(audio_tuple)
|
||||||
|
```
|
||||||
|
|
||||||
|
## `audio_to_file`
|
||||||
|
|
||||||
|
Save an audio tuple containing sample rate and numpy array data to a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
```
|
||||||
|
audio : tuple[int, np.ndarray]
|
||||||
|
A tuple containing:
|
||||||
|
- sample_rate (int): The audio sample rate in Hz
|
||||||
|
- data (np.ndarray): The audio data as a numpy array
|
||||||
|
```
|
||||||
|
Returns
|
||||||
|
```
|
||||||
|
str
|
||||||
|
The path to the saved audio file
|
||||||
|
```
|
||||||
|
Example
|
||||||
|
```
|
||||||
|
```python
|
||||||
|
>>> sample_rate = 44100
|
||||||
|
>>> audio_data = np.array([0.1, -0.2, 0.3]) # Example audio samples
|
||||||
|
>>> audio_tuple = (sample_rate, audio_data)
|
||||||
|
>>> file_path = audio_to_file(audio_tuple)
|
||||||
|
>>> print(f"Audio saved to: {file_path}")
|
||||||
|
```
|
||||||
@@ -41,8 +41,7 @@
|
|||||||
|
|
||||||
function updateBars() {
|
function updateBars() {
|
||||||
analyser.getByteFrequencyData(dataArray);
|
analyser.getByteFrequencyData(dataArray);
|
||||||
|
const bars = document.querySelectorAll('.waveContainer .box');
|
||||||
const bars = document.querySelectorAll('.box');
|
|
||||||
for (let i = 0; i < bars.length; i++) {
|
for (let i = 0; i < bars.length; i++) {
|
||||||
const barHeight = (dataArray[i] / 255) * 2; // Amplify the effect
|
const barHeight = (dataArray[i] / 255) * 2; // Amplify the effect
|
||||||
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
|
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ nav:
|
|||||||
- Cookbook: cookbook.md
|
- Cookbook: cookbook.md
|
||||||
- Deployment: deployment.md
|
- Deployment: deployment.md
|
||||||
- Advanced Configuration: advanced-configuration.md
|
- Advanced Configuration: advanced-configuration.md
|
||||||
|
- Utils: utils.md
|
||||||
- Frequently Asked Questions: faq.md
|
- Frequently Asked Questions: faq.md
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- pymdownx.highlight:
|
- pymdownx.highlight:
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "gradio_webrtc"
|
name = "gradio_webrtc"
|
||||||
version = "0.0.15"
|
version = "0.0.16"
|
||||||
description = "Stream images in realtime with webrtc"
|
description = "Stream images in realtime with webrtc"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = "apache-2.0"
|
license = "apache-2.0"
|
||||||
@@ -50,3 +50,6 @@ artifacts = ["/backend/gradio_webrtc/templates", "*.pyi"]
|
|||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["/backend/gradio_webrtc"]
|
packages = ["/backend/gradio_webrtc"]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py310"
|
||||||
Reference in New Issue
Block a user