mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
code
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -11,4 +11,6 @@ __tmp/*
|
|||||||
node_modules
|
node_modules
|
||||||
backend/**/templates/
|
backend/**/templates/
|
||||||
demo/MobileNetSSD_deploy.caffemodel
|
demo/MobileNetSSD_deploy.caffemodel
|
||||||
demo/MobileNetSSD_deploy.prototxt.txt
|
demo/MobileNetSSD_deploy.prototxt.txt
|
||||||
|
.DS_Store
|
||||||
|
test/
|
||||||
69
README.md
69
README.md
@@ -1,14 +1,3 @@
|
|||||||
---
|
|
||||||
tags: [gradio-custom-component, Video, streaming, webrtc, realtime]
|
|
||||||
title: gradio_webrtc
|
|
||||||
short_description: Stream images in realtime with webrtc
|
|
||||||
colorFrom: blue
|
|
||||||
colorTo: yellow
|
|
||||||
sdk: gradio
|
|
||||||
pinned: false
|
|
||||||
app_file: space.py
|
|
||||||
---
|
|
||||||
|
|
||||||
<h1 style='text-align: center; margin-bottom: 1rem'> Gradio WebRTC ⚡️ </h1>
|
<h1 style='text-align: center; margin-bottom: 1rem'> Gradio WebRTC ⚡️ </h1>
|
||||||
|
|
||||||
<div style="display: flex; flex-direction: row; justify-content: center">
|
<div style="display: flex; flex-direction: row; justify-content: center">
|
||||||
@@ -30,15 +19,15 @@ pip install gradio_webrtc
|
|||||||
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
||||||
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
||||||
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
||||||
|
4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
The WebRTC component supports the following three use cases:
|
The WebRTC component supports the following three use cases:
|
||||||
1. Streaming video from the user webcam to the server and back
|
1. [Streaming video from the user webcam to the server and back](#h-streaming-video-from-the-user-webcam-to-the-server-and-back)
|
||||||
2. Streaming Video from the server to the client
|
2. [Streaming Video from the server to the client](#h-streaming-video-from-the-server-to-the-client)
|
||||||
3. Streaming Audio from the server to the client
|
3. [Streaming Audio from the server to the client](#h-streaming-audio-from-the-server-to-the-client)
|
||||||
|
4. [Streaming Audio from the client to the server and back (conversational AI)](#h-conversational-ai)
|
||||||
Streaming Audio from client to the server and back (conversational AI) is not supported yet.
|
|
||||||
|
|
||||||
|
|
||||||
## Streaming Video from the User Webcam to the Server and Back
|
## Streaming Video from the User Webcam to the Server and Back
|
||||||
@@ -78,7 +67,7 @@ as a **numpy array** and returns the processed frame also as a **numpy array**.
|
|||||||
* The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component.
|
* The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component.
|
||||||
* The `time_limit` parameter is the maximum time in seconds the video stream will run. If the time limit is reached, the video stream will stop.
|
* The `time_limit` parameter is the maximum time in seconds the video stream will run. If the time limit is reached, the video stream will stop.
|
||||||
|
|
||||||
## Streaming Video from the User Webcam to the Server and Back
|
## Streaming Video from the server to the client
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
@@ -143,6 +132,52 @@ with gr.Blocks() as demo:
|
|||||||
* The numpy array should be of shape (1, num_samples).
|
* The numpy array should be of shape (1, num_samples).
|
||||||
* The `outputs` parameter should be a list with the WebRTC component as the only element.
|
* The `outputs` parameter should be a list with the WebRTC component as the only element.
|
||||||
|
|
||||||
|
## Conversational AI
|
||||||
|
|
||||||
|
```python
|
||||||
|
import gradio as gr
|
||||||
|
import numpy as np
|
||||||
|
from gradio_webrtc import WebRTC, StreamHandler
|
||||||
|
from queue import Queue
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class EchoHandler(StreamHandler):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.queue = Queue()
|
||||||
|
|
||||||
|
def receive(self, frame: tuple[int, np.ndarray] | np.ndarray) -> None:
|
||||||
|
self.queue.put(frame)
|
||||||
|
|
||||||
|
def emit(self) -> None:
|
||||||
|
return self.queue.get()
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
with gr.Column():
|
||||||
|
with gr.Group():
|
||||||
|
audio = WebRTC(
|
||||||
|
label="Stream",
|
||||||
|
rtc_configuration=None,
|
||||||
|
mode="send-receive",
|
||||||
|
modality="audio",
|
||||||
|
)
|
||||||
|
|
||||||
|
audio.stream(fn=EchoHandler(), inputs=[audio], outputs=[audio], time_limit=15)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
* Instead of passing a function to the `stream` event's `fn` parameter, pass a `StreamHandler` implementation. The `StreamHandler` above simply echoes the audio back to the client.
|
||||||
|
* The `StreamHandler` class has two methods: `receive` and `emit`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client.
|
||||||
|
* An audio frame is represented as a tuple of (frame_rate, audio_samples) where `audio_samples` is a numpy array of shape (num_channels, num_samples).
|
||||||
|
* You can also specify the audio layout ("mono" or "stereo") in the emit method by retuning it as the third element of the tuple. If not specified, the default is "mono".
|
||||||
|
* The `time_limit` parameter is the maximum time in seconds the conversation will run. If the time limit is reached, the audio stream will stop.
|
||||||
|
|
||||||
|
|
||||||
## Deployment
|
## Deployment
|
||||||
|
|
||||||
When deploying in a cloud environment (like Hugging Face Spaces, EC2, etc), you need to set up a TURN server to relay the WebRTC traffic.
|
When deploying in a cloud environment (like Hugging Face Spaces, EC2, etc), you need to set up a TURN server to relay the WebRTC traffic.
|
||||||
|
|||||||
@@ -58,5 +58,5 @@ def player_worker_decode(
|
|||||||
frame.pts = audio_samples
|
frame.pts = audio_samples
|
||||||
frame.time_base = audio_time_base
|
frame.time_base = audio_time_base
|
||||||
audio_samples += frame.samples
|
audio_samples += frame.samples
|
||||||
|
|
||||||
asyncio.run_coroutine_threadsafe(queue.put(frame), loop)
|
asyncio.run_coroutine_threadsafe(queue.put(frame), loop)
|
||||||
|
logger.debug("Queue size utils.py: %s", queue.qsize())
|
||||||
|
|||||||
@@ -99,9 +99,9 @@ class VideoCallback(VideoStreamTrack):
|
|||||||
|
|
||||||
return new_frame
|
return new_frame
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(e)
|
logger.debug("exception %s", e)
|
||||||
exec = traceback.format_exc()
|
exec = traceback.format_exc()
|
||||||
logger.debug(exec)
|
logger.debug("traceback %s", exec)
|
||||||
|
|
||||||
|
|
||||||
class StreamHandler(ABC):
|
class StreamHandler(ABC):
|
||||||
@@ -161,20 +161,19 @@ class AudioCallback(AudioStreamTrack):
|
|||||||
frame = cast(AudioFrame, await self.track.recv())
|
frame = cast(AudioFrame, await self.track.recv())
|
||||||
for frame in self.event_handler.resample(frame):
|
for frame in self.event_handler.resample(frame):
|
||||||
numpy_array = frame.to_ndarray()
|
numpy_array = frame.to_ndarray()
|
||||||
logger.debug("numpy array shape %s", numpy_array.shape)
|
|
||||||
await anyio.to_thread.run_sync(
|
await anyio.to_thread.run_sync(
|
||||||
self.event_handler.receive, (frame.sample_rate, numpy_array)
|
self.event_handler.receive, (frame.sample_rate, numpy_array)
|
||||||
)
|
)
|
||||||
except MediaStreamError as e:
|
except MediaStreamError:
|
||||||
print("MediaStreamError", e)
|
logger.debug("MediaStreamError in process_input_frames")
|
||||||
break
|
break
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if not self.has_started:
|
if not self.has_started:
|
||||||
asyncio.create_task(self.process_input_frames())
|
asyncio.create_task(self.process_input_frames())
|
||||||
self.__thread = threading.Thread(
|
self.__thread = threading.Thread(
|
||||||
name="audio-output-decoders",
|
|
||||||
target=player_worker_decode,
|
target=player_worker_decode,
|
||||||
|
daemon=False,
|
||||||
args=(
|
args=(
|
||||||
asyncio.get_event_loop(),
|
asyncio.get_event_loop(),
|
||||||
self.event_handler.emit,
|
self.event_handler.emit,
|
||||||
@@ -214,11 +213,12 @@ class AudioCallback(AudioStreamTrack):
|
|||||||
self.last_timestamp = time.time()
|
self.last_timestamp = time.time()
|
||||||
return frame
|
return frame
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(e)
|
logger.debug("exception %s", e)
|
||||||
exec = traceback.format_exc()
|
exec = traceback.format_exc()
|
||||||
logger.debug(exec)
|
logger.debug("traceback %s", exec)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
logger.debug("audio callback stop")
|
||||||
self.thread_quit.set()
|
self.thread_quit.set()
|
||||||
if self.__thread is not None:
|
if self.__thread is not None:
|
||||||
self.__thread.join()
|
self.__thread.join()
|
||||||
@@ -266,9 +266,9 @@ class ServerToClientVideo(VideoStreamTrack):
|
|||||||
next_frame.time_base = time_base
|
next_frame.time_base = time_base
|
||||||
return next_frame
|
return next_frame
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(e)
|
logger.debug("exception %s", e)
|
||||||
exec = traceback.format_exc()
|
exec = traceback.format_exc()
|
||||||
logger.debug(exec)
|
logger.debug("traceback %s ", exec)
|
||||||
|
|
||||||
|
|
||||||
class ServerToClientAudio(AudioStreamTrack):
|
class ServerToClientAudio(AudioStreamTrack):
|
||||||
@@ -298,13 +298,14 @@ class ServerToClientAudio(AudioStreamTrack):
|
|||||||
frame = next(self.generator)
|
frame = next(self.generator)
|
||||||
return frame
|
return frame
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
self.thread_quit.set()
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if self.__thread is None:
|
if self.__thread is None:
|
||||||
self.__thread = threading.Thread(
|
self.__thread = threading.Thread(
|
||||||
name="generator-runner",
|
name="generator-runner",
|
||||||
target=player_worker_decode,
|
target=player_worker_decode,
|
||||||
|
daemon=True,
|
||||||
args=(
|
args=(
|
||||||
asyncio.get_event_loop(),
|
asyncio.get_event_loop(),
|
||||||
self.next,
|
self.next,
|
||||||
@@ -338,9 +339,9 @@ class ServerToClientAudio(AudioStreamTrack):
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(e)
|
logger.debug("exception %s", e)
|
||||||
exec = traceback.format_exc()
|
exec = traceback.format_exc()
|
||||||
logger.debug(exec)
|
logger.debug("traceback %s", exec)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
self.thread_quit.set()
|
self.thread_quit.set()
|
||||||
@@ -606,9 +607,12 @@ class WebRTC(Component):
|
|||||||
|
|
||||||
@pc.on("connectionstatechange")
|
@pc.on("connectionstatechange")
|
||||||
async def on_connectionstatechange():
|
async def on_connectionstatechange():
|
||||||
|
logger.debug("pc.connectionState %s", pc.connectionState)
|
||||||
if pc.connectionState in ["failed", "closed"]:
|
if pc.connectionState in ["failed", "closed"]:
|
||||||
await pc.close()
|
await pc.close()
|
||||||
self.connections.pop(body["webrtc_id"], None)
|
connection = self.connections.pop(body["webrtc_id"], None)
|
||||||
|
if connection:
|
||||||
|
connection.stop()
|
||||||
self.pcs.discard(pc)
|
self.pcs.discard(pc)
|
||||||
if pc.connectionState == "connected":
|
if pc.connectionState == "connected":
|
||||||
if self.time_limit is not None:
|
if self.time_limit is not None:
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ pip install gradio_webrtc
|
|||||||
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
||||||
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
||||||
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
||||||
4. [Conversational AI]()
|
4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@
|
|||||||
let stream_state: "open" | "closed" | "waiting" = "closed";
|
let stream_state: "open" | "closed" | "waiting" = "closed";
|
||||||
let audio_player: HTMLAudioElement;
|
let audio_player: HTMLAudioElement;
|
||||||
let pc: RTCPeerConnection;
|
let pc: RTCPeerConnection;
|
||||||
let _webrtc_id = Math.random().toString(36).substring(2);
|
let _webrtc_id = null;
|
||||||
|
|
||||||
|
|
||||||
const dispatch = createEventDispatcher<{
|
const dispatch = createEventDispatcher<{
|
||||||
@@ -63,6 +63,7 @@
|
|||||||
_time_limit = null;
|
_time_limit = null;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
_webrtc_id = Math.random().toString(36).substring(2);
|
||||||
value = _webrtc_id;
|
value = _webrtc_id;
|
||||||
pc = new RTCPeerConnection(rtc_configuration);
|
pc = new RTCPeerConnection(rtc_configuration);
|
||||||
pc.addEventListener("connectionstatechange",
|
pc.addEventListener("connectionstatechange",
|
||||||
|
|||||||
@@ -47,6 +47,7 @@
|
|||||||
async function start_stream(value: string): Promise<string> {
|
async function start_stream(value: string): Promise<string> {
|
||||||
if( value === "start_webrtc_stream") {
|
if( value === "start_webrtc_stream") {
|
||||||
stream_state = "waiting";
|
stream_state = "waiting";
|
||||||
|
_webrtc_id = Math.random().toString(36).substring(2)
|
||||||
value = _webrtc_id;
|
value = _webrtc_id;
|
||||||
console.log("set value to ", value);
|
console.log("set value to ", value);
|
||||||
pc = new RTCPeerConnection(rtc_configuration);
|
pc = new RTCPeerConnection(rtc_configuration);
|
||||||
|
|||||||
@@ -40,6 +40,7 @@
|
|||||||
)
|
)
|
||||||
|
|
||||||
$: if( value === "start_webrtc_stream") {
|
$: if( value === "start_webrtc_stream") {
|
||||||
|
_webrtc_id = Math.random().toString(36).substring(2);
|
||||||
value = _webrtc_id;
|
value = _webrtc_id;
|
||||||
pc = new RTCPeerConnection(rtc_configuration);
|
pc = new RTCPeerConnection(rtc_configuration);
|
||||||
pc.addEventListener("connectionstatechange",
|
pc.addEventListener("connectionstatechange",
|
||||||
|
|||||||
@@ -138,7 +138,7 @@
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
stream_state = "waiting"
|
stream_state = "waiting"
|
||||||
webrtc_id = _webrtc_id;
|
webrtc_id = Math.random().toString(36).substring(2);
|
||||||
start(stream, pc, video_source, server.offer, webrtc_id).then((connection) => {
|
start(stream, pc, video_source, server.offer, webrtc_id).then((connection) => {
|
||||||
pc = connection;
|
pc = connection;
|
||||||
}).catch(() => {
|
}).catch(() => {
|
||||||
|
|||||||
@@ -134,6 +134,7 @@ export function stop(pc: RTCPeerConnection) {
|
|||||||
// close local audio / video
|
// close local audio / video
|
||||||
if (pc.getSenders()) {
|
if (pc.getSenders()) {
|
||||||
pc.getSenders().forEach((sender) => {
|
pc.getSenders().forEach((sender) => {
|
||||||
|
console.log("sender", sender);
|
||||||
if (sender.track && sender.track.stop) sender.track.stop();
|
if (sender.track && sender.track.stop) sender.track.stop();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "gradio_webrtc"
|
name = "gradio_webrtc"
|
||||||
version = "0.0.5"
|
version = "0.0.6a2"
|
||||||
description = "Stream images in realtime with webrtc"
|
description = "Stream images in realtime with webrtc"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = "apache-2.0"
|
license = "apache-2.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user