diff --git a/.gitignore b/.gitignore
index b919c25..44ced21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,6 @@ __tmp/*
node_modules
backend/**/templates/
demo/MobileNetSSD_deploy.caffemodel
-demo/MobileNetSSD_deploy.prototxt.txt
\ No newline at end of file
+demo/MobileNetSSD_deploy.prototxt.txt
+.DS_Store
+test/
\ No newline at end of file
diff --git a/README.md b/README.md
index 4669e19..f3dea4c 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,3 @@
----
-tags: [gradio-custom-component, Video, streaming, webrtc, realtime]
-title: gradio_webrtc
-short_description: Stream images in realtime with webrtc
-colorFrom: blue
-colorTo: yellow
-sdk: gradio
-pinned: false
-app_file: space.py
----
-
Gradio WebRTC ⚡️
@@ -30,15 +19,15 @@ pip install gradio_webrtc
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
+4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
## Usage
The WebRTC component supports the following three use cases:
-1. Streaming video from the user webcam to the server and back
-2. Streaming Video from the server to the client
-3. Streaming Audio from the server to the client
-
-Streaming Audio from client to the server and back (conversational AI) is not supported yet.
+1. [Streaming video from the user webcam to the server and back](#h-streaming-video-from-the-user-webcam-to-the-server-and-back)
+2. [Streaming Video from the server to the client](#h-streaming-video-from-the-server-to-the-client)
+3. [Streaming Audio from the server to the client](#h-streaming-audio-from-the-server-to-the-client)
+4. [Streaming Audio from the client to the server and back (conversational AI)](#h-conversational-ai)
## Streaming Video from the User Webcam to the Server and Back
@@ -78,7 +67,7 @@ as a **numpy array** and returns the processed frame also as a **numpy array**.
* The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component.
* The `time_limit` parameter is the maximum time in seconds the video stream will run. If the time limit is reached, the video stream will stop.
-## Streaming Video from the User Webcam to the Server and Back
+## Streaming Video from the server to the client
```python
import gradio as gr
@@ -143,6 +132,52 @@ with gr.Blocks() as demo:
* The numpy array should be of shape (1, num_samples).
* The `outputs` parameter should be a list with the WebRTC component as the only element.
+## Conversational AI
+
+```python
+import gradio as gr
+import numpy as np
+from gradio_webrtc import WebRTC, StreamHandler
+from queue import Queue
+import time
+
+
+class EchoHandler(StreamHandler):
+ def __init__(self) -> None:
+ super().__init__()
+ self.queue = Queue()
+
+ def receive(self, frame: tuple[int, np.ndarray] | np.ndarray) -> None:
+ self.queue.put(frame)
+
+ def emit(self) -> None:
+ return self.queue.get()
+
+
+with gr.Blocks() as demo:
+ with gr.Column():
+ with gr.Group():
+ audio = WebRTC(
+ label="Stream",
+ rtc_configuration=None,
+ mode="send-receive",
+ modality="audio",
+ )
+
+ audio.stream(fn=EchoHandler(), inputs=[audio], outputs=[audio], time_limit=15)
+
+
+if __name__ == "__main__":
+ demo.launch()
+```
+
+* Instead of passing a function to the `stream` event's `fn` parameter, pass a `StreamHandler` implementation. The `StreamHandler` above simply echoes the audio back to the client.
+* The `StreamHandler` class has two methods: `receive` and `emit`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client.
+* An audio frame is represented as a tuple of (frame_rate, audio_samples) where `audio_samples` is a numpy array of shape (num_channels, num_samples).
+* You can also specify the audio layout ("mono" or "stereo") in the emit method by retuning it as the third element of the tuple. If not specified, the default is "mono".
+* The `time_limit` parameter is the maximum time in seconds the conversation will run. If the time limit is reached, the audio stream will stop.
+
+
## Deployment
When deploying in a cloud environment (like Hugging Face Spaces, EC2, etc), you need to set up a TURN server to relay the WebRTC traffic.
diff --git a/backend/gradio_webrtc/utils.py b/backend/gradio_webrtc/utils.py
index 1f9cb55..f3579ca 100644
--- a/backend/gradio_webrtc/utils.py
+++ b/backend/gradio_webrtc/utils.py
@@ -58,5 +58,5 @@ def player_worker_decode(
frame.pts = audio_samples
frame.time_base = audio_time_base
audio_samples += frame.samples
-
asyncio.run_coroutine_threadsafe(queue.put(frame), loop)
+ logger.debug("Queue size utils.py: %s", queue.qsize())
diff --git a/backend/gradio_webrtc/webrtc.py b/backend/gradio_webrtc/webrtc.py
index 92faa92..3d9a9cc 100644
--- a/backend/gradio_webrtc/webrtc.py
+++ b/backend/gradio_webrtc/webrtc.py
@@ -99,9 +99,9 @@ class VideoCallback(VideoStreamTrack):
return new_frame
except Exception as e:
- logger.debug(e)
+ logger.debug("exception %s", e)
exec = traceback.format_exc()
- logger.debug(exec)
+ logger.debug("traceback %s", exec)
class StreamHandler(ABC):
@@ -161,20 +161,19 @@ class AudioCallback(AudioStreamTrack):
frame = cast(AudioFrame, await self.track.recv())
for frame in self.event_handler.resample(frame):
numpy_array = frame.to_ndarray()
- logger.debug("numpy array shape %s", numpy_array.shape)
await anyio.to_thread.run_sync(
self.event_handler.receive, (frame.sample_rate, numpy_array)
)
- except MediaStreamError as e:
- print("MediaStreamError", e)
- break
+ except MediaStreamError:
+ logger.debug("MediaStreamError in process_input_frames")
+ break
def start(self):
if not self.has_started:
asyncio.create_task(self.process_input_frames())
self.__thread = threading.Thread(
- name="audio-output-decoders",
target=player_worker_decode,
+ daemon=False,
args=(
asyncio.get_event_loop(),
self.event_handler.emit,
@@ -214,11 +213,12 @@ class AudioCallback(AudioStreamTrack):
self.last_timestamp = time.time()
return frame
except Exception as e:
- logger.debug(e)
+ logger.debug("exception %s", e)
exec = traceback.format_exc()
- logger.debug(exec)
+ logger.debug("traceback %s", exec)
def stop(self):
+ logger.debug("audio callback stop")
self.thread_quit.set()
if self.__thread is not None:
self.__thread.join()
@@ -266,9 +266,9 @@ class ServerToClientVideo(VideoStreamTrack):
next_frame.time_base = time_base
return next_frame
except Exception as e:
- logger.debug(e)
+ logger.debug("exception %s", e)
exec = traceback.format_exc()
- logger.debug(exec)
+ logger.debug("traceback %s ", exec)
class ServerToClientAudio(AudioStreamTrack):
@@ -298,13 +298,14 @@ class ServerToClientAudio(AudioStreamTrack):
frame = next(self.generator)
return frame
except StopIteration:
- pass
+ self.thread_quit.set()
def start(self):
if self.__thread is None:
self.__thread = threading.Thread(
name="generator-runner",
target=player_worker_decode,
+ daemon=True,
args=(
asyncio.get_event_loop(),
self.next,
@@ -338,9 +339,9 @@ class ServerToClientAudio(AudioStreamTrack):
return data
except Exception as e:
- logger.debug(e)
+ logger.debug("exception %s", e)
exec = traceback.format_exc()
- logger.debug(exec)
+ logger.debug("traceback %s", exec)
def stop(self):
self.thread_quit.set()
@@ -606,9 +607,12 @@ class WebRTC(Component):
@pc.on("connectionstatechange")
async def on_connectionstatechange():
+ logger.debug("pc.connectionState %s", pc.connectionState)
if pc.connectionState in ["failed", "closed"]:
await pc.close()
- self.connections.pop(body["webrtc_id"], None)
+ connection = self.connections.pop(body["webrtc_id"], None)
+ if connection:
+ connection.stop()
self.pcs.discard(pc)
if pc.connectionState == "connected":
if self.time_limit is not None:
diff --git a/demo/app.py b/demo/app.py
index 1ff27ba..a34c83f 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -57,7 +57,7 @@ pip install gradio_webrtc
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
-4. [Conversational AI]()
+4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
## Usage
diff --git a/frontend/shared/InteractiveAudio.svelte b/frontend/shared/InteractiveAudio.svelte
index 5dd3ef3..c825e0a 100644
--- a/frontend/shared/InteractiveAudio.svelte
+++ b/frontend/shared/InteractiveAudio.svelte
@@ -36,7 +36,7 @@
let stream_state: "open" | "closed" | "waiting" = "closed";
let audio_player: HTMLAudioElement;
let pc: RTCPeerConnection;
- let _webrtc_id = Math.random().toString(36).substring(2);
+ let _webrtc_id = null;
const dispatch = createEventDispatcher<{
@@ -63,6 +63,7 @@
_time_limit = null;
return;
}
+ _webrtc_id = Math.random().toString(36).substring(2);
value = _webrtc_id;
pc = new RTCPeerConnection(rtc_configuration);
pc.addEventListener("connectionstatechange",
diff --git a/frontend/shared/StaticAudio.svelte b/frontend/shared/StaticAudio.svelte
index 69b4f88..6ed36cc 100644
--- a/frontend/shared/StaticAudio.svelte
+++ b/frontend/shared/StaticAudio.svelte
@@ -47,6 +47,7 @@
async function start_stream(value: string): Promise {
if( value === "start_webrtc_stream") {
stream_state = "waiting";
+ _webrtc_id = Math.random().toString(36).substring(2)
value = _webrtc_id;
console.log("set value to ", value);
pc = new RTCPeerConnection(rtc_configuration);
diff --git a/frontend/shared/StaticVideo.svelte b/frontend/shared/StaticVideo.svelte
index 2364cba..9b5ce1a 100644
--- a/frontend/shared/StaticVideo.svelte
+++ b/frontend/shared/StaticVideo.svelte
@@ -40,6 +40,7 @@
)
$: if( value === "start_webrtc_stream") {
+ _webrtc_id = Math.random().toString(36).substring(2);
value = _webrtc_id;
pc = new RTCPeerConnection(rtc_configuration);
pc.addEventListener("connectionstatechange",
diff --git a/frontend/shared/Webcam.svelte b/frontend/shared/Webcam.svelte
index 73ec809..9c2931d 100644
--- a/frontend/shared/Webcam.svelte
+++ b/frontend/shared/Webcam.svelte
@@ -138,7 +138,7 @@
}
)
stream_state = "waiting"
- webrtc_id = _webrtc_id;
+ webrtc_id = Math.random().toString(36).substring(2);
start(stream, pc, video_source, server.offer, webrtc_id).then((connection) => {
pc = connection;
}).catch(() => {
diff --git a/frontend/shared/webrtc_utils.ts b/frontend/shared/webrtc_utils.ts
index 6e9d13e..d0168a9 100644
--- a/frontend/shared/webrtc_utils.ts
+++ b/frontend/shared/webrtc_utils.ts
@@ -134,6 +134,7 @@ export function stop(pc: RTCPeerConnection) {
// close local audio / video
if (pc.getSenders()) {
pc.getSenders().forEach((sender) => {
+ console.log("sender", sender);
if (sender.track && sender.track.stop) sender.track.stop();
});
}
diff --git a/pyproject.toml b/pyproject.toml
index e0ad402..b764a23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
[project]
name = "gradio_webrtc"
-version = "0.0.5"
+version = "0.0.6a2"
description = "Stream images in realtime with webrtc"
readme = "README.md"
license = "apache-2.0"