diff --git a/backend/gradio_webrtc/webrtc.py b/backend/gradio_webrtc/webrtc.py
index 093efe9..a722ca3 100644
--- a/backend/gradio_webrtc/webrtc.py
+++ b/backend/gradio_webrtc/webrtc.py
@@ -707,6 +707,7 @@ class WebRTC(Component):
icon: str | None = None,
icon_button_color: str | None = None,
pulse_color: str | None = None,
+ button_labels: dict | None = None,
):
"""
Parameters:
@@ -737,6 +738,7 @@ class WebRTC(Component):
icon: Icon to display on the button instead of the wave animation. The icon should be a path/url to a .svg/.png/.jpeg file.
icon_button_color: Color of the icon button. Default is var(--color-accent) of the demo theme.
pulse_color: Color of the pulse animation. Default is var(--color-accent) of the demo theme.
+ button_labels: Text to display on the audio or video start, stop, waiting buttons. Dict with keys "start", "stop", "waiting" mapping to the text to display on the buttons.
"""
self.time_limit = time_limit
self.height = height
@@ -749,6 +751,12 @@ class WebRTC(Component):
self.icon_button_color = icon_button_color
self.pulse_color = pulse_color
self.rtp_params = rtp_params or {}
+ self.button_labels = {
+ "start": "",
+ "stop": "",
+ "waiting": "",
+ **(button_labels or {}),
+ }
if track_constraints is None and modality == "audio":
track_constraints = {
"echoCancellation": True,
diff --git a/docs/advanced-configuration.md b/docs/advanced-configuration.md
index 87ee406..4dae09b 100644
--- a/docs/advanced-configuration.md
+++ b/docs/advanced-configuration.md
@@ -140,4 +140,21 @@ You can control the button color and pulse color with `icon_button_color` and `p
pulse_color="black",
)
```
-
\ No newline at end of file
+
+
+
+## Changing the Button Text
+
+You can supply a `button_labels` dictionary to change the text displayed in the `Start`, `Stop` and `Waiting` buttons that are displayed in the UI.
+The keys must be `"start"`, `"stop"`, and `"waiting"`.
+
+``` python
+webrtc = WebRTC(
+ label="Video Chat",
+ modality="audio-video",
+ mode="send-receive",
+ button_labels={"start": "Start Talking to Gemini"}
+)
+```
+
+
diff --git a/docs/cookbook.md b/docs/cookbook.md
index 57bd2c8..0cb7910 100644
--- a/docs/cookbook.md
+++ b/docs/cookbook.md
@@ -1,16 +1,16 @@
-- :speaking_head:{ .lg .middle } __OpenAI Real Time Voice API__
+- :speaking_head:{ .lg .middle }:eyes:{ .lg .middle } __Gemini Audio Video Chat__
---
- Talk to ChatGPT in real time using OpenAI's voice API.
+ Stream BOTH your webcam video and audio feeds to Google Gemini. You can also upload images to augment your conversation!
-
+
- [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/openai-realtime-voice)
+ [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/gemini-audio-video-chat)
- [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/openai-realtime-voice/blob/main/app.py)
+ [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/gemini-audio-video-chat/blob/main/app.py)
- :speaking_head:{ .lg .middle } __Google Gemini Real Time Voice API__
@@ -24,6 +24,18 @@
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/gemini-voice/blob/main/app.py)
+- :speaking_head:{ .lg .middle } __OpenAI Real Time Voice API__
+
+ ---
+
+ Talk to ChatGPT in real time using OpenAI's voice API.
+
+
+
+ [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/openai-realtime-voice)
+
+ [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/openai-realtime-voice/blob/main/app.py)
+
- :speaking_head:{ .lg .middle } __Hello Llama: Stop Word Detection__
---
diff --git a/docs/user-guide.md b/docs/user-guide.md
index cb1ecb7..e596c2e 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -419,6 +419,34 @@ Set up a server-to-client stream to stream video from an arbitrary user interact
2. Set `mode="receive"` to only receive audio from the server.
3. The `trigger` parameter the gradio event that will trigger the stream. In this case, the button click event.
+## Audio-Video Streaming
+
+You can simultaneously stream audio and video simultaneously to/from a server using `AudioVideoStreamHandler` or `AsyncAudioVideoStreamHandler`.
+They are identical to the audio `StreamHandlers` with the addition of `video_receive` and `video_emit` methods which take and return a `numpy` array, respectively.
+
+Here is an example of the video handling functions for connecting with the Gemini multimodal API. In this case, we simply reflect the webcam feed back to the user but every second we'll send the latest webcam frame (and an additional image component) to the Gemini server.
+
+Please see the "Gemini Audio Video Chat" example in the [cookbook](/cookbook) for the complete code.
+
+``` python title="Async Gemini Video Handling"
+
+async def video_receive(self, frame: np.ndarray):
+ """Send video frames to the server"""
+ if self.session:
+ # send image every 1 second
+ # otherwise we flood the API
+ if time.time() - self.last_frame_time > 1:
+ self.last_frame_time = time.time()
+ await self.session.send(encode_image(frame))
+ if self.latest_args[2] is not None:
+ await self.session.send(encode_image(self.latest_args[2]))
+ self.video_queue.put_nowait(frame)
+
+async def video_emit(self) -> VideoEmitType:
+ """Return video frames to the client"""
+ return await self.video_queue.get()
+```
+
## Additional Outputs
diff --git a/frontend/Index.svelte b/frontend/Index.svelte
index 2ebfd69..8c996e5 100644
--- a/frontend/Index.svelte
+++ b/frontend/Index.svelte
@@ -13,6 +13,7 @@
export let elem_classes: string[] = [];
export let visible = true;
export let value: string = "__webrtc_value__";
+ export let button_labels: {start: string, stop: string, waiting: string};
export let label: string;
export let root: string;
@@ -116,6 +117,7 @@
{icon}
{icon_button_color}
{pulse_color}
+ {button_labels}
on:clear={() => gradio.dispatch("clear")}
on:play={() => gradio.dispatch("play")}
on:pause={() => gradio.dispatch("pause")}
@@ -147,6 +149,7 @@
{icon}
{icon_button_color}
{pulse_color}
+ {button_labels}
on:tick={() => gradio.dispatch("tick")}
on:error={({ detail }) => gradio.dispatch("error", detail)}
on:warning={({ detail }) => gradio.dispatch("warning", detail)}
diff --git a/frontend/shared/InteractiveAudio.svelte b/frontend/shared/InteractiveAudio.svelte
index 3638451..5350595 100644
--- a/frontend/shared/InteractiveAudio.svelte
+++ b/frontend/shared/InteractiveAudio.svelte
@@ -34,6 +34,7 @@
export let icon: string | undefined = undefined;
export let icon_button_color: string = "var(--color-accent)";
export let pulse_color: string = "var(--color-accent)";
+ export let button_labels: {start: string, stop: string, waiting: string};
let stopword_recognized = false;
@@ -253,25 +254,25 @@
aria-label={"start stream"}
>
{#if stream_state === "waiting"}
-
+
- {i18n("audio.waiting")}
+ {button_labels.waiting || i18n("audio.waiting")}
{:else if stream_state === "open"}
- {i18n("audio.stop")}
+ {button_labels.stop || i18n("audio.stop")}
{:else}
- {i18n("audio.record")}
+ {button_labels.start || i18n("audio.record")}
{/if}
@@ -377,11 +378,12 @@
}
.icon-with-text {
- width: var(--size-20);
+ min-width: var(--size-16);
align-items: center;
margin: 0 var(--spacing-xl);
display: flex;
- justify-content: space-evenly;
+ justify-content: space-evenly;
+ gap: var(--size-2);
}
@media (--screen-md) {
diff --git a/frontend/shared/InteractiveVideo.svelte b/frontend/shared/InteractiveVideo.svelte
index 594a85f..bf07ece 100644
--- a/frontend/shared/InteractiveVideo.svelte
+++ b/frontend/shared/InteractiveVideo.svelte
@@ -17,6 +17,7 @@
export let handle_reset_value: () => void = () => {};
export let stream_handler: Client["stream"];
export let time_limit: number | null = null;
+ export let button_labels: {start: string, stop: string, waiting: string};
export let server: {
offer: (body: any) => Promise
;
};
@@ -63,6 +64,7 @@
{icon}
{icon_button_color}
{pulse_color}
+ {button_labels}
on:error
on:start_recording
on:stop_recording
diff --git a/frontend/shared/Webcam.svelte b/frontend/shared/Webcam.svelte
index 52f8ce2..5809826 100644
--- a/frontend/shared/Webcam.svelte
+++ b/frontend/shared/Webcam.svelte
@@ -33,6 +33,7 @@
export let icon: string | undefined | ComponentType = undefined;
export let icon_button_color: string = "var(--color-accent)";
export let pulse_color: string = "var(--color-accent)";
+ export let button_labels: {start: string, stop: string, waiting: string};
export const modify_stream: (state: "open" | "closed" | "waiting") => void = (
state: "open" | "closed" | "waiting"
@@ -162,12 +163,6 @@
await access_webcam();
}
}
-
- // window.setInterval(() => {
- // if (stream_state == "open") {
- // dispatch("tick");
- // }
- // }, stream_every * 1000);
let options_open = false;
@@ -238,25 +233,25 @@
aria-label={"start stream"}
>
{#if stream_state === "waiting"}
-
+
- {i18n("audio.waiting")}
+ {button_labels.waiting || i18n("audio.waiting")}
{:else if stream_state === "open"}
- {i18n("audio.stop")}
+ {button_labels.stop || i18n("audio.stop")}
{:else}
- {i18n("audio.record")}
+ {button_labels.start || i18n("audio.record")}
{/if}
@@ -334,11 +329,13 @@
}
.icon-with-text {
- width: var(--size-20);
+ min-width: var(--size-16);
align-items: center;
margin: 0 var(--spacing-xl);
display: flex;
- justify-content: space-evenly;
+ justify-content: space-evenly;
+ /* Add gap between icon and text */
+ gap: var(--size-2);
}
.audio-indicator {
diff --git a/pyproject.toml b/pyproject.toml
index 045af4d..207a178 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
[project]
name = "gradio_webrtc"
-version = "0.0.28"
+version = "0.0.29"
description = "Stream images in realtime with webrtc"
readme = "README.md"
license = "apache-2.0"