Additional outputs tweaks + fix track constraints (#28)

* code * add code * add code
2026-02-04 17:39:23 +08:00 · 2024-12-03 15:32:43 -05:00
parent 65d0ba023f
commit c85c117576
10 changed files with 91 additions and 53 deletions
--- a/backend/gradio_webrtc/webrtc.py
+++ b/backend/gradio_webrtc/webrtc.py
@@ -167,10 +167,12 @@ class StreamHandler(ABC):
        expected_layout: Literal["mono", "stereo"] = "mono",
        output_sample_rate: int = 24000,
        output_frame_size: int = 960,
        input_sample_rate: int = 48000,
    ) -> None:
        self.expected_layout = expected_layout
        self.output_sample_rate = output_sample_rate
        self.output_frame_size = output_frame_size
        self.input_sample_rate = input_sample_rate
        self.latest_args: str | list[Any] = "not_set"
        self._resampler = None
        self._channel: DataChannel | None = None
@@ -191,6 +193,9 @@ class StreamHandler(ABC):
        logger.debug("setting args in audio callback %s", args)
        self.latest_args = ["__webrtc_value__"] + list(args)
    def shutdown(self):
        pass
    @abstractmethod
    def copy(self) -> "StreamHandler":
        pass
@@ -200,17 +205,23 @@ class StreamHandler(ABC):
            self._resampler = av.AudioResampler(  # type: ignore
                format="s16",
                layout=self.expected_layout,
-                rate=frame.sample_rate,
+                rate=self.input_sample_rate,
                frame_size=frame.samples,
            )
        yield from self._resampler.resample(frame)
    @abstractmethod
-    def receive(self, frame: tuple[int, np.ndarray] | np.ndarray) -> None:
+    def receive(self, frame: tuple[int, np.ndarray]) -> None:
        pass
    @abstractmethod
-    def emit(self) -> None:
+    def emit(
        self,
    ) -> (
        tuple[int, np.ndarray]
        | AdditionalOutputs
        | tuple[tuple[int, np.ndarray], AdditionalOutputs]
    ):
        pass
@@ -313,6 +324,9 @@ class AudioCallback(AudioStreamTrack):
        self.thread_quit.set()
        super().stop()
    def shutdown(self):
        self.event_handler.shutdown()
 class ServerToClientVideo(VideoStreamTrack):
    """
@@ -489,7 +503,7 @@ class WebRTC(Component):
        str, VideoCallback | ServerToClientVideo | ServerToClientAudio | AudioCallback
    ] = {}
    data_channels: dict[str, DataChannel] = {}
-    additional_outputs: dict[str, AdditionalOutputs] = {}
+    additional_outputs: dict[str, list[AdditionalOutputs]] = {}
    EVENTS = ["tick", "state_change"]
@@ -517,6 +531,7 @@ class WebRTC(Component):
        time_limit: float | None = None,
        mode: Literal["send-receive", "receive", "send"] = "send-receive",
        modality: Literal["video", "audio"] = "video",
        rtp_params: dict[str, Any] | None = None,
    ):
        """
        Parameters:
@@ -538,15 +553,12 @@ class WebRTC(Component):
            render: if False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later.
            key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
            mirror_webcam: if True webcam will be mirrored. Default is True.
-            include_audio: whether the component should record/retain the audio track for a video. By default, audio is excluded for webcam videos and included for uploaded videos.
+            rtc_configuration: WebRTC configuration options. See https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection/RTCPeerConnection . If running the demo on a remote server, you will need to specify a rtc_configuration. See https://freddyaboulton.github.io/gradio-webrtc/deployment/
-            autoplay: whether to automatically play the video when the component is used as an output. Note: browsers will not autoplay video files if the user has not interacted with the page yet.
+            track_constraints: Media track constraints for WebRTC. For example, to set video height, width use {"width": {"exact": 800}, "height": {"exact": 600}, "aspectRatio": {"exact": 1.33333}}
-            show_share_button: if True, will show a share icon in the corner of the component that allows user to share outputs to Hugging Face Spaces Discussions. If False, icon does not appear. If set to None (default behavior), then the icon appears if this Gradio app is launched on Spaces, but not otherwise.
+            time_limit: Maximum duration in seconds for recording.
-            show_download_button: if True, will show a download icon in the corner of the component that allows user to download the output. If False, icon does not appear. By default, it will be True for output components and False for input components.
+            mode: WebRTC mode - "send-receive", "receive", or "send".
-            min_length: the minimum length of video (in seconds) that the user can pass into the prediction function. If None, there is no minimum length.
+            modality: Type of media - "video" or "audio".
-            max_length: the maximum length of video (in seconds) that the user can pass into the prediction function. If None, there is no maximum length.
+            rtp_params: See https://developer.mozilla.org/en-US/docs/Web/API/RTCRtpSender/setParameters. If you are changing the video resolution, you can set this to {"degradationPreference": "maintain-framerate"} to keep the frame rate consistent.
            loop: if True, the video will loop when it reaches the end and continue playing from the beginning.
            streaming: when used set as an output, takes video chunks yielded from the backend and combines them into one streaming video output. Each chunk should be a video file with a .ts extension using an h.264 encoding. Mp4 files are also accepted but they will be converted to h.264 encoding.
            watermark: an image file to be included as a watermark on the video. The image is not scaled and is displayed on the bottom right of the video. Valid formats for the image are: jpeg, png.
        """
        self.time_limit = time_limit
        self.height = height
@@ -556,6 +568,7 @@ class WebRTC(Component):
        self.rtc_configuration = rtc_configuration
        self.mode = mode
        self.modality = modality
        self.rtp_params = rtp_params or {}
        if track_constraints is None and modality == "audio":
            track_constraints = {
                "echoCancellation": True,
@@ -595,7 +608,9 @@ class WebRTC(Component):
        self, webrtc_id: str
    ) -> Callable[[AdditionalOutputs], None]:
        def set_outputs(outputs: AdditionalOutputs):
-            self.additional_outputs[webrtc_id] = outputs
+            if webrtc_id not in self.additional_outputs:
                self.additional_outputs[webrtc_id] = []
            self.additional_outputs[webrtc_id].append(outputs)
        return set_outputs
@@ -638,8 +653,12 @@ class WebRTC(Component):
            inputs = list(inputs)
        def handler(webrtc_id: str, *args):
-            if webrtc_id in self.additional_outputs:
+            if (
-                return fn(*args, *self.additional_outputs[webrtc_id].args)  # type: ignore
+                webrtc_id in self.additional_outputs
                and len(self.additional_outputs[webrtc_id]) > 0
            ):
                next_outputs = self.additional_outputs[webrtc_id].pop(0)
                return fn(*args, *next_outputs.args)  # type: ignore
            return (
                tuple([None for _ in range(len(outputs))])
                if isinstance(outputs, Iterable)
@@ -655,6 +674,7 @@ class WebRTC(Component):
            concurrency_id=concurrency_id,
            show_progress=show_progress,
            queue=queue,
            trigger_mode="multiple",
        )
    def stream(
@@ -748,6 +768,8 @@ class WebRTC(Component):
    def clean_up(self, webrtc_id: str):
        connection = self.connections.pop(webrtc_id, None)
        if isinstance(connection, AudioCallback):
            connection.event_handler.shutdown()
        self.additional_outputs.pop(webrtc_id, None)
        self.data_channels.pop(webrtc_id, None)
        return connection
--- a/demo/stream_whisper.py
+++ b/demo/stream_whisper.py
@@ -1,37 +1,21 @@
 import logging
 import tempfile
 import gradio as gr
 import numpy as np
 from dotenv import load_dotenv
 from gradio_webrtc import AdditionalOutputs, ReplyOnPause, WebRTC
 from openai import OpenAI
 from pydub import AudioSegment
 from dotenv import load_dotenv
 load_dotenv()
 # Configure the root logger to WARNING to suppress debug messages from other libraries
 logging.basicConfig(level=logging.WARNING)
 # Create a console handler
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.DEBUG)
 # Create a formatter
 formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
 console_handler.setFormatter(formatter)
 # Configure the logger for your specific library
 logger = logging.getLogger("gradio_webrtc")
 logger.setLevel(logging.DEBUG)
 logger.addHandler(console_handler)
 client = OpenAI()
 def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
    print("audio", audio)
    segment = AudioSegment(
        audio[1].tobytes(),
        frame_rate=audio[0],
@@ -39,12 +23,14 @@ def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
        channels=1,
    )
    transcript.append({"role": "user", "content": gr.Audio((audio[0], audio[1].squeeze()))})
    with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
        segment.export(temp_audio.name, format="mp3")
        next_chunk = client.audio.transcriptions.create(
            model="whisper-1", file=open(temp_audio.name, "rb")
        ).text
-        transcript.append({"role": "user", "content": next_chunk})
+        transcript.append({"role": "assistant", "content": next_chunk})
        yield AdditionalOutputs(transcript)
--- a/demo/video_send_output.py
+++ b/demo/video_send_output.py
@@ -49,17 +49,14 @@ else:
 def detection(frame, conf_threshold=0.3):
    print("frame.shape", frame.shape)
    frame = cv2.flip(frame, 0)
-    global count
+    return AdditionalOutputs(1)
    if random.random() > 0.98:
        return AdditionalOutputs(count)
    count += 1
 css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
 with gr.Blocks(css=css) as demo:
    gr.HTML(
        """
@@ -78,7 +75,13 @@ with gr.Blocks(css=css) as demo:
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            image = WebRTC(
-                label="Stream", rtc_configuration=rtc_configuration, mode="send"
+                label="Stream", rtc_configuration=rtc_configuration,
                mode="send",
                track_constraints={"width": {"exact": 800},
                                   "height": {"exact": 600},
                                   "aspectRatio": {"exact": 1.33333}
                                   },
                rtp_params={"degradationPreference": "maintain-resolution"}
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
@@ -92,6 +95,6 @@ with gr.Blocks(css=css) as demo:
        image.stream(
            fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
        )
-        image.change(lambda n: n, outputs=[number])
+        image.on_additional_outputs(lambda n: n, outputs=number)
 demo.launch()
--- a/docs/advanced-configuration.md
+++ b/docs/advanced-configuration.md
@@ -6,8 +6,8 @@ For example, you can control the size of the frames captured from the webcam lik
 ```python
 track_constraints = {
-    "width": {"ideal": 500},
+    "width": {"exact": 500},
-    "height": {"ideal": 500},
+    "height": {"exact": 500},
    "frameRate": {"ideal": 30},
 }
 webrtc = WebRTC(track_constraints=track_constraints,
@@ -16,6 +16,22 @@ webrtc = WebRTC(track_constraints=track_constraints,
 ```
 !!! warning
    WebRTC may not enforce your constaints. For example, it may rescale your video
    (while keeping the same resolution) in order to maintain the desired (or reach a better) frame rate. If you
    really want to enforce height, width and resolution constraints, use the `rtp_params` parameter as set `"degradationPreference": "maintain-resolution"`. 
    ```python
    image = WebRTC(
        label="Stream",
        mode="send",
        track_constraints=track_constraints,
        rtp_params={"degradationPreference": "maintain-resolution"}
    )
    ```
 ## The RTC Configuration
 You can configure how the connection is created on the client by passing an `rtc_configuration` parameter to the `WebRTC` component constructor.
--- a/frontend/Index.svelte
+++ b/frontend/Index.svelte
@@ -32,6 +32,7 @@
 	export let time_limit: number | null = null;
 	export let modality: "video" | "audio" = "video";
 	export let mode: "send-receive" | "receive" | "send" = "send-receive";
 	export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
 	export let track_constraints: MediaTrackConstraints = {};
 	const on_change_cb = (msg: "change" | "tick") => {
@@ -98,6 +99,8 @@
 			{rtc_configuration}
 			{time_limit}
 			{mode}
 			{track_constraints}
 			{rtp_params}
 			{on_change_cb}
 			on:clear={() => gradio.dispatch("clear")}
 			on:play={() => gradio.dispatch("play")}
@@ -125,6 +128,7 @@
 			{time_limit}
 			{track_constraints}
 			{mode}
 			{rtp_params}
 			i18n={gradio.i18n}
 			on:tick={() => gradio.dispatch("tick")}
 			on:error={({ detail }) => gradio.dispatch("error", detail)}
--- a/frontend/shared/InteractiveAudio.svelte
+++ b/frontend/shared/InteractiveAudio.svelte
@@ -31,6 +31,7 @@
    export let i18n: I18nFormatter;
    export let time_limit: number | null = null;
    export let track_constraints: MediaTrackConstraints = {};
    export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
    export let on_change_cb: (mg: "tick" | "change") => void;
    let options_open = false;
@@ -143,7 +144,7 @@
            }
            if (stream == null) return;
-            start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb).then((connection) => {
+            start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb, rtp_params).then((connection) => {
                    pc = connection;
                }).catch(() => {
                    console.info("catching")
--- a/frontend/shared/InteractiveVideo.svelte
+++ b/frontend/shared/InteractiveVideo.svelte
@@ -23,6 +23,7 @@
 	export let track_constraints: MediaTrackConstraints = {};
 	export let mode: "send" | "send-receive";
 	export let on_change_cb: (msg: "change" | "tick") => void;
 	export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
 	const dispatch = createEventDispatcher<{
 		change: FileData | null;
@@ -53,6 +54,7 @@
 		{time_limit}
 		{track_constraints}
 		{mode}
 		{rtp_params}
 		{on_change_cb}
 		on:error
 		on:start_recording
--- a/frontend/shared/Webcam.svelte
+++ b/frontend/shared/Webcam.svelte
@@ -27,8 +27,7 @@
 	export let on_change_cb: (msg: "tick" | "change") => void;
 	export let mode: "send-receive" | "send";
    const _webrtc_id = Math.random().toString(36).substring(2);
-
+	export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
 	console.log("mode", mode);
 	export const modify_stream: (state: "open" | "closed" | "waiting") => void = (
 		state: "open" | "closed" | "waiting"
@@ -82,7 +81,7 @@
 	async function access_webcam(): Promise<void> {
 		try {
-			get_video_stream(include_audio, video_source)
+			get_video_stream(include_audio, video_source, null, track_constraints)
 				.then(async (local_stream) => {
 					webcam_accessed = true;
 					available_video_devices = await get_devices();
@@ -144,7 +143,7 @@
            )
            stream_state = "waiting"
 			webrtc_id = Math.random().toString(36).substring(2);
-            start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb).then((connection) => {
+            start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb, rtp_params).then((connection) => {
 				pc = connection;
 			}).catch(() => {
                console.info("catching")
--- a/frontend/shared/webrtc_utils.ts
+++ b/frontend/shared/webrtc_utils.ts
@@ -52,6 +52,7 @@ export async function start(
  webrtc_id,
  modality: "video" | "audio" = "video",
  on_change_cb: (msg: "change" | "tick") => void = () => {},
  rtp_params = {},
 ) {
  pc = createPeerConnection(pc, node);
  const data_channel = pc.createDataChannel("text");
@@ -70,9 +71,13 @@ export async function start(
  };
  if (stream) {
-    stream.getTracks().forEach((track) => {
+    stream.getTracks().forEach(async (track) => {
      console.debug("Track stream callback", track);
-      pc.addTrack(track, stream);
+      const sender = pc.addTrack(track, stream);
      const params = sender.getParameters();
      const updated_params = { ...params, ...rtp_params };
      await sender.setParameters(updated_params)
      console.debug("sender params", sender.getParameters());
    });
  } else {
    console.debug("Creating transceiver!");
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
 [project]
 name = "gradio_webrtc"
-version = "0.0.14"
+version = "0.0.15"
 description = "Stream images in realtime with webrtc"
 readme = "README.md"
 license = "apache-2.0"