diff --git a/backend/gradio_webrtc/webrtc.py b/backend/gradio_webrtc/webrtc.py index b685a35..ba47140 100644 --- a/backend/gradio_webrtc/webrtc.py +++ b/backend/gradio_webrtc/webrtc.py @@ -167,10 +167,12 @@ class StreamHandler(ABC): expected_layout: Literal["mono", "stereo"] = "mono", output_sample_rate: int = 24000, output_frame_size: int = 960, + input_sample_rate: int = 48000, ) -> None: self.expected_layout = expected_layout self.output_sample_rate = output_sample_rate self.output_frame_size = output_frame_size + self.input_sample_rate = input_sample_rate self.latest_args: str | list[Any] = "not_set" self._resampler = None self._channel: DataChannel | None = None @@ -191,6 +193,9 @@ class StreamHandler(ABC): logger.debug("setting args in audio callback %s", args) self.latest_args = ["__webrtc_value__"] + list(args) + def shutdown(self): + pass + @abstractmethod def copy(self) -> "StreamHandler": pass @@ -200,17 +205,23 @@ class StreamHandler(ABC): self._resampler = av.AudioResampler( # type: ignore format="s16", layout=self.expected_layout, - rate=frame.sample_rate, + rate=self.input_sample_rate, frame_size=frame.samples, ) yield from self._resampler.resample(frame) @abstractmethod - def receive(self, frame: tuple[int, np.ndarray] | np.ndarray) -> None: + def receive(self, frame: tuple[int, np.ndarray]) -> None: pass @abstractmethod - def emit(self) -> None: + def emit( + self, + ) -> ( + tuple[int, np.ndarray] + | AdditionalOutputs + | tuple[tuple[int, np.ndarray], AdditionalOutputs] + ): pass @@ -313,6 +324,9 @@ class AudioCallback(AudioStreamTrack): self.thread_quit.set() super().stop() + def shutdown(self): + self.event_handler.shutdown() + class ServerToClientVideo(VideoStreamTrack): """ @@ -489,7 +503,7 @@ class WebRTC(Component): str, VideoCallback | ServerToClientVideo | ServerToClientAudio | AudioCallback ] = {} data_channels: dict[str, DataChannel] = {} - additional_outputs: dict[str, AdditionalOutputs] = {} + additional_outputs: dict[str, list[AdditionalOutputs]] = {} EVENTS = ["tick", "state_change"] @@ -517,6 +531,7 @@ class WebRTC(Component): time_limit: float | None = None, mode: Literal["send-receive", "receive", "send"] = "send-receive", modality: Literal["video", "audio"] = "video", + rtp_params: dict[str, Any] | None = None, ): """ Parameters: @@ -538,15 +553,12 @@ class WebRTC(Component): render: if False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later. key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved. mirror_webcam: if True webcam will be mirrored. Default is True. - include_audio: whether the component should record/retain the audio track for a video. By default, audio is excluded for webcam videos and included for uploaded videos. - autoplay: whether to automatically play the video when the component is used as an output. Note: browsers will not autoplay video files if the user has not interacted with the page yet. - show_share_button: if True, will show a share icon in the corner of the component that allows user to share outputs to Hugging Face Spaces Discussions. If False, icon does not appear. If set to None (default behavior), then the icon appears if this Gradio app is launched on Spaces, but not otherwise. - show_download_button: if True, will show a download icon in the corner of the component that allows user to download the output. If False, icon does not appear. By default, it will be True for output components and False for input components. - min_length: the minimum length of video (in seconds) that the user can pass into the prediction function. If None, there is no minimum length. - max_length: the maximum length of video (in seconds) that the user can pass into the prediction function. If None, there is no maximum length. - loop: if True, the video will loop when it reaches the end and continue playing from the beginning. - streaming: when used set as an output, takes video chunks yielded from the backend and combines them into one streaming video output. Each chunk should be a video file with a .ts extension using an h.264 encoding. Mp4 files are also accepted but they will be converted to h.264 encoding. - watermark: an image file to be included as a watermark on the video. The image is not scaled and is displayed on the bottom right of the video. Valid formats for the image are: jpeg, png. + rtc_configuration: WebRTC configuration options. See https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection/RTCPeerConnection . If running the demo on a remote server, you will need to specify a rtc_configuration. See https://freddyaboulton.github.io/gradio-webrtc/deployment/ + track_constraints: Media track constraints for WebRTC. For example, to set video height, width use {"width": {"exact": 800}, "height": {"exact": 600}, "aspectRatio": {"exact": 1.33333}} + time_limit: Maximum duration in seconds for recording. + mode: WebRTC mode - "send-receive", "receive", or "send". + modality: Type of media - "video" or "audio". + rtp_params: See https://developer.mozilla.org/en-US/docs/Web/API/RTCRtpSender/setParameters. If you are changing the video resolution, you can set this to {"degradationPreference": "maintain-framerate"} to keep the frame rate consistent. """ self.time_limit = time_limit self.height = height @@ -556,6 +568,7 @@ class WebRTC(Component): self.rtc_configuration = rtc_configuration self.mode = mode self.modality = modality + self.rtp_params = rtp_params or {} if track_constraints is None and modality == "audio": track_constraints = { "echoCancellation": True, @@ -595,7 +608,9 @@ class WebRTC(Component): self, webrtc_id: str ) -> Callable[[AdditionalOutputs], None]: def set_outputs(outputs: AdditionalOutputs): - self.additional_outputs[webrtc_id] = outputs + if webrtc_id not in self.additional_outputs: + self.additional_outputs[webrtc_id] = [] + self.additional_outputs[webrtc_id].append(outputs) return set_outputs @@ -638,8 +653,12 @@ class WebRTC(Component): inputs = list(inputs) def handler(webrtc_id: str, *args): - if webrtc_id in self.additional_outputs: - return fn(*args, *self.additional_outputs[webrtc_id].args) # type: ignore + if ( + webrtc_id in self.additional_outputs + and len(self.additional_outputs[webrtc_id]) > 0 + ): + next_outputs = self.additional_outputs[webrtc_id].pop(0) + return fn(*args, *next_outputs.args) # type: ignore return ( tuple([None for _ in range(len(outputs))]) if isinstance(outputs, Iterable) @@ -655,6 +674,7 @@ class WebRTC(Component): concurrency_id=concurrency_id, show_progress=show_progress, queue=queue, + trigger_mode="multiple", ) def stream( @@ -748,6 +768,8 @@ class WebRTC(Component): def clean_up(self, webrtc_id: str): connection = self.connections.pop(webrtc_id, None) + if isinstance(connection, AudioCallback): + connection.event_handler.shutdown() self.additional_outputs.pop(webrtc_id, None) self.data_channels.pop(webrtc_id, None) return connection diff --git a/demo/stream_whisper.py b/demo/stream_whisper.py index a92ccd6..0da2614 100644 --- a/demo/stream_whisper.py +++ b/demo/stream_whisper.py @@ -1,37 +1,21 @@ -import logging import tempfile import gradio as gr import numpy as np -from dotenv import load_dotenv from gradio_webrtc import AdditionalOutputs, ReplyOnPause, WebRTC from openai import OpenAI from pydub import AudioSegment +from dotenv import load_dotenv + load_dotenv() -# Configure the root logger to WARNING to suppress debug messages from other libraries -logging.basicConfig(level=logging.WARNING) - -# Create a console handler -console_handler = logging.StreamHandler() -console_handler.setLevel(logging.DEBUG) - -# Create a formatter -formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") -console_handler.setFormatter(formatter) - -# Configure the logger for your specific library -logger = logging.getLogger("gradio_webrtc") -logger.setLevel(logging.DEBUG) -logger.addHandler(console_handler) - - client = OpenAI() def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]): + print("audio", audio) segment = AudioSegment( audio[1].tobytes(), frame_rate=audio[0], @@ -39,12 +23,14 @@ def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]): channels=1, ) + transcript.append({"role": "user", "content": gr.Audio((audio[0], audio[1].squeeze()))}) + with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio: segment.export(temp_audio.name, format="mp3") next_chunk = client.audio.transcriptions.create( model="whisper-1", file=open(temp_audio.name, "rb") ).text - transcript.append({"role": "user", "content": next_chunk}) + transcript.append({"role": "assistant", "content": next_chunk}) yield AdditionalOutputs(transcript) diff --git a/demo/video_send_output.py b/demo/video_send_output.py index 5f583c9..3ac5c9b 100644 --- a/demo/video_send_output.py +++ b/demo/video_send_output.py @@ -49,17 +49,14 @@ else: def detection(frame, conf_threshold=0.3): + print("frame.shape", frame.shape) frame = cv2.flip(frame, 0) - global count - if random.random() > 0.98: - return AdditionalOutputs(count) - count += 1 + return AdditionalOutputs(1) css = """.my-group {max-width: 600px !important; max-height: 600 !important;} .my-column {display: flex !important; justify-content: center !important; align-items: center !important};""" - with gr.Blocks(css=css) as demo: gr.HTML( """ @@ -78,7 +75,13 @@ with gr.Blocks(css=css) as demo: with gr.Column(elem_classes=["my-column"]): with gr.Group(elem_classes=["my-group"]): image = WebRTC( - label="Stream", rtc_configuration=rtc_configuration, mode="send" + label="Stream", rtc_configuration=rtc_configuration, + mode="send", + track_constraints={"width": {"exact": 800}, + "height": {"exact": 600}, + "aspectRatio": {"exact": 1.33333} + }, + rtp_params={"degradationPreference": "maintain-resolution"} ) conf_threshold = gr.Slider( label="Confidence Threshold", @@ -92,6 +95,6 @@ with gr.Blocks(css=css) as demo: image.stream( fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10 ) - image.change(lambda n: n, outputs=[number]) + image.on_additional_outputs(lambda n: n, outputs=number) demo.launch() diff --git a/docs/advanced-configuration.md b/docs/advanced-configuration.md index d021b3e..392a2d5 100644 --- a/docs/advanced-configuration.md +++ b/docs/advanced-configuration.md @@ -6,8 +6,8 @@ For example, you can control the size of the frames captured from the webcam lik ```python track_constraints = { - "width": {"ideal": 500}, - "height": {"ideal": 500}, + "width": {"exact": 500}, + "height": {"exact": 500}, "frameRate": {"ideal": 30}, } webrtc = WebRTC(track_constraints=track_constraints, @@ -16,6 +16,22 @@ webrtc = WebRTC(track_constraints=track_constraints, ``` +!!! warning + + WebRTC may not enforce your constaints. For example, it may rescale your video + (while keeping the same resolution) in order to maintain the desired (or reach a better) frame rate. If you + really want to enforce height, width and resolution constraints, use the `rtp_params` parameter as set `"degradationPreference": "maintain-resolution"`. + + ```python + image = WebRTC( + label="Stream", + mode="send", + track_constraints=track_constraints, + rtp_params={"degradationPreference": "maintain-resolution"} + ) + ``` + + ## The RTC Configuration You can configure how the connection is created on the client by passing an `rtc_configuration` parameter to the `WebRTC` component constructor. diff --git a/frontend/Index.svelte b/frontend/Index.svelte index 0bf4069..50499e0 100644 --- a/frontend/Index.svelte +++ b/frontend/Index.svelte @@ -32,6 +32,7 @@ export let time_limit: number | null = null; export let modality: "video" | "audio" = "video"; export let mode: "send-receive" | "receive" | "send" = "send-receive"; + export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters; export let track_constraints: MediaTrackConstraints = {}; const on_change_cb = (msg: "change" | "tick") => { @@ -98,6 +99,8 @@ {rtc_configuration} {time_limit} {mode} + {track_constraints} + {rtp_params} {on_change_cb} on:clear={() => gradio.dispatch("clear")} on:play={() => gradio.dispatch("play")} @@ -125,6 +128,7 @@ {time_limit} {track_constraints} {mode} + {rtp_params} i18n={gradio.i18n} on:tick={() => gradio.dispatch("tick")} on:error={({ detail }) => gradio.dispatch("error", detail)} diff --git a/frontend/shared/InteractiveAudio.svelte b/frontend/shared/InteractiveAudio.svelte index aec2899..85762e5 100644 --- a/frontend/shared/InteractiveAudio.svelte +++ b/frontend/shared/InteractiveAudio.svelte @@ -31,6 +31,7 @@ export let i18n: I18nFormatter; export let time_limit: number | null = null; export let track_constraints: MediaTrackConstraints = {}; + export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters; export let on_change_cb: (mg: "tick" | "change") => void; let options_open = false; @@ -143,7 +144,7 @@ } if (stream == null) return; - start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb).then((connection) => { + start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb, rtp_params).then((connection) => { pc = connection; }).catch(() => { console.info("catching") diff --git a/frontend/shared/InteractiveVideo.svelte b/frontend/shared/InteractiveVideo.svelte index 512c721..45f9012 100644 --- a/frontend/shared/InteractiveVideo.svelte +++ b/frontend/shared/InteractiveVideo.svelte @@ -23,6 +23,7 @@ export let track_constraints: MediaTrackConstraints = {}; export let mode: "send" | "send-receive"; export let on_change_cb: (msg: "change" | "tick") => void; + export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters; const dispatch = createEventDispatcher<{ change: FileData | null; @@ -53,6 +54,7 @@ {time_limit} {track_constraints} {mode} + {rtp_params} {on_change_cb} on:error on:start_recording diff --git a/frontend/shared/Webcam.svelte b/frontend/shared/Webcam.svelte index 6af0f65..858300a 100644 --- a/frontend/shared/Webcam.svelte +++ b/frontend/shared/Webcam.svelte @@ -27,8 +27,7 @@ export let on_change_cb: (msg: "tick" | "change") => void; export let mode: "send-receive" | "send"; const _webrtc_id = Math.random().toString(36).substring(2); - - console.log("mode", mode); + export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters; export const modify_stream: (state: "open" | "closed" | "waiting") => void = ( state: "open" | "closed" | "waiting" @@ -82,7 +81,7 @@ async function access_webcam(): Promise { try { - get_video_stream(include_audio, video_source) + get_video_stream(include_audio, video_source, null, track_constraints) .then(async (local_stream) => { webcam_accessed = true; available_video_devices = await get_devices(); @@ -144,7 +143,7 @@ ) stream_state = "waiting" webrtc_id = Math.random().toString(36).substring(2); - start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb).then((connection) => { + start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb, rtp_params).then((connection) => { pc = connection; }).catch(() => { console.info("catching") diff --git a/frontend/shared/webrtc_utils.ts b/frontend/shared/webrtc_utils.ts index 9cdd438..8151dc9 100644 --- a/frontend/shared/webrtc_utils.ts +++ b/frontend/shared/webrtc_utils.ts @@ -52,6 +52,7 @@ export async function start( webrtc_id, modality: "video" | "audio" = "video", on_change_cb: (msg: "change" | "tick") => void = () => {}, + rtp_params = {}, ) { pc = createPeerConnection(pc, node); const data_channel = pc.createDataChannel("text"); @@ -70,9 +71,13 @@ export async function start( }; if (stream) { - stream.getTracks().forEach((track) => { + stream.getTracks().forEach(async (track) => { console.debug("Track stream callback", track); - pc.addTrack(track, stream); + const sender = pc.addTrack(track, stream); + const params = sender.getParameters(); + const updated_params = { ...params, ...rtp_params }; + await sender.setParameters(updated_params) + console.debug("sender params", sender.getParameters()); }); } else { console.debug("Creating transceiver!"); diff --git a/pyproject.toml b/pyproject.toml index 0448b7c..cc31015 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "hatchling.build" [project] name = "gradio_webrtc" -version = "0.0.14" +version = "0.0.15" description = "Stream images in realtime with webrtc" readme = "README.md" license = "apache-2.0"