Additional outputs tweaks + fix track constraints (#28)

* code

* add code

* add code
This commit is contained in:
Freddy Boulton
2024-12-03 15:32:43 -05:00
committed by GitHub
parent 65d0ba023f
commit c85c117576
10 changed files with 91 additions and 53 deletions

View File

@@ -167,10 +167,12 @@ class StreamHandler(ABC):
expected_layout: Literal["mono", "stereo"] = "mono",
output_sample_rate: int = 24000,
output_frame_size: int = 960,
input_sample_rate: int = 48000,
) -> None:
self.expected_layout = expected_layout
self.output_sample_rate = output_sample_rate
self.output_frame_size = output_frame_size
self.input_sample_rate = input_sample_rate
self.latest_args: str | list[Any] = "not_set"
self._resampler = None
self._channel: DataChannel | None = None
@@ -191,6 +193,9 @@ class StreamHandler(ABC):
logger.debug("setting args in audio callback %s", args)
self.latest_args = ["__webrtc_value__"] + list(args)
def shutdown(self):
pass
@abstractmethod
def copy(self) -> "StreamHandler":
pass
@@ -200,17 +205,23 @@ class StreamHandler(ABC):
self._resampler = av.AudioResampler( # type: ignore
format="s16",
layout=self.expected_layout,
rate=frame.sample_rate,
rate=self.input_sample_rate,
frame_size=frame.samples,
)
yield from self._resampler.resample(frame)
@abstractmethod
def receive(self, frame: tuple[int, np.ndarray] | np.ndarray) -> None:
def receive(self, frame: tuple[int, np.ndarray]) -> None:
pass
@abstractmethod
def emit(self) -> None:
def emit(
self,
) -> (
tuple[int, np.ndarray]
| AdditionalOutputs
| tuple[tuple[int, np.ndarray], AdditionalOutputs]
):
pass
@@ -313,6 +324,9 @@ class AudioCallback(AudioStreamTrack):
self.thread_quit.set()
super().stop()
def shutdown(self):
self.event_handler.shutdown()
class ServerToClientVideo(VideoStreamTrack):
"""
@@ -489,7 +503,7 @@ class WebRTC(Component):
str, VideoCallback | ServerToClientVideo | ServerToClientAudio | AudioCallback
] = {}
data_channels: dict[str, DataChannel] = {}
additional_outputs: dict[str, AdditionalOutputs] = {}
additional_outputs: dict[str, list[AdditionalOutputs]] = {}
EVENTS = ["tick", "state_change"]
@@ -517,6 +531,7 @@ class WebRTC(Component):
time_limit: float | None = None,
mode: Literal["send-receive", "receive", "send"] = "send-receive",
modality: Literal["video", "audio"] = "video",
rtp_params: dict[str, Any] | None = None,
):
"""
Parameters:
@@ -538,15 +553,12 @@ class WebRTC(Component):
render: if False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later.
key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
mirror_webcam: if True webcam will be mirrored. Default is True.
include_audio: whether the component should record/retain the audio track for a video. By default, audio is excluded for webcam videos and included for uploaded videos.
autoplay: whether to automatically play the video when the component is used as an output. Note: browsers will not autoplay video files if the user has not interacted with the page yet.
show_share_button: if True, will show a share icon in the corner of the component that allows user to share outputs to Hugging Face Spaces Discussions. If False, icon does not appear. If set to None (default behavior), then the icon appears if this Gradio app is launched on Spaces, but not otherwise.
show_download_button: if True, will show a download icon in the corner of the component that allows user to download the output. If False, icon does not appear. By default, it will be True for output components and False for input components.
min_length: the minimum length of video (in seconds) that the user can pass into the prediction function. If None, there is no minimum length.
max_length: the maximum length of video (in seconds) that the user can pass into the prediction function. If None, there is no maximum length.
loop: if True, the video will loop when it reaches the end and continue playing from the beginning.
streaming: when used set as an output, takes video chunks yielded from the backend and combines them into one streaming video output. Each chunk should be a video file with a .ts extension using an h.264 encoding. Mp4 files are also accepted but they will be converted to h.264 encoding.
watermark: an image file to be included as a watermark on the video. The image is not scaled and is displayed on the bottom right of the video. Valid formats for the image are: jpeg, png.
rtc_configuration: WebRTC configuration options. See https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection/RTCPeerConnection . If running the demo on a remote server, you will need to specify a rtc_configuration. See https://freddyaboulton.github.io/gradio-webrtc/deployment/
track_constraints: Media track constraints for WebRTC. For example, to set video height, width use {"width": {"exact": 800}, "height": {"exact": 600}, "aspectRatio": {"exact": 1.33333}}
time_limit: Maximum duration in seconds for recording.
mode: WebRTC mode - "send-receive", "receive", or "send".
modality: Type of media - "video" or "audio".
rtp_params: See https://developer.mozilla.org/en-US/docs/Web/API/RTCRtpSender/setParameters. If you are changing the video resolution, you can set this to {"degradationPreference": "maintain-framerate"} to keep the frame rate consistent.
"""
self.time_limit = time_limit
self.height = height
@@ -556,6 +568,7 @@ class WebRTC(Component):
self.rtc_configuration = rtc_configuration
self.mode = mode
self.modality = modality
self.rtp_params = rtp_params or {}
if track_constraints is None and modality == "audio":
track_constraints = {
"echoCancellation": True,
@@ -595,7 +608,9 @@ class WebRTC(Component):
self, webrtc_id: str
) -> Callable[[AdditionalOutputs], None]:
def set_outputs(outputs: AdditionalOutputs):
self.additional_outputs[webrtc_id] = outputs
if webrtc_id not in self.additional_outputs:
self.additional_outputs[webrtc_id] = []
self.additional_outputs[webrtc_id].append(outputs)
return set_outputs
@@ -638,8 +653,12 @@ class WebRTC(Component):
inputs = list(inputs)
def handler(webrtc_id: str, *args):
if webrtc_id in self.additional_outputs:
return fn(*args, *self.additional_outputs[webrtc_id].args) # type: ignore
if (
webrtc_id in self.additional_outputs
and len(self.additional_outputs[webrtc_id]) > 0
):
next_outputs = self.additional_outputs[webrtc_id].pop(0)
return fn(*args, *next_outputs.args) # type: ignore
return (
tuple([None for _ in range(len(outputs))])
if isinstance(outputs, Iterable)
@@ -655,6 +674,7 @@ class WebRTC(Component):
concurrency_id=concurrency_id,
show_progress=show_progress,
queue=queue,
trigger_mode="multiple",
)
def stream(
@@ -748,6 +768,8 @@ class WebRTC(Component):
def clean_up(self, webrtc_id: str):
connection = self.connections.pop(webrtc_id, None)
if isinstance(connection, AudioCallback):
connection.event_handler.shutdown()
self.additional_outputs.pop(webrtc_id, None)
self.data_channels.pop(webrtc_id, None)
return connection

View File

@@ -1,37 +1,21 @@
import logging
import tempfile
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from gradio_webrtc import AdditionalOutputs, ReplyOnPause, WebRTC
from openai import OpenAI
from pydub import AudioSegment
from dotenv import load_dotenv
load_dotenv()
# Configure the root logger to WARNING to suppress debug messages from other libraries
logging.basicConfig(level=logging.WARNING)
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
# Create a formatter
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
# Configure the logger for your specific library
logger = logging.getLogger("gradio_webrtc")
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)
client = OpenAI()
def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
print("audio", audio)
segment = AudioSegment(
audio[1].tobytes(),
frame_rate=audio[0],
@@ -39,12 +23,14 @@ def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
channels=1,
)
transcript.append({"role": "user", "content": gr.Audio((audio[0], audio[1].squeeze()))})
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
segment.export(temp_audio.name, format="mp3")
next_chunk = client.audio.transcriptions.create(
model="whisper-1", file=open(temp_audio.name, "rb")
).text
transcript.append({"role": "user", "content": next_chunk})
transcript.append({"role": "assistant", "content": next_chunk})
yield AdditionalOutputs(transcript)

View File

@@ -49,17 +49,14 @@ else:
def detection(frame, conf_threshold=0.3):
print("frame.shape", frame.shape)
frame = cv2.flip(frame, 0)
global count
if random.random() > 0.98:
return AdditionalOutputs(count)
count += 1
return AdditionalOutputs(1)
css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
.my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
with gr.Blocks(css=css) as demo:
gr.HTML(
"""
@@ -78,7 +75,13 @@ with gr.Blocks(css=css) as demo:
with gr.Column(elem_classes=["my-column"]):
with gr.Group(elem_classes=["my-group"]):
image = WebRTC(
label="Stream", rtc_configuration=rtc_configuration, mode="send"
label="Stream", rtc_configuration=rtc_configuration,
mode="send",
track_constraints={"width": {"exact": 800},
"height": {"exact": 600},
"aspectRatio": {"exact": 1.33333}
},
rtp_params={"degradationPreference": "maintain-resolution"}
)
conf_threshold = gr.Slider(
label="Confidence Threshold",
@@ -92,6 +95,6 @@ with gr.Blocks(css=css) as demo:
image.stream(
fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
)
image.change(lambda n: n, outputs=[number])
image.on_additional_outputs(lambda n: n, outputs=number)
demo.launch()

View File

@@ -6,8 +6,8 @@ For example, you can control the size of the frames captured from the webcam lik
```python
track_constraints = {
"width": {"ideal": 500},
"height": {"ideal": 500},
"width": {"exact": 500},
"height": {"exact": 500},
"frameRate": {"ideal": 30},
}
webrtc = WebRTC(track_constraints=track_constraints,
@@ -16,6 +16,22 @@ webrtc = WebRTC(track_constraints=track_constraints,
```
!!! warning
WebRTC may not enforce your constaints. For example, it may rescale your video
(while keeping the same resolution) in order to maintain the desired (or reach a better) frame rate. If you
really want to enforce height, width and resolution constraints, use the `rtp_params` parameter as set `"degradationPreference": "maintain-resolution"`.
```python
image = WebRTC(
label="Stream",
mode="send",
track_constraints=track_constraints,
rtp_params={"degradationPreference": "maintain-resolution"}
)
```
## The RTC Configuration
You can configure how the connection is created on the client by passing an `rtc_configuration` parameter to the `WebRTC` component constructor.

View File

@@ -32,6 +32,7 @@
export let time_limit: number | null = null;
export let modality: "video" | "audio" = "video";
export let mode: "send-receive" | "receive" | "send" = "send-receive";
export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
export let track_constraints: MediaTrackConstraints = {};
const on_change_cb = (msg: "change" | "tick") => {
@@ -98,6 +99,8 @@
{rtc_configuration}
{time_limit}
{mode}
{track_constraints}
{rtp_params}
{on_change_cb}
on:clear={() => gradio.dispatch("clear")}
on:play={() => gradio.dispatch("play")}
@@ -125,6 +128,7 @@
{time_limit}
{track_constraints}
{mode}
{rtp_params}
i18n={gradio.i18n}
on:tick={() => gradio.dispatch("tick")}
on:error={({ detail }) => gradio.dispatch("error", detail)}

View File

@@ -31,6 +31,7 @@
export let i18n: I18nFormatter;
export let time_limit: number | null = null;
export let track_constraints: MediaTrackConstraints = {};
export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
export let on_change_cb: (mg: "tick" | "change") => void;
let options_open = false;
@@ -143,7 +144,7 @@
}
if (stream == null) return;
start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb).then((connection) => {
start(stream, pc, mode === "send" ? null: audio_player, server.offer, _webrtc_id, "audio", on_change_cb, rtp_params).then((connection) => {
pc = connection;
}).catch(() => {
console.info("catching")

View File

@@ -23,6 +23,7 @@
export let track_constraints: MediaTrackConstraints = {};
export let mode: "send" | "send-receive";
export let on_change_cb: (msg: "change" | "tick") => void;
export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
const dispatch = createEventDispatcher<{
change: FileData | null;
@@ -53,6 +54,7 @@
{time_limit}
{track_constraints}
{mode}
{rtp_params}
{on_change_cb}
on:error
on:start_recording

View File

@@ -27,8 +27,7 @@
export let on_change_cb: (msg: "tick" | "change") => void;
export let mode: "send-receive" | "send";
const _webrtc_id = Math.random().toString(36).substring(2);
console.log("mode", mode);
export let rtp_params: RTCRtpParameters = {} as RTCRtpParameters;
export const modify_stream: (state: "open" | "closed" | "waiting") => void = (
state: "open" | "closed" | "waiting"
@@ -82,7 +81,7 @@
async function access_webcam(): Promise<void> {
try {
get_video_stream(include_audio, video_source)
get_video_stream(include_audio, video_source, null, track_constraints)
.then(async (local_stream) => {
webcam_accessed = true;
available_video_devices = await get_devices();
@@ -144,7 +143,7 @@
)
stream_state = "waiting"
webrtc_id = Math.random().toString(36).substring(2);
start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb).then((connection) => {
start(stream, pc, mode === "send" ? null: video_source, server.offer, webrtc_id, "video", on_change_cb, rtp_params).then((connection) => {
pc = connection;
}).catch(() => {
console.info("catching")

View File

@@ -52,6 +52,7 @@ export async function start(
webrtc_id,
modality: "video" | "audio" = "video",
on_change_cb: (msg: "change" | "tick") => void = () => {},
rtp_params = {},
) {
pc = createPeerConnection(pc, node);
const data_channel = pc.createDataChannel("text");
@@ -70,9 +71,13 @@ export async function start(
};
if (stream) {
stream.getTracks().forEach((track) => {
stream.getTracks().forEach(async (track) => {
console.debug("Track stream callback", track);
pc.addTrack(track, stream);
const sender = pc.addTrack(track, stream);
const params = sender.getParameters();
const updated_params = { ...params, ...rtp_params };
await sender.setParameters(updated_params)
console.debug("sender params", sender.getParameters());
});
} else {
console.debug("Creating transceiver!");

View File

@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
[project]
name = "gradio_webrtc"
version = "0.0.14"
version = "0.0.15"
description = "Stream images in realtime with webrtc"
readme = "README.md"
license = "apache-2.0"