Additional outputs tweaks + fix track constraints (#28)

* code * add code * add code
2026-02-05 01:49:23 +08:00 · 2024-12-03 15:32:43 -05:00
parent 65d0ba023f
commit c85c117576
10 changed files with 91 additions and 53 deletions
--- a/demo/stream_whisper.py
+++ b/demo/stream_whisper.py
@@ -1,37 +1,21 @@
-import logging
 import tempfile

 import gradio as gr
 import numpy as np
-from dotenv import load_dotenv
 from gradio_webrtc import AdditionalOutputs, ReplyOnPause, WebRTC
 from openai import OpenAI
 from pydub import AudioSegment

+from dotenv import load_dotenv
+
 load_dotenv()


-# Configure the root logger to WARNING to suppress debug messages from other libraries
-logging.basicConfig(level=logging.WARNING)
-
-# Create a console handler
-console_handler = logging.StreamHandler()
-console_handler.setLevel(logging.DEBUG)
-
-# Create a formatter
-formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
-console_handler.setFormatter(formatter)
-
-# Configure the logger for your specific library
-logger = logging.getLogger("gradio_webrtc")
-logger.setLevel(logging.DEBUG)
-logger.addHandler(console_handler)
-
-
 client = OpenAI()


 def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
+    print("audio", audio)
    segment = AudioSegment(
        audio[1].tobytes(),
        frame_rate=audio[0],
@@ -39,12 +23,14 @@ def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
        channels=1,
    )

+    transcript.append({"role": "user", "content": gr.Audio((audio[0], audio[1].squeeze()))})
+
    with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
        segment.export(temp_audio.name, format="mp3")
        next_chunk = client.audio.transcriptions.create(
            model="whisper-1", file=open(temp_audio.name, "rb")
        ).text
-        transcript.append({"role": "user", "content": next_chunk})
+        transcript.append({"role": "assistant", "content": next_chunk})
        yield AdditionalOutputs(transcript)


--- a/demo/video_send_output.py
+++ b/demo/video_send_output.py
@@ -49,17 +49,14 @@ else:


 def detection(frame, conf_threshold=0.3):
+    print("frame.shape", frame.shape)
    frame = cv2.flip(frame, 0)
-    global count
-    if random.random() > 0.98:
-        return AdditionalOutputs(count)
-    count += 1
+    return AdditionalOutputs(1)


 css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

-
 with gr.Blocks(css=css) as demo:
    gr.HTML(
        """
@@ -78,7 +75,13 @@ with gr.Blocks(css=css) as demo:
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            image = WebRTC(
-                label="Stream", rtc_configuration=rtc_configuration, mode="send"
+                label="Stream", rtc_configuration=rtc_configuration,
+                mode="send",
+                track_constraints={"width": {"exact": 800},
+                                   "height": {"exact": 600},
+                                   "aspectRatio": {"exact": 1.33333}
+                                   },
+                rtp_params={"degradationPreference": "maintain-resolution"}
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
@@ -92,6 +95,6 @@ with gr.Blocks(css=css) as demo:
        image.stream(
            fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
        )
-        image.change(lambda n: n, outputs=[number])
+        image.on_additional_outputs(lambda n: n, outputs=number)

 demo.launch()