Fix TURN credentials for interactive video + other Gemini Audio Video demo tweaks (#297)

* Gemini

* Add code

* demo tweaks
This commit is contained in:
Freddy Boulton
2025-04-23 12:52:47 -04:00
committed by GitHub
parent f3308b6e81
commit 24349dee0c
6 changed files with 202 additions and 179 deletions

View File

@@ -4,12 +4,12 @@ emoji: ♊️
colorFrom: purple
colorTo: red
sdk: gradio
sdk_version: 5.16.0
sdk_version: 5.25.2
app_file: app.py
pinned: false
license: mit
short_description: Gemini understands audio and video!
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -6,12 +6,14 @@ from io import BytesIO
import gradio as gr
import numpy as np
import websockets
from dotenv import load_dotenv
from fastrtc import (
AsyncAudioVideoStreamHandler,
Stream,
WebRTC,
get_twilio_turn_credentials,
get_cloudflare_turn_credentials_async,
wait_for_item,
)
from google import genai
from gradio.utils import get_space
@@ -61,18 +63,24 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
)
config = {"response_modalities": ["AUDIO"]}
async with client.aio.live.connect(
model="gemini-2.0-flash-exp", config=config
model="gemini-2.0-flash-exp",
config=config, # type: ignore
) as session:
self.session = session
print("set session")
while not self.quit.is_set():
turn = self.session.receive()
async for response in turn:
if data := response.data:
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
try:
async for response in turn:
if data := response.data:
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
self.audio_queue.put_nowait(audio)
except websockets.exceptions.ConnectionClosedOK:
print("connection closed")
break
async def video_receive(self, frame: np.ndarray):
self.video_queue.put_nowait(frame)
if self.session:
# send image every 1 second
print(time.time() - self.last_frame_time)
@@ -82,10 +90,12 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
if self.latest_args[1] is not None:
await self.session.send(input=encode_image(self.latest_args[1]))
self.video_queue.put_nowait(frame)
async def video_emit(self):
return await self.video_queue.get()
frame = await wait_for_item(self.video_queue, 0.01)
if frame is not None:
return frame
else:
return np.zeros((100, 100, 3), dtype=np.uint8)
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
_, array = frame
@@ -95,13 +105,15 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
await self.session.send(input=audio_message)
async def emit(self):
array = await self.audio_queue.get()
return (self.output_sample_rate, array)
array = await wait_for_item(self.audio_queue, 0.01)
if array is not None:
return (self.output_sample_rate, array)
return array
async def shutdown(self) -> None:
if self.session:
self.quit.set()
await self.session._websocket.close()
await self.session.close()
self.quit.clear()
@@ -109,10 +121,8 @@ stream = Stream(
handler=GeminiHandler(),
modality="audio-video",
mode="send-receive",
rtc_configuration=get_twilio_turn_credentials()
if get_space() == "spaces"
else None,
time_limit=90 if get_space() else None,
rtc_configuration=get_cloudflare_turn_credentials_async,
time_limit=180 if get_space() else None,
additional_inputs=[
gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
],
@@ -151,9 +161,7 @@ with gr.Blocks(css=css) as demo:
modality="audio-video",
mode="send-receive",
elem_id="video-source",
rtc_configuration=get_twilio_turn_credentials()
if get_space() == "spaces"
else None,
rtc_configuration=get_cloudflare_turn_credentials_async,
icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
pulse_color="rgb(255, 255, 255)",
icon_button_color="rgb(255, 255, 255)",
@@ -167,7 +175,7 @@ with gr.Blocks(css=css) as demo:
GeminiHandler(),
inputs=[webrtc, image_input],
outputs=[webrtc],
time_limit=60 if get_space() else None,
time_limit=180 if get_space() else None,
concurrency_limit=2 if get_space() else None,
)

View File

@@ -1,4 +1,4 @@
fastrtc
fastrtc==0.0.23.rc1
python-dotenv
google-genai
twilio