Add ability to Hide Title in Built-in UI + llama 4 cartesia tweaks (#299)

* merge title * Fix
2026-02-05 18:09:23 +08:00 · 2025-04-23 16:01:54 -04:00
parent 745701c79c
commit 02aef9da58
6 changed files with 131 additions and 97 deletions
--- a/demo/talk_to_llama4/AV_Huggy.png
+++ b/demo/talk_to_llama4/AV_Huggy.png
--- a/demo/talk_to_llama4/app.py
+++ b/demo/talk_to_llama4/app.py
@@ -9,11 +9,12 @@ from fastapi import FastAPI
 from fastapi.responses import HTMLResponse, StreamingResponse
 from fastrtc import (
    AdditionalOutputs,
+    CartesiaTTSOptions,
    ReplyOnPause,
    Stream,
-    audio_to_bytes,
    get_cloudflare_turn_credentials_async,
    get_current_context,
+    get_stt_model,
    get_tts_model,
 )
 from groq import Groq
@@ -22,9 +23,11 @@ from numpy.typing import NDArray
 curr_dir = Path(__file__).parent
 load_dotenv()

-tts_model = get_tts_model()
+tts_model = get_tts_model(
+    model="cartesia", cartesia_api_key=os.getenv("CARTESIA_API_KEY")
+)
 groq = Groq(api_key=os.getenv("GROQ_API_KEY"))
-
+stt_model = get_stt_model()

 conversations: dict[str, list[dict[str, str]]] = {}

@@ -43,14 +46,8 @@ def response(user_audio: tuple[int, NDArray[np.int16]]):
        ]
    messages = conversations[context.webrtc_id]

-    transcription = groq.audio.transcriptions.create(
-        file=("audio.wav", audio_to_bytes(user_audio)),
-        model="distil-whisper-large-v3-en",
-        response_format="verbose_json",
-    )
-    print(transcription.text)
-
-    messages.append({"role": "user", "content": transcription.text})
+    transcription = stt_model.stt(user_audio)
+    messages.append({"role": "user", "content": transcription})

    completion = groq.chat.completions.create(  # type: ignore
        model="meta-llama/llama-4-scout-17b-16e-instruct",
@@ -68,7 +65,9 @@ def response(user_audio: tuple[int, NDArray[np.int16]]):
    long_response = response["long"]
    messages.append({"role": "assistant", "content": long_response})
    conversations[context.webrtc_id] = messages
-    yield from tts_model.stream_tts_sync(short_response)
+    yield from tts_model.stream_tts_sync(
+        short_response, options=CartesiaTTSOptions(sample_rate=24_000)
+    )
    yield AdditionalOutputs(messages)


@@ -78,9 +77,22 @@ stream = Stream(
    mode="send-receive",
    additional_outputs=[gr.Chatbot(type="messages")],
    additional_outputs_handler=lambda old, new: new,
-    rtc_configuration=get_cloudflare_turn_credentials_async,
+    rtc_configuration=None,
+    ui_args={"hide_title": True},
 )

+with gr.Blocks() as demo:
+    gr.HTML(
+        f"""
+        <h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
+        <img src="/gradio_api/file={str((Path(__file__).parent / "AV_Huggy.png").resolve())}" alt="AV Huggy" style="height: 100px; margin-right: 10px"> FastRTC + Cartesia TTS = Blazing Fast LLM Audio
+        </h1>
+        """
+    )
+    stream.ui.render()
+
+stream.ui = demo
+
 app = FastAPI()
 stream.mount(app)

@@ -109,9 +121,13 @@ async def _(webrtc_id: str):

 if __name__ == "__main__":
    import os
+    from pathlib import Path

    if (mode := os.getenv("MODE")) == "UI":
-        stream.ui.launch(server_port=7860)
+        stream.ui.launch(
+            server_port=7860,
+            allowed_paths=[str((Path(__file__).parent / "AV_Huggy.png").resolve())],
+        )
    elif mode == "PHONE":
        raise ValueError("Phone mode not supported")
    else: