[feat] update some feature

sync code of fastrtc, add text support through datachannel, fix safari connect problem support chat without camera or mic
2026-02-05 18:09:23 +08:00 · 2025-03-25 18:05:10 +08:00
parent e1fb40a8a8
commit aefb08150f
222 changed files with 28698 additions and 5889 deletions
--- a/demo/llm_voice_chat/app.py
+++ b/demo/llm_voice_chat/app.py
@@ -0,0 +1,97 @@
+import os
+import time
+
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from elevenlabs import ElevenLabs
+from fastapi import FastAPI
+from fastrtc import (
+    AdditionalOutputs,
+    ReplyOnPause,
+    Stream,
+    get_stt_model,
+    get_twilio_turn_credentials,
+)
+from gradio.utils import get_space
+from groq import Groq
+from numpy.typing import NDArray
+
+load_dotenv()
+groq_client = Groq()
+tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
+stt_model = get_stt_model()
+
+
+# See "Talk to Claude" in Cookbook for an example of how to keep
+# track of the chat history.
+def response(
+    audio: tuple[int, NDArray[np.int16 | np.float32]],
+    chatbot: list[dict] | None = None,
+):
+    chatbot = chatbot or []
+    messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
+    start = time.time()
+    text = stt_model.stt(audio)
+    print("transcription", time.time() - start)
+    print("prompt", text)
+    chatbot.append({"role": "user", "content": text})
+    yield AdditionalOutputs(chatbot)
+    messages.append({"role": "user", "content": text})
+    response_text = (
+        groq_client.chat.completions.create(
+            model="llama-3.1-8b-instant",
+            max_tokens=200,
+            messages=messages,  # type: ignore
+        )
+        .choices[0]
+        .message.content
+    )
+
+    chatbot.append({"role": "assistant", "content": response_text})
+
+    for i, chunk in enumerate(
+        tts_client.text_to_speech.convert_as_stream(
+            text=response_text,  # type: ignore
+            voice_id="JBFqnCBsd6RMkjVDRZzb",
+            model_id="eleven_multilingual_v2",
+            output_format="pcm_24000",
+        )
+    ):
+        if i == 0:
+            yield AdditionalOutputs(chatbot)
+        audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
+        yield (24000, audio_array)
+
+
+chatbot = gr.Chatbot(type="messages")
+stream = Stream(
+    modality="audio",
+    mode="send-receive",
+    handler=ReplyOnPause(response, input_sample_rate=16000),
+    additional_outputs_handler=lambda a, b: b,
+    additional_inputs=[chatbot],
+    additional_outputs=[chatbot],
+    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
+    concurrency_limit=5 if get_space() else None,
+    time_limit=90 if get_space() else None,
+    ui_args={"title": "LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)"},
+)
+
+# Mount the STREAM UI to the FastAPI app
+# Because I don't want to build the UI manually
+app = FastAPI()
+app = gr.mount_gradio_app(app, stream.ui, path="/")
+
+
+if __name__ == "__main__":
+    import os
+
+    os.environ["GRADIO_SSR_MODE"] = "false"
+
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        stream.fastphone(host="0.0.0.0", port=7860)
+    else:
+        stream.ui.launch(server_port=7860)