[feat] update some feature

sync code of fastrtc, add text support through datachannel, fix safari connect problem support chat without camera or mic
2026-02-05 18:09:23 +08:00 · 2025-03-25 18:05:10 +08:00
parent e1fb40a8a8
commit aefb08150f
222 changed files with 28698 additions and 5889 deletions
--- a/demo/nextjs_voice_chat/backend/server.py
+++ b/demo/nextjs_voice_chat/backend/server.py
@@ -0,0 +1,129 @@
+import fastapi
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
+from fastrtc.utils import audio_to_bytes
+from openai import OpenAI
+import logging
+import time
+from fastapi.middleware.cors import CORSMiddleware
+from elevenlabs import VoiceSettings, stream
+from elevenlabs.client import ElevenLabs
+import numpy as np
+
+from .env import LLM_API_KEY, ELEVENLABS_API_KEY
+
+
+sys_prompt = """
+You are a helpful assistant. You are witty, engaging and fun. You love being interactive with the user. 
+You also can add minimalistic utterances like 'uh-huh' or 'mm-hmm' to the conversation to make it more natural. However, only vocalization are allowed, no actions or other non-vocal sounds.
+Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm ready for this...' or 'I bet you already regret clicking that button...'
+"""
+
+messages = [{"role": "system", "content": sys_prompt}]
+
+openai_client = OpenAI(api_key=LLM_API_KEY)
+
+elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+
+logging.basicConfig(level=logging.INFO)
+
+
+def echo(audio):
+    stt_time = time.time()
+
+    logging.info("Performing STT")
+
+    transcription = elevenlabs_client.speech_to_text.convert(
+        file=audio_to_bytes(audio),
+        model_id="scribe_v1",
+        tag_audio_events=False,
+        language_code="eng",
+        diarize=False,
+    )
+    prompt = transcription.text
+    if prompt == "":
+        logging.info("STT returned empty string")
+        return
+    logging.info(f"STT response: {prompt}")
+
+    messages.append({"role": "user", "content": prompt})
+
+    logging.info(f"STT took {time.time() - stt_time} seconds")
+
+    llm_time = time.time()
+
+    def text_stream():
+        global full_response
+        full_response = ""
+
+        response = openai_client.chat.completions.create(
+            model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
+        )
+
+        for chunk in response:
+            if chunk.choices[0].finish_reason == "stop":
+                break
+            if chunk.choices[0].delta.content:
+                full_response += chunk.choices[0].delta.content
+                yield chunk.choices[0].delta.content
+
+    audio_stream = elevenlabs_client.generate(
+        text=text_stream(),
+        voice="Rachel",  # Cassidy is also really good
+        voice_settings=VoiceSettings(
+            similarity_boost=0.9, stability=0.6, style=0.4, speed=1
+        ),
+        model="eleven_multilingual_v2",
+        output_format="pcm_24000",
+        stream=True,
+    )
+
+    for audio_chunk in audio_stream:
+        audio_array = (
+            np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
+        )
+        yield (24000, audio_array)
+
+    messages.append({"role": "assistant", "content": full_response + " "})
+    logging.info(f"LLM response: {full_response}")
+    logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+
+stream = Stream(
+    ReplyOnPause(
+        echo,
+        algo_options=AlgoOptions(
+            audio_chunk_duration=0.5,
+            started_talking_threshold=0.1,
+            speech_threshold=0.03,
+        ),
+        model_options=SileroVadOptions(
+            threshold=0.75,
+            min_speech_duration_ms=250,
+            min_silence_duration_ms=1500,
+            speech_pad_ms=400,
+            max_speech_duration_s=15,
+        ),
+    ),
+    modality="audio",
+    mode="send-receive",
+)
+
+app = fastapi.FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+stream.mount(app)
+
+
+@app.get("/reset")
+async def reset():
+    global messages
+    logging.info("Resetting chat")
+    messages = [{"role": "system", "content": sys_prompt}]
+    return {"status": "success"}