stt models (#147)

2026-02-05 18:09:23 +08:00 · 2025-03-07 17:03:11 -05:00
parent cbbfa17679
commit 504eb452f0
6 changed files with 55 additions and 64 deletions
--- a/demo/nextjs_voice_chat/backend/server.py
+++ b/demo/nextjs_voice_chat/backend/server.py
@@ -1,5 +1,4 @@
 import fastapi
-from fastapi.responses import FileResponse
 from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
 from fastrtc.utils import audio_to_bytes
 from openai import OpenAI
@@ -9,7 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
 from elevenlabs import VoiceSettings, stream
 from elevenlabs.client import ElevenLabs
 import numpy as np
-import io

 from .env import LLM_API_KEY, ELEVENLABS_API_KEY

@@ -22,16 +20,14 @@ Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm read

 messages = [{"role": "system", "content": sys_prompt}]

-openai_client = OpenAI(
-    api_key=LLM_API_KEY
-)
+openai_client = OpenAI(api_key=LLM_API_KEY)

 elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)

 logging.basicConfig(level=logging.INFO)

-def echo(audio):

+def echo(audio):
    stt_time = time.time()

    logging.info("Performing STT")
@@ -54,18 +50,15 @@ def echo(audio):
    logging.info(f"STT took {time.time() - stt_time} seconds")

    llm_time = time.time()
-    
+
    def text_stream():
-        global full_response  
+        global full_response
        full_response = ""
-        
+
        response = openai_client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=messages,
-            max_tokens=200,
-            stream=True
+            model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
        )
-        
+
        for chunk in response:
            if chunk.choices[0].finish_reason == "stop":
                break
@@ -77,41 +70,43 @@ def echo(audio):
        text=text_stream(),
        voice="Rachel",  # Cassidy is also really good
        voice_settings=VoiceSettings(
-            similarity_boost=0.9,
-            stability=0.6,
-            style=0.4,
-            speed=1
+            similarity_boost=0.9, stability=0.6, style=0.4, speed=1
        ),
        model="eleven_multilingual_v2",
        output_format="pcm_24000",
-        stream=True
+        stream=True,
    )

    for audio_chunk in audio_stream:
-        audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
+        audio_array = (
+            np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
+        )
        yield (24000, audio_array)

    messages.append({"role": "assistant", "content": full_response + " "})
    logging.info(f"LLM response: {full_response}")
    logging.info(f"LLM took {time.time() - llm_time} seconds")
-        

-stream = Stream(ReplyOnPause(echo,
-            algo_options=AlgoOptions(
-                audio_chunk_duration=0.5,
-                started_talking_threshold=0.1,
-                speech_threshold=0.03
-            ),
-            model_options=SileroVadOptions(
-                threshold=0.75,
-                min_speech_duration_ms=250,
-                min_silence_duration_ms=1500,
-                speech_pad_ms=400,
-                max_speech_duration_s=15
-            )), 
-            modality="audio", 
-            mode="send-receive"
-        )
+
+stream = Stream(
+    ReplyOnPause(
+        echo,
+        algo_options=AlgoOptions(
+            audio_chunk_duration=0.5,
+            started_talking_threshold=0.1,
+            speech_threshold=0.03,
+        ),
+        model_options=SileroVadOptions(
+            threshold=0.75,
+            min_speech_duration_ms=250,
+            min_silence_duration_ms=1500,
+            speech_pad_ms=400,
+            max_speech_duration_s=15,
+        ),
+    ),
+    modality="audio",
+    mode="send-receive",
+)

 app = fastapi.FastAPI()

@@ -125,9 +120,10 @@ app.add_middleware(

 stream.mount(app)

+
@app.get("/reset")
 async def reset():
    global messages
    logging.info("Resetting chat")
    messages = [{"role": "system", "content": sys_prompt}]
-    return {"status": "success"}
+    return {"status": "success"}