stt models (#147)

This commit is contained in:
Freddy Boulton
2025-03-07 17:03:11 -05:00
committed by GitHub
parent cbbfa17679
commit 504eb452f0
6 changed files with 55 additions and 64 deletions

View File

@@ -1,5 +1,4 @@
import fastapi
from fastapi.responses import FileResponse
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
from fastrtc.utils import audio_to_bytes
from openai import OpenAI
@@ -9,7 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
from elevenlabs import VoiceSettings, stream
from elevenlabs.client import ElevenLabs
import numpy as np
import io
from .env import LLM_API_KEY, ELEVENLABS_API_KEY
@@ -22,16 +20,14 @@ Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm read
messages = [{"role": "system", "content": sys_prompt}]
openai_client = OpenAI(
api_key=LLM_API_KEY
)
openai_client = OpenAI(api_key=LLM_API_KEY)
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
logging.basicConfig(level=logging.INFO)
def echo(audio):
def echo(audio):
stt_time = time.time()
logging.info("Performing STT")
@@ -54,18 +50,15 @@ def echo(audio):
logging.info(f"STT took {time.time() - stt_time} seconds")
llm_time = time.time()
def text_stream():
global full_response
global full_response
full_response = ""
response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=200,
stream=True
model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
)
for chunk in response:
if chunk.choices[0].finish_reason == "stop":
break
@@ -77,41 +70,43 @@ def echo(audio):
text=text_stream(),
voice="Rachel", # Cassidy is also really good
voice_settings=VoiceSettings(
similarity_boost=0.9,
stability=0.6,
style=0.4,
speed=1
similarity_boost=0.9, stability=0.6, style=0.4, speed=1
),
model="eleven_multilingual_v2",
output_format="pcm_24000",
stream=True
stream=True,
)
for audio_chunk in audio_stream:
audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
audio_array = (
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
)
yield (24000, audio_array)
messages.append({"role": "assistant", "content": full_response + " "})
logging.info(f"LLM response: {full_response}")
logging.info(f"LLM took {time.time() - llm_time} seconds")
stream = Stream(ReplyOnPause(echo,
algo_options=AlgoOptions(
audio_chunk_duration=0.5,
started_talking_threshold=0.1,
speech_threshold=0.03
),
model_options=SileroVadOptions(
threshold=0.75,
min_speech_duration_ms=250,
min_silence_duration_ms=1500,
speech_pad_ms=400,
max_speech_duration_s=15
)),
modality="audio",
mode="send-receive"
)
stream = Stream(
ReplyOnPause(
echo,
algo_options=AlgoOptions(
audio_chunk_duration=0.5,
started_talking_threshold=0.1,
speech_threshold=0.03,
),
model_options=SileroVadOptions(
threshold=0.75,
min_speech_duration_ms=250,
min_silence_duration_ms=1500,
speech_pad_ms=400,
max_speech_duration_s=15,
),
),
modality="audio",
mode="send-receive",
)
app = fastapi.FastAPI()
@@ -125,9 +120,10 @@ app.add_middleware(
stream.mount(app)
@app.get("/reset")
async def reset():
global messages
logging.info("Resetting chat")
messages = [{"role": "system", "content": sys_prompt}]
return {"status": "success"}
return {"status": "success"}