import fastapi from fastapi.responses import FileResponse from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions from fastrtc.utils import audio_to_bytes from openai import OpenAI import logging import time from fastapi.middleware.cors import CORSMiddleware from elevenlabs import VoiceSettings, stream from elevenlabs.client import ElevenLabs import numpy as np import io from .env import LLM_API_KEY, ELEVENLABS_API_KEY sys_prompt = """ You are a helpful assistant. You are witty, engaging and fun. You love being interactive with the user. You also can add minimalistic utterances like 'uh-huh' or 'mm-hmm' to the conversation to make it more natural. However, only vocalization are allowed, no actions or other non-vocal sounds. Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm ready for this...' or 'I bet you already regret clicking that button...' """ messages = [{"role": "system", "content": sys_prompt}] openai_client = OpenAI( api_key=LLM_API_KEY ) elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) logging.basicConfig(level=logging.INFO) def echo(audio): stt_time = time.time() logging.info("Performing STT") transcription = elevenlabs_client.speech_to_text.convert( file=audio_to_bytes(audio), model_id="scribe_v1", tag_audio_events=False, language_code="eng", diarize=False, ) prompt = transcription.text if prompt == "": logging.info("STT returned empty string") return logging.info(f"STT response: {prompt}") messages.append({"role": "user", "content": prompt}) logging.info(f"STT took {time.time() - stt_time} seconds") llm_time = time.time() def text_stream(): global full_response full_response = "" response = openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True ) for chunk in response: if chunk.choices[0].finish_reason == "stop": break if chunk.choices[0].delta.content: full_response += chunk.choices[0].delta.content yield chunk.choices[0].delta.content audio_stream = elevenlabs_client.generate( text=text_stream(), voice="Rachel", # Cassidy is also really good voice_settings=VoiceSettings( similarity_boost=0.9, stability=0.6, style=0.4, speed=1 ), model="eleven_multilingual_v2", output_format="pcm_24000", stream=True ) for audio_chunk in audio_stream: audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0 yield (24000, audio_array) messages.append({"role": "assistant", "content": full_response + " "}) logging.info(f"LLM response: {full_response}") logging.info(f"LLM took {time.time() - llm_time} seconds") stream = Stream(ReplyOnPause(echo, algo_options=AlgoOptions( audio_chunk_duration=0.5, started_talking_threshold=0.1, speech_threshold=0.03 ), model_options=SileroVadOptions( threshold=0.75, min_speech_duration_ms=250, min_silence_duration_ms=1500, speech_pad_ms=400, max_speech_duration_s=15 )), modality="audio", mode="send-receive" ) app = fastapi.FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) stream.mount(app) @app.get("/reset") async def reset(): global messages logging.info("Resetting chat") messages = [{"role": "system", "content": sys_prompt}] return {"status": "success"}