Files
gradio-webrtc/demo/nextjs_voice_chat/backend/server.py
Rohan Richard 6905810f37 Adding nextjs + 11labs + openai streaming demo (#139)
* adding nextjs + 11labs + openai streaming demo

* removing package-lock
2025-03-07 14:24:23 -05:00

133 lines
3.9 KiB
Python

import fastapi
from fastapi.responses import FileResponse
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
from fastrtc.utils import audio_to_bytes
from openai import OpenAI
import logging
import time
from fastapi.middleware.cors import CORSMiddleware
from elevenlabs import VoiceSettings, stream
from elevenlabs.client import ElevenLabs
import numpy as np
import io
from .env import LLM_API_KEY, ELEVENLABS_API_KEY
sys_prompt = """
You are a helpful assistant. You are witty, engaging and fun. You love being interactive with the user.
You also can add minimalistic utterances like 'uh-huh' or 'mm-hmm' to the conversation to make it more natural. However, only vocalization are allowed, no actions or other non-vocal sounds.
Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm ready for this...' or 'I bet you already regret clicking that button...'
"""
messages = [{"role": "system", "content": sys_prompt}]
openai_client = OpenAI(
api_key=LLM_API_KEY
)
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
logging.basicConfig(level=logging.INFO)
def echo(audio):
stt_time = time.time()
logging.info("Performing STT")
transcription = elevenlabs_client.speech_to_text.convert(
file=audio_to_bytes(audio),
model_id="scribe_v1",
tag_audio_events=False,
language_code="eng",
diarize=False,
)
prompt = transcription.text
if prompt == "":
logging.info("STT returned empty string")
return
logging.info(f"STT response: {prompt}")
messages.append({"role": "user", "content": prompt})
logging.info(f"STT took {time.time() - stt_time} seconds")
llm_time = time.time()
def text_stream():
global full_response
full_response = ""
response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=200,
stream=True
)
for chunk in response:
if chunk.choices[0].finish_reason == "stop":
break
if chunk.choices[0].delta.content:
full_response += chunk.choices[0].delta.content
yield chunk.choices[0].delta.content
audio_stream = elevenlabs_client.generate(
text=text_stream(),
voice="Rachel", # Cassidy is also really good
voice_settings=VoiceSettings(
similarity_boost=0.9,
stability=0.6,
style=0.4,
speed=1
),
model="eleven_multilingual_v2",
output_format="pcm_24000",
stream=True
)
for audio_chunk in audio_stream:
audio_array = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
yield (24000, audio_array)
messages.append({"role": "assistant", "content": full_response + " "})
logging.info(f"LLM response: {full_response}")
logging.info(f"LLM took {time.time() - llm_time} seconds")
stream = Stream(ReplyOnPause(echo,
algo_options=AlgoOptions(
audio_chunk_duration=0.5,
started_talking_threshold=0.1,
speech_threshold=0.03
),
model_options=SileroVadOptions(
threshold=0.75,
min_speech_duration_ms=250,
min_silence_duration_ms=1500,
speech_pad_ms=400,
max_speech_duration_s=15
)),
modality="audio",
mode="send-receive"
)
app = fastapi.FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
stream.mount(app)
@app.get("/reset")
async def reset():
global messages
logging.info("Resetting chat")
messages = [{"role": "system", "content": sys_prompt}]
return {"status": "success"}