mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-04 17:39:23 +08:00
sync code of fastrtc, add text support through datachannel, fix safari connect problem support chat without camera or mic
130 lines
3.7 KiB
Python
130 lines
3.7 KiB
Python
import fastapi
|
|
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
|
|
from fastrtc.utils import audio_to_bytes
|
|
from openai import OpenAI
|
|
import logging
|
|
import time
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from elevenlabs import VoiceSettings, stream
|
|
from elevenlabs.client import ElevenLabs
|
|
import numpy as np
|
|
|
|
from .env import LLM_API_KEY, ELEVENLABS_API_KEY
|
|
|
|
|
|
sys_prompt = """
|
|
You are a helpful assistant. You are witty, engaging and fun. You love being interactive with the user.
|
|
You also can add minimalistic utterances like 'uh-huh' or 'mm-hmm' to the conversation to make it more natural. However, only vocalization are allowed, no actions or other non-vocal sounds.
|
|
Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm ready for this...' or 'I bet you already regret clicking that button...'
|
|
"""
|
|
|
|
messages = [{"role": "system", "content": sys_prompt}]
|
|
|
|
openai_client = OpenAI(api_key=LLM_API_KEY)
|
|
|
|
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
def echo(audio):
|
|
stt_time = time.time()
|
|
|
|
logging.info("Performing STT")
|
|
|
|
transcription = elevenlabs_client.speech_to_text.convert(
|
|
file=audio_to_bytes(audio),
|
|
model_id="scribe_v1",
|
|
tag_audio_events=False,
|
|
language_code="eng",
|
|
diarize=False,
|
|
)
|
|
prompt = transcription.text
|
|
if prompt == "":
|
|
logging.info("STT returned empty string")
|
|
return
|
|
logging.info(f"STT response: {prompt}")
|
|
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
logging.info(f"STT took {time.time() - stt_time} seconds")
|
|
|
|
llm_time = time.time()
|
|
|
|
def text_stream():
|
|
global full_response
|
|
full_response = ""
|
|
|
|
response = openai_client.chat.completions.create(
|
|
model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
|
|
)
|
|
|
|
for chunk in response:
|
|
if chunk.choices[0].finish_reason == "stop":
|
|
break
|
|
if chunk.choices[0].delta.content:
|
|
full_response += chunk.choices[0].delta.content
|
|
yield chunk.choices[0].delta.content
|
|
|
|
audio_stream = elevenlabs_client.generate(
|
|
text=text_stream(),
|
|
voice="Rachel", # Cassidy is also really good
|
|
voice_settings=VoiceSettings(
|
|
similarity_boost=0.9, stability=0.6, style=0.4, speed=1
|
|
),
|
|
model="eleven_multilingual_v2",
|
|
output_format="pcm_24000",
|
|
stream=True,
|
|
)
|
|
|
|
for audio_chunk in audio_stream:
|
|
audio_array = (
|
|
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
|
|
)
|
|
yield (24000, audio_array)
|
|
|
|
messages.append({"role": "assistant", "content": full_response + " "})
|
|
logging.info(f"LLM response: {full_response}")
|
|
logging.info(f"LLM took {time.time() - llm_time} seconds")
|
|
|
|
|
|
stream = Stream(
|
|
ReplyOnPause(
|
|
echo,
|
|
algo_options=AlgoOptions(
|
|
audio_chunk_duration=0.5,
|
|
started_talking_threshold=0.1,
|
|
speech_threshold=0.03,
|
|
),
|
|
model_options=SileroVadOptions(
|
|
threshold=0.75,
|
|
min_speech_duration_ms=250,
|
|
min_silence_duration_ms=1500,
|
|
speech_pad_ms=400,
|
|
max_speech_duration_s=15,
|
|
),
|
|
),
|
|
modality="audio",
|
|
mode="send-receive",
|
|
)
|
|
|
|
app = fastapi.FastAPI()
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
stream.mount(app)
|
|
|
|
|
|
@app.get("/reset")
|
|
async def reset():
|
|
global messages
|
|
logging.info("Resetting chat")
|
|
messages = [{"role": "system", "content": sys_prompt}]
|
|
return {"status": "success"}
|