mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-04 17:39:23 +08:00
sync code of fastrtc, add text support through datachannel, fix safari connect problem support chat without camera or mic
98 lines
2.8 KiB
Python
98 lines
2.8 KiB
Python
import os
|
|
import time
|
|
|
|
import gradio as gr
|
|
import numpy as np
|
|
from dotenv import load_dotenv
|
|
from elevenlabs import ElevenLabs
|
|
from fastapi import FastAPI
|
|
from fastrtc import (
|
|
AdditionalOutputs,
|
|
ReplyOnPause,
|
|
Stream,
|
|
get_stt_model,
|
|
get_twilio_turn_credentials,
|
|
)
|
|
from gradio.utils import get_space
|
|
from groq import Groq
|
|
from numpy.typing import NDArray
|
|
|
|
load_dotenv()
|
|
groq_client = Groq()
|
|
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
|
stt_model = get_stt_model()
|
|
|
|
|
|
# See "Talk to Claude" in Cookbook for an example of how to keep
|
|
# track of the chat history.
|
|
def response(
|
|
audio: tuple[int, NDArray[np.int16 | np.float32]],
|
|
chatbot: list[dict] | None = None,
|
|
):
|
|
chatbot = chatbot or []
|
|
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
|
|
start = time.time()
|
|
text = stt_model.stt(audio)
|
|
print("transcription", time.time() - start)
|
|
print("prompt", text)
|
|
chatbot.append({"role": "user", "content": text})
|
|
yield AdditionalOutputs(chatbot)
|
|
messages.append({"role": "user", "content": text})
|
|
response_text = (
|
|
groq_client.chat.completions.create(
|
|
model="llama-3.1-8b-instant",
|
|
max_tokens=200,
|
|
messages=messages, # type: ignore
|
|
)
|
|
.choices[0]
|
|
.message.content
|
|
)
|
|
|
|
chatbot.append({"role": "assistant", "content": response_text})
|
|
|
|
for i, chunk in enumerate(
|
|
tts_client.text_to_speech.convert_as_stream(
|
|
text=response_text, # type: ignore
|
|
voice_id="JBFqnCBsd6RMkjVDRZzb",
|
|
model_id="eleven_multilingual_v2",
|
|
output_format="pcm_24000",
|
|
)
|
|
):
|
|
if i == 0:
|
|
yield AdditionalOutputs(chatbot)
|
|
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
|
|
yield (24000, audio_array)
|
|
|
|
|
|
chatbot = gr.Chatbot(type="messages")
|
|
stream = Stream(
|
|
modality="audio",
|
|
mode="send-receive",
|
|
handler=ReplyOnPause(response, input_sample_rate=16000),
|
|
additional_outputs_handler=lambda a, b: b,
|
|
additional_inputs=[chatbot],
|
|
additional_outputs=[chatbot],
|
|
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
|
concurrency_limit=5 if get_space() else None,
|
|
time_limit=90 if get_space() else None,
|
|
ui_args={"title": "LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)"},
|
|
)
|
|
|
|
# Mount the STREAM UI to the FastAPI app
|
|
# Because I don't want to build the UI manually
|
|
app = FastAPI()
|
|
app = gr.mount_gradio_app(app, stream.ui, path="/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import os
|
|
|
|
os.environ["GRADIO_SSR_MODE"] = "false"
|
|
|
|
if (mode := os.getenv("MODE")) == "UI":
|
|
stream.ui.launch(server_port=7860)
|
|
elif mode == "PHONE":
|
|
stream.fastphone(host="0.0.0.0", port=7860)
|
|
else:
|
|
stream.ui.launch(server_port=7860)
|