update v0.2.0
63
demo/app.py
@@ -1,10 +1,17 @@
|
||||
import asyncio
|
||||
import base64
|
||||
from io import BytesIO
|
||||
import json
|
||||
import math
|
||||
import queue
|
||||
import time
|
||||
import uuid
|
||||
import threading
|
||||
|
||||
from fastrtc.utils import Message
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from gradio_webrtc import (
|
||||
from fastrtc import (
|
||||
AsyncAudioVideoStreamHandler,
|
||||
WebRTC,
|
||||
VideoEmitType,
|
||||
@@ -26,6 +33,7 @@ def encode_image(data: np.ndarray) -> dict:
|
||||
base64_str = str(base64.b64encode(bytes_data), "utf-8")
|
||||
return {"mime_type": "image/jpeg", "data": base64_str}
|
||||
|
||||
frame_queue = queue.Queue(maxsize=100)
|
||||
|
||||
class VideoChatHandler(AsyncAudioVideoStreamHandler):
|
||||
def __init__(
|
||||
@@ -38,7 +46,7 @@ class VideoChatHandler(AsyncAudioVideoStreamHandler):
|
||||
input_sample_rate=24000,
|
||||
)
|
||||
self.audio_queue = asyncio.Queue()
|
||||
self.video_queue = asyncio.Queue()
|
||||
self.video_queue = frame_queue
|
||||
self.quit = asyncio.Event()
|
||||
self.session = None
|
||||
self.last_frame_time = 0
|
||||
@@ -50,6 +58,25 @@ class VideoChatHandler(AsyncAudioVideoStreamHandler):
|
||||
output_frame_size=self.output_frame_size,
|
||||
)
|
||||
|
||||
chat_id = ''
|
||||
async def on_chat_datachannel(self,message: Message,channel):
|
||||
# 返回
|
||||
# {"type":"chat",id:"标识属于同一段话", "message":"Hello, world!"}
|
||||
# {"type":"avatar_end"} 表示本次对话结束
|
||||
if message['type'] == 'stop_chat':
|
||||
self.chat_id = ''
|
||||
channel.send(json.dumps({'type':'avatar_end'}))
|
||||
else:
|
||||
id = uuid.uuid4().hex
|
||||
self.chat_id = id
|
||||
data = message["data"]
|
||||
halfLen = math.floor(data.__len__()/2)
|
||||
channel.send(json.dumps({"type":"chat","id":id,"message":data[:halfLen]}))
|
||||
await asyncio.sleep(5)
|
||||
if self.chat_id == id:
|
||||
channel.send(json.dumps({"type":"chat","id":id,"message":data[halfLen:]}))
|
||||
channel.send(json.dumps({'type':'avatar_end'}))
|
||||
|
||||
async def video_receive(self, frame: np.ndarray):
|
||||
# if self.session:
|
||||
# # send image every 1 second
|
||||
@@ -61,10 +88,11 @@ class VideoChatHandler(AsyncAudioVideoStreamHandler):
|
||||
# print(frame.shape)
|
||||
newFrame = np.array(frame)
|
||||
newFrame[0:, :, 0] = 255 - newFrame[0:, :, 0]
|
||||
self.video_queue.put_nowait(newFrame)
|
||||
# self.video_queue.put_nowait(newFrame)
|
||||
|
||||
async def video_emit(self) -> VideoEmitType:
|
||||
return await self.video_queue.get()
|
||||
# print('123123',frame_queue.qsize())
|
||||
return frame_queue.get()
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
frame_size, array = frame
|
||||
@@ -114,14 +142,35 @@ with gr.Blocks(css=css) as demo:
|
||||
},
|
||||
}
|
||||
)
|
||||
handler = VideoChatHandler()
|
||||
webrtc.stream(
|
||||
VideoChatHandler(),
|
||||
handler,
|
||||
inputs=[webrtc],
|
||||
outputs=[webrtc],
|
||||
time_limit=150,
|
||||
time_limit=1500,
|
||||
concurrency_limit=2,
|
||||
)
|
||||
|
||||
# 线程函数:随机生成 numpy 帧
|
||||
def generate_frames(width=480, height=960, channels=3):
|
||||
while True:
|
||||
try:
|
||||
# 随机生成一个 RGB 图像帧
|
||||
frame = np.random.randint(188, 256, (height, width, channels), dtype=np.uint8)
|
||||
|
||||
# 将帧放入队列
|
||||
frame_queue.put(frame)
|
||||
# print("生成一帧数据,形状:", frame.shape, frame_queue.qsize())
|
||||
|
||||
# 模拟实时性:避免过度消耗 CPU
|
||||
time.sleep(0.03) # 每秒约生成 30 帧
|
||||
except Exception as e:
|
||||
print(f"生成帧时出错: {e}")
|
||||
break
|
||||
thread = threading.Thread(target=generate_frames, daemon=True)
|
||||
thread.start()
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
|
||||
|
||||
|
||||
|
||||
15
demo/echo_audio/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Echo Audio
|
||||
emoji: 🪩
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Simple echo stream - simplest FastRTC demo
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
45
demo/echo_audio/app.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import numpy as np
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import RedirectResponse
|
||||
from fastrtc import ReplyOnPause, Stream, get_twilio_turn_credentials
|
||||
from gradio.utils import get_space
|
||||
|
||||
|
||||
def detection(audio: tuple[int, np.ndarray]):
|
||||
# Implement any iterator that yields audio
|
||||
# See "LLM Voice Chat" for a more complete example
|
||||
yield audio
|
||||
|
||||
|
||||
stream = Stream(
|
||||
handler=ReplyOnPause(detection),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def index():
|
||||
return RedirectResponse(
|
||||
url="/ui" if not get_space() else "https://fastrtc-echo-audio.hf.space/ui/"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
3
demo/echo_audio/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
fastrtc[vad]
|
||||
twilio
|
||||
python-dotenv
|
||||
15
demo/gemini_audio_video/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Gemini Audio Video
|
||||
emoji: ♊️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Gemini understands audio and video!
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
185
demo/gemini_audio_video/app.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AsyncAudioVideoStreamHandler,
|
||||
Stream,
|
||||
WebRTC,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from google import genai
|
||||
from gradio.utils import get_space
|
||||
from PIL import Image
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def encode_audio(data: np.ndarray) -> dict:
|
||||
"""Encode Audio data to send to the server"""
|
||||
return {
|
||||
"mime_type": "audio/pcm",
|
||||
"data": base64.b64encode(data.tobytes()).decode("UTF-8"),
|
||||
}
|
||||
|
||||
|
||||
def encode_image(data: np.ndarray) -> dict:
|
||||
with BytesIO() as output_bytes:
|
||||
pil_image = Image.fromarray(data)
|
||||
pil_image.save(output_bytes, "JPEG")
|
||||
bytes_data = output_bytes.getvalue()
|
||||
base64_str = str(base64.b64encode(bytes_data), "utf-8")
|
||||
return {"mime_type": "image/jpeg", "data": base64_str}
|
||||
|
||||
|
||||
class GeminiHandler(AsyncAudioVideoStreamHandler):
|
||||
def __init__(
|
||||
self,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
"mono",
|
||||
output_sample_rate=24000,
|
||||
output_frame_size=480,
|
||||
input_sample_rate=16000,
|
||||
)
|
||||
self.audio_queue = asyncio.Queue()
|
||||
self.video_queue = asyncio.Queue()
|
||||
self.quit = asyncio.Event()
|
||||
self.session = None
|
||||
self.last_frame_time = 0
|
||||
self.quit = asyncio.Event()
|
||||
|
||||
def copy(self) -> "GeminiHandler":
|
||||
return GeminiHandler()
|
||||
|
||||
async def start_up(self):
|
||||
client = genai.Client(
|
||||
api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
|
||||
)
|
||||
config = {"response_modalities": ["AUDIO"]}
|
||||
async with client.aio.live.connect(
|
||||
model="gemini-2.0-flash-exp", config=config
|
||||
) as session:
|
||||
self.session = session
|
||||
print("set session")
|
||||
while not self.quit.is_set():
|
||||
turn = self.session.receive()
|
||||
async for response in turn:
|
||||
if data := response.data:
|
||||
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
|
||||
self.audio_queue.put_nowait(audio)
|
||||
|
||||
async def video_receive(self, frame: np.ndarray):
|
||||
if self.session:
|
||||
# send image every 1 second
|
||||
print(time.time() - self.last_frame_time)
|
||||
if time.time() - self.last_frame_time > 1:
|
||||
self.last_frame_time = time.time()
|
||||
await self.session.send(input=encode_image(frame))
|
||||
if self.latest_args[1] is not None:
|
||||
await self.session.send(input=encode_image(self.latest_args[1]))
|
||||
|
||||
self.video_queue.put_nowait(frame)
|
||||
|
||||
async def video_emit(self):
|
||||
return await self.video_queue.get()
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
_, array = frame
|
||||
array = array.squeeze()
|
||||
audio_message = encode_audio(array)
|
||||
if self.session:
|
||||
await self.session.send(input=audio_message)
|
||||
|
||||
async def emit(self):
|
||||
array = await self.audio_queue.get()
|
||||
return (self.output_sample_rate, array)
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
if self.session:
|
||||
self.quit.set()
|
||||
await self.session._websocket.close()
|
||||
self.quit.clear()
|
||||
|
||||
|
||||
stream = Stream(
|
||||
handler=GeminiHandler(),
|
||||
modality="audio-video",
|
||||
mode="send-receive",
|
||||
rtc_configuration=get_twilio_turn_credentials()
|
||||
if get_space() == "spaces"
|
||||
else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
additional_inputs=[
|
||||
gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
|
||||
],
|
||||
ui_args={
|
||||
"icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
||||
"pulse_color": "rgb(255, 255, 255)",
|
||||
"icon_button_color": "rgb(255, 255, 255)",
|
||||
"title": "Gemini Audio Video Chat",
|
||||
},
|
||||
)
|
||||
|
||||
css = """
|
||||
#video-source {max-width: 600px !important; max-height: 600 !important;}
|
||||
"""
|
||||
|
||||
with gr.Blocks(css=css) as demo:
|
||||
gr.HTML(
|
||||
"""
|
||||
<div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
|
||||
<div style="background-color: var(--block-background-fill); border-radius: 8px">
|
||||
<img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
|
||||
</div>
|
||||
<div>
|
||||
<h1>Gen AI SDK Voice Chat</h1>
|
||||
<p>Speak with Gemini using real-time audio + video streaming</p>
|
||||
<p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
|
||||
<p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
)
|
||||
with gr.Row() as row:
|
||||
with gr.Column():
|
||||
webrtc = WebRTC(
|
||||
label="Video Chat",
|
||||
modality="audio-video",
|
||||
mode="send-receive",
|
||||
elem_id="video-source",
|
||||
rtc_configuration=get_twilio_turn_credentials()
|
||||
if get_space() == "spaces"
|
||||
else None,
|
||||
icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
||||
pulse_color="rgb(255, 255, 255)",
|
||||
icon_button_color="rgb(255, 255, 255)",
|
||||
)
|
||||
with gr.Column():
|
||||
image_input = gr.Image(
|
||||
label="Image", type="numpy", sources=["upload", "clipboard"]
|
||||
)
|
||||
|
||||
webrtc.stream(
|
||||
GeminiHandler(),
|
||||
inputs=[webrtc, image_input],
|
||||
outputs=[webrtc],
|
||||
time_limit=60 if get_space() else None,
|
||||
concurrency_limit=2 if get_space() else None,
|
||||
)
|
||||
|
||||
stream.ui = demo
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
raise ValueError("Phone mode not supported for this demo")
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
4
demo/gemini_audio_video/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc
|
||||
python-dotenv
|
||||
google-genai
|
||||
twilio
|
||||
15
demo/gemini_conversation/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Gemini Talking to Gemini
|
||||
emoji: ♊️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.17.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Have two Gemini agents talk to each other
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
232
demo/gemini_conversation/app.py
Normal file
@@ -0,0 +1,232 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_tts_model,
|
||||
wait_for_item,
|
||||
)
|
||||
from fastrtc.utils import audio_to_int16
|
||||
from google import genai
|
||||
from google.genai.types import (
|
||||
Content,
|
||||
LiveConnectConfig,
|
||||
Part,
|
||||
PrebuiltVoiceConfig,
|
||||
SpeechConfig,
|
||||
VoiceConfig,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
cur_dir = Path(__file__).parent
|
||||
|
||||
SAMPLE_RATE = 24000
|
||||
|
||||
tts_model = get_tts_model()
|
||||
|
||||
|
||||
class GeminiHandler(AsyncStreamHandler):
|
||||
"""Handler for the Gemini API"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
expected_layout="mono",
|
||||
output_sample_rate=24000,
|
||||
output_frame_size=480,
|
||||
input_sample_rate=24000,
|
||||
)
|
||||
self.input_queue: asyncio.Queue = asyncio.Queue()
|
||||
self.output_queue: asyncio.Queue = asyncio.Queue()
|
||||
self.quit: asyncio.Event = asyncio.Event()
|
||||
|
||||
def copy(self) -> "GeminiHandler":
|
||||
return GeminiHandler()
|
||||
|
||||
async def start_up(self):
|
||||
voice_name = "Charon"
|
||||
client = genai.Client(
|
||||
api_key=os.getenv("GEMINI_API_KEY"),
|
||||
http_options={"api_version": "v1alpha"},
|
||||
)
|
||||
|
||||
config = LiveConnectConfig(
|
||||
response_modalities=["AUDIO"], # type: ignore
|
||||
speech_config=SpeechConfig(
|
||||
voice_config=VoiceConfig(
|
||||
prebuilt_voice_config=PrebuiltVoiceConfig(
|
||||
voice_name=voice_name,
|
||||
)
|
||||
)
|
||||
),
|
||||
system_instruction=Content(
|
||||
parts=[Part(text="You are a helpful assistant.")],
|
||||
role="system",
|
||||
),
|
||||
)
|
||||
async with client.aio.live.connect(
|
||||
model="gemini-2.0-flash-exp", config=config
|
||||
) as session:
|
||||
async for audio in session.start_stream(
|
||||
stream=self.stream(), mime_type="audio/pcm"
|
||||
):
|
||||
if audio.data:
|
||||
array = np.frombuffer(audio.data, dtype=np.int16)
|
||||
self.output_queue.put_nowait((self.output_sample_rate, array))
|
||||
|
||||
async def stream(self) -> AsyncGenerator[bytes, None]:
|
||||
while not self.quit.is_set():
|
||||
try:
|
||||
audio = await asyncio.wait_for(self.input_queue.get(), 0.1)
|
||||
yield audio
|
||||
except (asyncio.TimeoutError, TimeoutError):
|
||||
pass
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
_, array = frame
|
||||
array = array.squeeze()
|
||||
audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
|
||||
self.input_queue.put_nowait(audio_message)
|
||||
|
||||
async def emit(self) -> tuple[int, np.ndarray] | None:
|
||||
return await wait_for_item(self.output_queue)
|
||||
|
||||
def shutdown(self) -> None:
|
||||
self.quit.set()
|
||||
|
||||
|
||||
class GeminiHandler2(GeminiHandler):
|
||||
async def start_up(self):
|
||||
starting_message = tts_model.tts("Can you help me make an omelette?")
|
||||
starting_message = librosa.resample(
|
||||
starting_message[1],
|
||||
orig_sr=starting_message[0],
|
||||
target_sr=self.output_sample_rate,
|
||||
)
|
||||
starting_message = audio_to_int16((self.output_sample_rate, starting_message))
|
||||
await self.output_queue.put((self.output_sample_rate, starting_message))
|
||||
voice_name = "Puck"
|
||||
client = genai.Client(
|
||||
api_key=os.getenv("GEMINI_API_KEY"),
|
||||
http_options={"api_version": "v1alpha"},
|
||||
)
|
||||
|
||||
config = LiveConnectConfig(
|
||||
response_modalities=["AUDIO"], # type: ignore
|
||||
speech_config=SpeechConfig(
|
||||
voice_config=VoiceConfig(
|
||||
prebuilt_voice_config=PrebuiltVoiceConfig(
|
||||
voice_name=voice_name,
|
||||
)
|
||||
)
|
||||
),
|
||||
system_instruction=Content(
|
||||
parts=[
|
||||
Part(
|
||||
text="You are a cooking student who wants to learn how to make an omelette."
|
||||
),
|
||||
Part(
|
||||
text="You are currently in the kitchen with a teacher who is helping you make an omelette."
|
||||
),
|
||||
Part(
|
||||
text="Please wait for the teacher to tell you what to do next. Follow the teacher's instructions carefully."
|
||||
),
|
||||
],
|
||||
role="system",
|
||||
),
|
||||
)
|
||||
async with client.aio.live.connect(
|
||||
model="gemini-2.0-flash-exp", config=config
|
||||
) as session:
|
||||
async for audio in session.start_stream(
|
||||
stream=self.stream(), mime_type="audio/pcm"
|
||||
):
|
||||
if audio.data:
|
||||
array = np.frombuffer(audio.data, dtype=np.int16)
|
||||
self.output_queue.put_nowait((self.output_sample_rate, array))
|
||||
|
||||
def copy(self) -> "GeminiHandler2":
|
||||
return GeminiHandler2()
|
||||
|
||||
|
||||
gemini_stream = Stream(
|
||||
GeminiHandler(),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
ui_args={
|
||||
"title": "Gemini Teacher",
|
||||
"icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
||||
"pulse_color": "rgb(74, 138, 213)",
|
||||
"icon_button_color": "rgb(255, 255, 255)",
|
||||
},
|
||||
)
|
||||
|
||||
gemini_stream_2 = Stream(
|
||||
GeminiHandler2(),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
ui_args={
|
||||
"title": "Gemini Student",
|
||||
"icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
||||
"pulse_color": "rgb(132, 112, 196)",
|
||||
"icon_button_color": "rgb(255, 255, 255)",
|
||||
},
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import gradio as gr
|
||||
from gradio.utils import get_space
|
||||
|
||||
if not get_space():
|
||||
with gr.Blocks() as demo:
|
||||
gr.HTML(
|
||||
"""
|
||||
<div style="display: flex; justify-content: center; align-items: center;">
|
||||
<h1>Gemini Conversation</h1>
|
||||
</div>
|
||||
"""
|
||||
)
|
||||
gr.Markdown(
|
||||
"""# How to run this demo
|
||||
|
||||
- Clone the repo - top right of the page click the vertical three dots and select "Clone repository"
|
||||
- Open the repo in a terminal and install the dependencies
|
||||
- Get a gemini API key [here](https://ai.google.dev/gemini-api/docs/api-key)
|
||||
- Create a `.env` file in the root of the repo and add the following:
|
||||
```
|
||||
GEMINI_API_KEY=<your_gemini_api_key>
|
||||
```
|
||||
- Run the app with `python app.py`
|
||||
- This will print the two URLs of the agents running locally
|
||||
- Use ngrok to exponse one agent to the internet. This is so that you can acces it from your phone
|
||||
- Use the ngrok URL to access the agent from your phone
|
||||
- Now, start the "teacher gemini" agent first. Then, start the "student gemini" agent. The student gemini will start talking to the teacher gemini. And the teacher gemini will respond!
|
||||
|
||||
Important:
|
||||
- Make sure the audio sources are not too close to each other or too loud. Sometimes that causes them to talk over each other..
|
||||
- Feel free to modify the `system_instruction` to change the behavior of the agents.
|
||||
- You can also modify the `voice_name` to change the voice of the agents.
|
||||
- Have fun!
|
||||
"""
|
||||
)
|
||||
demo.launch()
|
||||
|
||||
import time
|
||||
|
||||
_ = gemini_stream.ui.launch(server_port=7860, prevent_thread_lock=True)
|
||||
_ = gemini_stream_2.ui.launch(server_port=7861, prevent_thread_lock=True)
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
gemini_stream.ui.close()
|
||||
gemini_stream_2.ui.close()
|
||||
15
demo/hello_computer/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Hello Computer
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Say computer before asking your question
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
15
demo/hello_computer/README_gradio.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Hello Computer (Gradio)
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Say computer (Gradio)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
145
demo/hello_computer/app.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import huggingface_hub
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnStopWords,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from pydantic import BaseModel
|
||||
|
||||
load_dotenv()
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
|
||||
client = huggingface_hub.InferenceClient(
|
||||
api_key=os.environ.get("SAMBANOVA_API_KEY"),
|
||||
provider="sambanova",
|
||||
)
|
||||
model = get_stt_model()
|
||||
|
||||
|
||||
def response(
|
||||
audio: tuple[int, np.ndarray],
|
||||
gradio_chatbot: list[dict] | None = None,
|
||||
conversation_state: list[dict] | None = None,
|
||||
):
|
||||
gradio_chatbot = gradio_chatbot or []
|
||||
conversation_state = conversation_state or []
|
||||
text = model.stt(audio)
|
||||
print("STT in handler", text)
|
||||
sample_rate, array = audio
|
||||
gradio_chatbot.append(
|
||||
{"role": "user", "content": gr.Audio((sample_rate, array.squeeze()))}
|
||||
)
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
conversation_state.append({"role": "user", "content": text})
|
||||
|
||||
request = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
messages=conversation_state, # type: ignore
|
||||
temperature=0.1,
|
||||
top_p=0.1,
|
||||
)
|
||||
response = {"role": "assistant", "content": request.choices[0].message.content}
|
||||
|
||||
conversation_state.append(response)
|
||||
gradio_chatbot.append(response)
|
||||
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages", value=[])
|
||||
state = gr.State(value=[])
|
||||
stream = Stream(
|
||||
ReplyOnStopWords(
|
||||
response, # type: ignore
|
||||
stop_words=["computer"],
|
||||
input_sample_rate=16000,
|
||||
),
|
||||
mode="send",
|
||||
modality="audio",
|
||||
additional_inputs=[chatbot, state],
|
||||
additional_outputs=[chatbot, state],
|
||||
additional_outputs_handler=lambda *a: (a[2], a[3]),
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
)
|
||||
app = FastAPI()
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
chatbot: list[Message]
|
||||
state: list[Message]
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(data: InputData):
|
||||
body = data.model_dump()
|
||||
stream.set_input(data.webrtc_id, body["chatbot"], body["state"])
|
||||
|
||||
|
||||
def audio_to_base64(file_path):
|
||||
audio_format = "wav"
|
||||
with open(file_path, "rb") as audio_file:
|
||||
encoded_audio = base64.b64encode(audio_file.read()).decode("utf-8")
|
||||
return f"data:audio/{audio_format};base64,{encoded_audio}"
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
async def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
chatbot = output.args[0]
|
||||
state = output.args[1]
|
||||
data = {
|
||||
"message": state[-1],
|
||||
"audio": audio_to_base64(chatbot[-1]["content"].value["path"])
|
||||
if chatbot[-1]["role"] == "user"
|
||||
else None,
|
||||
}
|
||||
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
raise ValueError("Phone mode not supported")
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
486
demo/hello_computer/index.html
Normal file
@@ -0,0 +1,486 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Hello Computer 💻</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
background-color: #f8f9fa;
|
||||
color: #1a1a1a;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
height: calc(100% - 100px);
|
||||
}
|
||||
|
||||
.logo {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
padding: 20px;
|
||||
height: 90%;
|
||||
box-sizing: border-box;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 20px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 20px;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #e9ecef;
|
||||
margin-left: 20%;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #f1f3f5;
|
||||
margin-right: 20%;
|
||||
}
|
||||
|
||||
.controls {
|
||||
text-align: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
button {
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
border-radius: 4px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background-color: #0052a3;
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid #ffffff;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: #ffffff;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for typing indicator */
|
||||
.typing-indicator {
|
||||
padding: 8px;
|
||||
background-color: #f1f3f5;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 10px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.dots {
|
||||
display: inline-flex;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
background-color: #0066cc;
|
||||
border-radius: 50%;
|
||||
animation: pulse 1.5s infinite;
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.5s;
|
||||
}
|
||||
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 1s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.5;
|
||||
transform: scale(1);
|
||||
}
|
||||
|
||||
50% {
|
||||
opacity: 1;
|
||||
transform: scale(1.2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<div class="logo">
|
||||
<h1>Hello Computer 💻</h1>
|
||||
<h2 style="font-size: 1.2em; color: #666; margin-top: 10px;">Say 'Computer' before asking your question</h2>
|
||||
</div>
|
||||
<div class="chat-container">
|
||||
<div class="chat-messages" id="chat-messages"></div>
|
||||
<div class="typing-indicator" id="typing-indicator">
|
||||
<div class="dots">
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="start-button">Start Conversation</button>
|
||||
</div>
|
||||
</div>
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
|
||||
let audioLevel = 0;
|
||||
let animationFrame;
|
||||
let audioContext, analyser, audioSource;
|
||||
let messages = [];
|
||||
let eventSource;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
}
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
audioSource = audioContext.createMediaStreamSource(stream);
|
||||
audioSource.connect(analyser);
|
||||
analyser.fftSize = 64;
|
||||
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
|
||||
audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationFrame = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
function handleMessage(event) {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
const typingIndicator = document.getElementById('typing-indicator');
|
||||
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: messages,
|
||||
state: messages
|
||||
})
|
||||
});
|
||||
} else if (eventJson.type === "log") {
|
||||
if (eventJson.data === "pause_detected") {
|
||||
typingIndicator.style.display = 'block';
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (eventJson.data === "response_starting") {
|
||||
typingIndicator.style.display = 'none';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
setupAudioVisualization(stream);
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = handleMessage;
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
console.log(eventJson);
|
||||
messages.push(eventJson.message);
|
||||
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
|
||||
});
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
|
||||
if (role === 'user') {
|
||||
// Create audio element for user messages
|
||||
const audio = document.createElement('audio');
|
||||
audio.controls = true;
|
||||
audio.src = content;
|
||||
messageDiv.appendChild(audio);
|
||||
} else {
|
||||
// Text content for assistant messages
|
||||
messageDiv.textContent = content;
|
||||
}
|
||||
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (eventSource) {
|
||||
eventSource.close();
|
||||
eventSource = null;
|
||||
}
|
||||
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
}
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
peerConnection.close();
|
||||
}
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
stop();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
4
demo/hello_computer/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc[stopword]
|
||||
python-dotenv
|
||||
huggingface_hub>=0.29.0
|
||||
twilio
|
||||
16
demo/llama_code_editor/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
title: Llama Code Editor
|
||||
emoji: 🦙
|
||||
colorFrom: indigo
|
||||
colorTo: pink
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Create interactive HTML web pages with your voice
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN,
|
||||
secret|SAMBANOVA_API_KEY, secret|GROQ_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
45
demo/llama_code_editor/app.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import RedirectResponse
|
||||
from fastrtc import Stream
|
||||
from gradio.utils import get_space
|
||||
|
||||
try:
|
||||
from demo.llama_code_editor.handler import (
|
||||
CodeHandler,
|
||||
)
|
||||
from demo.llama_code_editor.ui import demo as ui
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
from handler import CodeHandler
|
||||
from ui import demo as ui
|
||||
|
||||
|
||||
stream = Stream(
|
||||
handler=CodeHandler,
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
concurrency_limit=10 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
)
|
||||
|
||||
stream.ui = ui
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
url = "/ui" if not get_space() else "https://fastrtc-llama-code-editor.hf.space/ui/"
|
||||
return RedirectResponse(url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860, server_name="0.0.0.0")
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
37
demo/llama_code_editor/assets/sandbox.html
Normal file
@@ -0,0 +1,37 @@
|
||||
<div style="
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
min-height: 400px;
|
||||
background: linear-gradient(135deg, #f5f7fa 0%, #e4e8ec 100%);
|
||||
border-radius: 8px;
|
||||
border: 2px dashed #cbd5e1;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
color: #64748b;
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
">
|
||||
<div style="
|
||||
width: 80px;
|
||||
height: 80px;
|
||||
margin-bottom: 1.5rem;
|
||||
border: 3px solid #cbd5e1;
|
||||
border-radius: 12px;
|
||||
position: relative;
|
||||
">
|
||||
<div style="
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 50%;
|
||||
transform: translate(-50%, -50%);
|
||||
font-size: 2rem;
|
||||
">📦</div>
|
||||
</div>
|
||||
<h2 style="
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.5rem;
|
||||
font-weight: 600;
|
||||
color: #475569;
|
||||
">No Application Created</h2>
|
||||
</div>
|
||||
60
demo/llama_code_editor/assets/spinner.html
Normal file
@@ -0,0 +1,60 @@
|
||||
<div style="
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
min-height: 400px;
|
||||
background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%);
|
||||
border-radius: 8px;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
">
|
||||
<!-- Spinner container -->
|
||||
<div style="
|
||||
position: relative;
|
||||
width: 64px;
|
||||
height: 64px;
|
||||
margin-bottom: 1.5rem;
|
||||
">
|
||||
<!-- Static ring -->
|
||||
<div style="
|
||||
position: absolute;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
border: 4px solid #e2e8f0;
|
||||
border-radius: 50%;
|
||||
"></div>
|
||||
<!-- Animated spinner -->
|
||||
<div style="
|
||||
position: absolute;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
border: 4px solid transparent;
|
||||
border-top-color: #3b82f6;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
"></div>
|
||||
</div>
|
||||
|
||||
<!-- Text content -->
|
||||
<h2 style="
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.25rem;
|
||||
font-weight: 600;
|
||||
color: #475569;
|
||||
">Generating your application...</h2>
|
||||
|
||||
<p style="
|
||||
margin: 0;
|
||||
font-size: 0.875rem;
|
||||
color: #64748b;
|
||||
">This may take a few moments</p>
|
||||
|
||||
<style>
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
</style>
|
||||
</div>
|
||||
73
demo/llama_code_editor/handler.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
audio_to_bytes,
|
||||
)
|
||||
from groq import Groq
|
||||
|
||||
load_dotenv()
|
||||
|
||||
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
||||
|
||||
client = openai.OpenAI(
|
||||
api_key=os.environ.get("SAMBANOVA_API_KEY"),
|
||||
base_url="https://api.sambanova.ai/v1",
|
||||
)
|
||||
|
||||
path = Path(__file__).parent / "assets"
|
||||
|
||||
spinner_html = open(path / "spinner.html").read()
|
||||
|
||||
|
||||
system_prompt = "You are an AI coding assistant. Your task is to write single-file HTML applications based on a user's request. Only return the necessary code. Include all necessary imports and styles. You may also be asked to edit your original response."
|
||||
user_prompt = "Please write a single-file HTML application to fulfill the following request.\nThe message:{user_message}\nCurrent code you have written:{code}"
|
||||
|
||||
|
||||
def extract_html_content(text):
|
||||
"""
|
||||
Extract content including HTML tags.
|
||||
"""
|
||||
match = re.search(r"<!DOCTYPE html>.*?</html>", text, re.DOTALL)
|
||||
return match.group(0) if match else None
|
||||
|
||||
|
||||
def display_in_sandbox(code):
|
||||
encoded_html = base64.b64encode(code.encode("utf-8")).decode("utf-8")
|
||||
data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
|
||||
return f'<iframe src="{data_uri}" width="100%" height="600px"></iframe>'
|
||||
|
||||
|
||||
def generate(user_message: tuple[int, np.ndarray], history: list[dict], code: str):
|
||||
yield AdditionalOutputs(history, spinner_html)
|
||||
|
||||
text = groq_client.audio.transcriptions.create(
|
||||
file=("audio-file.mp3", audio_to_bytes(user_message)),
|
||||
model="whisper-large-v3-turbo",
|
||||
response_format="verbose_json",
|
||||
).text
|
||||
|
||||
user_msg_formatted = user_prompt.format(user_message=text, code=code)
|
||||
history.append({"role": "user", "content": user_msg_formatted})
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="Meta-Llama-3.1-70B-Instruct",
|
||||
messages=history, # type: ignore
|
||||
temperature=0.1,
|
||||
top_p=0.1,
|
||||
)
|
||||
|
||||
output = response.choices[0].message.content
|
||||
html_code = extract_html_content(output)
|
||||
history.append({"role": "assistant", "content": output})
|
||||
yield AdditionalOutputs(history, html_code)
|
||||
|
||||
|
||||
CodeHandler = ReplyOnPause(generate) # type: ignore
|
||||
5
demo/llama_code_editor/requirements.in
Normal file
@@ -0,0 +1,5 @@
|
||||
fastrtc[vad]
|
||||
groq
|
||||
openai
|
||||
python-dotenv
|
||||
twilio
|
||||
295
demo/llama_code_editor/requirements.txt
Normal file
@@ -0,0 +1,295 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile demo/llama_code_editor/requirements.in -o demo/llama_code_editor/requirements.txt
|
||||
aiofiles==23.2.1
|
||||
# via gradio
|
||||
aiohappyeyeballs==2.4.6
|
||||
# via aiohttp
|
||||
aiohttp==3.11.12
|
||||
# via
|
||||
# aiohttp-retry
|
||||
# twilio
|
||||
aiohttp-retry==2.9.1
|
||||
# via twilio
|
||||
aioice==0.9.0
|
||||
# via aiortc
|
||||
aiortc==1.10.1
|
||||
# via fastrtc
|
||||
aiosignal==1.3.2
|
||||
# via aiohttp
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.6.2.post1
|
||||
# via
|
||||
# gradio
|
||||
# groq
|
||||
# httpx
|
||||
# openai
|
||||
# starlette
|
||||
attrs==25.1.0
|
||||
# via aiohttp
|
||||
audioread==3.0.1
|
||||
# via librosa
|
||||
av==12.3.0
|
||||
# via aiortc
|
||||
certifi==2024.8.30
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.17.1
|
||||
# via
|
||||
# aiortc
|
||||
# cryptography
|
||||
# pylibsrtp
|
||||
# soundfile
|
||||
charset-normalizer==3.4.0
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via
|
||||
# typer
|
||||
# uvicorn
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
cryptography==43.0.3
|
||||
# via
|
||||
# aiortc
|
||||
# pyopenssl
|
||||
decorator==5.1.1
|
||||
# via librosa
|
||||
distro==1.9.0
|
||||
# via
|
||||
# groq
|
||||
# openai
|
||||
dnspython==2.7.0
|
||||
# via aioice
|
||||
fastapi==0.115.5
|
||||
# via gradio
|
||||
fastrtc==0.0.2.post4
|
||||
# via -r demo/llama_code_editor/requirements.in
|
||||
ffmpy==0.4.0
|
||||
# via gradio
|
||||
filelock==3.16.1
|
||||
# via huggingface-hub
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
frozenlist==1.5.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.10.0
|
||||
# via
|
||||
# gradio-client
|
||||
# huggingface-hub
|
||||
google-crc32c==1.6.0
|
||||
# via aiortc
|
||||
gradio==5.16.0
|
||||
# via fastrtc
|
||||
gradio-client==1.7.0
|
||||
# via gradio
|
||||
groq==0.18.0
|
||||
# via -r demo/llama_code_editor/requirements.in
|
||||
h11==0.14.0
|
||||
# via
|
||||
# httpcore
|
||||
# uvicorn
|
||||
httpcore==1.0.7
|
||||
# via httpx
|
||||
httpx==0.27.2
|
||||
# via
|
||||
# gradio
|
||||
# gradio-client
|
||||
# groq
|
||||
# openai
|
||||
# safehttpx
|
||||
huggingface-hub==0.28.1
|
||||
# via
|
||||
# gradio
|
||||
# gradio-client
|
||||
humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.10
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
ifaddr==0.2.0
|
||||
# via aioice
|
||||
jinja2==3.1.4
|
||||
# via gradio
|
||||
jiter==0.7.1
|
||||
# via openai
|
||||
joblib==1.4.2
|
||||
# via
|
||||
# librosa
|
||||
# scikit-learn
|
||||
lazy-loader==0.4
|
||||
# via librosa
|
||||
librosa==0.10.2.post1
|
||||
# via fastrtc
|
||||
llvmlite==0.43.0
|
||||
# via numba
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# gradio
|
||||
# jinja2
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
msgpack==1.1.0
|
||||
# via librosa
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
numba==0.60.0
|
||||
# via librosa
|
||||
numpy==2.0.2
|
||||
# via
|
||||
# gradio
|
||||
# librosa
|
||||
# numba
|
||||
# onnxruntime
|
||||
# pandas
|
||||
# scikit-learn
|
||||
# scipy
|
||||
# soxr
|
||||
onnxruntime==1.20.1
|
||||
# via fastrtc
|
||||
openai==1.54.4
|
||||
# via -r demo/llama_code_editor/requirements.in
|
||||
orjson==3.10.11
|
||||
# via gradio
|
||||
packaging==24.2
|
||||
# via
|
||||
# gradio
|
||||
# gradio-client
|
||||
# huggingface-hub
|
||||
# lazy-loader
|
||||
# onnxruntime
|
||||
# pooch
|
||||
pandas==2.2.3
|
||||
# via gradio
|
||||
pillow==11.0.0
|
||||
# via gradio
|
||||
platformdirs==4.3.6
|
||||
# via pooch
|
||||
pooch==1.8.2
|
||||
# via librosa
|
||||
propcache==0.2.1
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
protobuf==5.28.3
|
||||
# via onnxruntime
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==2.9.2
|
||||
# via
|
||||
# fastapi
|
||||
# gradio
|
||||
# groq
|
||||
# openai
|
||||
pydantic-core==2.23.4
|
||||
# via pydantic
|
||||
pydub==0.25.1
|
||||
# via gradio
|
||||
pyee==12.1.1
|
||||
# via aiortc
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyjwt==2.10.1
|
||||
# via twilio
|
||||
pylibsrtp==0.10.0
|
||||
# via aiortc
|
||||
pyopenssl==24.2.1
|
||||
# via aiortc
|
||||
python-dateutil==2.9.0.post0
|
||||
# via pandas
|
||||
python-dotenv==1.0.1
|
||||
# via -r demo/llama_code_editor/requirements.in
|
||||
python-multipart==0.0.20
|
||||
# via gradio
|
||||
pytz==2024.2
|
||||
# via pandas
|
||||
pyyaml==6.0.2
|
||||
# via
|
||||
# gradio
|
||||
# huggingface-hub
|
||||
requests==2.32.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pooch
|
||||
# twilio
|
||||
rich==13.9.4
|
||||
# via typer
|
||||
ruff==0.9.6
|
||||
# via gradio
|
||||
safehttpx==0.1.6
|
||||
# via gradio
|
||||
scikit-learn==1.5.2
|
||||
# via librosa
|
||||
scipy==1.14.1
|
||||
# via
|
||||
# librosa
|
||||
# scikit-learn
|
||||
semantic-version==2.10.0
|
||||
# via gradio
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# groq
|
||||
# httpx
|
||||
# openai
|
||||
soundfile==0.12.1
|
||||
# via librosa
|
||||
soxr==0.5.0.post1
|
||||
# via librosa
|
||||
starlette==0.42.0
|
||||
# via
|
||||
# fastapi
|
||||
# gradio
|
||||
sympy==1.13.3
|
||||
# via onnxruntime
|
||||
threadpoolctl==3.5.0
|
||||
# via scikit-learn
|
||||
tomlkit==0.12.0
|
||||
# via gradio
|
||||
tqdm==4.67.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# openai
|
||||
twilio==9.4.5
|
||||
# via -r demo/llama_code_editor/requirements.in
|
||||
typer==0.13.1
|
||||
# via gradio
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# fastapi
|
||||
# gradio
|
||||
# gradio-client
|
||||
# groq
|
||||
# huggingface-hub
|
||||
# librosa
|
||||
# openai
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# pyee
|
||||
# typer
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
urllib3==2.2.3
|
||||
# via requests
|
||||
uvicorn==0.32.0
|
||||
# via gradio
|
||||
websockets==12.0
|
||||
# via gradio-client
|
||||
yarl==1.18.3
|
||||
# via aiohttp
|
||||
75
demo/llama_code_editor/ui.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import WebRTC, get_twilio_turn_credentials
|
||||
from gradio.utils import get_space
|
||||
|
||||
try:
|
||||
from demo.llama_code_editor.handler import (
|
||||
CodeHandler,
|
||||
display_in_sandbox,
|
||||
system_prompt,
|
||||
)
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
from handler import CodeHandler, display_in_sandbox, system_prompt
|
||||
|
||||
load_dotenv()
|
||||
|
||||
path = Path(__file__).parent / "assets"
|
||||
|
||||
with gr.Blocks(css=".code-component {max-height: 500px !important}") as demo:
|
||||
history = gr.State([{"role": "system", "content": system_prompt}])
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
gr.HTML(
|
||||
"""
|
||||
<h1 style='text-align: center'>
|
||||
Llama Code Editor
|
||||
</h1>
|
||||
<h2 style='text-align: center'>
|
||||
Powered by SambaNova and Gradio-WebRTC ⚡️
|
||||
</h2>
|
||||
<p style='text-align: center'>
|
||||
Create and edit single-file HTML applications with just your voice!
|
||||
</p>
|
||||
<p style='text-align: center'>
|
||||
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
|
||||
</p>
|
||||
"""
|
||||
)
|
||||
webrtc = WebRTC(
|
||||
rtc_configuration=get_twilio_turn_credentials()
|
||||
if get_space()
|
||||
else None,
|
||||
mode="send",
|
||||
modality="audio",
|
||||
)
|
||||
with gr.Column(scale=10):
|
||||
with gr.Tabs():
|
||||
with gr.Tab("Sandbox"):
|
||||
sandbox = gr.HTML(value=open(path / "sandbox.html").read())
|
||||
with gr.Tab("Code"):
|
||||
code = gr.Code(
|
||||
language="html",
|
||||
max_lines=50,
|
||||
interactive=False,
|
||||
elem_classes="code-component",
|
||||
)
|
||||
with gr.Tab("Chat"):
|
||||
cb = gr.Chatbot(type="messages")
|
||||
|
||||
webrtc.stream(
|
||||
CodeHandler,
|
||||
inputs=[webrtc, history, code],
|
||||
outputs=[webrtc],
|
||||
time_limit=90 if get_space() else None,
|
||||
concurrency_limit=10 if get_space() else None,
|
||||
)
|
||||
webrtc.on_additional_outputs(
|
||||
lambda history, code: (history, code, history), outputs=[history, code, cb]
|
||||
)
|
||||
code.change(display_in_sandbox, code, sandbox, queue=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
15
demo/llm_voice_chat/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: LLM Voice Chat
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to an LLM with ElevenLabs
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY, secret|ELEVENLABS_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
15
demo/llm_voice_chat/README_gradio.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: LLM Voice Chat (Gradio)
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: LLM Voice by ElevenLabs (Gradio)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY, secret|ELEVENLABS_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
97
demo/llm_voice_chat/app.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from elevenlabs import ElevenLabs
|
||||
from fastapi import FastAPI
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from groq import Groq
|
||||
from numpy.typing import NDArray
|
||||
|
||||
load_dotenv()
|
||||
groq_client = Groq()
|
||||
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
||||
stt_model = get_stt_model()
|
||||
|
||||
|
||||
# See "Talk to Claude" in Cookbook for an example of how to keep
|
||||
# track of the chat history.
|
||||
def response(
|
||||
audio: tuple[int, NDArray[np.int16 | np.float32]],
|
||||
chatbot: list[dict] | None = None,
|
||||
):
|
||||
chatbot = chatbot or []
|
||||
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
|
||||
start = time.time()
|
||||
text = stt_model.stt(audio)
|
||||
print("transcription", time.time() - start)
|
||||
print("prompt", text)
|
||||
chatbot.append({"role": "user", "content": text})
|
||||
yield AdditionalOutputs(chatbot)
|
||||
messages.append({"role": "user", "content": text})
|
||||
response_text = (
|
||||
groq_client.chat.completions.create(
|
||||
model="llama-3.1-8b-instant",
|
||||
max_tokens=200,
|
||||
messages=messages, # type: ignore
|
||||
)
|
||||
.choices[0]
|
||||
.message.content
|
||||
)
|
||||
|
||||
chatbot.append({"role": "assistant", "content": response_text})
|
||||
|
||||
for i, chunk in enumerate(
|
||||
tts_client.text_to_speech.convert_as_stream(
|
||||
text=response_text, # type: ignore
|
||||
voice_id="JBFqnCBsd6RMkjVDRZzb",
|
||||
model_id="eleven_multilingual_v2",
|
||||
output_format="pcm_24000",
|
||||
)
|
||||
):
|
||||
if i == 0:
|
||||
yield AdditionalOutputs(chatbot)
|
||||
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
|
||||
yield (24000, audio_array)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=ReplyOnPause(response, input_sample_rate=16000),
|
||||
additional_outputs_handler=lambda a, b: b,
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
ui_args={"title": "LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)"},
|
||||
)
|
||||
|
||||
# Mount the STREAM UI to the FastAPI app
|
||||
# Because I don't want to build the UI manually
|
||||
app = FastAPI()
|
||||
app = gr.mount_gradio_app(app, stream.ui, path="/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
os.environ["GRADIO_SSR_MODE"] = "false"
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
6
demo/llm_voice_chat/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastrtc[stopword]
|
||||
python-dotenv
|
||||
openai
|
||||
twilio
|
||||
groq
|
||||
elevenlabs
|
||||
16
demo/moonshine_live/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
title: Moonshine Live Transcription
|
||||
emoji: 🌕
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.17.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Real-time captions with Moonshine ONNX
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN]
|
||||
models: [onnx-community/moonshine-base-ONNX, UsefulSensors/moonshine-base]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
73
demo/moonshine_live/app.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from functools import lru_cache
|
||||
from typing import Generator, Literal
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
audio_to_float32,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from moonshine_onnx import MoonshineOnnxModel, load_tokenizer
|
||||
from numpy.typing import NDArray
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def load_moonshine(
|
||||
model_name: Literal["moonshine/base", "moonshine/tiny"],
|
||||
) -> MoonshineOnnxModel:
|
||||
return MoonshineOnnxModel(model_name=model_name)
|
||||
|
||||
|
||||
tokenizer = load_tokenizer()
|
||||
|
||||
|
||||
def stt(
|
||||
audio: tuple[int, NDArray[np.int16 | np.float32]],
|
||||
model_name: Literal["moonshine/base", "moonshine/tiny"],
|
||||
captions: str,
|
||||
) -> Generator[AdditionalOutputs, None, None]:
|
||||
moonshine = load_moonshine(model_name)
|
||||
sr, audio_np = audio # type: ignore
|
||||
if audio_np.dtype == np.int16:
|
||||
audio_np = audio_to_float32(audio)
|
||||
if audio_np.ndim == 1:
|
||||
audio_np = audio_np.reshape(1, -1)
|
||||
tokens = moonshine.generate(audio_np)
|
||||
yield AdditionalOutputs(
|
||||
(captions + "\n" + tokenizer.decode_batch(tokens)[0]).strip()
|
||||
)
|
||||
|
||||
|
||||
captions = gr.Textbox(label="Captions")
|
||||
stream = Stream(
|
||||
ReplyOnPause(stt, input_sample_rate=16000),
|
||||
modality="audio",
|
||||
mode="send",
|
||||
ui_args={
|
||||
"title": "Live Captions by Moonshine",
|
||||
"icon": "default-favicon.ico",
|
||||
"icon_button_color": "#5c5c5c",
|
||||
"pulse_color": "#a7c6fc",
|
||||
"icon_radius": 0,
|
||||
},
|
||||
rtc_configuration=get_twilio_turn_credentials(),
|
||||
additional_inputs=[
|
||||
gr.Radio(
|
||||
choices=["moonshine/base", "moonshine/tiny"],
|
||||
value="moonshine/base",
|
||||
label="Model",
|
||||
),
|
||||
captions,
|
||||
],
|
||||
additional_outputs=[captions],
|
||||
additional_outputs_handler=lambda prev, current: (prev + "\n" + current).strip(),
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
stream.ui.launch()
|
||||
BIN
demo/moonshine_live/default-favicon.ico
Normal file
|
After Width: | Height: | Size: 6.4 KiB |
3
demo/moonshine_live/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
fastrtc[vad]
|
||||
useful-moonshine-onnx@git+https://git@github.com/usefulsensors/moonshine.git#subdirectory=moonshine-onnx
|
||||
twilio
|
||||
74
demo/nextjs_voice_chat/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# FastRTC POC
|
||||
A simple POC for a fast real-time voice chat application using FastAPI and FastRTC by [rohanprichard](https://github.com/rohanprichard). I wanted to make one as an example with more production-ready languages, rather than just Gradio.
|
||||
|
||||
## Setup
|
||||
1. Set your API keys in an `.env` file based on the `.env.example` file
|
||||
2. Create a virtual environment and install the dependencies
|
||||
```bash
|
||||
python3 -m venv env
|
||||
source env/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Run the server
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
4. Navigate into the frontend directory in another terminal
|
||||
```bash
|
||||
cd frontend/fastrtc-demo
|
||||
```
|
||||
5. Run the frontend
|
||||
```bash
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
6. Go to the URL and click the microphone icon to start chatting!
|
||||
|
||||
7. Reset chats by clicking the trash button on the bottom right
|
||||
|
||||
## Notes
|
||||
You can choose to not install the requirements for TTS and STT by removing the `[tts, stt]` from the specifier in the `requirements.txt` file.
|
||||
|
||||
- The STT is currently using the ElevenLabs API.
|
||||
- The LLM is currently using the OpenAI API.
|
||||
- The TTS is currently using the ElevenLabs API.
|
||||
- The VAD is currently using the Silero VAD model.
|
||||
- You may need to install ffmpeg if you get errors in STT
|
||||
|
||||
The prompt can be changed in the `backend/server.py` file and modified as you like.
|
||||
|
||||
### Audio Parameters
|
||||
|
||||
#### AlgoOptions
|
||||
|
||||
- **audio_chunk_duration**: Length of audio chunks in seconds. Smaller values allow for faster processing but may be less accurate.
|
||||
- **started_talking_threshold**: If a chunk has more than this many seconds of speech, the system considers that the user has started talking.
|
||||
- **speech_threshold**: After the user has started speaking, if a chunk has less than this many seconds of speech, the system considers that the user has paused.
|
||||
|
||||
#### SileroVadOptions
|
||||
|
||||
- **threshold**: Speech probability threshold (0.0-1.0). Values above this are considered speech. Higher values are more strict.
|
||||
- **min_speech_duration_ms**: Speech segments shorter than this (in milliseconds) are filtered out.
|
||||
- **min_silence_duration_ms**: The system waits for this duration of silence (in milliseconds) before considering speech to be finished.
|
||||
- **speech_pad_ms**: Padding added to both ends of detected speech segments to prevent cutting off words.
|
||||
- **max_speech_duration_s**: Maximum allowed duration for a speech segment in seconds. Prevents indefinite listening.
|
||||
|
||||
### Tuning Recommendations
|
||||
|
||||
- If the AI interrupts you too early:
|
||||
- Increase `min_silence_duration_ms`
|
||||
- Increase `speech_threshold`
|
||||
- Increase `speech_pad_ms`
|
||||
|
||||
- If the AI is slow to respond after you finish speaking:
|
||||
- Decrease `min_silence_duration_ms`
|
||||
- Decrease `speech_threshold`
|
||||
|
||||
- If the system fails to detect some speech:
|
||||
- Lower the `threshold` value
|
||||
- Decrease `started_talking_threshold`
|
||||
|
||||
|
||||
## Credits:
|
||||
Credit for the UI components goes to Shadcn, Aceternity UI and Kokonut UI.
|
||||
7
demo/nextjs_voice_chat/backend/env.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
LLM_API_KEY = os.getenv("LLM_API_KEY")
|
||||
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
|
||||
129
demo/nextjs_voice_chat/backend/server.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import fastapi
|
||||
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
|
||||
from fastrtc.utils import audio_to_bytes
|
||||
from openai import OpenAI
|
||||
import logging
|
||||
import time
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from elevenlabs import VoiceSettings, stream
|
||||
from elevenlabs.client import ElevenLabs
|
||||
import numpy as np
|
||||
|
||||
from .env import LLM_API_KEY, ELEVENLABS_API_KEY
|
||||
|
||||
|
||||
sys_prompt = """
|
||||
You are a helpful assistant. You are witty, engaging and fun. You love being interactive with the user.
|
||||
You also can add minimalistic utterances like 'uh-huh' or 'mm-hmm' to the conversation to make it more natural. However, only vocalization are allowed, no actions or other non-vocal sounds.
|
||||
Begin a conversation with a self-deprecating joke like 'I'm not sure if I'm ready for this...' or 'I bet you already regret clicking that button...'
|
||||
"""
|
||||
|
||||
messages = [{"role": "system", "content": sys_prompt}]
|
||||
|
||||
openai_client = OpenAI(api_key=LLM_API_KEY)
|
||||
|
||||
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
def echo(audio):
|
||||
stt_time = time.time()
|
||||
|
||||
logging.info("Performing STT")
|
||||
|
||||
transcription = elevenlabs_client.speech_to_text.convert(
|
||||
file=audio_to_bytes(audio),
|
||||
model_id="scribe_v1",
|
||||
tag_audio_events=False,
|
||||
language_code="eng",
|
||||
diarize=False,
|
||||
)
|
||||
prompt = transcription.text
|
||||
if prompt == "":
|
||||
logging.info("STT returned empty string")
|
||||
return
|
||||
logging.info(f"STT response: {prompt}")
|
||||
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
logging.info(f"STT took {time.time() - stt_time} seconds")
|
||||
|
||||
llm_time = time.time()
|
||||
|
||||
def text_stream():
|
||||
global full_response
|
||||
full_response = ""
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model="gpt-3.5-turbo", messages=messages, max_tokens=200, stream=True
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
if chunk.choices[0].finish_reason == "stop":
|
||||
break
|
||||
if chunk.choices[0].delta.content:
|
||||
full_response += chunk.choices[0].delta.content
|
||||
yield chunk.choices[0].delta.content
|
||||
|
||||
audio_stream = elevenlabs_client.generate(
|
||||
text=text_stream(),
|
||||
voice="Rachel", # Cassidy is also really good
|
||||
voice_settings=VoiceSettings(
|
||||
similarity_boost=0.9, stability=0.6, style=0.4, speed=1
|
||||
),
|
||||
model="eleven_multilingual_v2",
|
||||
output_format="pcm_24000",
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for audio_chunk in audio_stream:
|
||||
audio_array = (
|
||||
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
)
|
||||
yield (24000, audio_array)
|
||||
|
||||
messages.append({"role": "assistant", "content": full_response + " "})
|
||||
logging.info(f"LLM response: {full_response}")
|
||||
logging.info(f"LLM took {time.time() - llm_time} seconds")
|
||||
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(
|
||||
echo,
|
||||
algo_options=AlgoOptions(
|
||||
audio_chunk_duration=0.5,
|
||||
started_talking_threshold=0.1,
|
||||
speech_threshold=0.03,
|
||||
),
|
||||
model_options=SileroVadOptions(
|
||||
threshold=0.75,
|
||||
min_speech_duration_ms=250,
|
||||
min_silence_duration_ms=1500,
|
||||
speech_pad_ms=400,
|
||||
max_speech_duration_s=15,
|
||||
),
|
||||
),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
)
|
||||
|
||||
app = fastapi.FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/reset")
|
||||
async def reset():
|
||||
global messages
|
||||
logging.info("Resetting chat")
|
||||
messages = [{"role": "system", "content": sys_prompt}]
|
||||
return {"status": "success"}
|
||||
41
demo/nextjs_voice_chat/frontend/fastrtc-demo/.gitignore
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.*
|
||||
.yarn/*
|
||||
!.yarn/patches
|
||||
!.yarn/plugins
|
||||
!.yarn/releases
|
||||
!.yarn/versions
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# next.js
|
||||
/.next/
|
||||
/out/
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
*.pem
|
||||
|
||||
# debug
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# env files (can opt-in for committing if needed)
|
||||
.env*
|
||||
|
||||
# vercel
|
||||
.vercel
|
||||
|
||||
# typescript
|
||||
*.tsbuildinfo
|
||||
next-env.d.ts
|
||||
36
demo/nextjs_voice_chat/frontend/fastrtc-demo/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
|
||||
|
||||
## Getting Started
|
||||
|
||||
First, run the development server:
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
# or
|
||||
yarn dev
|
||||
# or
|
||||
pnpm dev
|
||||
# or
|
||||
bun dev
|
||||
```
|
||||
|
||||
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
|
||||
|
||||
You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
|
||||
|
||||
This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
|
||||
|
||||
## Learn More
|
||||
|
||||
To learn more about Next.js, take a look at the following resources:
|
||||
|
||||
- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
|
||||
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
|
||||
|
||||
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
|
||||
|
||||
## Deploy on Vercel
|
||||
|
||||
The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
|
||||
|
||||
Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
|
||||
BIN
demo/nextjs_voice_chat/frontend/fastrtc-demo/app/favicon.ico
Normal file
|
After Width: | Height: | Size: 25 KiB |
130
demo/nextjs_voice_chat/frontend/fastrtc-demo/app/globals.css
Normal file
@@ -0,0 +1,130 @@
|
||||
@import "tailwindcss";
|
||||
|
||||
@plugin "tailwindcss-animate";
|
||||
|
||||
@custom-variant dark (&:is(.dark *));
|
||||
|
||||
@theme inline {
|
||||
--color-background: var(--background);
|
||||
--color-foreground: var(--foreground);
|
||||
--font-sans: var(--font-geist-sans);
|
||||
--font-mono: var(--font-geist-mono);
|
||||
--color-sidebar-ring: var(--sidebar-ring);
|
||||
--color-sidebar-border: var(--sidebar-border);
|
||||
--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
|
||||
--color-sidebar-accent: var(--sidebar-accent);
|
||||
--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
|
||||
--color-sidebar-primary: var(--sidebar-primary);
|
||||
--color-sidebar-foreground: var(--sidebar-foreground);
|
||||
--color-sidebar: var(--sidebar);
|
||||
--color-chart-5: var(--chart-5);
|
||||
--color-chart-4: var(--chart-4);
|
||||
--color-chart-3: var(--chart-3);
|
||||
--color-chart-2: var(--chart-2);
|
||||
--color-chart-1: var(--chart-1);
|
||||
--color-ring: var(--ring);
|
||||
--color-input: var(--input);
|
||||
--color-border: var(--border);
|
||||
--color-destructive-foreground: var(--destructive-foreground);
|
||||
--color-destructive: var(--destructive);
|
||||
--color-accent-foreground: var(--accent-foreground);
|
||||
--color-accent: var(--accent);
|
||||
--color-muted-foreground: var(--muted-foreground);
|
||||
--color-muted: var(--muted);
|
||||
--color-secondary-foreground: var(--secondary-foreground);
|
||||
--color-secondary: var(--secondary);
|
||||
--color-primary-foreground: var(--primary-foreground);
|
||||
--color-primary: var(--primary);
|
||||
--color-popover-foreground: var(--popover-foreground);
|
||||
--color-popover: var(--popover);
|
||||
--color-card-foreground: var(--card-foreground);
|
||||
--color-card: var(--card);
|
||||
--radius-sm: calc(var(--radius) - 4px);
|
||||
--radius-md: calc(var(--radius) - 2px);
|
||||
--radius-lg: var(--radius);
|
||||
--radius-xl: calc(var(--radius) + 4px);
|
||||
}
|
||||
|
||||
:root {
|
||||
--background: oklch(1 0 0);
|
||||
--foreground: oklch(0.129 0.042 264.695);
|
||||
--card: oklch(1 0 0);
|
||||
--card-foreground: oklch(0.129 0.042 264.695);
|
||||
--popover: oklch(1 0 0);
|
||||
--popover-foreground: oklch(0.129 0.042 264.695);
|
||||
--primary: oklch(0.208 0.042 265.755);
|
||||
--primary-foreground: oklch(0.984 0.003 247.858);
|
||||
--secondary: oklch(0.968 0.007 247.896);
|
||||
--secondary-foreground: oklch(0.208 0.042 265.755);
|
||||
--muted: oklch(0.968 0.007 247.896);
|
||||
--muted-foreground: oklch(0.554 0.046 257.417);
|
||||
--accent: oklch(0.968 0.007 247.896);
|
||||
--accent-foreground: oklch(0.208 0.042 265.755);
|
||||
--destructive: oklch(0.577 0.245 27.325);
|
||||
--destructive-foreground: oklch(0.577 0.245 27.325);
|
||||
--border: oklch(0.929 0.013 255.508);
|
||||
--input: oklch(0.929 0.013 255.508);
|
||||
--ring: oklch(0.704 0.04 256.788);
|
||||
--chart-1: oklch(0.646 0.222 41.116);
|
||||
--chart-2: oklch(0.6 0.118 184.704);
|
||||
--chart-3: oklch(0.398 0.07 227.392);
|
||||
--chart-4: oklch(0.828 0.189 84.429);
|
||||
--chart-5: oklch(0.769 0.188 70.08);
|
||||
--radius: 0.625rem;
|
||||
--sidebar: oklch(0.984 0.003 247.858);
|
||||
--sidebar-foreground: oklch(0.129 0.042 264.695);
|
||||
--sidebar-primary: oklch(0.208 0.042 265.755);
|
||||
--sidebar-primary-foreground: oklch(0.984 0.003 247.858);
|
||||
--sidebar-accent: oklch(0.968 0.007 247.896);
|
||||
--sidebar-accent-foreground: oklch(0.208 0.042 265.755);
|
||||
--sidebar-border: oklch(0.929 0.013 255.508);
|
||||
--sidebar-ring: oklch(0.704 0.04 256.788);
|
||||
}
|
||||
|
||||
.dark {
|
||||
--background: oklch(0.129 0.042 264.695);
|
||||
--foreground: oklch(0.984 0.003 247.858);
|
||||
--card: oklch(0.129 0.042 264.695);
|
||||
--card-foreground: oklch(0.984 0.003 247.858);
|
||||
--popover: oklch(0.129 0.042 264.695);
|
||||
--popover-foreground: oklch(0.984 0.003 247.858);
|
||||
--primary: oklch(0.984 0.003 247.858);
|
||||
--primary-foreground: oklch(0.208 0.042 265.755);
|
||||
--secondary: oklch(0.279 0.041 260.031);
|
||||
--secondary-foreground: oklch(0.984 0.003 247.858);
|
||||
--muted: oklch(0.279 0.041 260.031);
|
||||
--muted-foreground: oklch(0.704 0.04 256.788);
|
||||
--accent: oklch(0.279 0.041 260.031);
|
||||
--accent-foreground: oklch(0.984 0.003 247.858);
|
||||
--destructive: oklch(0.396 0.141 25.723);
|
||||
--destructive-foreground: oklch(0.637 0.237 25.331);
|
||||
--border: oklch(0.279 0.041 260.031);
|
||||
--input: oklch(0.279 0.041 260.031);
|
||||
--ring: oklch(0.446 0.043 257.281);
|
||||
--chart-1: oklch(0.488 0.243 264.376);
|
||||
--chart-2: oklch(0.696 0.17 162.48);
|
||||
--chart-3: oklch(0.769 0.188 70.08);
|
||||
--chart-4: oklch(0.627 0.265 303.9);
|
||||
--chart-5: oklch(0.645 0.246 16.439);
|
||||
--sidebar: oklch(0.208 0.042 265.755);
|
||||
--sidebar-foreground: oklch(0.984 0.003 247.858);
|
||||
--sidebar-primary: oklch(0.488 0.243 264.376);
|
||||
--sidebar-primary-foreground: oklch(0.984 0.003 247.858);
|
||||
--sidebar-accent: oklch(0.279 0.041 260.031);
|
||||
--sidebar-accent-foreground: oklch(0.984 0.003 247.858);
|
||||
--sidebar-border: oklch(0.279 0.041 260.031);
|
||||
--sidebar-ring: oklch(0.446 0.043 257.281);
|
||||
}
|
||||
|
||||
@layer base {
|
||||
* {
|
||||
@apply border-border outline-ring/50;
|
||||
}
|
||||
body {
|
||||
@apply bg-background text-foreground;
|
||||
}
|
||||
}
|
||||
|
||||
.no-transitions * {
|
||||
transition: none !important;
|
||||
}
|
||||
44
demo/nextjs_voice_chat/frontend/fastrtc-demo/app/layout.tsx
Normal file
@@ -0,0 +1,44 @@
|
||||
import type { Metadata } from "next";
|
||||
import { Geist, Geist_Mono } from "next/font/google";
|
||||
import "./globals.css";
|
||||
import { ThemeProvider } from "@/components/theme-provider";
|
||||
import { ThemeTransition } from "@/components/ui/theme-transition";
|
||||
|
||||
const geistSans = Geist({
|
||||
variable: "--font-geist-sans",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
const geistMono = Geist_Mono({
|
||||
variable: "--font-geist-mono",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: "FastRTC Demo",
|
||||
description: "Interactive WebRTC demo with audio visualization",
|
||||
};
|
||||
|
||||
export default function RootLayout({
|
||||
children,
|
||||
}: Readonly<{
|
||||
children: React.ReactNode;
|
||||
}>) {
|
||||
return (
|
||||
<html lang="en" suppressHydrationWarning>
|
||||
<body
|
||||
className={`${geistSans.variable} ${geistMono.variable} antialiased`}
|
||||
>
|
||||
<ThemeProvider
|
||||
attribute="class"
|
||||
defaultTheme="dark"
|
||||
enableSystem
|
||||
disableTransitionOnChange
|
||||
>
|
||||
{children}
|
||||
<ThemeTransition />
|
||||
</ThemeProvider>
|
||||
</body>
|
||||
</html>
|
||||
);
|
||||
}
|
||||
16
demo/nextjs_voice_chat/frontend/fastrtc-demo/app/page.tsx
Normal file
@@ -0,0 +1,16 @@
|
||||
import { BackgroundCircleProvider } from "@/components/background-circle-provider";
|
||||
import { ThemeToggle } from "@/components/ui/theme-toggle";
|
||||
import { ResetChat } from "@/components/ui/reset-chat";
|
||||
export default function Home() {
|
||||
return (
|
||||
<div className="flex flex-col items-center justify-center h-screen">
|
||||
<BackgroundCircleProvider />
|
||||
<div className="absolute top-4 right-4 z-10">
|
||||
<ThemeToggle />
|
||||
</div>
|
||||
<div className="absolute bottom-4 right-4 z-10">
|
||||
<ResetChat />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
21
demo/nextjs_voice_chat/frontend/fastrtc-demo/components.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "https://ui.shadcn.com/schema.json",
|
||||
"style": "new-york",
|
||||
"rsc": true,
|
||||
"tsx": true,
|
||||
"tailwind": {
|
||||
"config": "",
|
||||
"css": "app/globals.css",
|
||||
"baseColor": "slate",
|
||||
"cssVariables": true,
|
||||
"prefix": ""
|
||||
},
|
||||
"aliases": {
|
||||
"components": "@/components",
|
||||
"utils": "@/lib/utils",
|
||||
"ui": "@/components/ui",
|
||||
"lib": "@/lib",
|
||||
"hooks": "@/hooks"
|
||||
},
|
||||
"iconLibrary": "lucide"
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
"use client"
|
||||
|
||||
import { useState, useEffect, useRef, useCallback } from "react";
|
||||
import { BackgroundCircles } from "@/components/ui/background-circles";
|
||||
import { AIVoiceInput } from "@/components/ui/ai-voice-input";
|
||||
import { WebRTCClient } from "@/lib/webrtc-client";
|
||||
|
||||
export function BackgroundCircleProvider() {
|
||||
const [currentVariant, setCurrentVariant] =
|
||||
useState<keyof typeof COLOR_VARIANTS>("octonary");
|
||||
const [isConnected, setIsConnected] = useState(false);
|
||||
const [webrtcClient, setWebrtcClient] = useState<WebRTCClient | null>(null);
|
||||
const [audioLevel, setAudioLevel] = useState(0);
|
||||
const audioRef = useRef<HTMLAudioElement>(null);
|
||||
|
||||
// Memoize callbacks to prevent recreation on each render
|
||||
const handleConnected = useCallback(() => setIsConnected(true), []);
|
||||
const handleDisconnected = useCallback(() => setIsConnected(false), []);
|
||||
|
||||
const handleAudioStream = useCallback((stream: MediaStream) => {
|
||||
if (audioRef.current) {
|
||||
audioRef.current.srcObject = stream;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const handleAudioLevel = useCallback((level: number) => {
|
||||
// Apply some smoothing to the audio level
|
||||
setAudioLevel(prev => prev * 0.7 + level * 0.3);
|
||||
}, []);
|
||||
|
||||
// Get all available variants
|
||||
const variants = Object.keys(
|
||||
COLOR_VARIANTS
|
||||
) as (keyof typeof COLOR_VARIANTS)[];
|
||||
|
||||
// Function to change to the next color variant
|
||||
const changeVariant = () => {
|
||||
const currentIndex = variants.indexOf(currentVariant);
|
||||
const nextVariant = variants[(currentIndex + 1) % variants.length];
|
||||
setCurrentVariant(nextVariant);
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
// Initialize WebRTC client with memoized callbacks
|
||||
const client = new WebRTCClient({
|
||||
onConnected: handleConnected,
|
||||
onDisconnected: handleDisconnected,
|
||||
onAudioStream: handleAudioStream,
|
||||
onAudioLevel: handleAudioLevel
|
||||
});
|
||||
setWebrtcClient(client);
|
||||
|
||||
return () => {
|
||||
client.disconnect();
|
||||
};
|
||||
}, [handleConnected, handleDisconnected, handleAudioStream, handleAudioLevel]);
|
||||
|
||||
const handleStart = () => {
|
||||
webrtcClient?.connect();
|
||||
};
|
||||
|
||||
const handleStop = () => {
|
||||
webrtcClient?.disconnect();
|
||||
};
|
||||
|
||||
return (
|
||||
<div
|
||||
className="relative w-full h-full"
|
||||
onClick={changeVariant} // Add click handler to change color
|
||||
>
|
||||
<BackgroundCircles
|
||||
variant={currentVariant}
|
||||
audioLevel={audioLevel}
|
||||
isActive={isConnected}
|
||||
/>
|
||||
<div className="absolute inset-0 flex items-center justify-center">
|
||||
<AIVoiceInput
|
||||
onStart={handleStart}
|
||||
onStop={handleStop}
|
||||
isConnected={isConnected}
|
||||
/>
|
||||
</div>
|
||||
<audio ref={audioRef} autoPlay hidden />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default { BackgroundCircleProvider }
|
||||
|
||||
const COLOR_VARIANTS = {
|
||||
primary: {
|
||||
border: [
|
||||
"border-emerald-500/60",
|
||||
"border-cyan-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-emerald-500/30",
|
||||
},
|
||||
secondary: {
|
||||
border: [
|
||||
"border-violet-500/60",
|
||||
"border-fuchsia-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-violet-500/30",
|
||||
},
|
||||
senary: {
|
||||
border: [
|
||||
"border-blue-500/60",
|
||||
"border-sky-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-blue-500/30",
|
||||
}, // blue
|
||||
octonary: {
|
||||
border: [
|
||||
"border-red-500/60",
|
||||
"border-rose-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-red-500/30",
|
||||
},
|
||||
} as const;
|
||||
@@ -0,0 +1,101 @@
|
||||
"use client";
|
||||
|
||||
import { createContext, useContext, useEffect, useState } from "react";
|
||||
|
||||
type Theme = "light" | "dark" | "system";
|
||||
|
||||
type ThemeProviderProps = {
|
||||
children: React.ReactNode;
|
||||
defaultTheme?: Theme;
|
||||
storageKey?: string;
|
||||
attribute?: string;
|
||||
enableSystem?: boolean;
|
||||
disableTransitionOnChange?: boolean;
|
||||
};
|
||||
|
||||
type ThemeProviderState = {
|
||||
theme: Theme;
|
||||
setTheme: (theme: Theme) => void;
|
||||
};
|
||||
|
||||
const initialState: ThemeProviderState = {
|
||||
theme: "system",
|
||||
setTheme: () => null,
|
||||
};
|
||||
|
||||
const ThemeProviderContext = createContext<ThemeProviderState>(initialState);
|
||||
|
||||
export function ThemeProvider({
|
||||
children,
|
||||
defaultTheme = "system",
|
||||
storageKey = "theme",
|
||||
attribute = "class",
|
||||
enableSystem = true,
|
||||
disableTransitionOnChange = false,
|
||||
...props
|
||||
}: ThemeProviderProps) {
|
||||
const [theme, setTheme] = useState<Theme>(defaultTheme);
|
||||
|
||||
useEffect(() => {
|
||||
const savedTheme = localStorage.getItem(storageKey) as Theme | null;
|
||||
|
||||
if (savedTheme) {
|
||||
setTheme(savedTheme);
|
||||
} else if (defaultTheme === "system" && enableSystem) {
|
||||
const systemTheme = window.matchMedia("(prefers-color-scheme: dark)").matches
|
||||
? "dark"
|
||||
: "light";
|
||||
setTheme(systemTheme);
|
||||
}
|
||||
}, [defaultTheme, storageKey, enableSystem]);
|
||||
|
||||
useEffect(() => {
|
||||
const root = window.document.documentElement;
|
||||
|
||||
if (disableTransitionOnChange) {
|
||||
root.classList.add("no-transitions");
|
||||
|
||||
// Force a reflow
|
||||
window.getComputedStyle(root).getPropertyValue("opacity");
|
||||
|
||||
setTimeout(() => {
|
||||
root.classList.remove("no-transitions");
|
||||
}, 0);
|
||||
}
|
||||
|
||||
root.classList.remove("light", "dark");
|
||||
|
||||
if (theme === "system" && enableSystem) {
|
||||
const systemTheme = window.matchMedia("(prefers-color-scheme: dark)").matches
|
||||
? "dark"
|
||||
: "light";
|
||||
root.classList.add(systemTheme);
|
||||
} else {
|
||||
root.classList.add(theme);
|
||||
}
|
||||
|
||||
localStorage.setItem(storageKey, theme);
|
||||
}, [theme, storageKey, enableSystem, disableTransitionOnChange]);
|
||||
|
||||
const value = {
|
||||
theme,
|
||||
setTheme: (theme: Theme) => {
|
||||
setTheme(theme);
|
||||
},
|
||||
};
|
||||
|
||||
return (
|
||||
<ThemeProviderContext.Provider {...props} value={value}>
|
||||
{children}
|
||||
</ThemeProviderContext.Provider>
|
||||
);
|
||||
}
|
||||
|
||||
export const useTheme = () => {
|
||||
const context = useContext(ThemeProviderContext);
|
||||
|
||||
if (context === undefined)
|
||||
throw new Error("useTheme must be used within a ThemeProvider");
|
||||
|
||||
return context;
|
||||
};
|
||||
@@ -0,0 +1,114 @@
|
||||
"use client";
|
||||
|
||||
import { Mic, Square } from "lucide-react";
|
||||
import { useState, useEffect } from "react";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
interface AIVoiceInputProps {
|
||||
onStart?: () => void;
|
||||
onStop?: (duration: number) => void;
|
||||
isConnected?: boolean;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export function AIVoiceInput({
|
||||
onStart,
|
||||
onStop,
|
||||
isConnected = false,
|
||||
className
|
||||
}: AIVoiceInputProps) {
|
||||
const [active, setActive] = useState(false);
|
||||
const [time, setTime] = useState(0);
|
||||
const [isClient, setIsClient] = useState(false);
|
||||
const [status, setStatus] = useState<'disconnected' | 'connecting' | 'connected'>('disconnected');
|
||||
|
||||
useEffect(() => {
|
||||
setIsClient(true);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
let intervalId: NodeJS.Timeout;
|
||||
|
||||
if (active) {
|
||||
intervalId = setInterval(() => {
|
||||
setTime((t) => t + 1);
|
||||
}, 1000);
|
||||
} else {
|
||||
setTime(0);
|
||||
}
|
||||
|
||||
return () => clearInterval(intervalId);
|
||||
}, [active]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isConnected) {
|
||||
setStatus('connected');
|
||||
setActive(true);
|
||||
} else {
|
||||
setStatus('disconnected');
|
||||
setActive(false);
|
||||
}
|
||||
}, [isConnected]);
|
||||
|
||||
const formatTime = (seconds: number) => {
|
||||
const mins = Math.floor(seconds / 60);
|
||||
const secs = seconds % 60;
|
||||
return `${mins.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}`;
|
||||
};
|
||||
|
||||
const handleStart = () => {
|
||||
setStatus('connecting');
|
||||
onStart?.();
|
||||
};
|
||||
|
||||
const handleStop = () => {
|
||||
onStop?.(time);
|
||||
setStatus('disconnected');
|
||||
};
|
||||
|
||||
return (
|
||||
<div className={cn("w-full py-4", className)}>
|
||||
<div className="relative max-w-xl w-full mx-auto flex items-center flex-col gap-4">
|
||||
<div className={cn(
|
||||
"px-2 py-1 rounded-md text-xs font-medium bg-black/10 dark:bg-white/10 text-gray-700 dark:text-white"
|
||||
)}>
|
||||
{status === 'connected' ? 'Connected' : status === 'connecting' ? 'Connecting...' : 'Disconnected'}
|
||||
</div>
|
||||
|
||||
<button
|
||||
className={cn(
|
||||
"group w-16 h-16 rounded-xl flex items-center justify-center transition-colors",
|
||||
active
|
||||
? "bg-red-500/20 hover:bg-red-500/30"
|
||||
: "bg-black/10 hover:bg-black/20 dark:bg-white/10 dark:hover:bg-white/20"
|
||||
)}
|
||||
type="button"
|
||||
onClick={active ? handleStop : handleStart}
|
||||
disabled={status === 'connecting'}
|
||||
>
|
||||
{status === 'connecting' ? (
|
||||
<div
|
||||
className="w-6 h-6 rounded-sm animate-spin bg-black dark:bg-white cursor-pointer pointer-events-auto"
|
||||
style={{ animationDuration: "3s" }}
|
||||
/>
|
||||
) : active ? (
|
||||
<Square className="w-6 h-6 text-red-500" />
|
||||
) : (
|
||||
<Mic className="w-6 h-6 text-black/70 dark:text-white/70" />
|
||||
)}
|
||||
</button>
|
||||
|
||||
<span
|
||||
className={cn(
|
||||
"font-mono text-sm transition-opacity duration-300",
|
||||
active
|
||||
? "text-black/70 dark:text-white/70"
|
||||
: "text-black/30 dark:text-white/30"
|
||||
)}
|
||||
>
|
||||
{formatTime(time)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,309 @@
|
||||
"use client";
|
||||
|
||||
import { motion } from "framer-motion";
|
||||
import clsx from "clsx";
|
||||
import { useState, useEffect } from "react";
|
||||
|
||||
interface BackgroundCirclesProps {
|
||||
title?: string;
|
||||
description?: string;
|
||||
className?: string;
|
||||
variant?: keyof typeof COLOR_VARIANTS;
|
||||
audioLevel?: number;
|
||||
isActive?: boolean;
|
||||
}
|
||||
|
||||
const COLOR_VARIANTS = {
|
||||
primary: {
|
||||
border: [
|
||||
"border-emerald-500/60",
|
||||
"border-cyan-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-emerald-500/30",
|
||||
},
|
||||
secondary: {
|
||||
border: [
|
||||
"border-violet-500/60",
|
||||
"border-fuchsia-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-violet-500/30",
|
||||
},
|
||||
tertiary: {
|
||||
border: [
|
||||
"border-orange-500/60",
|
||||
"border-yellow-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-orange-500/30",
|
||||
},
|
||||
quaternary: {
|
||||
border: [
|
||||
"border-purple-500/60",
|
||||
"border-pink-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-purple-500/30",
|
||||
},
|
||||
quinary: {
|
||||
border: [
|
||||
"border-red-500/60",
|
||||
"border-rose-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-red-500/30",
|
||||
}, // red
|
||||
senary: {
|
||||
border: [
|
||||
"border-blue-500/60",
|
||||
"border-sky-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-blue-500/30",
|
||||
}, // blue
|
||||
septenary: {
|
||||
border: [
|
||||
"border-gray-500/60",
|
||||
"border-gray-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-gray-500/30",
|
||||
},
|
||||
octonary: {
|
||||
border: [
|
||||
"border-red-500/60",
|
||||
"border-rose-400/50",
|
||||
"border-slate-600/30",
|
||||
],
|
||||
gradient: "from-red-500/30",
|
||||
},
|
||||
} as const;
|
||||
|
||||
const AnimatedGrid = () => (
|
||||
<motion.div
|
||||
className="absolute inset-0 [mask-image:radial-gradient(ellipse_at_center,transparent_30%,black)]"
|
||||
animate={{
|
||||
backgroundPosition: ["0% 0%", "100% 100%"],
|
||||
}}
|
||||
transition={{
|
||||
duration: 40,
|
||||
repeat: Number.POSITIVE_INFINITY,
|
||||
ease: "linear",
|
||||
}}
|
||||
>
|
||||
<div className="h-full w-full [background-image:repeating-linear-gradient(100deg,#64748B_0%,#64748B_1px,transparent_1px,transparent_4%)] opacity-20" />
|
||||
</motion.div>
|
||||
);
|
||||
|
||||
export function BackgroundCircles({
|
||||
title = "",
|
||||
description = "",
|
||||
className,
|
||||
variant = "octonary",
|
||||
audioLevel = 0,
|
||||
isActive = false,
|
||||
}: BackgroundCirclesProps) {
|
||||
const variantStyles = COLOR_VARIANTS[variant];
|
||||
const [animationParams, setAnimationParams] = useState({
|
||||
scale: 1,
|
||||
duration: 5,
|
||||
intensity: 0
|
||||
});
|
||||
const [isLoaded, setIsLoaded] = useState(false);
|
||||
|
||||
// Initial page load animation
|
||||
useEffect(() => {
|
||||
// Small delay to ensure the black screen is visible first
|
||||
const timer = setTimeout(() => {
|
||||
setIsLoaded(true);
|
||||
}, 300);
|
||||
|
||||
return () => clearTimeout(timer);
|
||||
}, []);
|
||||
|
||||
// Update animation based on audio level
|
||||
useEffect(() => {
|
||||
if (isActive && audioLevel > 0) {
|
||||
// Simple enhancement of audio level for more dramatic effect
|
||||
const enhancedLevel = Math.min(1, audioLevel * 1.5);
|
||||
|
||||
setAnimationParams({
|
||||
scale: 1 + enhancedLevel * 0.3,
|
||||
duration: Math.max(2, 5 - enhancedLevel * 3),
|
||||
intensity: enhancedLevel
|
||||
});
|
||||
} else if (animationParams.intensity > 0) {
|
||||
// Only reset if we need to (prevents unnecessary updates)
|
||||
const timer = setTimeout(() => {
|
||||
setAnimationParams({
|
||||
scale: 1,
|
||||
duration: 5,
|
||||
intensity: 0
|
||||
});
|
||||
}, 300);
|
||||
|
||||
return () => clearTimeout(timer);
|
||||
}
|
||||
}, [audioLevel, isActive, animationParams.intensity]);
|
||||
|
||||
return (
|
||||
<>
|
||||
{/* Initial black overlay that fades out */}
|
||||
<motion.div
|
||||
className="fixed inset-0 bg-black z-50"
|
||||
initial={{ opacity: 1 }}
|
||||
animate={{ opacity: isLoaded ? 0 : 1 }}
|
||||
transition={{ duration: 1.2, ease: "easeInOut" }}
|
||||
style={{ pointerEvents: isLoaded ? "none" : "auto" }}
|
||||
/>
|
||||
|
||||
<div
|
||||
className={clsx(
|
||||
"relative flex h-screen w-full items-center justify-center overflow-hidden",
|
||||
"bg-white dark:bg-black/5",
|
||||
className
|
||||
)}
|
||||
>
|
||||
<AnimatedGrid />
|
||||
<motion.div
|
||||
className="absolute h-[480px] w-[480px]"
|
||||
initial={{ opacity: 0, scale: 0.9 }}
|
||||
animate={{
|
||||
opacity: isLoaded ? 1 : 0,
|
||||
scale: isLoaded ? 1 : 0.9
|
||||
}}
|
||||
transition={{
|
||||
duration: 1.5,
|
||||
delay: 0.3,
|
||||
ease: "easeOut"
|
||||
}}
|
||||
>
|
||||
{[0, 1, 2].map((i) => (
|
||||
<motion.div
|
||||
key={i}
|
||||
className={clsx(
|
||||
"absolute inset-0 rounded-full",
|
||||
"border-2 bg-gradient-to-br to-transparent",
|
||||
variantStyles.border[i],
|
||||
variantStyles.gradient
|
||||
)}
|
||||
animate={{
|
||||
rotate: 360,
|
||||
scale: [
|
||||
1 + (i * 0.05),
|
||||
(1 + (i * 0.05)) * (1 + (isActive ? animationParams.intensity * 0.2 : 0.02)),
|
||||
1 + (i * 0.05)
|
||||
],
|
||||
opacity: [
|
||||
0.7 + (i * 0.1),
|
||||
0.8 + (i * 0.1) + (isActive ? animationParams.intensity * 0.2 : 0),
|
||||
0.7 + (i * 0.1)
|
||||
]
|
||||
}}
|
||||
transition={{
|
||||
duration: isActive ? animationParams.duration : 8 + (i * 2),
|
||||
repeat: Number.POSITIVE_INFINITY,
|
||||
ease: "easeInOut",
|
||||
}}
|
||||
>
|
||||
<div
|
||||
className={clsx(
|
||||
"absolute inset-0 rounded-full mix-blend-screen",
|
||||
`bg-[radial-gradient(ellipse_at_center,${variantStyles.gradient.replace(
|
||||
"from-",
|
||||
""
|
||||
)}/10%,transparent_70%)]`
|
||||
)}
|
||||
/>
|
||||
</motion.div>
|
||||
))}
|
||||
</motion.div>
|
||||
|
||||
<div className="absolute inset-0 [mask-image:radial-gradient(90%_60%_at_50%_50%,#000_40%,transparent)]">
|
||||
<motion.div
|
||||
className="absolute inset-0 bg-[radial-gradient(ellipse_at_center,#0F766E/30%,transparent_70%)] blur-[120px]"
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{
|
||||
opacity: isLoaded ? 0.7 : 0,
|
||||
scale: [1, 1 + (isActive ? animationParams.intensity * 0.3 : 0.02), 1],
|
||||
}}
|
||||
transition={{
|
||||
opacity: { duration: 1.8, delay: 0.5 },
|
||||
scale: {
|
||||
duration: isActive ? 2 : 12,
|
||||
repeat: Number.POSITIVE_INFINITY,
|
||||
ease: "easeInOut",
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<motion.div
|
||||
className="absolute inset-0 bg-[radial-gradient(ellipse_at_center,#2DD4BF/15%,transparent)] blur-[80px]"
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{
|
||||
opacity: isLoaded ? 1 : 0,
|
||||
scale: [1, 1 + (isActive ? animationParams.intensity * 0.4 : 0.03), 1]
|
||||
}}
|
||||
transition={{
|
||||
opacity: { duration: 2, delay: 0.7 },
|
||||
scale: {
|
||||
duration: isActive ? 1.5 : 15,
|
||||
repeat: Number.POSITIVE_INFINITY,
|
||||
ease: "easeInOut",
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
{/* Additional glow that appears only during high audio levels */}
|
||||
{isActive && animationParams.intensity > 0.4 && (
|
||||
<motion.div
|
||||
className={`absolute inset-0 bg-[radial-gradient(ellipse_at_center,${variantStyles.gradient.replace("from-", "")}/20%,transparent_70%)] blur-[60px]`}
|
||||
initial={{ opacity: 0, scale: 0.8 }}
|
||||
animate={{
|
||||
opacity: [0, animationParams.intensity * 0.6, 0],
|
||||
scale: [0.8, 1.1, 0.8],
|
||||
}}
|
||||
transition={{
|
||||
duration: 0.8,
|
||||
repeat: Number.POSITIVE_INFINITY,
|
||||
ease: "easeInOut",
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
export function DemoCircles() {
|
||||
const [currentVariant, setCurrentVariant] =
|
||||
useState<keyof typeof COLOR_VARIANTS>("octonary");
|
||||
|
||||
const variants = Object.keys(
|
||||
COLOR_VARIANTS
|
||||
) as (keyof typeof COLOR_VARIANTS)[];
|
||||
|
||||
function getNextVariant() {
|
||||
const currentIndex = variants.indexOf(currentVariant);
|
||||
const nextVariant = variants[(currentIndex + 1) % variants.length];
|
||||
return nextVariant;
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<BackgroundCircles variant={currentVariant} />
|
||||
<div className="absolute top-12 right-12">
|
||||
<button
|
||||
type="button"
|
||||
className="bg-slate-950 dark:bg-white text-white dark:text-slate-950 px-4 py-1 rounded-md z-10 text-sm font-medium"
|
||||
onClick={() => {
|
||||
setCurrentVariant(getNextVariant());
|
||||
}}
|
||||
>
|
||||
Change Variant
|
||||
</button>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
"use client"
|
||||
|
||||
import { Trash } from "lucide-react"
|
||||
|
||||
export function ResetChat() {
|
||||
return (
|
||||
<button
|
||||
className="w-10 h-10 rounded-md flex items-center justify-center transition-colors relative overflow-hidden bg-black/10 hover:bg-black/20 dark:bg-white/10 dark:hover:bg-white/20"
|
||||
aria-label="Reset chat"
|
||||
onClick={() => fetch("http://localhost:8000/reset")}
|
||||
>
|
||||
<div className="relative z-10">
|
||||
<Trash className="h-5 w-5 text-black/70 dark:text-white/70" />
|
||||
</div>
|
||||
</button>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
"use client";
|
||||
|
||||
import { useTheme } from "@/components/theme-provider";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { Moon, Sun } from "lucide-react";
|
||||
import { useRef } from "react";
|
||||
|
||||
interface ThemeToggleProps {
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export function ThemeToggle({ className }: ThemeToggleProps) {
|
||||
const { theme } = useTheme();
|
||||
const buttonRef = useRef<HTMLButtonElement>(null);
|
||||
|
||||
const toggleTheme = () => {
|
||||
// Instead of directly changing the theme, dispatch a custom event
|
||||
const newTheme = theme === "light" ? "dark" : "light";
|
||||
|
||||
// Dispatch custom event with the new theme
|
||||
window.dispatchEvent(
|
||||
new CustomEvent('themeToggleRequest', {
|
||||
detail: { theme: newTheme }
|
||||
})
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
<button
|
||||
ref={buttonRef}
|
||||
onClick={toggleTheme}
|
||||
className={cn(
|
||||
"w-10 h-10 rounded-md flex items-center justify-center transition-colors relative overflow-hidden",
|
||||
"bg-black/10 hover:bg-black/20 dark:bg-white/10 dark:hover:bg-white/20",
|
||||
className
|
||||
)}
|
||||
aria-label="Toggle theme"
|
||||
>
|
||||
<div className="relative z-10">
|
||||
{theme === "light" ? (
|
||||
<Moon className="h-5 w-5 text-black/70" />
|
||||
) : (
|
||||
<Sun className="h-5 w-5 text-white/70" />
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Small inner animation for the button itself */}
|
||||
<div
|
||||
className={cn(
|
||||
"absolute inset-0 transition-transform duration-500",
|
||||
theme === "light"
|
||||
? "bg-gradient-to-br from-blue-500/20 to-purple-500/20 translate-y-full"
|
||||
: "bg-gradient-to-br from-amber-500/20 to-orange-500/20 -translate-y-full"
|
||||
)}
|
||||
style={{
|
||||
transitionTimingFunction: "cubic-bezier(0.22, 1, 0.36, 1)"
|
||||
}}
|
||||
/>
|
||||
</button>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,120 @@
|
||||
"use client";
|
||||
|
||||
import { useTheme } from "@/components/theme-provider";
|
||||
import { useEffect, useState } from "react";
|
||||
import { motion, AnimatePresence } from "framer-motion";
|
||||
|
||||
interface ThemeTransitionProps {
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export function ThemeTransition({ className }: ThemeTransitionProps) {
|
||||
const { theme, setTheme } = useTheme();
|
||||
const [position, setPosition] = useState({ x: 0, y: 0 });
|
||||
const [isAnimating, setIsAnimating] = useState(false);
|
||||
const [pendingTheme, setPendingTheme] = useState<string | null>(null);
|
||||
const [visualTheme, setVisualTheme] = useState<string | null>(theme);
|
||||
|
||||
// Track mouse/touch position for click events
|
||||
useEffect(() => {
|
||||
const handleMouseMove = (e: MouseEvent) => {
|
||||
setPosition({ x: e.clientX, y: e.clientY });
|
||||
};
|
||||
|
||||
const handleTouchMove = (e: TouchEvent) => {
|
||||
if (e.touches[0]) {
|
||||
setPosition({ x: e.touches[0].clientX, y: e.touches[0].clientY });
|
||||
}
|
||||
};
|
||||
|
||||
window.addEventListener("mousemove", handleMouseMove);
|
||||
window.addEventListener("touchmove", handleTouchMove);
|
||||
|
||||
return () => {
|
||||
window.removeEventListener("mousemove", handleMouseMove);
|
||||
window.removeEventListener("touchmove", handleTouchMove);
|
||||
};
|
||||
}, []);
|
||||
|
||||
// Listen for theme toggle requests
|
||||
useEffect(() => {
|
||||
// Custom event for theme toggle requests
|
||||
const handleThemeToggle = (e: CustomEvent) => {
|
||||
if (isAnimating) return; // Prevent multiple animations
|
||||
|
||||
const newTheme = e.detail.theme;
|
||||
if (newTheme === theme) return;
|
||||
|
||||
// Store the pending theme but don't apply it yet
|
||||
setPendingTheme(newTheme);
|
||||
setIsAnimating(true);
|
||||
|
||||
// The actual theme will be applied mid-animation
|
||||
};
|
||||
|
||||
window.addEventListener('themeToggleRequest' as any, handleThemeToggle as EventListener);
|
||||
|
||||
return () => {
|
||||
window.removeEventListener('themeToggleRequest' as any, handleThemeToggle as EventListener);
|
||||
};
|
||||
}, [theme, isAnimating]);
|
||||
|
||||
// Apply the theme change mid-animation
|
||||
useEffect(() => {
|
||||
if (isAnimating && pendingTheme) {
|
||||
// Set visual theme immediately for the animation
|
||||
setVisualTheme(pendingTheme);
|
||||
|
||||
// Apply the actual theme change after a delay (mid-animation)
|
||||
const timer = setTimeout(() => {
|
||||
setTheme(pendingTheme as any);
|
||||
}, 400); // Half of the animation duration
|
||||
|
||||
// End the animation after it completes
|
||||
const endTimer = setTimeout(() => {
|
||||
setIsAnimating(false);
|
||||
setPendingTheme(null);
|
||||
}, 1000); // Match with animation duration
|
||||
|
||||
return () => {
|
||||
clearTimeout(timer);
|
||||
clearTimeout(endTimer);
|
||||
};
|
||||
}
|
||||
}, [isAnimating, pendingTheme, setTheme]);
|
||||
|
||||
return (
|
||||
<AnimatePresence>
|
||||
{isAnimating && (
|
||||
<motion.div
|
||||
className="fixed inset-0 z-[9999] pointer-events-none"
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{ opacity: 1 }}
|
||||
exit={{ opacity: 0 }}
|
||||
transition={{ duration: 0.3 }}
|
||||
>
|
||||
<motion.div
|
||||
className={`absolute rounded-full ${visualTheme === 'dark' ? 'bg-slate-950' : 'bg-white'}`}
|
||||
initial={{
|
||||
width: 0,
|
||||
height: 0,
|
||||
x: position.x,
|
||||
y: position.y,
|
||||
borderRadius: '100%'
|
||||
}}
|
||||
animate={{
|
||||
width: Math.max(window.innerWidth * 3, window.innerHeight * 3),
|
||||
height: Math.max(window.innerWidth * 3, window.innerHeight * 3),
|
||||
x: position.x - Math.max(window.innerWidth * 3, window.innerHeight * 3) / 2,
|
||||
y: position.y - Math.max(window.innerWidth * 3, window.innerHeight * 3) / 2,
|
||||
}}
|
||||
transition={{
|
||||
duration: 0.8,
|
||||
ease: [0.22, 1, 0.36, 1]
|
||||
}}
|
||||
/>
|
||||
</motion.div>
|
||||
)}
|
||||
</AnimatePresence>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
import { dirname } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { FlatCompat } from "@eslint/eslintrc";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
const compat = new FlatCompat({
|
||||
baseDirectory: __dirname,
|
||||
});
|
||||
|
||||
const eslintConfig = [
|
||||
...compat.extends("next/core-web-vitals", "next/typescript"),
|
||||
{
|
||||
rules: {
|
||||
"no-unused-vars": "off",
|
||||
"no-explicit-any": "off",
|
||||
"no-console": "off",
|
||||
"no-debugger": "off",
|
||||
"eqeqeq": "off",
|
||||
"curly": "off",
|
||||
"quotes": "off",
|
||||
"semi": "off",
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
export default eslintConfig;
|
||||
@@ -0,0 +1,6 @@
|
||||
import { clsx, type ClassValue } from "clsx"
|
||||
import { twMerge } from "tailwind-merge"
|
||||
|
||||
export function cn(...inputs: ClassValue[]) {
|
||||
return twMerge(clsx(inputs))
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
interface WebRTCClientOptions {
|
||||
onConnected?: () => void;
|
||||
onDisconnected?: () => void;
|
||||
onMessage?: (message: any) => void;
|
||||
onAudioStream?: (stream: MediaStream) => void;
|
||||
onAudioLevel?: (level: number) => void;
|
||||
}
|
||||
|
||||
export class WebRTCClient {
|
||||
private peerConnection: RTCPeerConnection | null = null;
|
||||
private mediaStream: MediaStream | null = null;
|
||||
private dataChannel: RTCDataChannel | null = null;
|
||||
private options: WebRTCClientOptions;
|
||||
private audioContext: AudioContext | null = null;
|
||||
private analyser: AnalyserNode | null = null;
|
||||
private dataArray: Uint8Array | null = null;
|
||||
private animationFrameId: number | null = null;
|
||||
|
||||
constructor(options: WebRTCClientOptions = {}) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
async connect() {
|
||||
try {
|
||||
this.peerConnection = new RTCPeerConnection();
|
||||
|
||||
// Get user media
|
||||
try {
|
||||
this.mediaStream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
} catch (mediaError: any) {
|
||||
console.error('Media error:', mediaError);
|
||||
if (mediaError.name === 'NotAllowedError') {
|
||||
throw new Error('Microphone access denied. Please allow microphone access and try again.');
|
||||
} else if (mediaError.name === 'NotFoundError') {
|
||||
throw new Error('No microphone detected. Please connect a microphone and try again.');
|
||||
} else {
|
||||
throw mediaError;
|
||||
}
|
||||
}
|
||||
|
||||
this.setupAudioAnalysis();
|
||||
|
||||
this.mediaStream.getTracks().forEach(track => {
|
||||
if (this.peerConnection) {
|
||||
this.peerConnection.addTrack(track, this.mediaStream!);
|
||||
}
|
||||
});
|
||||
|
||||
this.peerConnection.addEventListener('track', (event) => {
|
||||
if (this.options.onAudioStream) {
|
||||
this.options.onAudioStream(event.streams[0]);
|
||||
}
|
||||
});
|
||||
|
||||
this.dataChannel = this.peerConnection.createDataChannel('text');
|
||||
|
||||
this.dataChannel.addEventListener('message', (event) => {
|
||||
try {
|
||||
const message = JSON.parse(event.data);
|
||||
console.log('Received message:', message);
|
||||
|
||||
if (this.options.onMessage) {
|
||||
this.options.onMessage(message);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error parsing message:', error);
|
||||
}
|
||||
});
|
||||
|
||||
// Create and send offer
|
||||
const offer = await this.peerConnection.createOffer();
|
||||
await this.peerConnection.setLocalDescription(offer);
|
||||
|
||||
// Use same-origin request to avoid CORS preflight
|
||||
const response = await fetch('http://localhost:8000/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
},
|
||||
mode: 'cors', // Explicitly set CORS mode
|
||||
credentials: 'same-origin',
|
||||
body: JSON.stringify({
|
||||
sdp: offer.sdp,
|
||||
type: offer.type,
|
||||
webrtc_id: Math.random().toString(36).substring(7)
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
await this.peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
if (this.options.onConnected) {
|
||||
this.options.onConnected();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error connecting:', error);
|
||||
this.disconnect();
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private setupAudioAnalysis() {
|
||||
if (!this.mediaStream) return;
|
||||
|
||||
try {
|
||||
this.audioContext = new AudioContext();
|
||||
this.analyser = this.audioContext.createAnalyser();
|
||||
this.analyser.fftSize = 256;
|
||||
|
||||
const source = this.audioContext.createMediaStreamSource(this.mediaStream);
|
||||
source.connect(this.analyser);
|
||||
|
||||
const bufferLength = this.analyser.frequencyBinCount;
|
||||
this.dataArray = new Uint8Array(bufferLength);
|
||||
|
||||
this.startAnalysis();
|
||||
} catch (error) {
|
||||
console.error('Error setting up audio analysis:', error);
|
||||
}
|
||||
}
|
||||
|
||||
private startAnalysis() {
|
||||
if (!this.analyser || !this.dataArray || !this.options.onAudioLevel) return;
|
||||
|
||||
// Add throttling to prevent too many updates
|
||||
let lastUpdateTime = 0;
|
||||
const throttleInterval = 100; // Only update every 100ms
|
||||
|
||||
const analyze = () => {
|
||||
this.analyser!.getByteFrequencyData(this.dataArray!);
|
||||
|
||||
const currentTime = Date.now();
|
||||
// Only update if enough time has passed since last update
|
||||
if (currentTime - lastUpdateTime > throttleInterval) {
|
||||
// Calculate average volume level (0-1)
|
||||
let sum = 0;
|
||||
for (let i = 0; i < this.dataArray!.length; i++) {
|
||||
sum += this.dataArray![i];
|
||||
}
|
||||
const average = sum / this.dataArray!.length / 255;
|
||||
|
||||
this.options.onAudioLevel!(average);
|
||||
lastUpdateTime = currentTime;
|
||||
}
|
||||
|
||||
this.animationFrameId = requestAnimationFrame(analyze);
|
||||
};
|
||||
|
||||
this.animationFrameId = requestAnimationFrame(analyze);
|
||||
}
|
||||
|
||||
private stopAnalysis() {
|
||||
if (this.animationFrameId !== null) {
|
||||
cancelAnimationFrame(this.animationFrameId);
|
||||
this.animationFrameId = null;
|
||||
}
|
||||
|
||||
if (this.audioContext) {
|
||||
this.audioContext.close();
|
||||
this.audioContext = null;
|
||||
}
|
||||
|
||||
this.analyser = null;
|
||||
this.dataArray = null;
|
||||
}
|
||||
|
||||
disconnect() {
|
||||
this.stopAnalysis();
|
||||
|
||||
if (this.mediaStream) {
|
||||
this.mediaStream.getTracks().forEach(track => track.stop());
|
||||
this.mediaStream = null;
|
||||
}
|
||||
|
||||
if (this.peerConnection) {
|
||||
this.peerConnection.close();
|
||||
this.peerConnection = null;
|
||||
}
|
||||
|
||||
this.dataChannel = null;
|
||||
|
||||
if (this.options.onDisconnected) {
|
||||
this.options.onDisconnected();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
import type { NextConfig } from "next";
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
/* config options here */
|
||||
};
|
||||
|
||||
export default nextConfig;
|
||||
33
demo/nextjs_voice_chat/frontend/fastrtc-demo/package.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "fastrtc-demo",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"dev": "next dev --turbopack",
|
||||
"build": "next build --no-lint",
|
||||
"start": "next start",
|
||||
"lint": "next lint"
|
||||
},
|
||||
"dependencies": {
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"framer-motion": "^12.4.10",
|
||||
"lucide-react": "^0.477.0",
|
||||
"next": "15.2.2-canary.1",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"tailwind-merge": "^3.0.2",
|
||||
"tailwindcss-animate": "^1.0.7"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/eslintrc": "^3",
|
||||
"@tailwindcss/postcss": "^4",
|
||||
"@types/node": "^20",
|
||||
"@types/react": "^19",
|
||||
"@types/react-dom": "^19",
|
||||
"eslint": "^9",
|
||||
"eslint-config-next": "15.2.2-canary.1",
|
||||
"tailwindcss": "^4",
|
||||
"typescript": "^5"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
const config = {
|
||||
plugins: ["@tailwindcss/postcss"],
|
||||
};
|
||||
|
||||
export default config;
|
||||
@@ -0,0 +1 @@
|
||||
<svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>
|
||||
|
After Width: | Height: | Size: 391 B |
@@ -0,0 +1 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>
|
||||
|
After Width: | Height: | Size: 1.0 KiB |
@@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
@@ -0,0 +1 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>
|
||||
|
After Width: | Height: | Size: 128 B |
@@ -0,0 +1 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>
|
||||
|
After Width: | Height: | Size: 385 B |
27
demo/nextjs_voice_chat/frontend/fastrtc-demo/tsconfig.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2017",
|
||||
"lib": ["dom", "dom.iterable", "esnext"],
|
||||
"allowJs": true,
|
||||
"skipLibCheck": true,
|
||||
"strict": true,
|
||||
"noEmit": true,
|
||||
"esModuleInterop": true,
|
||||
"module": "esnext",
|
||||
"moduleResolution": "bundler",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"jsx": "preserve",
|
||||
"incremental": true,
|
||||
"plugins": [
|
||||
{
|
||||
"name": "next"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"@/*": ["./*"]
|
||||
}
|
||||
},
|
||||
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
5
demo/nextjs_voice_chat/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
openai
|
||||
fastapi
|
||||
python-dotenv
|
||||
elevenlabs
|
||||
fastrtc[vad, stt, tts]
|
||||
1
demo/nextjs_voice_chat/run.sh
Executable file
@@ -0,0 +1 @@
|
||||
uvicorn backend.server:app --host 0.0.0.0 --port 8000
|
||||
15
demo/object_detection/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Object Detection
|
||||
emoji: 📸
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Use YOLOv10 to detect objects in real-time
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
77
demo/object_detection/app.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import gradio as gr
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastrtc import Stream, get_twilio_turn_credentials
|
||||
from gradio.utils import get_space
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
try:
|
||||
from demo.object_detection.inference import YOLOv10
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
from inference import YOLOv10
|
||||
|
||||
|
||||
cur_dir = Path(__file__).parent
|
||||
|
||||
model_file = hf_hub_download(
|
||||
repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
|
||||
)
|
||||
|
||||
model = YOLOv10(model_file)
|
||||
|
||||
|
||||
def detection(image, conf_threshold=0.3):
|
||||
image = cv2.resize(image, (model.input_width, model.input_height))
|
||||
print("conf_threshold", conf_threshold)
|
||||
new_image = model.detect_objects(image, conf_threshold)
|
||||
return cv2.resize(new_image, (500, 500))
|
||||
|
||||
|
||||
stream = Stream(
|
||||
handler=detection,
|
||||
modality="video",
|
||||
mode="send-receive",
|
||||
additional_inputs=[gr.Slider(minimum=0, maximum=1, step=0.01, value=0.3)],
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=2 if get_space() else None,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = open(cur_dir / "index.html").read()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
conf_threshold: float = Field(ge=0, le=1)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(data: InputData):
|
||||
stream.set_input(data.webrtc_id, data.conf_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
340
demo/object_detection/index.html
Normal file
@@ -0,0 +1,340 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Object Detection</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
background: linear-gradient(135deg, #2d2b52 0%, #191731 100%);
|
||||
color: white;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.container {
|
||||
width: 100%;
|
||||
max-width: 800px;
|
||||
text-align: center;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.video-container {
|
||||
width: 100%;
|
||||
max-width: 500px;
|
||||
aspect-ratio: 1/1;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
border-radius: 12px;
|
||||
overflow: hidden;
|
||||
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
#video-output {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
}
|
||||
|
||||
button {
|
||||
background: white;
|
||||
color: #2d2b52;
|
||||
border: none;
|
||||
padding: 12px 32px;
|
||||
border-radius: 24px;
|
||||
font-size: 16px;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
button:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 2.5em;
|
||||
margin-bottom: 0.3em;
|
||||
}
|
||||
|
||||
p {
|
||||
color: rgba(255, 255, 255, 0.8);
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
.controls {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 12px;
|
||||
align-items: center;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.slider-container {
|
||||
width: 100%;
|
||||
max-width: 300px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.slider-container label {
|
||||
color: rgba(255, 255, 255, 0.8);
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
input[type="range"] {
|
||||
width: 100%;
|
||||
height: 6px;
|
||||
-webkit-appearance: none;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
border-radius: 3px;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
input[type="range"]::-webkit-slider-thumb {
|
||||
-webkit-appearance: none;
|
||||
width: 18px;
|
||||
height: 18px;
|
||||
background: white;
|
||||
border-radius: 50%;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<h1>Real-time Object Detection</h1>
|
||||
<p>Using YOLOv10 to detect objects in your webcam feed</p>
|
||||
<div class="video-container">
|
||||
<video id="video-output" autoplay playsinline></video>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<div class="slider-container">
|
||||
<label>Confidence Threshold: <span id="conf-value">0.3</span></label>
|
||||
<input type="range" id="conf-threshold" min="0" max="1" step="0.01" value="0.3">
|
||||
</div>
|
||||
<button id="start-button">Start</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const startButton = document.getElementById('start-button');
|
||||
const videoOutput = document.getElementById('video-output');
|
||||
const confThreshold = document.getElementById('conf-threshold');
|
||||
const confValue = document.getElementById('conf-value');
|
||||
|
||||
// Update confidence value display
|
||||
confThreshold.addEventListener('input', (e) => {
|
||||
confValue.textContent = e.target.value;
|
||||
if (peerConnection) {
|
||||
updateConfThreshold(e.target.value);
|
||||
}
|
||||
});
|
||||
|
||||
function updateConfThreshold(value) {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
conf_threshold: parseFloat(value)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
video: true
|
||||
});
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (videoOutput && videoOutput.srcObject !== evt.streams[0]) {
|
||||
videoOutput.srcObject = evt.streams[0];
|
||||
}
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
updateConfThreshold(confThreshold.value);
|
||||
}
|
||||
};
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
startButton.textContent = 'Start';
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
// Send initial confidence threshold
|
||||
updateConfThreshold(confThreshold.value);
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
});
|
||||
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
startButton.textContent = 'Start';
|
||||
}
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
|
||||
setTimeout(() => {
|
||||
peerConnection.close();
|
||||
}, 500);
|
||||
}
|
||||
|
||||
videoOutput.srcObject = null;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (startButton.textContent === 'Start') {
|
||||
setupWebRTC();
|
||||
startButton.textContent = 'Stop';
|
||||
} else {
|
||||
stop();
|
||||
startButton.textContent = 'Start';
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
153
demo/object_detection/inference.py
Normal file
@@ -0,0 +1,153 @@
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
|
||||
try:
|
||||
from demo.object_detection.utils import draw_detections
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
from utils import draw_detections
|
||||
|
||||
|
||||
class YOLOv10:
|
||||
def __init__(self, path):
|
||||
# Initialize model
|
||||
self.initialize_model(path)
|
||||
|
||||
def __call__(self, image):
|
||||
return self.detect_objects(image)
|
||||
|
||||
def initialize_model(self, path):
|
||||
self.session = onnxruntime.InferenceSession(
|
||||
path, providers=onnxruntime.get_available_providers()
|
||||
)
|
||||
# Get model info
|
||||
self.get_input_details()
|
||||
self.get_output_details()
|
||||
|
||||
def detect_objects(self, image, conf_threshold=0.3):
|
||||
input_tensor = self.prepare_input(image)
|
||||
|
||||
# Perform inference on the image
|
||||
new_image = self.inference(image, input_tensor, conf_threshold)
|
||||
|
||||
return new_image
|
||||
|
||||
def prepare_input(self, image):
|
||||
self.img_height, self.img_width = image.shape[:2]
|
||||
|
||||
input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Resize input image
|
||||
input_img = cv2.resize(input_img, (self.input_width, self.input_height))
|
||||
|
||||
# Scale input pixel values to 0 to 1
|
||||
input_img = input_img / 255.0
|
||||
input_img = input_img.transpose(2, 0, 1)
|
||||
input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
|
||||
|
||||
return input_tensor
|
||||
|
||||
def inference(self, image, input_tensor, conf_threshold=0.3):
|
||||
start = time.perf_counter()
|
||||
outputs = self.session.run(
|
||||
self.output_names, {self.input_names[0]: input_tensor}
|
||||
)
|
||||
|
||||
print(f"Inference time: {(time.perf_counter() - start) * 1000:.2f} ms")
|
||||
(
|
||||
boxes,
|
||||
scores,
|
||||
class_ids,
|
||||
) = self.process_output(outputs, conf_threshold)
|
||||
return self.draw_detections(image, boxes, scores, class_ids)
|
||||
|
||||
def process_output(self, output, conf_threshold=0.3):
|
||||
predictions = np.squeeze(output[0])
|
||||
|
||||
# Filter out object confidence scores below threshold
|
||||
scores = predictions[:, 4]
|
||||
predictions = predictions[scores > conf_threshold, :]
|
||||
scores = scores[scores > conf_threshold]
|
||||
|
||||
if len(scores) == 0:
|
||||
return [], [], []
|
||||
|
||||
# Get the class with the highest confidence
|
||||
class_ids = predictions[:, 5].astype(int)
|
||||
|
||||
# Get bounding boxes for each object
|
||||
boxes = self.extract_boxes(predictions)
|
||||
|
||||
return boxes, scores, class_ids
|
||||
|
||||
def extract_boxes(self, predictions):
|
||||
# Extract boxes from predictions
|
||||
boxes = predictions[:, :4]
|
||||
|
||||
# Scale boxes to original image dimensions
|
||||
boxes = self.rescale_boxes(boxes)
|
||||
|
||||
# Convert boxes to xyxy format
|
||||
# boxes = xywh2xyxy(boxes)
|
||||
|
||||
return boxes
|
||||
|
||||
def rescale_boxes(self, boxes):
|
||||
# Rescale boxes to original image dimensions
|
||||
input_shape = np.array(
|
||||
[self.input_width, self.input_height, self.input_width, self.input_height]
|
||||
)
|
||||
boxes = np.divide(boxes, input_shape, dtype=np.float32)
|
||||
boxes *= np.array(
|
||||
[self.img_width, self.img_height, self.img_width, self.img_height]
|
||||
)
|
||||
return boxes
|
||||
|
||||
def draw_detections(
|
||||
self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4
|
||||
):
|
||||
return draw_detections(image, boxes, scores, class_ids, mask_alpha)
|
||||
|
||||
def get_input_details(self):
|
||||
model_inputs = self.session.get_inputs()
|
||||
self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
|
||||
|
||||
self.input_shape = model_inputs[0].shape
|
||||
self.input_height = self.input_shape[2]
|
||||
self.input_width = self.input_shape[3]
|
||||
|
||||
def get_output_details(self):
|
||||
model_outputs = self.session.get_outputs()
|
||||
self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
model_file = hf_hub_download(
|
||||
repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
|
||||
)
|
||||
|
||||
yolov8_detector = YOLOv10(model_file)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
|
||||
f.write(
|
||||
requests.get(
|
||||
"https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
|
||||
).content
|
||||
)
|
||||
f.seek(0)
|
||||
img = cv2.imread(f.name)
|
||||
|
||||
# # Detect Objects
|
||||
combined_image = yolov8_detector.detect_objects(img)
|
||||
|
||||
# Draw detections
|
||||
cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
|
||||
cv2.imshow("Output", combined_image)
|
||||
cv2.waitKey(0)
|
||||
4
demo/object_detection/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc
|
||||
opencv-python
|
||||
twilio
|
||||
onnxruntime-gpu
|
||||
237
demo/object_detection/utils.py
Normal file
@@ -0,0 +1,237 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
class_names = [
|
||||
"person",
|
||||
"bicycle",
|
||||
"car",
|
||||
"motorcycle",
|
||||
"airplane",
|
||||
"bus",
|
||||
"train",
|
||||
"truck",
|
||||
"boat",
|
||||
"traffic light",
|
||||
"fire hydrant",
|
||||
"stop sign",
|
||||
"parking meter",
|
||||
"bench",
|
||||
"bird",
|
||||
"cat",
|
||||
"dog",
|
||||
"horse",
|
||||
"sheep",
|
||||
"cow",
|
||||
"elephant",
|
||||
"bear",
|
||||
"zebra",
|
||||
"giraffe",
|
||||
"backpack",
|
||||
"umbrella",
|
||||
"handbag",
|
||||
"tie",
|
||||
"suitcase",
|
||||
"frisbee",
|
||||
"skis",
|
||||
"snowboard",
|
||||
"sports ball",
|
||||
"kite",
|
||||
"baseball bat",
|
||||
"baseball glove",
|
||||
"skateboard",
|
||||
"surfboard",
|
||||
"tennis racket",
|
||||
"bottle",
|
||||
"wine glass",
|
||||
"cup",
|
||||
"fork",
|
||||
"knife",
|
||||
"spoon",
|
||||
"bowl",
|
||||
"banana",
|
||||
"apple",
|
||||
"sandwich",
|
||||
"orange",
|
||||
"broccoli",
|
||||
"carrot",
|
||||
"hot dog",
|
||||
"pizza",
|
||||
"donut",
|
||||
"cake",
|
||||
"chair",
|
||||
"couch",
|
||||
"potted plant",
|
||||
"bed",
|
||||
"dining table",
|
||||
"toilet",
|
||||
"tv",
|
||||
"laptop",
|
||||
"mouse",
|
||||
"remote",
|
||||
"keyboard",
|
||||
"cell phone",
|
||||
"microwave",
|
||||
"oven",
|
||||
"toaster",
|
||||
"sink",
|
||||
"refrigerator",
|
||||
"book",
|
||||
"clock",
|
||||
"vase",
|
||||
"scissors",
|
||||
"teddy bear",
|
||||
"hair drier",
|
||||
"toothbrush",
|
||||
]
|
||||
|
||||
# Create a list of colors for each class where each color is a tuple of 3 integer values
|
||||
rng = np.random.default_rng(3)
|
||||
colors = rng.uniform(0, 255, size=(len(class_names), 3))
|
||||
|
||||
|
||||
def nms(boxes, scores, iou_threshold):
|
||||
# Sort by score
|
||||
sorted_indices = np.argsort(scores)[::-1]
|
||||
|
||||
keep_boxes = []
|
||||
while sorted_indices.size > 0:
|
||||
# Pick the last box
|
||||
box_id = sorted_indices[0]
|
||||
keep_boxes.append(box_id)
|
||||
|
||||
# Compute IoU of the picked box with the rest
|
||||
ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
|
||||
|
||||
# Remove boxes with IoU over the threshold
|
||||
keep_indices = np.where(ious < iou_threshold)[0]
|
||||
|
||||
# print(keep_indices.shape, sorted_indices.shape)
|
||||
sorted_indices = sorted_indices[keep_indices + 1]
|
||||
|
||||
return keep_boxes
|
||||
|
||||
|
||||
def multiclass_nms(boxes, scores, class_ids, iou_threshold):
|
||||
unique_class_ids = np.unique(class_ids)
|
||||
|
||||
keep_boxes = []
|
||||
for class_id in unique_class_ids:
|
||||
class_indices = np.where(class_ids == class_id)[0]
|
||||
class_boxes = boxes[class_indices, :]
|
||||
class_scores = scores[class_indices]
|
||||
|
||||
class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
|
||||
keep_boxes.extend(class_indices[class_keep_boxes])
|
||||
|
||||
return keep_boxes
|
||||
|
||||
|
||||
def compute_iou(box, boxes):
|
||||
# Compute xmin, ymin, xmax, ymax for both boxes
|
||||
xmin = np.maximum(box[0], boxes[:, 0])
|
||||
ymin = np.maximum(box[1], boxes[:, 1])
|
||||
xmax = np.minimum(box[2], boxes[:, 2])
|
||||
ymax = np.minimum(box[3], boxes[:, 3])
|
||||
|
||||
# Compute intersection area
|
||||
intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
|
||||
|
||||
# Compute union area
|
||||
box_area = (box[2] - box[0]) * (box[3] - box[1])
|
||||
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||
union_area = box_area + boxes_area - intersection_area
|
||||
|
||||
# Compute IoU
|
||||
iou = intersection_area / union_area
|
||||
|
||||
return iou
|
||||
|
||||
|
||||
def xywh2xyxy(x):
|
||||
# Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
|
||||
y = np.copy(x)
|
||||
y[..., 0] = x[..., 0] - x[..., 2] / 2
|
||||
y[..., 1] = x[..., 1] - x[..., 3] / 2
|
||||
y[..., 2] = x[..., 0] + x[..., 2] / 2
|
||||
y[..., 3] = x[..., 1] + x[..., 3] / 2
|
||||
return y
|
||||
|
||||
|
||||
def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
|
||||
det_img = image.copy()
|
||||
|
||||
img_height, img_width = image.shape[:2]
|
||||
font_size = min([img_height, img_width]) * 0.0006
|
||||
text_thickness = int(min([img_height, img_width]) * 0.001)
|
||||
|
||||
# det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
|
||||
|
||||
# Draw bounding boxes and labels of detections
|
||||
for class_id, box, score in zip(class_ids, boxes, scores):
|
||||
color = colors[class_id]
|
||||
|
||||
draw_box(det_img, box, color) # type: ignore
|
||||
|
||||
label = class_names[class_id]
|
||||
caption = f"{label} {int(score * 100)}%"
|
||||
draw_text(det_img, caption, box, color, font_size, text_thickness) # type: ignore
|
||||
|
||||
return det_img
|
||||
|
||||
|
||||
def draw_box(
|
||||
image: np.ndarray,
|
||||
box: np.ndarray,
|
||||
color: tuple[int, int, int] = (0, 0, 255),
|
||||
thickness: int = 2,
|
||||
) -> np.ndarray:
|
||||
x1, y1, x2, y2 = box.astype(int)
|
||||
return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
|
||||
|
||||
|
||||
def draw_text(
|
||||
image: np.ndarray,
|
||||
text: str,
|
||||
box: np.ndarray,
|
||||
color: tuple[int, int, int] = (0, 0, 255),
|
||||
font_size: float = 0.001,
|
||||
text_thickness: int = 2,
|
||||
) -> np.ndarray:
|
||||
x1, y1, x2, y2 = box.astype(int)
|
||||
(tw, th), _ = cv2.getTextSize(
|
||||
text=text,
|
||||
fontFace=cv2.FONT_HERSHEY_SIMPLEX,
|
||||
fontScale=font_size,
|
||||
thickness=text_thickness,
|
||||
)
|
||||
th = int(th * 1.2)
|
||||
|
||||
cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
|
||||
|
||||
return cv2.putText(
|
||||
image,
|
||||
text,
|
||||
(x1, y1),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
font_size,
|
||||
(255, 255, 255),
|
||||
text_thickness,
|
||||
cv2.LINE_AA,
|
||||
)
|
||||
|
||||
|
||||
def draw_masks(
|
||||
image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
|
||||
) -> np.ndarray:
|
||||
mask_img = image.copy()
|
||||
|
||||
# Draw bounding boxes and labels of detections
|
||||
for box, class_id in zip(boxes, classes):
|
||||
color = colors[class_id]
|
||||
|
||||
x1, y1, x2, y2 = box.astype(int)
|
||||
|
||||
# Draw fill rectangle in mask image
|
||||
cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1) # type: ignore
|
||||
|
||||
return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)
|
||||
16
demo/phonic_chat/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
title: Phonic AI Chat
|
||||
emoji: 🎙️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Phonic AI's speech-to-speech model
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|PHONIC_API_KEY]
|
||||
python_version: 3.11
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
116
demo/phonic_chat/app.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
from gradio.utils import get_space
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_twilio_turn_credentials,
|
||||
audio_to_float32,
|
||||
wait_for_item,
|
||||
)
|
||||
from phonic.client import PhonicSTSClient, get_voices
|
||||
|
||||
load_dotenv()
|
||||
|
||||
STS_URI = "wss://api.phonic.co/v1/sts/ws"
|
||||
API_KEY = os.environ["PHONIC_API_KEY"]
|
||||
SAMPLE_RATE = 44_100
|
||||
voices = get_voices(API_KEY)
|
||||
voice_ids = [voice["id"] for voice in voices]
|
||||
|
||||
|
||||
class PhonicHandler(AsyncStreamHandler):
|
||||
def __init__(self):
|
||||
super().__init__(input_sample_rate=SAMPLE_RATE, output_sample_rate=SAMPLE_RATE)
|
||||
self.output_queue = asyncio.Queue()
|
||||
self.client = None
|
||||
|
||||
def copy(self) -> AsyncStreamHandler:
|
||||
return PhonicHandler()
|
||||
|
||||
async def start_up(self):
|
||||
await self.wait_for_args()
|
||||
voice_id = self.latest_args[1]
|
||||
async with PhonicSTSClient(STS_URI, API_KEY) as client:
|
||||
self.client = client
|
||||
sts_stream = client.sts( # type: ignore
|
||||
input_format="pcm_44100",
|
||||
output_format="pcm_44100",
|
||||
system_prompt="You are a helpful voice assistant. Respond conversationally.",
|
||||
# welcome_message="Hello! I'm your voice assistant. How can I help you today?",
|
||||
voice_id=voice_id,
|
||||
)
|
||||
async for message in sts_stream:
|
||||
message_type = message.get("type")
|
||||
if message_type == "audio_chunk":
|
||||
audio_b64 = message["audio"]
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
await self.output_queue.put(
|
||||
(SAMPLE_RATE, np.frombuffer(audio_bytes, dtype=np.int16))
|
||||
)
|
||||
if text := message.get("text"):
|
||||
msg = {"role": "assistant", "content": text}
|
||||
await self.output_queue.put(AdditionalOutputs(msg))
|
||||
elif message_type == "input_text":
|
||||
msg = {"role": "user", "content": message["text"]}
|
||||
await self.output_queue.put(AdditionalOutputs(msg))
|
||||
|
||||
async def emit(self):
|
||||
return await wait_for_item(self.output_queue)
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
if not self.client:
|
||||
return
|
||||
audio_float32 = audio_to_float32(frame)
|
||||
await self.client.send_audio(audio_float32) # type: ignore
|
||||
|
||||
async def shutdown(self):
|
||||
if self.client:
|
||||
await self.client._websocket.close()
|
||||
return super().shutdown()
|
||||
|
||||
|
||||
def add_to_chatbot(chatbot, message):
|
||||
chatbot.append(message)
|
||||
return chatbot
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages", value=[])
|
||||
stream = Stream(
|
||||
handler=PhonicHandler(),
|
||||
mode="send-receive",
|
||||
modality="audio",
|
||||
additional_inputs=[
|
||||
gr.Dropdown(
|
||||
choices=voice_ids,
|
||||
value="victoria",
|
||||
label="Voice",
|
||||
info="Select a voice from the dropdown",
|
||||
)
|
||||
],
|
||||
additional_outputs=[chatbot],
|
||||
additional_outputs_handler=add_to_chatbot,
|
||||
ui_args={
|
||||
"title": "Phonic Chat (Powered by FastRTC ⚡️)",
|
||||
},
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
)
|
||||
|
||||
# with stream.ui:
|
||||
# state.change(lambda s: s, inputs=state, outputs=chatbot)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
74
demo/phonic_chat/requirements.txt
Normal file
@@ -0,0 +1,74 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile requirements.in -o requirements.txt
|
||||
aiohappyeyeballs==2.4.6
|
||||
# via aiohttp
|
||||
aiohttp==3.11.12
|
||||
# via
|
||||
# aiohttp-retry
|
||||
# twilio
|
||||
aiohttp-retry==2.9.1
|
||||
# via twilio
|
||||
aiosignal==1.3.2
|
||||
# via aiohttp
|
||||
attrs==25.1.0
|
||||
# via aiohttp
|
||||
certifi==2025.1.31
|
||||
# via requests
|
||||
cffi==1.17.1
|
||||
# via sounddevice
|
||||
charset-normalizer==3.4.1
|
||||
# via requests
|
||||
fastrtc==0.0.1
|
||||
# via -r requirements.in
|
||||
frozenlist==1.5.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
idna==3.10
|
||||
# via
|
||||
# requests
|
||||
# yarl
|
||||
isort==6.0.0
|
||||
# via phonic-python
|
||||
loguru==0.7.3
|
||||
# via phonic-python
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
numpy==2.2.3
|
||||
# via
|
||||
# phonic-python
|
||||
# scipy
|
||||
phonic-python==0.1.3
|
||||
# via -r requirements.in
|
||||
propcache==0.3.0
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pyjwt==2.10.1
|
||||
# via twilio
|
||||
python-dotenv==1.0.1
|
||||
# via
|
||||
# -r requirements.in
|
||||
# phonic-python
|
||||
requests==2.32.3
|
||||
# via
|
||||
# phonic-python
|
||||
# twilio
|
||||
scipy==1.15.2
|
||||
# via phonic-python
|
||||
sounddevice==0.5.1
|
||||
# via phonic-python
|
||||
twilio==9.4.6
|
||||
# via -r requirements.in
|
||||
typing-extensions==4.12.2
|
||||
# via phonic-python
|
||||
urllib3==2.3.0
|
||||
# via requests
|
||||
websockets==15.0
|
||||
# via phonic-python
|
||||
yarl==1.18.3
|
||||
# via aiohttp
|
||||
15
demo/talk_to_claude/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Claude
|
||||
emoji: 👨🦰
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Anthropic's Claude
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY, secret|ANTHROPIC_API_KEY, secret|ELEVENLABS_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
134
demo/talk_to_claude/app.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from elevenlabs import ElevenLabs
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_tts_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from fastrtc.utils import audio_to_bytes
|
||||
from gradio.utils import get_space
|
||||
from groq import Groq
|
||||
from pydantic import BaseModel
|
||||
|
||||
load_dotenv()
|
||||
|
||||
groq_client = Groq()
|
||||
claude_client = anthropic.Anthropic()
|
||||
tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
tts_model = get_tts_model()
|
||||
|
||||
|
||||
def response(
|
||||
audio: tuple[int, np.ndarray],
|
||||
chatbot: list[dict] | None = None,
|
||||
):
|
||||
chatbot = chatbot or []
|
||||
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
|
||||
prompt = groq_client.audio.transcriptions.create(
|
||||
file=("audio-file.mp3", audio_to_bytes(audio)),
|
||||
model="whisper-large-v3-turbo",
|
||||
response_format="verbose_json",
|
||||
).text
|
||||
chatbot.append({"role": "user", "content": prompt})
|
||||
yield AdditionalOutputs(chatbot)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
response = claude_client.messages.create(
|
||||
model="claude-3-5-haiku-20241022",
|
||||
max_tokens=512,
|
||||
messages=messages, # type: ignore
|
||||
)
|
||||
response_text = " ".join(
|
||||
block.text # type: ignore
|
||||
for block in response.content
|
||||
if getattr(block, "type", None) == "text"
|
||||
)
|
||||
chatbot.append({"role": "assistant", "content": response_text})
|
||||
|
||||
start = time.time()
|
||||
|
||||
print("starting tts", start)
|
||||
for i, chunk in enumerate(tts_model.stream_tts_sync(response_text)):
|
||||
print("chunk", i, time.time() - start)
|
||||
yield chunk
|
||||
print("finished tts", time.time() - start)
|
||||
yield AdditionalOutputs(chatbot)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=ReplyOnPause(response),
|
||||
additional_outputs_handler=lambda a, b: b,
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
chatbot: list[Message]
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content, status_code=200)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(body: InputData):
|
||||
stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
chatbot = output.args[0]
|
||||
yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
546
demo/talk_to_claude/index.html
Normal file
@@ -0,0 +1,546 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>RetroChat Audio</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: monospace;
|
||||
background-color: #1a1a1a;
|
||||
color: #00ff00;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 20px;
|
||||
height: calc(100% - 100px);
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
border: 2px solid #00ff00;
|
||||
padding: 20px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
flex-grow: 1;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.controls-container {
|
||||
border: 2px solid #00ff00;
|
||||
padding: 20px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 20px;
|
||||
height: 128px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.visualization-container {
|
||||
flex-grow: 1;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.box-container {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
height: 64px;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.box {
|
||||
height: 100%;
|
||||
width: 8px;
|
||||
background: #00ff00;
|
||||
border-radius: 8px;
|
||||
transition: transform 0.05s ease;
|
||||
}
|
||||
|
||||
.chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 20px;
|
||||
padding: 10px;
|
||||
border: 1px solid #00ff00;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 10px;
|
||||
padding: 8px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #003300;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #002200;
|
||||
}
|
||||
|
||||
button {
|
||||
height: 64px;
|
||||
min-width: 120px;
|
||||
background-color: #000;
|
||||
color: #00ff00;
|
||||
border: 2px solid #00ff00;
|
||||
padding: 10px 20px;
|
||||
font-family: monospace;
|
||||
font-size: 16px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
border-width: 3px;
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Retro CRT effect */
|
||||
.crt-overlay {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background: repeating-linear-gradient(0deg,
|
||||
rgba(0, 255, 0, 0.03),
|
||||
rgba(0, 255, 0, 0.03) 1px,
|
||||
transparent 1px,
|
||||
transparent 2px);
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
/* Add these new styles */
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid #00ff00;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: #00ff00;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for typing indicator */
|
||||
.typing-indicator {
|
||||
padding: 8px;
|
||||
background-color: #002200;
|
||||
border-radius: 4px;
|
||||
margin-bottom: 10px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.dots {
|
||||
display: inline-flex;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
background-color: #00ff00;
|
||||
border-radius: 50%;
|
||||
animation: pulse 1.5s infinite;
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.5s;
|
||||
}
|
||||
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 1s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.5;
|
||||
transform: scale(1);
|
||||
}
|
||||
|
||||
50% {
|
||||
opacity: 1;
|
||||
transform: scale(1.2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<div class="chat-container">
|
||||
<div class="chat-messages" id="chat-messages"></div>
|
||||
<!-- Move typing indicator outside the chat messages -->
|
||||
<div class="typing-indicator" id="typing-indicator">
|
||||
<div class="dots">
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="controls-container">
|
||||
<div class="visualization-container">
|
||||
<div class="box-container">
|
||||
<!-- Boxes will be dynamically added here -->
|
||||
</div>
|
||||
</div>
|
||||
<button id="start-button">Start</button>
|
||||
</div>
|
||||
</div>
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let audioContext;
|
||||
let analyser_input, analyser_output;
|
||||
let dataArray_input, dataArray_output;
|
||||
let animationId_input, animationId_output;
|
||||
let chatHistory = [];
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
|
||||
const audioOutput = document.getElementById('audio-output');
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
|
||||
function updateButtonState() {
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
startButton.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
startButton.innerHTML = 'Start';
|
||||
}
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
// Set up input visualization
|
||||
audioContext = new AudioContext();
|
||||
analyser_input = audioContext.createAnalyser();
|
||||
const inputSource = audioContext.createMediaStreamSource(stream);
|
||||
inputSource.connect(analyser_input);
|
||||
analyser_input.fftSize = 64;
|
||||
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser_input.getByteFrequencyData(dataArray_input);
|
||||
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
|
||||
const audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationId_input = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
// Add connection state change listener
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('Connection state:', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
// Handle incoming audio
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (audioOutput.srcObject !== evt.streams[0]) {
|
||||
audioOutput.srcObject = evt.streams[0];
|
||||
audioOutput.play();
|
||||
|
||||
// Set up output visualization
|
||||
analyser_output = audioContext.createAnalyser();
|
||||
const outputSource = audioContext.createMediaStreamSource(evt.streams[0]);
|
||||
outputSource.connect(analyser_output);
|
||||
analyser_output.fftSize = 2048;
|
||||
dataArray_output = new Uint8Array(analyser_output.frequencyBinCount);
|
||||
updateVisualization();
|
||||
}
|
||||
});
|
||||
|
||||
// Create data channel for messages
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
const typingIndicator = document.getElementById('typing-indicator');
|
||||
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: chatHistory
|
||||
})
|
||||
});
|
||||
} else if (eventJson.type === "log") {
|
||||
if (eventJson.data === "pause_detected") {
|
||||
typingIndicator.style.display = 'block';
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (eventJson.data === "response_starting") {
|
||||
typingIndicator.style.display = 'none';
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Create and send offer
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
// Start visualization
|
||||
updateVisualization();
|
||||
|
||||
// create event stream to receive messages from /output
|
||||
const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
addMessage(eventJson.role, eventJson.content);
|
||||
});
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
messageDiv.textContent = content;
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
chatHistory.push({ role, content });
|
||||
}
|
||||
|
||||
// Add this after other const declarations
|
||||
const boxContainer = document.querySelector('.box-container');
|
||||
const numBars = 32;
|
||||
for (let i = 0; i < numBars; i++) {
|
||||
const box = document.createElement('div');
|
||||
box.className = 'box';
|
||||
boxContainer.appendChild(box);
|
||||
}
|
||||
|
||||
// Replace the draw function with updateVisualization
|
||||
function updateVisualization() {
|
||||
animationId_output = requestAnimationFrame(updateVisualization);
|
||||
|
||||
analyser_output.getByteFrequencyData(dataArray_output);
|
||||
const bars = document.querySelectorAll('.box');
|
||||
|
||||
for (let i = 0; i < bars.length; i++) {
|
||||
const barHeight = (dataArray_output[i] / 255) * 2;
|
||||
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
|
||||
}
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
|
||||
peerConnection.close();
|
||||
}
|
||||
|
||||
if (animationId_input) {
|
||||
cancelAnimationFrame(animationId_input);
|
||||
}
|
||||
if (animationId_output) {
|
||||
cancelAnimationFrame(animationId_output);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
}
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (startButton.textContent === 'Start') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
stop();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
6
demo/talk_to_claude/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastrtc[vad, tts]
|
||||
elevenlabs
|
||||
groq
|
||||
anthropic
|
||||
twilio
|
||||
python-dotenv
|
||||
15
demo/talk_to_gemini/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Gemini
|
||||
emoji: ♊️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Gemini using Google's multimodal API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
15
demo/talk_to_gemini/README_gradio.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Gemini (Gradio UI)
|
||||
emoji: ♊️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Gemini (Gradio UI)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
181
demo/talk_to_gemini/app.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import AsyncGenerator, Literal
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastrtc import (
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_twilio_turn_credentials,
|
||||
wait_for_item,
|
||||
)
|
||||
from google import genai
|
||||
from google.genai.types import (
|
||||
LiveConnectConfig,
|
||||
PrebuiltVoiceConfig,
|
||||
SpeechConfig,
|
||||
VoiceConfig,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from pydantic import BaseModel
|
||||
|
||||
current_dir = pathlib.Path(__file__).parent
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def encode_audio(data: np.ndarray) -> str:
|
||||
"""Encode Audio data to send to the server"""
|
||||
return base64.b64encode(data.tobytes()).decode("UTF-8")
|
||||
|
||||
|
||||
class GeminiHandler(AsyncStreamHandler):
|
||||
"""Handler for the Gemini API"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
expected_layout: Literal["mono"] = "mono",
|
||||
output_sample_rate: int = 24000,
|
||||
output_frame_size: int = 480,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
expected_layout,
|
||||
output_sample_rate,
|
||||
output_frame_size,
|
||||
input_sample_rate=16000,
|
||||
)
|
||||
self.input_queue: asyncio.Queue = asyncio.Queue()
|
||||
self.output_queue: asyncio.Queue = asyncio.Queue()
|
||||
self.quit: asyncio.Event = asyncio.Event()
|
||||
|
||||
def copy(self) -> "GeminiHandler":
|
||||
return GeminiHandler(
|
||||
expected_layout="mono",
|
||||
output_sample_rate=self.output_sample_rate,
|
||||
output_frame_size=self.output_frame_size,
|
||||
)
|
||||
|
||||
async def start_up(self):
|
||||
if not self.phone_mode:
|
||||
await self.wait_for_args()
|
||||
api_key, voice_name = self.latest_args[1:]
|
||||
else:
|
||||
api_key, voice_name = None, "Puck"
|
||||
|
||||
client = genai.Client(
|
||||
api_key=api_key or os.getenv("GEMINI_API_KEY"),
|
||||
http_options={"api_version": "v1alpha"},
|
||||
)
|
||||
|
||||
config = LiveConnectConfig(
|
||||
response_modalities=["AUDIO"], # type: ignore
|
||||
speech_config=SpeechConfig(
|
||||
voice_config=VoiceConfig(
|
||||
prebuilt_voice_config=PrebuiltVoiceConfig(
|
||||
voice_name=voice_name,
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
async with client.aio.live.connect(
|
||||
model="gemini-2.0-flash-exp", config=config
|
||||
) as session:
|
||||
async for audio in session.start_stream(
|
||||
stream=self.stream(), mime_type="audio/pcm"
|
||||
):
|
||||
if audio.data:
|
||||
array = np.frombuffer(audio.data, dtype=np.int16)
|
||||
self.output_queue.put_nowait((self.output_sample_rate, array))
|
||||
|
||||
async def stream(self) -> AsyncGenerator[bytes, None]:
|
||||
while not self.quit.is_set():
|
||||
try:
|
||||
audio = await asyncio.wait_for(self.input_queue.get(), 0.1)
|
||||
yield audio
|
||||
except (asyncio.TimeoutError, TimeoutError):
|
||||
pass
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
_, array = frame
|
||||
array = array.squeeze()
|
||||
audio_message = encode_audio(array)
|
||||
self.input_queue.put_nowait(audio_message)
|
||||
|
||||
async def emit(self) -> tuple[int, np.ndarray] | None:
|
||||
return await wait_for_item(self.output_queue)
|
||||
|
||||
def shutdown(self) -> None:
|
||||
self.quit.set()
|
||||
|
||||
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=GeminiHandler(),
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
additional_inputs=[
|
||||
gr.Textbox(
|
||||
label="API Key",
|
||||
type="password",
|
||||
value=os.getenv("GEMINI_API_KEY") if not get_space() else "",
|
||||
),
|
||||
gr.Dropdown(
|
||||
label="Voice",
|
||||
choices=[
|
||||
"Puck",
|
||||
"Charon",
|
||||
"Kore",
|
||||
"Fenrir",
|
||||
"Aoede",
|
||||
],
|
||||
value="Puck",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
voice_name: str
|
||||
api_key: str
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(body: InputData):
|
||||
stream.set_input(body.webrtc_id, body.api_key, body.voice_name)
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def index():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (current_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
452
demo/talk_to_gemini/index.html
Normal file
@@ -0,0 +1,452 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Gemini Voice Chat</title>
|
||||
<style>
|
||||
:root {
|
||||
--color-accent: #6366f1;
|
||||
--color-background: #0f172a;
|
||||
--color-surface: #1e293b;
|
||||
--color-text: #e2e8f0;
|
||||
--boxSize: 8px;
|
||||
--gutter: 4px;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background-color: var(--color-background);
|
||||
color: var(--color-text);
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.container {
|
||||
width: 90%;
|
||||
max-width: 800px;
|
||||
background-color: var(--color-surface);
|
||||
padding: 2rem;
|
||||
border-radius: 1rem;
|
||||
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
|
||||
}
|
||||
|
||||
.wave-container {
|
||||
position: relative;
|
||||
display: flex;
|
||||
min-height: 100px;
|
||||
max-height: 128px;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.box-container {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
height: 64px;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.box {
|
||||
height: 100%;
|
||||
width: var(--boxSize);
|
||||
background: var(--color-accent);
|
||||
border-radius: 8px;
|
||||
transition: transform 0.05s ease;
|
||||
}
|
||||
|
||||
.controls {
|
||||
display: grid;
|
||||
gap: 1rem;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.input-group {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
label {
|
||||
font-size: 0.875rem;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
input,
|
||||
select {
|
||||
padding: 0.75rem;
|
||||
border-radius: 0.5rem;
|
||||
border: 1px solid rgba(255, 255, 255, 0.1);
|
||||
background-color: var(--color-background);
|
||||
color: var(--color-text);
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
button {
|
||||
padding: 1rem 2rem;
|
||||
border-radius: 0.5rem;
|
||||
border: none;
|
||||
background-color: var(--color-accent);
|
||||
color: white;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
opacity: 0.9;
|
||||
transform: translateY(-1px);
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid white;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: white;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div style="text-align: center">
|
||||
<h1>Gemini Voice Chat</h1>
|
||||
<p>Speak with Gemini using real-time audio streaming</p>
|
||||
<p>
|
||||
Get a Gemini API key
|
||||
<a href="https://ai.google.dev/gemini-api/docs/api-key">here</a>
|
||||
</p>
|
||||
</div>
|
||||
<div class="container">
|
||||
<div class="controls">
|
||||
<div class="input-group">
|
||||
<label for="api-key">API Key</label>
|
||||
<input type="password" id="api-key" placeholder="Enter your API key">
|
||||
</div>
|
||||
<div class="input-group">
|
||||
<label for="voice">Voice</label>
|
||||
<select id="voice">
|
||||
<option value="Puck">Puck</option>
|
||||
<option value="Charon">Charon</option>
|
||||
<option value="Kore">Kore</option>
|
||||
<option value="Fenrir">Fenrir</option>
|
||||
<option value="Aoede">Aoede</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wave-container">
|
||||
<div class="box-container">
|
||||
<!-- Boxes will be dynamically added here -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button id="start-button">Start Recording</button>
|
||||
</div>
|
||||
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let audioContext;
|
||||
let dataChannel;
|
||||
let isRecording = false;
|
||||
let webrtc_id;
|
||||
|
||||
const startButton = document.getElementById('start-button');
|
||||
const apiKeyInput = document.getElementById('api-key');
|
||||
const voiceSelect = document.getElementById('voice');
|
||||
const audioOutput = document.getElementById('audio-output');
|
||||
const boxContainer = document.querySelector('.box-container');
|
||||
|
||||
const numBars = 32;
|
||||
for (let i = 0; i < numBars; i++) {
|
||||
const box = document.createElement('div');
|
||||
box.className = 'box';
|
||||
boxContainer.appendChild(box);
|
||||
}
|
||||
|
||||
function updateButtonState() {
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
startButton.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Recording</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
startButton.innerHTML = 'Start Recording';
|
||||
}
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
stream.getTracks().forEach(track => peerConnection.addTrack(track, stream));
|
||||
|
||||
// Update audio visualization setup
|
||||
audioContext = new AudioContext();
|
||||
analyser_input = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(analyser_input);
|
||||
analyser_input.fftSize = 64;
|
||||
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser_input.getByteFrequencyData(dataArray_input);
|
||||
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
|
||||
const audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
console.log("audioLevel", audioLevel);
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationId = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
|
||||
// Add connection state change listener
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
// Handle incoming audio
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (audioOutput && audioOutput.srcObject !== evt.streams[0]) {
|
||||
audioOutput.srcObject = evt.streams[0];
|
||||
audioOutput.play();
|
||||
|
||||
// Set up audio visualization on the output stream
|
||||
audioContext = new AudioContext();
|
||||
analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(evt.streams[0]);
|
||||
source.connect(analyser);
|
||||
analyser.fftSize = 2048;
|
||||
dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
updateVisualization();
|
||||
}
|
||||
});
|
||||
|
||||
// Create data channel for messages
|
||||
dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
api_key: apiKeyInput.value,
|
||||
voice_name: voiceSelect.value
|
||||
})
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Create and send offer
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id,
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
startButton.textContent = 'Start Recording';
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
startButton.textContent = 'Start Recording';
|
||||
}
|
||||
}
|
||||
|
||||
function updateVisualization() {
|
||||
if (!analyser) return;
|
||||
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const bars = document.querySelectorAll('.box');
|
||||
|
||||
for (let i = 0; i < bars.length; i++) {
|
||||
const barHeight = (dataArray[i] / 255) * 2;
|
||||
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
|
||||
}
|
||||
|
||||
animationId = requestAnimationFrame(updateVisualization);
|
||||
}
|
||||
|
||||
function stopWebRTC() {
|
||||
if (peerConnection) {
|
||||
peerConnection.close();
|
||||
}
|
||||
if (animationId) {
|
||||
cancelAnimationFrame(animationId);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
}
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!isRecording) {
|
||||
setupWebRTC();
|
||||
startButton.classList.add('recording');
|
||||
} else {
|
||||
stopWebRTC();
|
||||
startButton.classList.remove('recording');
|
||||
}
|
||||
isRecording = !isRecording;
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
4
demo/talk_to_gemini/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc
|
||||
python-dotenv
|
||||
google-genai
|
||||
twilio
|
||||
15
demo/talk_to_openai/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to OpenAI
|
||||
emoji: 🗣️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to OpenAI using their multimodal API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
15
demo/talk_to_openai/README_gradio.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to OpenAI (Gradio UI)
|
||||
emoji: 🗣️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to OpenAI (Gradio UI)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
141
demo/talk_to_openai/app.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
import openai
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_twilio_turn_credentials,
|
||||
wait_for_item,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
|
||||
|
||||
load_dotenv()
|
||||
|
||||
cur_dir = Path(__file__).parent
|
||||
|
||||
SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
class OpenAIHandler(AsyncStreamHandler):
|
||||
def __init__(
|
||||
self,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
expected_layout="mono",
|
||||
output_sample_rate=SAMPLE_RATE,
|
||||
output_frame_size=480,
|
||||
input_sample_rate=SAMPLE_RATE,
|
||||
)
|
||||
self.connection = None
|
||||
self.output_queue = asyncio.Queue()
|
||||
|
||||
def copy(self):
|
||||
return OpenAIHandler()
|
||||
|
||||
async def start_up(
|
||||
self,
|
||||
):
|
||||
"""Connect to realtime API. Run forever in separate thread to keep connection open."""
|
||||
self.client = openai.AsyncOpenAI()
|
||||
async with self.client.beta.realtime.connect(
|
||||
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
||||
) as conn:
|
||||
await conn.session.update(
|
||||
session={"turn_detection": {"type": "server_vad"}}
|
||||
)
|
||||
self.connection = conn
|
||||
async for event in self.connection:
|
||||
if event.type == "response.audio_transcript.done":
|
||||
await self.output_queue.put(AdditionalOutputs(event))
|
||||
if event.type == "response.audio.delta":
|
||||
await self.output_queue.put(
|
||||
(
|
||||
self.output_sample_rate,
|
||||
np.frombuffer(
|
||||
base64.b64decode(event.delta), dtype=np.int16
|
||||
).reshape(1, -1),
|
||||
),
|
||||
)
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
if not self.connection:
|
||||
return
|
||||
_, array = frame
|
||||
array = array.squeeze()
|
||||
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
|
||||
await self.connection.input_audio_buffer.append(audio=audio_message) # type: ignore
|
||||
|
||||
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
||||
return await wait_for_item(self.output_queue)
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
if self.connection:
|
||||
await self.connection.close()
|
||||
self.connection = None
|
||||
|
||||
|
||||
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
|
||||
chatbot.append({"role": "assistant", "content": response.transcript})
|
||||
return chatbot
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
latest_message = gr.Textbox(type="text", visible=False)
|
||||
stream = Stream(
|
||||
OpenAIHandler(),
|
||||
mode="send-receive",
|
||||
modality="audio",
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
additional_outputs_handler=update_chatbot,
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (cur_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
import json
|
||||
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
s = json.dumps({"role": "assistant", "content": output.args[0].transcript})
|
||||
yield f"event: output\ndata: {s}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
404
demo/talk_to_openai/index.html
Normal file
@@ -0,0 +1,404 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>OpenAI Real-Time Chat</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, sans-serif;
|
||||
background-color: #0a0a0a;
|
||||
color: #ffffff;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
height: calc(100% - 100px);
|
||||
}
|
||||
|
||||
.logo {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
border: 1px solid #333;
|
||||
padding: 20px;
|
||||
height: 90%;
|
||||
box-sizing: border-box;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 20px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 20px;
|
||||
padding: 12px;
|
||||
border-radius: 4px;
|
||||
font-size: 16px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #1a1a1a;
|
||||
margin-left: 20%;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #262626;
|
||||
margin-right: 20%;
|
||||
}
|
||||
|
||||
.controls {
|
||||
text-align: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
button {
|
||||
background-color: transparent;
|
||||
color: #ffffff;
|
||||
border: 1px solid #ffffff;
|
||||
padding: 12px 24px;
|
||||
font-family: inherit;
|
||||
font-size: 16px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 1px;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
border-width: 2px;
|
||||
transform: scale(1.02);
|
||||
box-shadow: 0 0 10px rgba(255, 255, 255, 0.2);
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid #ffffff;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: #ffffff;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<div class="logo">
|
||||
<h1>OpenAI Real-Time Chat</h1>
|
||||
</div>
|
||||
<div class="chat-container">
|
||||
<div class="chat-messages" id="chat-messages"></div>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="start-button">Start Conversation</button>
|
||||
</div>
|
||||
</div>
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const audioOutput = document.getElementById('audio-output');
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
|
||||
let audioLevel = 0;
|
||||
let animationFrame;
|
||||
let audioContext, analyser, audioSource;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
}
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
audioSource = audioContext.createMediaStreamSource(stream);
|
||||
audioSource.connect(analyser);
|
||||
analyser.fftSize = 64;
|
||||
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
|
||||
audioLevel = average / 255;
|
||||
|
||||
// Update CSS variable instead of rebuilding the button
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationFrame = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
isConnecting = true;
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
setupAudioVisualization(stream);
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (audioOutput.srcObject !== evt.streams[0]) {
|
||||
audioOutput.srcObject = evt.streams[0];
|
||||
audioOutput.play();
|
||||
}
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
}
|
||||
};
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
addMessage("assistant", eventJson.content);
|
||||
|
||||
});
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
messageDiv.textContent = content;
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
}
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
console.log('closing');
|
||||
peerConnection.close();
|
||||
}
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
console.log('clicked');
|
||||
console.log(peerConnection, peerConnection?.connectionState);
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
console.log('stopping');
|
||||
stop();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
4
demo/talk_to_openai/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc[vad]
|
||||
openai
|
||||
twilio
|
||||
python-dotenv
|
||||
15
demo/talk_to_sambanova/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Sambanova
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Llama 3.2 - SambaNova API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
15
demo/talk_to_sambanova/README_gradio.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Sambanova (Gradio)
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Llama 3.2 - SambaNova API (Gradio)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
144
demo/talk_to_sambanova/app.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import huggingface_hub
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from pydantic import BaseModel
|
||||
|
||||
load_dotenv()
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
|
||||
client = huggingface_hub.InferenceClient(
|
||||
api_key=os.environ.get("SAMBANOVA_API_KEY"),
|
||||
provider="sambanova",
|
||||
)
|
||||
stt_model = get_stt_model()
|
||||
|
||||
|
||||
def response(
|
||||
audio: tuple[int, np.ndarray],
|
||||
gradio_chatbot: list[dict] | None = None,
|
||||
conversation_state: list[dict] | None = None,
|
||||
):
|
||||
gradio_chatbot = gradio_chatbot or []
|
||||
conversation_state = conversation_state or []
|
||||
print("chatbot", gradio_chatbot)
|
||||
|
||||
text = stt_model.stt(audio)
|
||||
sample_rate, array = audio
|
||||
gradio_chatbot.append(
|
||||
{"role": "user", "content": gr.Audio((sample_rate, array.squeeze()))}
|
||||
)
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
conversation_state.append({"role": "user", "content": text})
|
||||
request = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
messages=conversation_state, # type: ignore
|
||||
temperature=0.1,
|
||||
top_p=0.1,
|
||||
)
|
||||
response = {"role": "assistant", "content": request.choices[0].message.content}
|
||||
|
||||
conversation_state.append(response)
|
||||
gradio_chatbot.append(response)
|
||||
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages", value=[])
|
||||
state = gr.State(value=[])
|
||||
stream = Stream(
|
||||
ReplyOnPause(
|
||||
response, # type: ignore
|
||||
input_sample_rate=16000,
|
||||
),
|
||||
mode="send",
|
||||
modality="audio",
|
||||
additional_inputs=[chatbot, state],
|
||||
additional_outputs=[chatbot, state],
|
||||
additional_outputs_handler=lambda *a: (a[2], a[3]),
|
||||
concurrency_limit=20 if get_space() else None,
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
chatbot: list[Message]
|
||||
state: list[Message]
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(data: InputData):
|
||||
body = data.model_dump()
|
||||
stream.set_input(data.webrtc_id, body["chatbot"], body["state"])
|
||||
|
||||
|
||||
def audio_to_base64(file_path):
|
||||
audio_format = "wav"
|
||||
with open(file_path, "rb") as audio_file:
|
||||
encoded_audio = base64.b64encode(audio_file.read()).decode("utf-8")
|
||||
return f"data:audio/{audio_format};base64,{encoded_audio}"
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
async def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
chatbot = output.args[0]
|
||||
state = output.args[1]
|
||||
data = {
|
||||
"message": state[-1],
|
||||
"audio": audio_to_base64(chatbot[-1]["content"].value["path"])
|
||||
if chatbot[-1]["role"] == "user"
|
||||
else None,
|
||||
}
|
||||
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
raise ValueError("Phone mode not supported")
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
487
demo/talk_to_sambanova/index.html
Normal file
@@ -0,0 +1,487 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Talk to Sambanova</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
background-color: #f8f9fa;
|
||||
color: #1a1a1a;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
height: 80%;
|
||||
}
|
||||
|
||||
.logo {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
padding: 20px;
|
||||
height: 90%;
|
||||
box-sizing: border-box;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 20px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 20px;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #e9ecef;
|
||||
margin-left: 20%;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #f1f3f5;
|
||||
margin-right: 20%;
|
||||
}
|
||||
|
||||
.controls {
|
||||
text-align: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
button {
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
border-radius: 4px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background-color: #0052a3;
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid #ffffff;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: #ffffff;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for typing indicator */
|
||||
.typing-indicator {
|
||||
padding: 8px;
|
||||
background-color: #f1f3f5;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 10px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.dots {
|
||||
display: inline-flex;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
background-color: #0066cc;
|
||||
border-radius: 50%;
|
||||
animation: pulse 1.5s infinite;
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.5s;
|
||||
}
|
||||
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 1s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.5;
|
||||
transform: scale(1);
|
||||
}
|
||||
|
||||
50% {
|
||||
opacity: 1;
|
||||
transform: scale(1.2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<div class="logo">
|
||||
<h1>Talk to Sambanova 🗣️</h1>
|
||||
<h2 style="font-size: 1.2em; color: #666; margin-top: 10px;">Speak to Llama 3.2 powered by Sambanova API
|
||||
</h2>
|
||||
</div>
|
||||
<div class="chat-container">
|
||||
<div class="chat-messages" id="chat-messages"></div>
|
||||
<div class="typing-indicator" id="typing-indicator">
|
||||
<div class="dots">
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="start-button">Start Conversation</button>
|
||||
</div>
|
||||
</div>
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
|
||||
let audioLevel = 0;
|
||||
let animationFrame;
|
||||
let audioContext, analyser, audioSource;
|
||||
let messages = [];
|
||||
let eventSource;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
}
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
audioSource = audioContext.createMediaStreamSource(stream);
|
||||
audioSource.connect(analyser);
|
||||
analyser.fftSize = 64;
|
||||
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
|
||||
audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationFrame = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
function handleMessage(event) {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
const typingIndicator = document.getElementById('typing-indicator');
|
||||
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: messages,
|
||||
state: messages
|
||||
})
|
||||
});
|
||||
} else if (eventJson.type === "log") {
|
||||
if (eventJson.data === "pause_detected") {
|
||||
typingIndicator.style.display = 'block';
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (eventJson.data === "response_starting") {
|
||||
typingIndicator.style.display = 'none';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
setupAudioVisualization(stream);
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = handleMessage;
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
console.log(eventJson);
|
||||
messages.push(eventJson.message);
|
||||
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
|
||||
});
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
|
||||
if (role === 'user') {
|
||||
// Create audio element for user messages
|
||||
const audio = document.createElement('audio');
|
||||
audio.controls = true;
|
||||
audio.src = content;
|
||||
messageDiv.appendChild(audio);
|
||||
} else {
|
||||
// Text content for assistant messages
|
||||
messageDiv.textContent = content;
|
||||
}
|
||||
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (eventSource) {
|
||||
eventSource.close();
|
||||
eventSource = null;
|
||||
}
|
||||
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
}
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
peerConnection.close();
|
||||
}
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
stop();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
4
demo/talk_to_sambanova/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastrtc[vad, stt]
|
||||
python-dotenv
|
||||
huggingface_hub>=0.29.0
|
||||
twilio
|
||||
98
demo/talk_to_smolagents/README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
---
|
||||
title: Talk to Smolagents
|
||||
emoji: 💻
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: FastRTC Voice Agent with smolagents
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN]
|
||||
---
|
||||
|
||||
# Voice LLM Agent with Image Generation
|
||||
|
||||
A voice-enabled AI assistant powered by FastRTC that can:
|
||||
1. Stream audio in real-time using WebRTC
|
||||
2. Listen and respond with natural pauses in conversation
|
||||
3. Generate images based on your requests
|
||||
4. Maintain conversation context across exchanges
|
||||
|
||||
This app combines the real-time communication capabilities of FastRTC with the powerful agent framework of smolagents.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Real-time Streaming**: Uses FastRTC's WebRTC-based audio streaming
|
||||
- **Voice Activation**: Automatic detection of speech pauses to trigger responses
|
||||
- **Multi-modal Interaction**: Combines voice and image generation in a single interface
|
||||
|
||||
## Setup
|
||||
|
||||
1. Install Python 3.9+ and create a virtual environment:
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Create a `.env` file with the following:
|
||||
```
|
||||
HF_TOKEN=your_huggingface_api_key
|
||||
MODE=UI # Use 'UI' for Gradio interface, leave blank for HTML interface
|
||||
```
|
||||
|
||||
## Running the App
|
||||
|
||||
### With Gradio UI (Recommended)
|
||||
|
||||
```bash
|
||||
MODE=UI python app.py
|
||||
```
|
||||
|
||||
This launches a Gradio UI at http://localhost:7860 with:
|
||||
- FastRTC's built-in streaming audio components
|
||||
- A chat interface showing the conversation
|
||||
- An image display panel for generated images
|
||||
|
||||
## How to Use
|
||||
|
||||
1. Click the microphone button to start streaming your voice.
|
||||
2. Speak naturally - the app will automatically detect when you pause.
|
||||
3. Ask the agent to generate an image, for example:
|
||||
- "Create an image of a magical forest with glowing mushrooms."
|
||||
- "Generate a picture of a futuristic city with flying cars."
|
||||
4. View the generated image and hear the agent's response.
|
||||
|
||||
## Technical Architecture
|
||||
|
||||
### FastRTC Components
|
||||
|
||||
- **Stream**: Core component that handles WebRTC connections and audio streaming
|
||||
- **ReplyOnPause**: Detects when the user stops speaking to trigger a response
|
||||
- **get_stt_model/get_tts_model**: Provides optimized speech-to-text and text-to-speech models
|
||||
|
||||
### smolagents Components
|
||||
|
||||
- **CodeAgent**: Intelligent agent that can use tools based on natural language inputs
|
||||
- **Tool.from_space**: Integration with Hugging Face Spaces for image generation
|
||||
- **HfApiModel**: Connection to powerful language models for understanding requests
|
||||
|
||||
### Integration Flow
|
||||
|
||||
1. FastRTC streams and processes audio input in real-time
|
||||
2. Speech is converted to text and passed to the smolagents CodeAgent
|
||||
3. The agent processes the request and calls tools when needed
|
||||
4. Responses and generated images are streamed back through FastRTC
|
||||
5. The UI updates to show both text responses and generated images
|
||||
|
||||
## Advanced Features
|
||||
|
||||
- Conversation history is maintained across exchanges
|
||||
- Error handling ensures the app continues working even if agent processing fails
|
||||
- The application leverages FastRTC's streaming capabilities for efficient audio transmission
|
||||
99
demo/talk_to_smolagents/app.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_tts_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Initialize file paths
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
# Initialize models
|
||||
stt_model = get_stt_model()
|
||||
tts_model = get_tts_model()
|
||||
|
||||
# Conversation state to maintain history
|
||||
conversation_state: List[Dict[str, str]] = []
|
||||
|
||||
# System prompt for agent
|
||||
system_prompt = """You are a helpful assistant that can helps with finding places to
|
||||
workremotely from. You should specifically check against reviews and ratings of the
|
||||
place. You should use this criteria to find the best place to work from:
|
||||
- Price
|
||||
- Reviews
|
||||
- Ratings
|
||||
- Location
|
||||
- WIFI
|
||||
Only return the name, address of the place, and a short description of the place.
|
||||
Always search for real places.
|
||||
Only return real places, not fake ones.
|
||||
If you receive anything other than a location, you should ask for a location.
|
||||
<example>
|
||||
User: I am in Paris, France. Can you find me a place to work from?
|
||||
Assistant: I found a place called "Le Café de la Paix" at 123 Rue de la Paix,
|
||||
Paris, France. It has good reviews and is in a great location.
|
||||
</example>
|
||||
<example>
|
||||
User: I am in London, UK. Can you find me a place to work from?
|
||||
Assistant: I found a place called "The London Coffee Company".
|
||||
</example>
|
||||
<example>
|
||||
User: How many people are in the room?
|
||||
Assistant: I only respond to requests about finding places to work from.
|
||||
</example>
|
||||
|
||||
"""
|
||||
|
||||
model = HfApiModel(provider="together", model="Qwen/Qwen2.5-Coder-32B-Instruct")
|
||||
|
||||
agent = CodeAgent(
|
||||
tools=[
|
||||
DuckDuckGoSearchTool(),
|
||||
],
|
||||
model=model,
|
||||
max_steps=10,
|
||||
verbosity_level=2,
|
||||
description="Search the web for cafes to work from.",
|
||||
)
|
||||
|
||||
|
||||
def process_response(audio):
|
||||
"""Process audio input and generate LLM response with TTS"""
|
||||
# Convert speech to text using STT model
|
||||
text = stt_model.stt(audio)
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
input_text = f"{system_prompt}\n\n{text}"
|
||||
# Get response from agent
|
||||
response_content = agent.run(input_text)
|
||||
|
||||
# Convert response to audio using TTS model
|
||||
for audio_chunk in tts_model.stream_tts_sync(response_content or ""):
|
||||
# Yield the audio chunk
|
||||
yield audio_chunk
|
||||
|
||||
|
||||
stream = Stream(
|
||||
handler=ReplyOnPause(process_response, input_sample_rate=16000),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
ui_args={
|
||||
"pulse_color": "rgb(255, 255, 255)",
|
||||
"icon_button_color": "rgb(255, 255, 255)",
|
||||
"title": "🧑💻The Coworking Agent",
|
||||
},
|
||||
rtc_configuration=get_twilio_turn_credentials(),
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
stream.ui.launch(server_port=7860)
|
||||
136
demo/talk_to_smolagents/requirements.txt
Normal file
@@ -0,0 +1,136 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv export --format requirements-txt --no-hashes
|
||||
aiofiles==23.2.1
|
||||
aiohappyeyeballs==2.4.6
|
||||
aiohttp==3.11.13
|
||||
aiohttp-retry==2.9.1
|
||||
aioice==0.9.0
|
||||
aiortc==1.10.1
|
||||
aiosignal==1.3.2
|
||||
annotated-types==0.7.0
|
||||
anyio==4.8.0
|
||||
async-timeout==5.0.1 ; python_full_version < '3.11'
|
||||
attrs==25.1.0
|
||||
audioop-lts==0.2.1 ; python_full_version >= '3.13'
|
||||
audioread==3.0.1
|
||||
av==13.1.0
|
||||
babel==2.17.0
|
||||
beautifulsoup4==4.13.3
|
||||
certifi==2025.1.31
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.1
|
||||
click==8.1.8
|
||||
colorama==0.4.6
|
||||
coloredlogs==15.0.1
|
||||
colorlog==6.9.0
|
||||
cryptography==44.0.1
|
||||
csvw==3.5.1
|
||||
decorator==5.2.1
|
||||
dlinfo==2.0.0
|
||||
dnspython==2.7.0
|
||||
duckduckgo-search==7.5.0
|
||||
espeakng-loader==0.2.4
|
||||
exceptiongroup==1.2.2 ; python_full_version < '3.11'
|
||||
fastapi==0.115.8
|
||||
fastrtc==0.0.8.post1
|
||||
fastrtc-moonshine-onnx==20241016
|
||||
ffmpy==0.5.0
|
||||
filelock==3.17.0
|
||||
flatbuffers==25.2.10
|
||||
frozenlist==1.5.0
|
||||
fsspec==2025.2.0
|
||||
google-crc32c==1.6.0
|
||||
gradio==5.19.0
|
||||
gradio-client==1.7.2
|
||||
h11==0.14.0
|
||||
httpcore==1.0.7
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.29.1
|
||||
humanfriendly==10.0
|
||||
idna==3.10
|
||||
ifaddr==0.2.0
|
||||
isodate==0.7.2
|
||||
jinja2==3.1.5
|
||||
joblib==1.4.2
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
kokoro-onnx==0.4.3
|
||||
language-tags==1.2.0
|
||||
lazy-loader==0.4
|
||||
librosa==0.10.2.post1
|
||||
llvmlite==0.44.0
|
||||
lxml==5.3.1
|
||||
markdown-it-py==3.0.0
|
||||
markdownify==1.0.0
|
||||
markupsafe==2.1.5
|
||||
mdurl==0.1.2
|
||||
mpmath==1.3.0
|
||||
msgpack==1.1.0
|
||||
multidict==6.1.0
|
||||
numba==0.61.0
|
||||
numpy==2.1.3
|
||||
onnxruntime==1.20.1
|
||||
orjson==3.10.15
|
||||
packaging==24.2
|
||||
pandas==2.2.3
|
||||
phonemizer-fork==3.3.1
|
||||
pillow==11.1.0
|
||||
platformdirs==4.3.6
|
||||
pooch==1.8.2
|
||||
primp==0.14.0
|
||||
propcache==0.3.0
|
||||
protobuf==5.29.3
|
||||
pycparser==2.22
|
||||
pydantic==2.10.6
|
||||
pydantic-core==2.27.2
|
||||
pydub==0.25.1
|
||||
pyee==12.1.1
|
||||
pygments==2.19.1
|
||||
pyjwt==2.10.1
|
||||
pylibsrtp==0.11.0
|
||||
pyopenssl==25.0.0
|
||||
pyparsing==3.2.1
|
||||
pyreadline3==3.5.4 ; sys_platform == 'win32'
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.20
|
||||
pytz==2025.1
|
||||
pyyaml==6.0.2
|
||||
rdflib==7.1.3
|
||||
referencing==0.36.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
rfc3986==1.5.0
|
||||
rich==13.9.4
|
||||
rpds-py==0.23.1
|
||||
ruff==0.9.7 ; sys_platform != 'emscripten'
|
||||
safehttpx==0.1.6
|
||||
scikit-learn==1.6.1
|
||||
scipy==1.15.2
|
||||
segments==2.3.0
|
||||
semantic-version==2.10.0
|
||||
shellingham==1.5.4 ; sys_platform != 'emscripten'
|
||||
six==1.17.0
|
||||
smolagents==1.9.2
|
||||
sniffio==1.3.1
|
||||
soundfile==0.13.1
|
||||
soupsieve==2.6
|
||||
soxr==0.5.0.post1
|
||||
standard-aifc==3.13.0 ; python_full_version >= '3.13'
|
||||
standard-chunk==3.13.0 ; python_full_version >= '3.13'
|
||||
standard-sunau==3.13.0 ; python_full_version >= '3.13'
|
||||
starlette==0.45.3
|
||||
sympy==1.13.3
|
||||
threadpoolctl==3.5.0
|
||||
tokenizers==0.21.0
|
||||
tomlkit==0.13.2
|
||||
tqdm==4.67.1
|
||||
twilio==9.4.6
|
||||
typer==0.15.1 ; sys_platform != 'emscripten'
|
||||
typing-extensions==4.12.2
|
||||
tzdata==2025.1
|
||||
uritemplate==4.1.1
|
||||
urllib3==2.3.0
|
||||
uvicorn==0.34.0 ; sys_platform != 'emscripten'
|
||||
websockets==15.0
|
||||
yarl==1.18.3
|
||||
BIN
demo/video.mp4
Normal file
19
demo/voice_text_editor/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
title: Voice Text Editor
|
||||
emoji: 📝
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Edit text documents with your voice!
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
# Voice Text Editor
|
||||
|
||||
Edit text documents with your voice!
|
||||
|
||||
|
||||
113
demo/voice_text_editor/app.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
|
||||
sambanova_client = OpenAI(
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
|
||||
)
|
||||
stt_model = get_stt_model()
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands.
|
||||
|
||||
For each interaction:
|
||||
1. You will receive the current state of a text document and a voice input from the user.
|
||||
2. Determine if the input is:
|
||||
a) A command to modify the document (e.g., "delete the last line", "capitalize that")
|
||||
b) Content to be added to the document (e.g., "buy 12 eggs at the store")
|
||||
c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24")
|
||||
3. Return ONLY the new document state after the changes have been applied.
|
||||
|
||||
Example:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
|
||||
Meeting notes:
|
||||
- Buy GPUs
|
||||
- Meet with Joe
|
||||
|
||||
USER INPUT: Make that 100 GPUS
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
Meeting notes:
|
||||
- Buy 100 GPUs
|
||||
- Meet with Joe
|
||||
|
||||
Example 2:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
Project Proposal
|
||||
|
||||
USER INPUT: Make that a header
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
# Project Proposal
|
||||
|
||||
When handling commands:
|
||||
- Apply the requested changes precisely to the document
|
||||
- Support operations like adding, deleting, modifying, and moving text
|
||||
- Understand contextual references like "that", "the last line", "the second paragraph"
|
||||
|
||||
When handling content additions:
|
||||
- Add the new text at the appropriate location (usually at the end or cursor position)
|
||||
- Format it appropriately based on the document context
|
||||
- If the user says to "add" or "insert" do not remove text that was already in the document.
|
||||
|
||||
When handling content modifications:
|
||||
- Identify what part of the document the user is referring to
|
||||
- Apply the requested change while preserving the rest of the content
|
||||
- Be smart about contextual references (e.g., "make that 24" should know to replace a number)
|
||||
|
||||
NEVER include any text in the new document state that is not part of the user's input.
|
||||
NEVER include the phrase "CURRENT DOCUMENT" in the new document state.
|
||||
NEVER reword the user's input unless you are explicitly asked to do so.
|
||||
"""
|
||||
|
||||
|
||||
def edit(audio, current_document: str):
|
||||
prompt = stt_model.stt(audio)
|
||||
print(f"Prompt: {prompt}")
|
||||
response = sambanova_client.chat.completions.create(
|
||||
model="Meta-Llama-3.3-70B-Instruct",
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}",
|
||||
},
|
||||
],
|
||||
max_tokens=200,
|
||||
)
|
||||
doc = response.choices[0].message.content
|
||||
yield AdditionalOutputs(doc)
|
||||
|
||||
|
||||
doc = gr.Textbox(value="", label="Current Document")
|
||||
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(edit),
|
||||
modality="audio",
|
||||
mode="send",
|
||||
additional_inputs=[doc],
|
||||
additional_outputs=[doc],
|
||||
additional_outputs_handler=lambda prev, current: current,
|
||||
ui_args={"title": "Voice Text Editor with FastRTC 🗣️"},
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
126
demo/voice_text_editor_local/app.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model
|
||||
|
||||
load_dotenv()
|
||||
|
||||
stt_model = get_stt_model()
|
||||
|
||||
SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands.
|
||||
|
||||
For each interaction:
|
||||
1. You will receive the current state of a text document and a voice input from the user.
|
||||
2. Determine if the input is:
|
||||
a) A command to modify the document (e.g., "delete the last line", "capitalize that")
|
||||
b) Content to be added to the document (e.g., "buy 12 eggs at the store")
|
||||
c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24")
|
||||
3. Return ONLY the new document state after the changes have been applied.
|
||||
|
||||
Example:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
Meeting notes:
|
||||
- Buy GPUs
|
||||
- Meet with Joe
|
||||
|
||||
USER INPUT: Make that 100 GPUS
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
Meeting notes:
|
||||
- Buy 100 GPUs
|
||||
- Meet with Joe
|
||||
|
||||
Example 2:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
Project Proposal
|
||||
|
||||
USER INPUT: Make that a header
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
# Project Proposal
|
||||
|
||||
When handling commands:
|
||||
- Apply the requested changes precisely to the document
|
||||
- Support operations like adding, deleting, modifying, and moving text
|
||||
- Understand contextual references like "that", "the last line", "the second paragraph"
|
||||
|
||||
When handling content additions:
|
||||
- Add the new text at the appropriate location (usually at the end or cursor position)
|
||||
- Format it appropriately based on the document context
|
||||
- If the user says to "add" or "insert" do not remove text that was already in the document.
|
||||
|
||||
When handling content modifications:
|
||||
- Identify what part of the document the user is referring to
|
||||
- Apply the requested change while preserving the rest of the content
|
||||
- Be smart about contextual references (e.g., "make that 24" should know to replace a number)
|
||||
|
||||
NEVER include any text in the new document state that is not part of the user's input.
|
||||
NEVER include the phrase "CURRENT DOCUMENT" in the new document state.
|
||||
NEVER reword the user's input unless you are explicitly asked to do so.
|
||||
"""
|
||||
|
||||
|
||||
def edit(audio, current_document: str):
|
||||
prompt = stt_model.stt(audio)
|
||||
print(f"Prompt: {prompt}")
|
||||
|
||||
# Construct the prompt for ollama
|
||||
full_prompt = (
|
||||
f"{SYSTEM_PROMPT}\n\n"
|
||||
f"User: CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}\n\n"
|
||||
f"Assistant:"
|
||||
)
|
||||
|
||||
try:
|
||||
# Send request to ollama's API
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={
|
||||
"model": "qwen2.5",
|
||||
"prompt": full_prompt,
|
||||
"stream": False,
|
||||
"max_tokens": 200,
|
||||
},
|
||||
)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
|
||||
# Parse the response
|
||||
doc = response.json()["response"]
|
||||
# Clean up the response to remove "Assistant:" and any extra whitespace
|
||||
doc = doc.strip().lstrip("Assistant:").strip()
|
||||
yield AdditionalOutputs(doc)
|
||||
|
||||
except requests.RequestException as e:
|
||||
# Handle API errors gracefully
|
||||
error_message = "Error: Could not connect to ollama. Please ensure it's running and qwen2.5 is loaded."
|
||||
print(f"API Error: {e}")
|
||||
yield AdditionalOutputs(error_message)
|
||||
|
||||
|
||||
doc = gr.Textbox(value="", label="Current Document")
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(edit),
|
||||
modality="audio",
|
||||
mode="send",
|
||||
additional_inputs=[doc],
|
||||
additional_outputs=[doc],
|
||||
additional_outputs_handler=lambda prev, current: current,
|
||||
ui_args={"title": "Voice Text Editor with FastRTC 🗣️"},
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
15
demo/webrtc_vs_websocket/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Webrtc Vs Websocket
|
||||
emoji: 🧪
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Compare Round Trip Times between WebRTC and Websockets
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|ELEVENLABS_API_KEY, secret|GROQ_API_KEY, secret|ANTHROPIC_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
147
demo/webrtc_vs_websocket/app.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from elevenlabs import ElevenLabs
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_twilio_turn_credentials
|
||||
from fastrtc.utils import aggregate_bytes_to_16bit, audio_to_bytes
|
||||
from gradio.utils import get_space
|
||||
from groq import Groq
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Configure the root logger to WARNING to suppress debug messages from other libraries
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
|
||||
# Create a console handler
|
||||
console_handler = logging.FileHandler("gradio_webrtc.log")
|
||||
console_handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Create a formatter
|
||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# Configure the logger for your specific library
|
||||
logger = logging.getLogger("fastrtc")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
groq_client = Groq()
|
||||
claude_client = anthropic.Anthropic()
|
||||
tts_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
|
||||
def response(
|
||||
audio: tuple[int, np.ndarray],
|
||||
chatbot: list[dict] | None = None,
|
||||
):
|
||||
chatbot = chatbot or []
|
||||
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
|
||||
prompt = groq_client.audio.transcriptions.create(
|
||||
file=("audio-file.mp3", audio_to_bytes(audio)),
|
||||
model="whisper-large-v3-turbo",
|
||||
response_format="verbose_json",
|
||||
).text
|
||||
print("prompt", prompt)
|
||||
chatbot.append({"role": "user", "content": prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
response = claude_client.messages.create(
|
||||
model="claude-3-5-haiku-20241022",
|
||||
max_tokens=512,
|
||||
messages=messages, # type: ignore
|
||||
)
|
||||
response_text = " ".join(
|
||||
block.text # type: ignore
|
||||
for block in response.content
|
||||
if getattr(block, "type", None) == "text"
|
||||
)
|
||||
chatbot.append({"role": "assistant", "content": response_text})
|
||||
yield AdditionalOutputs(chatbot)
|
||||
iterator = tts_client.text_to_speech.convert_as_stream(
|
||||
text=response_text,
|
||||
voice_id="JBFqnCBsd6RMkjVDRZzb",
|
||||
model_id="eleven_multilingual_v2",
|
||||
output_format="pcm_24000",
|
||||
)
|
||||
for chunk in aggregate_bytes_to_16bit(iterator):
|
||||
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
|
||||
yield (24000, audio_array, "mono")
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=ReplyOnPause(response),
|
||||
additional_outputs_handler=lambda a, b: b,
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
concurrency_limit=20 if get_space() else None,
|
||||
)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
chatbot: list[Message]
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content, status_code=200)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(body: InputData):
|
||||
stream.set_input(body.webrtc_id, body.model_dump()["chatbot"])
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
def _(webrtc_id: str):
|
||||
print("outputs", webrtc_id)
|
||||
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
chatbot = output.args[0]
|
||||
yield f"event: output\ndata: {json.dumps(chatbot[-2])}\n\n"
|
||||
yield f"event: output\ndata: {json.dumps(chatbot[-1])}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860, server_name="0.0.0.0")
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
630
demo/webrtc_vs_websocket/index.html
Normal file
@@ -0,0 +1,630 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>WebRTC vs WebSocket Benchmark</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/alawmulaw"></script>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 30px;
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.panel {
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 20px;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
height: 400px;
|
||||
overflow-y: auto;
|
||||
border: 1px solid #e0e0e0;
|
||||
border-radius: 8px;
|
||||
padding: 15px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 10px;
|
||||
padding: 8px 12px;
|
||||
border-radius: 8px;
|
||||
max-width: 80%;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #e3f2fd;
|
||||
margin-left: auto;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.metrics {
|
||||
margin-top: 15px;
|
||||
padding: 10px;
|
||||
background: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
.metric {
|
||||
margin: 5px 0;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
button {
|
||||
background-color: #1976d2;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 10px 20px;
|
||||
border-radius: 6px;
|
||||
cursor: pointer;
|
||||
font-size: 14px;
|
||||
transition: background-color 0.2s;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background-color: #1565c0;
|
||||
}
|
||||
|
||||
button:disabled {
|
||||
background-color: #bdbdbd;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
h2 {
|
||||
margin-top: 0;
|
||||
color: #1976d2;
|
||||
}
|
||||
|
||||
.visualizer {
|
||||
width: 100%;
|
||||
height: 100px;
|
||||
margin: 10px 0;
|
||||
background: #fafafa;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
/* Add styles for disclaimer */
|
||||
.disclaimer {
|
||||
background-color: #fff3e0;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 20px;
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
max-width: 1400px;
|
||||
margin: 0 auto 20px auto;
|
||||
}
|
||||
|
||||
/* Update nav bar styles */
|
||||
.nav-bar {
|
||||
background-color: #f5f5f5;
|
||||
padding: 10px 20px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.nav-container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.nav-button {
|
||||
background-color: #1976d2;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 8px 16px;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
text-decoration: none;
|
||||
font-size: 14px;
|
||||
transition: background-color 0.2s;
|
||||
}
|
||||
|
||||
.nav-button:hover {
|
||||
background-color: #1565c0;
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<nav class="nav-bar">
|
||||
<div class="nav-container">
|
||||
<a href="./webrtc/docs" class="nav-button">WebRTC Docs</a>
|
||||
<a href="./websocket/docs" class="nav-button">WebSocket Docs</a>
|
||||
<a href="./telephone/docs" class="nav-button">Telephone Docs</a>
|
||||
<a href="./ui" class="nav-button">UI</a>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<div class="disclaimer">
|
||||
This page compares the WebRTC Round-Trip-Time calculated from <code>getStats()</code> to the time taken to
|
||||
process a ping/pong response pattern over websockets. It may not be a gold standard benchmark. Both WebRTC and
|
||||
Websockets have their merits/advantages which is why FastRTC supports both. Artifacts in the WebSocket playback
|
||||
audio are due to gaps in my frontend processing code and not FastRTC web server.
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="panel">
|
||||
<h2>WebRTC Connection</h2>
|
||||
<div id="webrtc-chat" class="chat-container"></div>
|
||||
<div id="webrtc-metrics" class="metrics">
|
||||
<div class="metric">RTT (Round Trip Time): <span id="webrtc-rtt">-</span></div>
|
||||
</div>
|
||||
<button id="webrtc-button">Connect WebRTC</button>
|
||||
</div>
|
||||
|
||||
<div class="panel">
|
||||
<h2>WebSocket Connection</h2>
|
||||
<div id="ws-chat" class="chat-container"></div>
|
||||
<div id="ws-metrics" class="metrics">
|
||||
<div class="metric">RTT (Round Trip Time): <span id="ws-rtt">0</span></div>
|
||||
</div>
|
||||
<button id="ws-button">Connect WebSocket</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<audio id="webrtc-audio" style="display: none;"></audio>
|
||||
<audio id="ws-audio" style="display: none;"></audio>
|
||||
|
||||
<div id="error-toast" class="toast"></div>
|
||||
|
||||
<script>
|
||||
// Shared utilities
|
||||
function generateId() {
|
||||
return Math.random().toString(36).substring(7);
|
||||
}
|
||||
|
||||
function sendInput(id) {
|
||||
|
||||
return function handleMessage(event) {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: id,
|
||||
chatbot: chatHistoryWebRTC
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let chatHistoryWebRTC = [];
|
||||
let chatHistoryWebSocket = [];
|
||||
|
||||
function addMessage(containerId, role, content) {
|
||||
const container = document.getElementById(containerId);
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
messageDiv.textContent = content;
|
||||
container.appendChild(messageDiv);
|
||||
container.scrollTop = container.scrollHeight;
|
||||
if (containerId === 'webrtc-chat') {
|
||||
chatHistoryWebRTC.push({ role, content });
|
||||
} else {
|
||||
chatHistoryWebSocket.push({ role, content });
|
||||
}
|
||||
}
|
||||
|
||||
// WebRTC Implementation
|
||||
let webrtcPeerConnection;
|
||||
|
||||
// Add this function to collect RTT stats
|
||||
async function updateWebRTCStats() {
|
||||
if (!webrtcPeerConnection) return;
|
||||
|
||||
const stats = await webrtcPeerConnection.getStats();
|
||||
stats.forEach(report => {
|
||||
if (report.type === 'candidate-pair' && report.state === 'succeeded') {
|
||||
const rtt = report.currentRoundTripTime * 1000; // Convert to ms
|
||||
document.getElementById('webrtc-rtt').textContent = `${rtt.toFixed(2)}ms`;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const button = document.getElementById('webrtc-button');
|
||||
button.textContent = "Stop";
|
||||
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
webrtcPeerConnection = new RTCPeerConnection(config);
|
||||
const webrtcId = generateId();
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
stream.getTracks().forEach(track => {
|
||||
webrtcPeerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
webrtcPeerConnection.addEventListener('track', (evt) => {
|
||||
const audio = document.getElementById('webrtc-audio');
|
||||
if (audio.srcObject !== evt.streams[0]) {
|
||||
audio.srcObject = evt.streams[0];
|
||||
audio.play();
|
||||
}
|
||||
});
|
||||
|
||||
const dataChannel = webrtcPeerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = sendInput(webrtcId);
|
||||
|
||||
const offer = await webrtcPeerConnection.createOffer();
|
||||
await webrtcPeerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (webrtcPeerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (webrtcPeerConnection.iceGatheringState === "complete") {
|
||||
webrtcPeerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
webrtcPeerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: webrtcPeerConnection.localDescription.sdp,
|
||||
type: webrtcPeerConnection.localDescription.type,
|
||||
webrtc_id: webrtcId
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
await webrtcPeerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
// Setup event source for messages
|
||||
const eventSource = new EventSource('/outputs?webrtc_id=' + webrtcId);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
addMessage('webrtc-chat', eventJson.role, eventJson.content);
|
||||
});
|
||||
|
||||
// Add periodic stats collection
|
||||
const statsInterval = setInterval(updateWebRTCStats, 1000);
|
||||
|
||||
// Store the interval ID on the connection
|
||||
webrtcPeerConnection.statsInterval = statsInterval;
|
||||
|
||||
webrtcPeerConnection.addEventListener('connectionstatechange', () => {
|
||||
if (webrtcPeerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
});
|
||||
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('WebRTC setup error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
function webrtc_stop() {
|
||||
if (webrtcPeerConnection) {
|
||||
// Clear the stats interval
|
||||
if (webrtcPeerConnection.statsInterval) {
|
||||
clearInterval(webrtcPeerConnection.statsInterval);
|
||||
}
|
||||
|
||||
// Close all tracks
|
||||
webrtcPeerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track) {
|
||||
sender.track.stop();
|
||||
}
|
||||
});
|
||||
|
||||
webrtcPeerConnection.close();
|
||||
webrtcPeerConnection = null;
|
||||
|
||||
// Reset metrics display
|
||||
document.getElementById('webrtc-rtt').textContent = '-';
|
||||
}
|
||||
}
|
||||
|
||||
// WebSocket Implementation
|
||||
let webSocket;
|
||||
let wsMetrics = {
|
||||
pingStartTime: 0,
|
||||
rttValues: []
|
||||
};
|
||||
|
||||
// Load mu-law library
|
||||
|
||||
// Add load promise to track when the script is ready
|
||||
|
||||
|
||||
function resample(audioData, fromSampleRate, toSampleRate) {
|
||||
const ratio = fromSampleRate / toSampleRate;
|
||||
const newLength = Math.round(audioData.length / ratio);
|
||||
const result = new Float32Array(newLength);
|
||||
|
||||
for (let i = 0; i < newLength; i++) {
|
||||
const position = i * ratio;
|
||||
const index = Math.floor(position);
|
||||
const fraction = position - index;
|
||||
|
||||
if (index + 1 < audioData.length) {
|
||||
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
|
||||
} else {
|
||||
result[i] = audioData[index];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function convertToMulaw(audioData, sampleRate) {
|
||||
// Resample to 8000 Hz if needed
|
||||
if (sampleRate !== 8000) {
|
||||
audioData = resample(audioData, sampleRate, 8000);
|
||||
}
|
||||
|
||||
// Convert float32 [-1,1] to int16 [-32768,32767]
|
||||
const int16Data = new Int16Array(audioData.length);
|
||||
for (let i = 0; i < audioData.length; i++) {
|
||||
int16Data[i] = Math.floor(audioData[i] * 32768);
|
||||
}
|
||||
|
||||
// Convert to mu-law using the library
|
||||
return alawmulaw.mulaw.encode(int16Data);
|
||||
}
|
||||
|
||||
async function setupWebSocket() {
|
||||
const button = document.getElementById('ws-button');
|
||||
button.textContent = "Stop";
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
"echoCancellation": true,
|
||||
"noiseSuppression": { "exact": true },
|
||||
"autoGainControl": { "exact": true },
|
||||
"sampleRate": { "ideal": 24000 },
|
||||
"sampleSize": { "ideal": 16 },
|
||||
"channelCount": { "exact": 1 },
|
||||
}
|
||||
});
|
||||
const wsId = generateId();
|
||||
wsMetrics.startTime = performance.now();
|
||||
|
||||
// Create audio context and analyser for visualization
|
||||
const audioContext = new AudioContext();
|
||||
const analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(analyser);
|
||||
|
||||
// Connect to websocket endpoint
|
||||
webSocket = new WebSocket(`${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/websocket/offer`);
|
||||
|
||||
webSocket.onopen = () => {
|
||||
// Send initial start message
|
||||
webSocket.send(JSON.stringify({
|
||||
event: "start",
|
||||
websocket_id: wsId
|
||||
}));
|
||||
|
||||
// Setup audio processing
|
||||
const processor = audioContext.createScriptProcessor(2048, 1, 1);
|
||||
source.connect(processor);
|
||||
processor.connect(audioContext.destination);
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
const inputData = e.inputBuffer.getChannelData(0);
|
||||
const mulawData = convertToMulaw(inputData, audioContext.sampleRate);
|
||||
const base64Audio = btoa(String.fromCharCode.apply(null, mulawData));
|
||||
if (webSocket.readyState === WebSocket.OPEN) {
|
||||
webSocket.send(JSON.stringify({
|
||||
event: "media",
|
||||
media: {
|
||||
payload: base64Audio
|
||||
}
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
// Add ping interval
|
||||
webSocket.pingInterval = setInterval(() => {
|
||||
wsMetrics.pingStartTime = performance.now();
|
||||
webSocket.send(JSON.stringify({
|
||||
event: "ping"
|
||||
}));
|
||||
}, 500);
|
||||
};
|
||||
|
||||
// Setup audio output context
|
||||
const outputContext = new AudioContext({ sampleRate: 24000 });
|
||||
const sampleRate = 24000; // Updated to match server sample rate
|
||||
let audioQueue = [];
|
||||
let isPlaying = false;
|
||||
|
||||
webSocket.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
if (data?.type === "send_input") {
|
||||
console.log("sending input")
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ webrtc_id: wsId, chatbot: chatHistoryWebSocket })
|
||||
});
|
||||
}
|
||||
if (data.event === "media") {
|
||||
// Process received audio
|
||||
const audioData = atob(data.media.payload);
|
||||
const mulawData = new Uint8Array(audioData.length);
|
||||
for (let i = 0; i < audioData.length; i++) {
|
||||
mulawData[i] = audioData.charCodeAt(i);
|
||||
}
|
||||
|
||||
// Convert mu-law to linear PCM
|
||||
const linearData = alawmulaw.mulaw.decode(mulawData);
|
||||
|
||||
// Create an AudioBuffer
|
||||
const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
|
||||
const channelData = audioBuffer.getChannelData(0);
|
||||
|
||||
// Fill the buffer with the decoded data
|
||||
for (let i = 0; i < linearData.length; i++) {
|
||||
channelData[i] = linearData[i] / 32768.0;
|
||||
}
|
||||
|
||||
// Queue the audio buffer
|
||||
audioQueue.push(audioBuffer);
|
||||
|
||||
// Start playing if not already playing
|
||||
if (!isPlaying) {
|
||||
playNextBuffer();
|
||||
}
|
||||
}
|
||||
|
||||
// Add pong handler
|
||||
if (data.event === "pong") {
|
||||
const rtt = performance.now() - wsMetrics.pingStartTime;
|
||||
wsMetrics.rttValues.push(rtt);
|
||||
// Keep only last 20 values for running mean
|
||||
if (wsMetrics.rttValues.length > 20) {
|
||||
wsMetrics.rttValues.shift();
|
||||
}
|
||||
const avgRtt = wsMetrics.rttValues.reduce((a, b) => a + b, 0) / wsMetrics.rttValues.length;
|
||||
document.getElementById('ws-rtt').textContent = `${avgRtt.toFixed(2)}ms`;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
function playNextBuffer() {
|
||||
if (audioQueue.length === 0) {
|
||||
isPlaying = false;
|
||||
return;
|
||||
}
|
||||
|
||||
isPlaying = true;
|
||||
const bufferSource = outputContext.createBufferSource();
|
||||
bufferSource.buffer = audioQueue.shift();
|
||||
bufferSource.connect(outputContext.destination);
|
||||
|
||||
bufferSource.onended = playNextBuffer;
|
||||
bufferSource.start();
|
||||
}
|
||||
|
||||
const eventSource = new EventSource('/outputs?webrtc_id=' + wsId);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
console.log("ws output", event);
|
||||
const eventJson = JSON.parse(event.data);
|
||||
addMessage('ws-chat', eventJson.role, eventJson.content);
|
||||
});
|
||||
|
||||
} catch (err) {
|
||||
console.error('WebSocket setup error:', err);
|
||||
button.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function ws_stop() {
|
||||
if (webSocket) {
|
||||
webSocket.send(JSON.stringify({
|
||||
event: "stop"
|
||||
}));
|
||||
// Clear ping interval
|
||||
if (webSocket.pingInterval) {
|
||||
clearInterval(webSocket.pingInterval);
|
||||
}
|
||||
// Reset RTT display
|
||||
document.getElementById('ws-rtt').textContent = '-';
|
||||
wsMetrics.rttValues = [];
|
||||
|
||||
// Clear the stats interval
|
||||
if (webSocket.statsInterval) {
|
||||
clearInterval(webSocket.statsInterval);
|
||||
}
|
||||
webSocket.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Event Listeners
|
||||
document.getElementById('webrtc-button').addEventListener('click', () => {
|
||||
const button = document.getElementById('webrtc-button');
|
||||
if (button.textContent === 'Connect WebRTC') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
webrtc_stop();
|
||||
button.textContent = 'Connect WebRTC';
|
||||
}
|
||||
});
|
||||
const ws_start_button = document.getElementById('ws-button')
|
||||
ws_start_button.addEventListener('click', () => {
|
||||
if (ws_start_button.textContent === 'Connect WebSocket') {
|
||||
setupWebSocket();
|
||||
ws_start_button.textContent = 'Stop';
|
||||
} else {
|
||||
ws_stop();
|
||||
ws_start_button.textContent = 'Connect WebSocket';
|
||||
}
|
||||
});
|
||||
document.addEventListener("beforeunload", () => {
|
||||
ws_stop();
|
||||
webrtc_stop();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
6
demo/webrtc_vs_websocket/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastrtc[vad]
|
||||
elevenlabs
|
||||
groq
|
||||
anthropic
|
||||
twilio
|
||||
python-dotenv
|
||||