Merge remote-tracking branch 'origin/main' into open-avatar-chat-0.4.0

This commit is contained in:
bingochaos
2025-06-17 20:39:40 +08:00
142 changed files with 117010 additions and 814 deletions

View File

@@ -4,12 +4,12 @@ emoji: ♊️
colorFrom: purple
colorTo: red
sdk: gradio
sdk_version: 5.16.0
sdk_version: 5.25.2
app_file: app.py
pinned: false
license: mit
short_description: Gemini understands audio and video!
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -6,12 +6,14 @@ from io import BytesIO
import gradio as gr
import numpy as np
import websockets
from dotenv import load_dotenv
from fastrtc import (
AsyncAudioVideoStreamHandler,
Stream,
WebRTC,
get_twilio_turn_credentials,
get_cloudflare_turn_credentials_async,
wait_for_item,
)
from google import genai
from gradio.utils import get_space
@@ -61,18 +63,24 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
)
config = {"response_modalities": ["AUDIO"]}
async with client.aio.live.connect(
model="gemini-2.0-flash-exp", config=config
model="gemini-2.0-flash-exp",
config=config, # type: ignore
) as session:
self.session = session
print("set session")
while not self.quit.is_set():
turn = self.session.receive()
async for response in turn:
if data := response.data:
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
try:
async for response in turn:
if data := response.data:
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
self.audio_queue.put_nowait(audio)
except websockets.exceptions.ConnectionClosedOK:
print("connection closed")
break
async def video_receive(self, frame: np.ndarray):
self.video_queue.put_nowait(frame)
if self.session:
# send image every 1 second
print(time.time() - self.last_frame_time)
@@ -82,10 +90,12 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
if self.latest_args[1] is not None:
await self.session.send(input=encode_image(self.latest_args[1]))
self.video_queue.put_nowait(frame)
async def video_emit(self):
return await self.video_queue.get()
frame = await wait_for_item(self.video_queue, 0.01)
if frame is not None:
return frame
else:
return np.zeros((100, 100, 3), dtype=np.uint8)
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
_, array = frame
@@ -95,13 +105,15 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
await self.session.send(input=audio_message)
async def emit(self):
array = await self.audio_queue.get()
return (self.output_sample_rate, array)
array = await wait_for_item(self.audio_queue, 0.01)
if array is not None:
return (self.output_sample_rate, array)
return array
async def shutdown(self) -> None:
if self.session:
self.quit.set()
await self.session._websocket.close()
await self.session.close()
self.quit.clear()
@@ -109,10 +121,8 @@ stream = Stream(
handler=GeminiHandler(),
modality="audio-video",
mode="send-receive",
rtc_configuration=get_twilio_turn_credentials()
if get_space() == "spaces"
else None,
time_limit=90 if get_space() else None,
rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
time_limit=180 if get_space() else None,
additional_inputs=[
gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
],
@@ -151,8 +161,8 @@ with gr.Blocks(css=css) as demo:
modality="audio-video",
mode="send-receive",
elem_id="video-source",
rtc_configuration=get_twilio_turn_credentials()
if get_space() == "spaces"
rtc_configuration=get_cloudflare_turn_credentials_async
if get_space()
else None,
icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
pulse_color="rgb(255, 255, 255)",
@@ -167,7 +177,7 @@ with gr.Blocks(css=css) as demo:
GeminiHandler(),
inputs=[webrtc, image_input],
outputs=[webrtc],
time_limit=60 if get_space() else None,
time_limit=180 if get_space() else None,
concurrency_limit=2 if get_space() else None,
)

View File

@@ -1,4 +1,4 @@
fastrtc
fastrtc==0.0.23.rc1
python-dotenv
google-genai
twilio

View File

@@ -1,8 +1,8 @@
import asyncio
import base64
import os
from collections.abc import AsyncGenerator
from pathlib import Path
from typing import AsyncGenerator
import librosa
import numpy as np
@@ -190,13 +190,13 @@ if __name__ == "__main__":
gr.HTML(
"""
<div style="display: flex; justify-content: center; align-items: center;">
<h1>Gemini Conversation</h1>
<h1>Gemini Conversation</h1>
</div>
"""
)
gr.Markdown(
"""# How to run this demo
- Clone the repo - top right of the page click the vertical three dots and select "Clone repository"
- Open the repo in a terminal and install the dependencies
- Get a gemini API key [here](https://ai.google.dev/gemini-api/docs/api-key)

View File

@@ -0,0 +1,19 @@
---
title: Integrated Text Box
emoji: 📝
colorFrom: purple
colorTo: red
sdk: gradio
sdk_version: 5.31.0
app_file: app.py
pinned: false
license: mit
short_description: Talk or type to ANY LLM!
tags: [webrtc, websocket, gradio, secret|HF_TOKEN]
---
# Integrated Textbox
Talk or type to ANY LLM!

View File

@@ -0,0 +1,143 @@
# /// script
# dependencies = [
# "fastrtc[vad, stt]">=0.0.26",
# "openai",
# ]
# ///
import gradio as gr
import huggingface_hub
from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
WebRTC,
WebRTCData,
WebRTCError,
get_hf_turn_credentials,
get_stt_model,
)
from gradio.utils import get_space
from openai import OpenAI
stt_model = get_stt_model()
conversations = {}
def response(
data: WebRTCData,
conversation: list[dict],
token: str | None = None,
model: str = "meta-llama/Llama-3.2-3B-Instruct",
provider: str = "sambanova",
):
print("conversation before", conversation)
if not provider.startswith("http") and not token:
raise WebRTCError("Please add your HF token.")
if data.audio is not None and data.audio[1].size > 0:
user_audio_text = stt_model.stt(data.audio)
conversation.append({"role": "user", "content": user_audio_text})
else:
conversation.append({"role": "user", "content": data.textbox})
yield AdditionalOutputs(conversation)
if provider.startswith("http"):
client = OpenAI(base_url=provider, api_key="ollama")
else:
client = huggingface_hub.InferenceClient(
api_key=token,
provider=provider, # type: ignore
)
request = client.chat.completions.create(
model=model,
messages=conversation, # type: ignore
temperature=1,
top_p=0.1,
)
response = {"role": "assistant", "content": request.choices[0].message.content}
conversation.append(response)
print("conversation after", conversation)
yield AdditionalOutputs(conversation)
css = """
footer {
display: none !important;
}
"""
providers = [
"black-forest-labs",
"cerebras",
"cohere",
"fal-ai",
"fireworks-ai",
"hf-inference",
"hyperbolic",
"nebius",
"novita",
"openai",
"replicate",
"sambanova",
"together",
]
def hide_token(provider: str):
if provider.startswith("http"):
return gr.Textbox(visible=False)
return gr.skip()
with gr.Blocks(css=css) as demo:
gr.HTML(
"""
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
<img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/AV_Huggy.png" alt="Streaming Huggy" style="height: 50px; margin-right: 10px"> FastRTC Chat
</h1>
"""
)
with gr.Sidebar():
token = gr.Textbox(
placeholder="Place your HF token here", type="password", label="HF Token"
)
model = gr.Dropdown(
choices=["meta-llama/Llama-3.2-3B-Instruct"],
allow_custom_value=True,
label="Model",
)
provider = gr.Dropdown(
label="Provider",
choices=providers,
value="sambanova",
info="Select a hf-compatible provider or type the url of your server, e.g. http://127.0.0.1:11434/v1 for ollama",
allow_custom_value=True,
)
provider.change(hide_token, inputs=[provider], outputs=[token])
cb = gr.Chatbot(type="messages", height=600)
webrtc = WebRTC(
modality="audio",
mode="send",
variant="textbox",
rtc_configuration=get_hf_turn_credentials if get_space() else None,
server_rtc_configuration=get_hf_turn_credentials(ttl=3_600 * 24 * 30)
if get_space()
else None,
)
webrtc.stream(
ReplyOnPause(response), # type: ignore
inputs=[webrtc, cb, token, model, provider],
outputs=[cb],
concurrency_limit=100,
)
webrtc.on_additional_outputs(
lambda old, new: new, inputs=[cb], outputs=[cb], concurrency_limit=100
)
if __name__ == "__main__":
demo.launch(server_port=7860)

View File

@@ -0,0 +1,2 @@
fastrtc[vad, stt]
openai

View File

@@ -1,5 +1,6 @@
from functools import lru_cache
from typing import Generator, Literal
from collections.abc import Generator
from functools import cache
from typing import Literal
import gradio as gr
import numpy as np
@@ -17,7 +18,7 @@ from numpy.typing import NDArray
load_dotenv()
@lru_cache(maxsize=None)
@cache
def load_moonshine(
model_name: Literal["moonshine/base", "moonshine/tiny"],
) -> MoonshineOnnxModel:

View File

@@ -1,6 +1,6 @@
import fastapi
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
from fastrtc.utils import audio_to_bytes
from fastrtc.utils import audio_to_bytes, audio_to_float32
from openai import OpenAI
import logging
import time
@@ -78,8 +78,8 @@ def echo(audio):
)
for audio_chunk in audio_stream:
audio_array = (
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
audio_array = audio_to_float32(
np.frombuffer(audio_chunk, dtype=np.int16)
)
yield (24000, audio_array)

363
demo/patient_intake/app.py Normal file
View File

@@ -0,0 +1,363 @@
import json
import os
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastrtc import (
AdditionalOutputs,
CloseStream,
ReplyOnPause,
Stream,
get_current_context,
get_stt_model,
get_tts_model,
)
from numpy.typing import NDArray
from openai import OpenAI
load_dotenv()
tts = get_tts_model()
stt = get_stt_model()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
conversations: dict[str, list[dict]] = {}
FUNCTIONS = [
{
"name": "verify_birthday",
"description": "Use this function to verify the user has provided their correct birthday.",
"parameters": {
"type": "object",
"properties": {
"birthday": {
"type": "string",
"description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.",
}
},
},
},
{
"name": "list_prescriptions",
"description": "Once the user has provided a list of their prescription medications, call this function.",
"parameters": {
"type": "object",
"properties": {
"prescriptions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"medication": {
"type": "string",
"description": "The medication's name",
},
"dosage": {
"type": "string",
"description": "The prescription's dosage",
},
},
},
}
},
},
},
{
"name": "list_allergies",
"description": "Once the user has provided a list of their allergies, call this function.",
"parameters": {
"type": "object",
"properties": {
"allergies": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "What the user is allergic to",
}
},
},
}
},
},
},
{
"name": "list_conditions",
"description": "Once the user has provided a list of their medical conditions, call this function.",
"parameters": {
"type": "object",
"properties": {
"conditions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The user's medical condition",
}
},
},
}
},
},
},
{
"name": "list_visit_reasons",
"description": "Once the user has provided a list of the reasons they are visiting a doctor today, call this function.",
"parameters": {
"type": "object",
"properties": {
"visit_reasons": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The user's reason for visiting the doctor",
}
},
},
}
},
},
},
]
def create_system_message():
system_message = [
{
"role": "system",
"content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Freddy. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function.",
}
]
return system_message
def start_up():
stream_id = get_current_context().webrtc_id
conversation = create_system_message()
response = client.chat.completions.create(
model="gpt-4o",
messages=conversation, # type: ignore
)
llm_response = response.choices[0].message.content
assert llm_response is not None
yield from tts.stream_tts_sync(llm_response)
llm_dict = {"role": "assistant", "content": llm_response}
yield AdditionalOutputs(llm_dict, conversation)
conversation.append(llm_dict)
conversations[stream_id] = conversation
def response(audio: tuple[int, NDArray[np.int16]]):
stream_id = get_current_context().webrtc_id
if stream_id not in conversations:
conversations[stream_id] = create_system_message()
message = stt.stt(audio)
print("message", message)
conversation = conversations[stream_id]
conversation.append({"role": "user", "content": message})
yield AdditionalOutputs({"role": "user", "content": message})
response = client.chat.completions.create(
model="gpt-4o",
messages=conversation, # type: ignore
functions=FUNCTIONS, # type: ignore
function_call="auto",
)
should_end = False
response_message = response.choices[0].message
if response_message.function_call:
function_name = response_message.function_call.name
function_args = json.loads(response_message.function_call.arguments)
yield AdditionalOutputs(
{
"role": "assistant",
"content": f"Function call: {function_name} with arguments: {function_args}",
}
)
if function_name == "verify_birthday":
if function_args.get("birthday") == "1983-01-01":
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Successfully verified birthday",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Success",
}
)
conversation.append(
{
"role": "system",
"content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions if they have any. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages. Once they have listed their prescriptions or confirmed they don't have any, call the list_prescriptions function.",
}
)
else:
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Failed to verify birthday",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Failed",
}
)
conversation.append(
{
"role": "system",
"content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function.",
}
)
elif function_name == "list_prescriptions":
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Successfully listed prescriptions",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Success",
}
)
conversation.append(
{
"role": "system",
"content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function.",
}
)
elif function_name == "list_allergies":
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Successfully listed allergies",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Success",
}
)
conversation.append(
{
"role": "system",
"content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function.",
}
)
elif function_name == "list_conditions":
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Successfully listed conditions",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Success",
}
)
conversation.append(
{
"role": "system",
"content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function.",
}
)
elif function_name == "list_visit_reasons":
yield AdditionalOutputs(
{
"role": "assistant",
"content": "Successfully listed visit reasons",
}
)
conversation.append(response_message.model_dump())
conversation.append(
{
"role": "function",
"name": function_name,
"content": "Success",
}
)
conversation.append(
{
"role": "system",
"content": "Now, thank the user and end the conversation.",
}
)
should_end = True
llm_response = (
client.chat.completions.create(
model="gpt-4o",
messages=conversation, # type: ignore
functions=FUNCTIONS, # type: ignore
function_call="auto",
)
.choices[0]
.message.content
)
else:
llm_response = response.choices[0].message.content
assert llm_response is not None
yield from tts.stream_tts_sync(llm_response)
llm_dict = {"role": "assistant", "content": llm_response}
yield AdditionalOutputs(llm_dict, conversation)
conversation.append(llm_dict)
if should_end:
yield CloseStream()
def update_chatbot(
chatbot: list[dict],
conversation_old,
response: dict,
conversation: list[dict] | None = None,
):
chatbot.append(response)
return chatbot, conversation
chatbot = gr.Chatbot(type="messages")
stream = Stream(
ReplyOnPause(response, start_up),
mode="send-receive",
modality="audio",
additional_inputs=[chatbot],
additional_outputs=[chatbot, gr.JSON(label="Conversation")],
additional_outputs_handler=update_chatbot,
)
if __name__ == "__main__":
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
stream.ui.launch(server_port=7860)

View File

@@ -0,0 +1,14 @@
---
title: Qwen Phone Chat
emoji: 📞
colorFrom: pink
colorTo: green
sdk: gradio
sdk_version: 5.25.2
app_file: app.py
pinned: false
license: mit
short_description: Talk with Qwen 2.5 Omni over the Phone
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

217
demo/qwen_phone_chat/app.py Normal file
View File

@@ -0,0 +1,217 @@
import asyncio
import base64
import json
import os
import secrets
from pathlib import Path
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastrtc import (
AdditionalOutputs,
AsyncStreamHandler,
Stream,
get_cloudflare_turn_credentials_async,
wait_for_item,
)
from websockets.asyncio.client import connect
load_dotenv()
cur_dir = Path(__file__).parent
API_KEY = os.getenv("MODELSCOPE_API_KEY", "")
API_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen-omni-turbo-realtime-2025-03-26"
VOICES = ["Chelsie", "Serena", "Ethan", "Cherry"]
headers = {"Authorization": "Bearer " + API_KEY}
class QwenOmniHandler(AsyncStreamHandler):
def __init__(
self,
) -> None:
super().__init__(
expected_layout="mono",
output_sample_rate=24_000,
input_sample_rate=16_000,
)
self.connection = None
self.output_queue = asyncio.Queue()
def copy(self):
return QwenOmniHandler()
@staticmethod
def msg_id() -> str:
return f"event_{secrets.token_hex(10)}"
async def start_up(
self,
):
"""Connect to realtime API. Run forever in separate thread to keep connection open."""
voice_id = "Serena"
print("voice_id", voice_id)
async with connect(
API_URL,
additional_headers=headers,
) as conn:
self.client = conn
await conn.send(
json.dumps(
{
"event_id": self.msg_id(),
"type": "session.update",
"session": {
"modalities": [
"text",
"audio",
],
"voice": voice_id,
"input_audio_format": "pcm16",
},
}
)
)
self.connection = conn
try:
async for data in self.connection:
event = json.loads(data)
print("event", event["type"])
if "type" not in event:
continue
# Handle interruptions
if event["type"] == "input_audio_buffer.speech_started":
self.clear_queue()
if event["type"] == "response.audio.delta":
print("putting output")
await self.output_queue.put(
(
self.output_sample_rate,
np.frombuffer(
base64.b64decode(event["delta"]), dtype=np.int16
).reshape(1, -1),
),
)
except Exception as e:
print("error", e)
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
if not self.connection:
return
_, array = frame
array = array.squeeze()
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
try:
await self.connection.send(
json.dumps(
{
"event_id": self.msg_id(),
"type": "input_audio_buffer.append",
"audio": audio_message,
}
)
)
except Exception as e:
print("error", e)
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
return await wait_for_item(self.output_queue)
async def shutdown(self) -> None:
if self.connection:
await self.connection.close()
self.connection = None
voice = gr.Dropdown(choices=VOICES, value=VOICES[0], type="value", label="Voice")
stream = Stream(
QwenOmniHandler(),
mode="send-receive",
modality="audio",
additional_inputs=[voice],
additional_outputs=None,
rtc_configuration=get_cloudflare_turn_credentials_async,
concurrency_limit=20,
time_limit=180,
)
app = FastAPI()
@app.post("/telephone/incoming")
async def handle_incoming_call(request: Request):
"""
Handle incoming telephone calls (e.g., via Twilio).
Generates TwiML instructions to connect the incoming call to the
WebSocket handler (`/telephone/handler`) for audio streaming.
Args:
request: The FastAPI Request object for the incoming call webhook.
Returns:
An HTMLResponse containing the TwiML instructions as XML.
"""
from twilio.twiml.voice_response import Connect, VoiceResponse
if len(stream.connections) > (stream.concurrency_limit or 20):
response = VoiceResponse()
response.say("Qwen is busy please try again later!")
return HTMLResponse(content=str(response), media_type="application/xml")
response = VoiceResponse()
response.say("Connecting to Qwen")
connect = Connect()
print("request.url.hostname", request.url.hostname)
connect.stream(url=f"wss://{request.url.hostname}/telephone/handler")
response.append(connect)
response.say("The call has been disconnected.")
return HTMLResponse(content=str(response), media_type="application/xml")
stream.mount(app)
@app.get("/")
async def _():
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>Qwen Phone Chat</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
line-height: 1.6;
}
pre {
background-color: #f5f5f5;
padding: 15px;
border-radius: 5px;
overflow-x: auto;
}
h1 {
color: #333;
}
</style>
</head>
<body>
<h1>Qwen Phone Chat</h1>
<p>Call +1 (877) 853-7936</p>
</body>
</html>
"""
return HTMLResponse(content=html_content)
if __name__ == "__main__":
# stream.fastphone(host="0.0.0.0", port=7860)
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)

View File

@@ -0,0 +1,2 @@
fastrtc
websockets>=14.0

View File

@@ -0,0 +1,173 @@
import base64
import json
import os
from pathlib import Path
from typing import cast
import gradio as gr
import huggingface_hub
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, StreamingResponse
from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
Stream,
get_stt_model,
get_twilio_turn_credentials,
)
from gradio.utils import get_space
from pydantic import BaseModel
load_dotenv()
curr_dir = Path(__file__).parent
client = huggingface_hub.InferenceClient(
api_key=os.environ.get("SAMBANOVA_API_KEY"),
provider="sambanova",
)
stt_model = get_stt_model()
def response(
audio: tuple[int, np.ndarray],
gradio_chatbot: list[dict] | None = None,
conversation_state: list[dict] | None = None,
textbox: str | None = None,
):
gradio_chatbot = gradio_chatbot or []
conversation_state = conversation_state or []
print("chatbot", gradio_chatbot)
if textbox:
text = textbox
else:
text = stt_model.stt(audio)
sample_rate, array = audio
gradio_chatbot.append({"role": "user", "content": text})
yield AdditionalOutputs(gradio_chatbot, conversation_state)
conversation_state.append({"role": "user", "content": text})
request = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=conversation_state, # type: ignore
temperature=0.1,
top_p=0.1,
)
response = {"role": "assistant", "content": request.choices[0].message.content}
conversation_state.append(response)
gradio_chatbot.append(response)
yield AdditionalOutputs(gradio_chatbot, conversation_state)
chatbot = gr.Chatbot(type="messages", value=[])
state = gr.State(value=[])
textbox = gr.Textbox(value="", interactive=True)
stream = Stream(
ReplyOnPause(
response, # type: ignore
input_sample_rate=16000,
),
mode="send",
modality="audio",
additional_inputs=[
chatbot,
state,
textbox,
],
additional_outputs=[chatbot, state],
additional_outputs_handler=lambda *a: (a[2], a[3]),
concurrency_limit=20 if get_space() else 5,
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
)
def trigger_response(webrtc_id: str):
cast(ReplyOnPause, stream.webrtc_component.handlers[webrtc_id]).trigger_response()
return ""
with stream.ui as demo:
button = gr.Button("Send")
button.click(
trigger_response,
inputs=[stream.webrtc_component],
outputs=[textbox],
)
stream.ui = demo
app = FastAPI()
stream.mount(app)
class Message(BaseModel):
role: str
content: str
class InputData(BaseModel):
webrtc_id: str
chatbot: list[Message]
state: list[Message]
textbox: str
@app.get("/")
async def _():
rtc_config = get_twilio_turn_credentials() if get_space() else None
html_content = (curr_dir / "index.html").read_text()
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content)
@app.post("/input_hook")
async def _(data: InputData):
body = data.model_dump()
stream.set_input(data.webrtc_id, body["chatbot"], body["state"], body["textbox"])
cast(ReplyOnPause, stream.handlers[data.webrtc_id]).trigger_response()
def audio_to_base64(file_path):
audio_format = "wav"
with open(file_path, "rb") as audio_file:
encoded_audio = base64.b64encode(audio_file.read()).decode("utf-8")
return f"data:audio/{audio_format};base64,{encoded_audio}"
@app.get("/outputs")
async def _(webrtc_id: str):
async def output_stream():
async for output in stream.output_stream(webrtc_id):
chatbot = output.args[0]
state = output.args[1]
user_message = chatbot[-1]["content"]
data = {
"message": state[-1],
"audio": (
audio_to_base64(user_message["path"])
if isinstance(user_message, dict) and "path" in user_message
else None
),
}
yield f"event: output\ndata: {json.dumps(data)}\n\n"
return StreamingResponse(output_stream(), media_type="text/event-stream")
if __name__ == "__main__":
import os
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
raise ValueError("Phone mode not supported")
else:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)

View File

@@ -0,0 +1,539 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Talk to Sambanova</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background-color: #f8f9fa;
color: #1a1a1a;
margin: 0;
padding: 20px;
height: 100vh;
box-sizing: border-box;
}
.container {
max-width: 800px;
margin: 0 auto;
height: 80%;
}
.logo {
text-align: center;
margin-bottom: 40px;
}
.chat-container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
padding: 20px;
height: 90%;
box-sizing: border-box;
display: flex;
flex-direction: column;
}
.chat-messages {
flex-grow: 1;
overflow-y: auto;
margin-bottom: 20px;
padding: 10px;
}
.message {
margin-bottom: 20px;
padding: 12px;
border-radius: 8px;
font-size: 14px;
line-height: 1.5;
}
.message.user {
background-color: #e9ecef;
margin-left: 20%;
}
.message.assistant {
background-color: #f1f3f5;
margin-right: 20%;
}
.controls {
text-align: center;
margin-top: 20px;
}
button {
background-color: #0066cc;
color: white;
border: none;
padding: 12px 24px;
font-family: inherit;
font-size: 14px;
cursor: pointer;
transition: all 0.3s;
border-radius: 4px;
font-weight: 500;
}
button:hover {
background-color: #0052a3;
}
#audio-output {
display: none;
}
.icon-with-spinner {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.spinner {
width: 20px;
height: 20px;
border: 2px solid #ffffff;
border-top-color: transparent;
border-radius: 50%;
animation: spin 1s linear infinite;
flex-shrink: 0;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
.pulse-container {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.pulse-circle {
width: 20px;
height: 20px;
border-radius: 50%;
background-color: #ffffff;
opacity: 0.2;
flex-shrink: 0;
transform: translateX(-0%) scale(var(--audio-level, 1));
transition: transform 0.1s ease;
}
/* Add styles for typing indicator */
.typing-indicator {
padding: 8px;
background-color: #f1f3f5;
border-radius: 8px;
margin-bottom: 10px;
display: none;
}
.dots {
display: inline-flex;
gap: 4px;
}
.dot {
width: 8px;
height: 8px;
background-color: #0066cc;
border-radius: 50%;
animation: pulse 1.5s infinite;
opacity: 0.5;
}
.dot:nth-child(2) {
animation-delay: 0.5s;
}
.dot:nth-child(3) {
animation-delay: 1s;
}
@keyframes pulse {
0%,
100% {
opacity: 0.5;
transform: scale(1);
}
50% {
opacity: 1;
transform: scale(1.2);
}
}
/* Add styles for toast notifications */
.toast {
position: fixed;
top: 20px;
left: 50%;
transform: translateX(-50%);
padding: 16px 24px;
border-radius: 4px;
font-size: 14px;
z-index: 1000;
display: none;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
}
.toast.error {
background-color: #f44336;
color: white;
}
.toast.warning {
background-color: #ffd700;
color: black;
}
/* Add styles for text input */
.text-input-container {
display: flex;
margin-top: 10px;
gap: 10px;
}
#text-input {
flex-grow: 1;
padding: 10px;
border: 1px solid #ddd;
border-radius: 4px;
font-family: inherit;
font-size: 14px;
}
.text-input-container button {
padding: 10px 15px;
}
</style>
</head>
<body>
<!-- Add toast element after body opening tag -->
<div id="error-toast" class="toast"></div>
<div class="container">
<div class="logo">
<h1>Talk to Sambanova 🗣️</h1>
<h2 style="font-size: 1.2em; color: #666; margin-top: 10px;">Speak to Llama 3.2 powered by Sambanova API
</h2>
</div>
<div class="chat-container">
<div class="chat-messages" id="chat-messages"></div>
<div class="typing-indicator" id="typing-indicator">
<div class="dots">
<div class="dot"></div>
<div class="dot"></div>
<div class="dot"></div>
</div>
</div>
<!-- Added text input form -->
<form id="text-input-form" class="text-input-container">
<input type="text" id="text-input" placeholder="Type your message..." />
<button type="submit">Send</button>
</form>
</div>
<div class="controls">
<button id="start-button">Start Conversation</button>
</div>
</div>
<audio id="audio-output"></audio>
<script>
let peerConnection;
let webrtc_id;
const startButton = document.getElementById('start-button');
const chatMessages = document.getElementById('chat-messages');
let audioLevel = 0;
let animationFrame;
let audioContext, analyser, audioSource;
let messages = [];
let eventSource;
function updateButtonState() {
const button = document.getElementById('start-button');
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
button.innerHTML = `
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
</div>
`;
} else if (peerConnection && peerConnection.connectionState === 'connected') {
button.innerHTML = `
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Conversation</span>
</div>
`;
} else {
button.innerHTML = 'Start Conversation';
}
}
function setupAudioVisualization(stream) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
audioSource = audioContext.createMediaStreamSource(stream);
audioSource.connect(analyser);
analyser.fftSize = 64;
const dataArray = new Uint8Array(analyser.frequencyBinCount);
function updateAudioLevel() {
analyser.getByteFrequencyData(dataArray);
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
audioLevel = average / 255;
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
}
animationFrame = requestAnimationFrame(updateAudioLevel);
}
updateAudioLevel();
}
function showError(message) {
const toast = document.getElementById('error-toast');
toast.textContent = message;
toast.className = 'toast error';
toast.style.display = 'block';
// Hide toast after 5 seconds
setTimeout(() => {
toast.style.display = 'none';
}, 5000);
}
function handleMessage(event) {
const eventJson = JSON.parse(event.data);
const typingIndicator = document.getElementById('typing-indicator');
const textInput = document.getElementById('text-input');
if (eventJson.type === "error") {
showError(eventJson.message);
} else if (eventJson.type === "send_input") {
fetch('/input_hook', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
webrtc_id: webrtc_id,
chatbot: messages,
state: messages,
textbox: textInput.value
})
});
} else if (eventJson.type === "log") {
if (eventJson.data === "pause_detected") {
typingIndicator.style.display = 'block';
chatMessages.scrollTop = chatMessages.scrollHeight;
} else if (eventJson.data === "response_starting") {
typingIndicator.style.display = 'none';
}
}
}
async function setupWebRTC() {
const config = __RTC_CONFIGURATION__;
peerConnection = new RTCPeerConnection(config);
const timeoutId = setTimeout(() => {
const toast = document.getElementById('error-toast');
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
toast.className = 'toast warning';
toast.style.display = 'block';
// Hide warning after 5 seconds
setTimeout(() => {
toast.style.display = 'none';
}, 5000);
}, 5000);
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: true
});
setupAudioVisualization(stream);
stream.getTracks().forEach(track => {
peerConnection.addTrack(track, stream);
});
const dataChannel = peerConnection.createDataChannel('text');
dataChannel.onmessage = handleMessage;
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
peerConnection.onicecandidate = ({ candidate }) => {
if (candidate) {
console.debug("Sending ICE candidate", candidate);
fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
candidate: candidate.toJSON(),
webrtc_id: webrtc_id,
type: "ice-candidate",
})
})
}
};
peerConnection.addEventListener('connectionstatechange', () => {
console.log('connectionstatechange', peerConnection.connectionState);
if (peerConnection.connectionState === 'connected') {
clearTimeout(timeoutId);
const toast = document.getElementById('error-toast');
toast.style.display = 'none';
}
updateButtonState();
});
webrtc_id = Math.random().toString(36).substring(7);
const response = await fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sdp: peerConnection.localDescription.sdp,
type: peerConnection.localDescription.type,
webrtc_id: webrtc_id
})
});
const serverResponse = await response.json();
if (serverResponse.status === 'failed') {
showError(serverResponse.meta.error === 'concurrency_limit_reached'
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
: serverResponse.meta.error);
stop();
return;
}
await peerConnection.setRemoteDescription(serverResponse);
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
eventSource.addEventListener("output", (event) => {
const eventJson = JSON.parse(event.data);
console.log(eventJson);
messages.push(eventJson.message);
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
});
} catch (err) {
clearTimeout(timeoutId);
console.error('Error setting up WebRTC:', err);
showError('Failed to establish connection. Please try again.');
stop();
}
}
function addMessage(role, content) {
const messageDiv = document.createElement('div');
messageDiv.classList.add('message', role);
if (role === 'user' && content.startsWith("data:audio/wav;base64,")) {
// Create audio element for user messages
const audio = document.createElement('audio');
audio.controls = true;
audio.src = content;
messageDiv.appendChild(audio);
} else {
// Text content for assistant messages
messageDiv.textContent = content;
}
chatMessages.appendChild(messageDiv);
chatMessages.scrollTop = chatMessages.scrollHeight;
}
function stop() {
if (eventSource) {
eventSource.close();
eventSource = null;
}
if (animationFrame) {
cancelAnimationFrame(animationFrame);
}
if (audioContext) {
audioContext.close();
audioContext = null;
analyser = null;
audioSource = null;
}
if (peerConnection) {
if (peerConnection.getTransceivers) {
peerConnection.getTransceivers().forEach(transceiver => {
if (transceiver.stop) {
transceiver.stop();
}
});
}
if (peerConnection.getSenders) {
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.stop) sender.track.stop();
});
}
peerConnection.close();
}
updateButtonState();
audioLevel = 0;
}
startButton.addEventListener('click', () => {
if (!peerConnection || peerConnection.connectionState !== 'connected') {
setupWebRTC();
} else {
stop();
}
});
// Add event listener for text input form
document.getElementById('text-input-form').addEventListener('submit', function (e) {
e.preventDefault();
const textInput = document.getElementById('text-input');
if (textInput.value.trim() !== '') {
fetch('/input_hook', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
webrtc_id: webrtc_id,
chatbot: messages,
state: messages,
textbox: textInput.value
})
});
// Clear the input after submission
textInput.value = '';
}
});
</script>
</body>
</html>

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Talk to Gemini using Google's multimodal API
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Talk to Gemini (Gradio UI)
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -3,7 +3,8 @@ import base64
import json
import os
import pathlib
from typing import AsyncGenerator, Literal
from collections.abc import AsyncGenerator
from typing import Literal
import gradio as gr
import numpy as np
@@ -13,7 +14,7 @@ from fastapi.responses import HTMLResponse
from fastrtc import (
AsyncStreamHandler,
Stream,
get_twilio_turn_credentials,
get_cloudflare_turn_credentials_async,
wait_for_item,
)
from google import genai
@@ -116,7 +117,7 @@ stream = Stream(
modality="audio",
mode="send-receive",
handler=GeminiHandler(),
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
concurrency_limit=5 if get_space() else None,
time_limit=90 if get_space() else None,
additional_inputs=[
@@ -159,7 +160,7 @@ async def _(body: InputData):
@app.get("/")
async def index():
rtc_config = get_twilio_turn_credentials() if get_space() else None
rtc_config = await get_cloudflare_turn_credentials_async() if get_space() else None
html_content = (current_dir / "index.html").read_text()
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content)

View File

@@ -98,6 +98,11 @@
font-weight: 600;
cursor: pointer;
transition: all 0.2s ease;
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
button:hover {
@@ -134,7 +139,6 @@
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.pulse-circle {
@@ -171,6 +175,23 @@
background-color: #ffd700;
color: black;
}
/* Add styles for the mute toggle */
.mute-toggle {
width: 24px;
height: 24px;
cursor: pointer;
flex-shrink: 0;
}
.mute-toggle svg {
display: block;
}
#start-button {
margin-left: auto;
margin-right: auto;
}
</style>
</head>
@@ -221,6 +242,11 @@
let dataChannel;
let isRecording = false;
let webrtc_id;
let isMuted = false;
let analyser_input, dataArray_input;
let analyser, dataArray;
let source_input = null;
let source_output = null;
const startButton = document.getElementById('start-button');
const apiKeyInput = document.getElementById('api-key');
@@ -235,7 +261,28 @@
boxContainer.appendChild(box);
}
// SVG Icons
const micIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>`;
const micMutedIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
<line x1="1" y1="1" x2="23" y2="23"></line>
</svg>`;
function updateButtonState() {
startButton.innerHTML = '';
startButton.onclick = null;
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
startButton.innerHTML = `
<div class="icon-with-spinner">
@@ -243,15 +290,28 @@
<span>Connecting...</span>
</div>
`;
startButton.disabled = true;
} else if (peerConnection && peerConnection.connectionState === 'connected') {
startButton.innerHTML = `
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Recording</span>
</div>
const pulseContainer = document.createElement('div');
pulseContainer.className = 'pulse-container';
pulseContainer.innerHTML = `
<div class="pulse-circle"></div>
<span>Stop Recording</span>
`;
const muteToggle = document.createElement('div');
muteToggle.className = 'mute-toggle';
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
muteToggle.addEventListener('click', toggleMute);
startButton.appendChild(pulseContainer);
startButton.appendChild(muteToggle);
startButton.disabled = false;
} else {
startButton.innerHTML = 'Start Recording';
startButton.disabled = false;
}
}
@@ -267,6 +327,23 @@
}, 5000);
}
function toggleMute(event) {
event.stopPropagation();
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
isMuted = !isMuted;
console.log("Mute toggled:", isMuted);
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.kind === 'audio') {
sender.track.enabled = !isMuted;
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
}
});
updateButtonState();
}
async function setupWebRTC() {
const config = __RTC_CONFIGURATION__;
peerConnection = new RTCPeerConnection(config);
@@ -288,58 +365,74 @@
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
stream.getTracks().forEach(track => peerConnection.addTrack(track, stream));
// Update audio visualization setup
audioContext = new AudioContext();
if (!audioContext || audioContext.state === 'closed') {
audioContext = new AudioContext();
}
if (source_input) {
try { source_input.disconnect(); } catch (e) { console.warn("Error disconnecting previous input source:", e); }
source_input = null;
}
source_input = audioContext.createMediaStreamSource(stream);
analyser_input = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser_input);
source_input.connect(analyser_input);
analyser_input.fftSize = 64;
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
function updateAudioLevel() {
analyser_input.getByteFrequencyData(dataArray_input);
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
const audioLevel = average / 255;
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
console.log("audioLevel", audioLevel);
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
}
animationId = requestAnimationFrame(updateAudioLevel);
}
updateAudioLevel();
// Add connection state change listener
peerConnection.addEventListener('connectionstatechange', () => {
console.log('connectionstatechange', peerConnection.connectionState);
if (peerConnection.connectionState === 'connected') {
clearTimeout(timeoutId);
const toast = document.getElementById('error-toast');
toast.style.display = 'none';
if (analyser_input) updateAudioLevel();
if (analyser) updateVisualization();
} else if (['disconnected', 'failed', 'closed'].includes(peerConnection.connectionState)) {
// Explicitly stop animations if connection drops unexpectedly
// Note: stopWebRTC() handles the normal stop case
}
updateButtonState();
});
// Handle incoming audio
peerConnection.addEventListener('track', (evt) => {
if (audioOutput && audioOutput.srcObject !== evt.streams[0]) {
audioOutput.srcObject = evt.streams[0];
audioOutput.play();
peerConnection.onicecandidate = ({ candidate }) => {
if (candidate) {
console.debug("Sending ICE candidate", candidate);
fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
candidate: candidate.toJSON(),
webrtc_id: webrtc_id,
type: "ice-candidate",
})
})
}
};
// Set up audio visualization on the output stream
audioContext = new AudioContext();
analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(evt.streams[0]);
source.connect(analyser);
analyser.fftSize = 2048;
dataArray = new Uint8Array(analyser.frequencyBinCount);
updateVisualization();
peerConnection.addEventListener('track', (evt) => {
if (evt.track.kind === 'audio' && audioOutput) {
if (audioOutput.srcObject !== evt.streams[0]) {
audioOutput.srcObject = evt.streams[0];
audioOutput.play().catch(e => console.error("Audio play failed:", e));
if (!audioContext || audioContext.state === 'closed') {
console.warn("AudioContext not ready for output track analysis.");
return;
}
if (source_output) {
try { source_output.disconnect(); } catch (e) { console.warn("Error disconnecting previous output source:", e); }
source_output = null;
}
source_output = audioContext.createMediaStreamSource(evt.streams[0]);
analyser = audioContext.createAnalyser();
source_output.connect(analyser);
analyser.fftSize = 2048;
dataArray = new Uint8Array(analyser.frequencyBinCount);
updateVisualization();
}
}
});
// Create data channel for messages
dataChannel = peerConnection.createDataChannel('text');
dataChannel.onmessage = (event) => {
const eventJson = JSON.parse(event.data);
@@ -360,24 +453,9 @@
}
};
// Create and send offer
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
await new Promise((resolve) => {
if (peerConnection.iceGatheringState === "complete") {
resolve();
} else {
const checkState = () => {
if (peerConnection.iceGatheringState === "complete") {
peerConnection.removeEventListener("icegatheringstatechange", checkState);
resolve();
}
};
peerConnection.addEventListener("icegatheringstatechange", checkState);
}
});
const response = await fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
@@ -394,7 +472,7 @@
showError(serverResponse.meta.error === 'concurrency_limit_reached'
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
: serverResponse.meta.error);
stop();
stopWebRTC();
startButton.textContent = 'Start Recording';
return;
}
@@ -404,13 +482,17 @@
clearTimeout(timeoutId);
console.error('Error setting up WebRTC:', err);
showError('Failed to establish connection. Please try again.');
stop();
stopWebRTC();
startButton.textContent = 'Start Recording';
}
}
function updateVisualization() {
if (!analyser) return;
if (!analyser || !peerConnection || !['connected', 'connecting'].includes(peerConnection.connectionState)) {
const bars = document.querySelectorAll('.box');
bars.forEach(bar => bar.style.transform = 'scaleY(0.1)');
return;
}
analyser.getByteFrequencyData(dataArray);
const bars = document.querySelectorAll('.box');
@@ -420,32 +502,114 @@
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
}
animationId = requestAnimationFrame(updateVisualization);
requestAnimationFrame(updateVisualization);
}
function updateAudioLevel() {
if (!analyser_input || !peerConnection || !['connected', 'connecting'].includes(peerConnection.connectionState)) {
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
pulseCircle.style.setProperty('--audio-level', 1);
}
return;
}
analyser_input.getByteFrequencyData(dataArray_input);
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
const audioLevel = average / 255;
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
}
requestAnimationFrame(updateAudioLevel);
}
function stopWebRTC() {
console.log("Running stopWebRTC");
if (peerConnection) {
peerConnection.close();
peerConnection.getSenders().forEach(sender => {
if (sender.track) {
sender.track.stop();
}
});
peerConnection.ontrack = null;
peerConnection.onicegatheringstatechange = null;
peerConnection.onconnectionstatechange = null;
if (dataChannel) {
dataChannel.onmessage = null;
try { dataChannel.close(); } catch (e) { console.warn("Error closing data channel:", e); }
dataChannel = null;
}
try { peerConnection.close(); } catch (e) { console.warn("Error closing peer connection:", e); }
peerConnection = null;
}
if (animationId) {
cancelAnimationFrame(animationId);
if (audioOutput) {
audioOutput.pause();
audioOutput.srcObject = null;
}
if (audioContext) {
audioContext.close();
if (source_input) {
try { source_input.disconnect(); } catch (e) { console.warn("Error disconnecting input source:", e); }
source_input = null;
}
if (source_output) {
try { source_output.disconnect(); } catch (e) { console.warn("Error disconnecting output source:", e); }
source_output = null;
}
if (audioContext && audioContext.state !== 'closed') {
audioContext.close().then(() => {
console.log("AudioContext closed successfully.");
audioContext = null;
}).catch(e => {
console.error("Error closing AudioContext:", e);
audioContext = null;
});
} else {
audioContext = null;
}
analyser_input = null;
dataArray_input = null;
analyser = null;
dataArray = null;
isMuted = false;
isRecording = false;
updateButtonState();
const bars = document.querySelectorAll('.box');
bars.forEach(bar => bar.style.transform = 'scaleY(0.1)');
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
pulseCircle.style.setProperty('--audio-level', 1);
}
}
startButton.addEventListener('click', () => {
if (!isRecording) {
setupWebRTC();
startButton.classList.add('recording');
} else {
stopWebRTC();
startButton.classList.remove('recording');
startButton.addEventListener('click', (event) => {
if (event.target.closest('.mute-toggle')) {
return;
}
if (peerConnection && peerConnection.connectionState === 'connected') {
console.log("Stop button clicked");
stopWebRTC();
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
console.log("Start button clicked");
if (!apiKeyInput.value) {
showError("Please enter your API Key.");
return;
}
setupWebRTC();
isRecording = true;
updateButtonState();
}
isRecording = !isRecording;
});
updateButtonState();
</script>
</body>

View File

@@ -1,4 +1,4 @@
fastrtc
fastrtc[vad]==0.0.20.rc2
python-dotenv
google-genai
twilio

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

View File

@@ -0,0 +1,15 @@
---
title: Talk to Llama 4
emoji: 🦙
colorFrom: purple
colorTo: red
sdk: gradio
sdk_version: 5.23.3
app_file: app.py
pinned: false
license: mit
short_description: Talk to Llama 4 using Groq + Cloudflare
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

136
demo/talk_to_llama4/app.py Normal file
View File

@@ -0,0 +1,136 @@
import json
import os
from pathlib import Path
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, StreamingResponse
from fastrtc import (
AdditionalOutputs,
CartesiaTTSOptions,
ReplyOnPause,
Stream,
get_cloudflare_turn_credentials_async,
get_current_context,
get_stt_model,
get_tts_model,
)
from groq import Groq
from numpy.typing import NDArray
curr_dir = Path(__file__).parent
load_dotenv()
tts_model = get_tts_model(
model="cartesia", cartesia_api_key=os.getenv("CARTESIA_API_KEY")
)
groq = Groq(api_key=os.getenv("GROQ_API_KEY"))
stt_model = get_stt_model()
conversations: dict[str, list[dict[str, str]]] = {}
def response(user_audio: tuple[int, NDArray[np.int16]]):
context = get_current_context()
if context.webrtc_id not in conversations:
conversations[context.webrtc_id] = [
{
"role": "system",
"content": (
"You are a helpful assistant that can answer questions and help with tasks."
'Please return a short (that will be converted to audio using a text-to-speech model) response and long response to this question. They can be the same if appropriate. Please return in JSON format\n\n{"short":, "long"}\n\n'
),
}
]
messages = conversations[context.webrtc_id]
transcription = stt_model.stt(user_audio)
messages.append({"role": "user", "content": transcription})
completion = groq.chat.completions.create( # type: ignore
model="meta-llama/llama-4-scout-17b-16e-instruct",
messages=messages, # type: ignore
temperature=1,
max_completion_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = completion.choices[0].message.content
response = json.loads(response)
short_response = response["short"]
long_response = response["long"]
messages.append({"role": "assistant", "content": long_response})
conversations[context.webrtc_id] = messages
yield from tts_model.stream_tts_sync(
short_response, options=CartesiaTTSOptions(sample_rate=24_000)
)
yield AdditionalOutputs(messages)
stream = Stream(
ReplyOnPause(response),
modality="audio",
mode="send-receive",
additional_outputs=[gr.Chatbot(type="messages")],
additional_outputs_handler=lambda old, new: new,
rtc_configuration=None,
ui_args={"hide_title": True},
)
with gr.Blocks() as demo:
gr.HTML(
f"""
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
<img src="/gradio_api/file={str((Path(__file__).parent / "AV_Huggy.png").resolve())}" alt="AV Huggy" style="height: 100px; margin-right: 10px"> FastRTC + Cartesia TTS = Blazing Fast LLM Audio
</h1>
"""
)
stream.ui.render()
stream.ui = demo
app = FastAPI()
stream.mount(app)
@app.get("/")
async def _():
rtc_config = await get_cloudflare_turn_credentials_async()
html_content = (curr_dir / "index.html").read_text()
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content)
@app.get("/outputs")
async def _(webrtc_id: str):
async def output_stream():
async for output in stream.output_stream(webrtc_id):
state = output.args[0]
for msg in state[-2:]:
data = {
"message": msg,
}
yield f"event: output\ndata: {json.dumps(data)}\n\n"
return StreamingResponse(output_stream(), media_type="text/event-stream")
if __name__ == "__main__":
import os
from pathlib import Path
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(
server_port=7860,
allowed_paths=[str((Path(__file__).parent / "AV_Huggy.png").resolve())],
)
elif mode == "PHONE":
raise ValueError("Phone mode not supported")
else:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)

View File

@@ -0,0 +1,839 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Talk to Llama 4</title>
<style>
:root {
--color-primary: #3b82f6;
--color-secondary: #f97316;
--color-background: #0f172a;
--color-surface: #1e293b;
--color-text: #f1f5f9;
--color-message-user: #334155;
--color-message-assistant: #1e40af;
--gradient-primary: linear-gradient(135deg, #3b82f6, #8b5cf6);
--gradient-secondary: linear-gradient(135deg, #f97316, #ec4899);
--boxSize: 8px;
--gutter: 4px;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background-color: var(--color-background);
color: var(--color-text);
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem 1rem;
background-image:
radial-gradient(circle at 25% 25%, rgba(59, 130, 246, 0.1) 0%, transparent 50%),
radial-gradient(circle at 75% 75%, rgba(249, 115, 22, 0.1) 0%, transparent 50%);
}
.header-container {
display: flex;
align-items: center;
gap: 2rem;
margin-bottom: 2rem;
width: 100%;
max-width: 800px;
animation: fadeIn 1s ease-out;
}
.header {
text-align: left;
}
.header h1 {
font-size: 2.5rem;
margin-bottom: 0.5rem;
background: var(--gradient-primary);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-weight: 800;
}
.header h2 {
font-size: 1.2rem;
font-weight: 400;
color: rgba(241, 245, 249, 0.8);
margin-bottom: 1rem;
}
.logo {
width: 120px;
height: 120px;
background: var(--color-surface);
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
box-shadow: 0 15px 30px rgba(0, 0, 0, 0.3);
position: relative;
overflow: hidden;
animation: float 6s ease-in-out infinite;
flex-shrink: 0;
}
.logo::before {
content: "";
position: absolute;
width: 200%;
height: 200%;
background: var(--gradient-secondary);
opacity: 0.2;
animation: rotate 10s linear infinite;
}
.logo img {
width: 75%;
height: 75%;
object-fit: contain;
z-index: 2;
}
.container {
width: 100%;
max-width: 800px;
background-color: var(--color-surface);
border-radius: 1rem;
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
overflow: hidden;
animation: slideUp 0.5s ease-out;
}
.chat-container {
height: 400px;
overflow-y: auto;
padding: 1.5rem;
display: flex;
flex-direction: column;
gap: 1rem;
scroll-behavior: smooth;
}
.message {
max-width: 80%;
padding: 1rem;
border-radius: 1rem;
line-height: 1.5;
animation: fadeIn 0.3s ease-out;
}
.message.user {
background-color: var(--color-message-user);
color: var(--color-text);
align-self: flex-end;
border-bottom-right-radius: 0.25rem;
}
.message.assistant {
background-color: var(--color-message-assistant);
color: var(--color-text);
align-self: flex-start;
border-bottom-left-radius: 0.25rem;
}
.wave-visualizer {
height: 100px;
padding: 1rem;
background-color: rgba(30, 41, 59, 0.8);
display: flex;
align-items: center;
justify-content: center;
position: relative;
overflow: hidden;
border-top: 1px solid rgba(255, 255, 255, 0.1);
}
.box-container {
display: flex;
justify-content: space-between;
align-items: center;
width: 100%;
height: 64px;
padding: 0 1rem;
}
.box {
height: 100%;
width: var(--boxSize);
background: var(--gradient-primary);
border-radius: 4px;
transform: scaleY(0.1);
transition: transform 0.05s ease;
}
.controls {
display: flex;
justify-content: center;
align-items: center;
padding: 1.5rem;
gap: 1rem;
border-top: 1px solid rgba(255, 255, 255, 0.1);
}
#start-button {
display: flex;
align-items: center;
justify-content: center;
background: var(--gradient-primary);
color: white;
border: none;
border-radius: 9999px;
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 14px rgba(59, 130, 246, 0.4);
}
#start-button:hover {
transform: translateY(-2px);
box-shadow: 0 6px 20px rgba(59, 130, 246, 0.6);
}
#start-button:active {
transform: translateY(1px);
}
.icon-with-spinner {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.spinner {
width: 20px;
height: 20px;
border: 2px solid white;
border-top-color: transparent;
border-radius: 50%;
animation: spin 1s linear infinite;
flex-shrink: 0;
}
.pulse-container {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
}
.pulse-circle {
width: 20px;
height: 20px;
border-radius: 50%;
background: var(--color-secondary);
opacity: 0.85;
flex-shrink: 0;
transform: scale(var(--audio-level, 1));
transition: transform 0.1s ease;
}
.mute-toggle {
width: 24px;
height: 24px;
cursor: pointer;
margin-left: 12px;
flex-shrink: 0;
filter: drop-shadow(0 4px 6px rgba(0, 0, 0, 0.2));
}
.mute-toggle svg {
width: 100%;
height: 100%;
stroke: white;
}
.typing-indicator {
padding: 0.5rem 1rem;
display: inline-flex;
align-items: center;
background-color: var(--color-message-assistant);
border-radius: 1rem;
align-self: flex-start;
margin-bottom: 0.5rem;
display: none;
animation: fadeIn 0.3s ease-out;
}
.dots {
display: inline-flex;
gap: 4px;
}
.dot {
width: 8px;
height: 8px;
background-color: white;
border-radius: 50%;
animation: bounce 1.5s infinite;
opacity: 0.7;
}
.dot:nth-child(2) {
animation-delay: 0.15s;
}
.dot:nth-child(3) {
animation-delay: 0.3s;
}
.toast {
position: fixed;
top: 20px;
left: 50%;
transform: translateX(-50%);
padding: 1rem 1.5rem;
border-radius: 0.5rem;
font-size: 0.875rem;
z-index: 1000;
display: none;
box-shadow: 0 10px 25px rgba(0, 0, 0, 0.3);
animation: slideDown 0.3s ease-out;
}
.toast.error {
background-color: #ef4444;
color: white;
}
.toast.warning {
background-color: #f59e0b;
color: black;
}
#audio-output {
display: none;
}
@keyframes float {
0%,
100% {
transform: translateY(0);
}
50% {
transform: translateY(-10px);
}
}
@keyframes rotate {
0% {
transform: rotate(0deg);
}
100% {
transform: rotate(360deg);
}
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
@keyframes bounce {
0%,
100% {
transform: translateY(0);
}
50% {
transform: translateY(-4px);
}
}
@keyframes fadeIn {
from {
opacity: 0;
}
to {
opacity: 1;
}
}
@keyframes slideUp {
from {
opacity: 0;
transform: translateY(20px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
@keyframes slideDown {
from {
opacity: 0;
transform: translate(-50%, -20px);
}
to {
opacity: 1;
transform: translate(-50%, 0);
}
}
</style>
</head>
<body>
<div id="error-toast" class="toast"></div>
<div class="header-container">
<div class="logo">
<img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/Video%26Audio%20huggy.png"
alt="LLaMA Logo">
</div>
<div class="header">
<h1>Talk to LLaMA 4</h1>
<h2>Experience seamless real-time conversation thanks to Cloudflare and Hugging Face's FastRTC.</h2>
</div>
</div>
<div class="container">
<div class="chat-container" id="chat-messages">
<!-- Messages will appear here -->
</div>
<div class="typing-indicator" id="typing-indicator">
<div class="dots">
<div class="dot"></div>
<div class="dot"></div>
<div class="dot"></div>
</div>
</div>
<div class="wave-visualizer">
<div class="box-container" id="box-container">
<!-- Boxes will be dynamically added here -->
</div>
</div>
<div class="controls">
<button id="start-button">Start Conversation</button>
</div>
</div>
<audio id="audio-output"></audio>
<script>
let peerConnection;
let webrtc_id;
const startButton = document.getElementById('start-button');
const chatMessages = document.getElementById('chat-messages');
const boxContainer = document.getElementById('box-container');
const typingIndicator = document.getElementById('typing-indicator');
const audioOutput = document.getElementById('audio-output');
let audioLevel = 0;
let animationFrame_input, animationFrame_output;
let audioContext_input, audioContext_output;
let analyser_input, dataArray_input;
let analyser_output, dataArray_output;
let audioSource_input, audioSource_output;
let messages = [];
let eventSource;
let isMuted = false;
// Create wave visualizer boxes
const numBars = 32;
for (let i = 0; i < numBars; i++) {
const box = document.createElement('div');
box.className = 'box';
boxContainer.appendChild(box);
}
// SVG Icons
const micIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>`;
const micMutedIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
<line x1="1" y1="1" x2="23" y2="23"></line>
</svg>`;
function updateButtonState() {
const existingMuteButton = startButton.querySelector('.mute-toggle');
if (existingMuteButton) {
existingMuteButton.removeEventListener('click', toggleMute);
}
startButton.innerHTML = '';
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
startButton.innerHTML = `
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
</div>
`;
startButton.disabled = true;
} else if (peerConnection && peerConnection.connectionState === 'connected') {
const pulseContainer = document.createElement('div');
pulseContainer.className = 'pulse-container';
pulseContainer.innerHTML = `
<div class="pulse-circle"></div>
<span>Stop Conversation</span>
`;
const muteToggle = document.createElement('div');
muteToggle.className = 'mute-toggle';
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
muteToggle.addEventListener('click', toggleMute);
startButton.appendChild(pulseContainer);
startButton.appendChild(muteToggle);
startButton.disabled = false;
} else {
startButton.textContent = 'Start Conversation';
startButton.disabled = false;
}
}
function toggleMute(event) {
event.stopPropagation();
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
isMuted = !isMuted;
console.log("Mute toggled:", isMuted);
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.kind === 'audio') {
sender.track.enabled = !isMuted;
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
}
});
updateButtonState();
}
function setupAudioVisualization(stream) {
// Input audio context for pulse circle
audioContext_input = new (window.AudioContext || window.webkitAudioContext)();
analyser_input = audioContext_input.createAnalyser();
audioSource_input = audioContext_input.createMediaStreamSource(stream);
audioSource_input.connect(analyser_input);
analyser_input.fftSize = 64;
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
function updateAudioLevel() {
// Update input audio visualization (pulse circle)
analyser_input.getByteFrequencyData(dataArray_input);
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
audioLevel = average / 255;
const pulseCircle = document.querySelector('.pulse-circle');
if (pulseCircle) {
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
}
animationFrame_input = requestAnimationFrame(updateAudioLevel);
}
updateAudioLevel();
}
function setupOutputVisualization(stream) {
// Create separate audio context for output visualization
audioContext_output = new (window.AudioContext || window.webkitAudioContext)();
analyser_output = audioContext_output.createAnalyser();
audioSource_output = audioContext_output.createMediaStreamSource(stream);
audioSource_output.connect(analyser_output);
analyser_output.fftSize = 2048;
dataArray_output = new Uint8Array(analyser_output.frequencyBinCount);
function updateVisualization() {
// Update output audio visualization (wave bars)
analyser_output.getByteFrequencyData(dataArray_output);
const boxes = document.querySelectorAll('.box');
for (let i = 0; i < boxes.length; i++) {
const index = Math.floor(i * dataArray_output.length / boxes.length);
const value = dataArray_output[index] / 255;
boxes[i].style.transform = `scaleY(${Math.max(0.1, value * 1.5)})`;
}
animationFrame_output = requestAnimationFrame(updateVisualization);
}
updateVisualization();
}
// Reset wave visualization bars to minimum height
function resetVisualization() {
const boxes = document.querySelectorAll('.box');
boxes.forEach(box => box.style.transform = 'scaleY(0.1)');
}
function showError(message) {
const toast = document.getElementById('error-toast');
toast.textContent = message;
toast.className = 'toast error';
toast.style.display = 'block';
setTimeout(() => {
toast.style.display = 'none';
}, 5000);
}
function handleMessage(event) {
const eventJson = JSON.parse(event.data);
if (eventJson.type === "error") {
showError(eventJson.message);
} else if (eventJson.type === "send_input") {
fetch('/input_hook', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
webrtc_id: webrtc_id,
chatbot: messages,
state: messages
})
});
} else if (eventJson.type === "log") {
if (eventJson.data === "pause_detected") {
typingIndicator.style.display = 'block';
chatMessages.scrollTop = chatMessages.scrollHeight;
} else if (eventJson.data === "response_starting") {
typingIndicator.style.display = 'none';
}
}
}
async function setupWebRTC() {
const config = __RTC_CONFIGURATION__;
peerConnection = new RTCPeerConnection(config);
const timeoutId = setTimeout(() => {
const toast = document.getElementById('error-toast');
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
toast.className = 'toast warning';
toast.style.display = 'block';
setTimeout(() => {
toast.style.display = 'none';
}, 5000);
}, 5000);
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: true
});
setupAudioVisualization(stream);
stream.getTracks().forEach(track => {
peerConnection.addTrack(track, stream);
});
// Add this listener to handle incoming audio track
peerConnection.addEventListener('track', (event) => {
if (event.track.kind === 'audio') {
console.log("Received audio track from server");
if (audioOutput) {
audioOutput.srcObject = event.streams[0];
audioOutput.play().catch(e => console.error("Audio play failed:", e));
}
// Set up visualization for output audio with separate context
setupOutputVisualization(event.streams[0]);
}
});
const dataChannel = peerConnection.createDataChannel('text');
dataChannel.onmessage = handleMessage;
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
peerConnection.onicecandidate = ({ candidate }) => {
if (candidate) {
console.debug("Sending ICE candidate", candidate);
fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
candidate: candidate.toJSON(),
webrtc_id: webrtc_id,
type: "ice-candidate",
})
})
}
};
peerConnection.addEventListener('connectionstatechange', () => {
console.log('connectionstatechange', peerConnection.connectionState);
if (peerConnection.connectionState === 'connected') {
clearTimeout(timeoutId);
const toast = document.getElementById('error-toast');
toast.style.display = 'none';
} else if (['closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
stop();
}
updateButtonState();
});
webrtc_id = Math.random().toString(36).substring(7);
const response = await fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sdp: peerConnection.localDescription.sdp,
type: peerConnection.localDescription.type,
webrtc_id: webrtc_id
})
});
const serverResponse = await response.json();
if (serverResponse.status === 'failed') {
showError(serverResponse.meta.error === 'concurrency_limit_reached'
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
: serverResponse.meta.error);
stop();
return;
}
await peerConnection.setRemoteDescription(serverResponse);
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
eventSource.addEventListener("output", (event) => {
const eventJson = JSON.parse(event.data);
console.log(eventJson);
messages.push(eventJson.message);
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
})
} catch (err) {
clearTimeout(timeoutId);
console.error('Error setting up WebRTC:', err);
showError('Failed to establish connection. Please try again.');
stop();
}
}
function addMessage(role, content) {
const messageDiv = document.createElement('div');
messageDiv.classList.add('message', role);
messageDiv.textContent = content;
chatMessages.appendChild(messageDiv);
chatMessages.scrollTop = chatMessages.scrollHeight;
}
function stop() {
if (eventSource) {
eventSource.close();
eventSource = null;
}
if (animationFrame_input) {
cancelAnimationFrame(animationFrame_input);
animationFrame_input = null;
}
if (animationFrame_output) {
cancelAnimationFrame(animationFrame_output);
animationFrame_output = null;
}
if (audioContext_input) {
audioContext_input.close().catch(e => console.error("Error closing input AudioContext:", e));
audioContext_input = null;
analyser_input = null;
dataArray_input = null;
audioSource_input = null;
}
if (audioContext_output) {
audioContext_output.close().catch(e => console.error("Error closing output AudioContext:", e));
audioContext_output = null;
analyser_output = null;
dataArray_output = null;
audioSource_output = null;
}
if (audioOutput) {
audioOutput.pause();
audioOutput.srcObject = null;
}
// Reset visualization
resetVisualization();
if (peerConnection) {
if (peerConnection.getTransceivers) {
peerConnection.getTransceivers().forEach(transceiver => {
if (transceiver.stop) {
transceiver.stop();
}
});
}
peerConnection.onicecandidate = null;
peerConnection.ondatachannel = null;
peerConnection.onconnectionstatechange = null;
peerConnection.close();
peerConnection = null;
}
isMuted = false;
updateButtonState();
audioLevel = 0;
}
startButton.addEventListener('click', (event) => {
if (event.target.closest('.mute-toggle')) {
return;
}
if (peerConnection && peerConnection.connectionState === 'connected') {
console.log("Stop button clicked");
stop();
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
console.log("Start button clicked");
messages = [];
chatMessages.innerHTML = '';
setupWebRTC();
updateButtonState();
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,3 @@
fastrtc[vad, tts]==0.0.20.rc2
groq
python-dotenv

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Talk to OpenAI using their multimodal API
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|OPENAI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Talk to OpenAI (Gradio UI)
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|OPENAI_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -17,7 +17,6 @@ from fastrtc import (
wait_for_item,
)
from gradio.utils import get_space
from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
load_dotenv()
@@ -50,12 +49,32 @@ class OpenAIHandler(AsyncStreamHandler):
model="gpt-4o-mini-realtime-preview-2024-12-17"
) as conn:
await conn.session.update(
session={"turn_detection": {"type": "server_vad"}}
session={
"turn_detection": {"type": "server_vad"},
"input_audio_transcription": {
"model": "whisper-1",
"language": "en",
},
}
)
self.connection = conn
async for event in self.connection:
# Handle interruptions
if event.type == "input_audio_buffer.speech_started":
self.clear_queue()
if (
event.type
== "conversation.item.input_audio_transcription.completed"
):
await self.output_queue.put(
AdditionalOutputs({"role": "user", "content": event.transcript})
)
if event.type == "response.audio_transcript.done":
await self.output_queue.put(AdditionalOutputs(event))
await self.output_queue.put(
AdditionalOutputs(
{"role": "assistant", "content": event.transcript}
)
)
if event.type == "response.audio.delta":
await self.output_queue.put(
(
@@ -83,8 +102,8 @@ class OpenAIHandler(AsyncStreamHandler):
self.connection = None
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
chatbot.append({"role": "assistant", "content": response.transcript})
def update_chatbot(chatbot: list[dict], response: dict):
chatbot.append(response)
return chatbot
@@ -121,7 +140,7 @@ def _(webrtc_id: str):
import json
async for output in stream.output_stream(webrtc_id):
s = json.dumps({"role": "assistant", "content": output.args[0].transcript})
s = json.dumps(output.args[0])
yield f"event: output\ndata: {s}\n\n"
return StreamingResponse(output_stream(), media_type="text/event-stream")

View File

@@ -45,20 +45,26 @@
.message {
margin-bottom: 20px;
padding: 12px;
border-radius: 4px;
padding: 12px 16px;
border-radius: 8px;
font-size: 16px;
line-height: 1.5;
max-width: 70%;
clear: both;
}
.message.user {
background-color: #1a1a1a;
margin-left: 20%;
background-color: #2c2c2c;
float: right;
border-bottom-right-radius: 2px;
border: 1px solid #404040;
}
.message.assistant {
background-color: #262626;
margin-right: 20%;
float: left;
border-bottom-left-radius: 2px;
border: 1px solid #333;
}
.controls {
@@ -67,16 +73,21 @@
}
button {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 10px;
padding: 12px 24px;
background-color: transparent;
color: #ffffff;
border: 1px solid #ffffff;
padding: 12px 24px;
font-family: inherit;
font-size: 16px;
cursor: pointer;
transition: all 0.3s;
text-transform: uppercase;
letter-spacing: 1px;
position: relative;
}
button:hover {
@@ -116,9 +127,7 @@
.pulse-container {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.pulse-circle {
@@ -128,10 +137,47 @@
background-color: #ffffff;
opacity: 0.2;
flex-shrink: 0;
transform: translateX(-0%) scale(var(--audio-level, 1));
transform: scale(var(--audio-level, 1));
transition: transform 0.1s ease;
}
/* Fix button layout */
button {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 10px;
padding: 12px 24px;
background-color: transparent;
color: #ffffff;
border: 1px solid #ffffff;
font-family: inherit;
font-size: 16px;
cursor: pointer;
transition: all 0.3s;
text-transform: uppercase;
letter-spacing: 1px;
position: relative;
}
.mute-toggle {
width: 24px;
height: 24px;
cursor: pointer;
flex-shrink: 0;
}
.mute-toggle svg {
display: block;
width: 100%;
height: 100%;
}
#start-button {
margin-left: auto;
margin-right: auto;
}
/* Add styles for toast notifications */
.toast {
position: fixed;
@@ -177,6 +223,7 @@
<script>
let peerConnection;
let webrtc_id;
let isMuted = false;
const audioOutput = document.getElementById('audio-output');
const startButton = document.getElementById('start-button');
const chatMessages = document.getElementById('chat-messages');
@@ -185,27 +232,82 @@
let animationFrame;
let audioContext, analyser, audioSource;
// SVG Icons
const micIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>`;
const micMutedIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
<line x1="1" y1="1" x2="23" y2="23"></line>
</svg>`;
function updateButtonState() {
const button = document.getElementById('start-button');
// Clear previous content
button.innerHTML = '';
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
button.innerHTML = `
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
</div>
`;
const spinner = document.createElement('div');
spinner.className = 'spinner';
const text = document.createElement('span');
text.textContent = 'Connecting...';
button.appendChild(spinner);
button.appendChild(text);
} else if (peerConnection && peerConnection.connectionState === 'connected') {
button.innerHTML = `
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Conversation</span>
</div>
`;
// Create pulse circle
const pulseCircle = document.createElement('div');
pulseCircle.className = 'pulse-circle';
// Create mic icon
const micIcon = document.createElement('div');
micIcon.className = 'mute-toggle';
micIcon.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
micIcon.addEventListener('click', toggleMute);
// Create text
const text = document.createElement('span');
text.textContent = 'Stop Conversation';
// Add elements in correct order
button.appendChild(pulseCircle);
button.appendChild(micIcon);
button.appendChild(text);
} else {
button.innerHTML = 'Start Conversation';
const text = document.createElement('span');
text.textContent = 'Start Conversation';
button.appendChild(text);
}
}
function toggleMute(event) {
event.stopPropagation();
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
isMuted = !isMuted;
console.log("Mute toggled:", isMuted);
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.kind === 'audio') {
sender.track.enabled = !isMuted;
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
}
});
updateButtonState();
}
function setupAudioVisualization(stream) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
@@ -276,6 +378,21 @@
}
});
peerConnection.onicecandidate = ({ candidate }) => {
if (candidate) {
console.debug("Sending ICE candidate", candidate);
fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
candidate: candidate.toJSON(),
webrtc_id: webrtc_id,
type: "ice-candidate",
})
})
}
};
const dataChannel = peerConnection.createDataChannel('text');
dataChannel.onmessage = (event) => {
const eventJson = JSON.parse(event.data);
@@ -287,20 +404,6 @@
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
await new Promise((resolve) => {
if (peerConnection.iceGatheringState === "complete") {
resolve();
} else {
const checkState = () => {
if (peerConnection.iceGatheringState === "complete") {
peerConnection.removeEventListener("icegatheringstatechange", checkState);
resolve();
}
};
peerConnection.addEventListener("icegatheringstatechange", checkState);
}
});
peerConnection.addEventListener('connectionstatechange', () => {
console.log('connectionstatechange', peerConnection.connectionState);
if (peerConnection.connectionState === 'connected') {
@@ -338,7 +441,7 @@
const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
eventSource.addEventListener("output", (event) => {
const eventJson = JSON.parse(event.data);
addMessage("assistant", eventJson.content);
addMessage(eventJson.role, eventJson.content);
});
} catch (err) {
@@ -388,7 +491,12 @@
audioLevel = 0;
}
startButton.addEventListener('click', () => {
startButton.addEventListener('click', (event) => {
// Skip if clicking the mute toggle
if (event.target.closest('.mute-toggle')) {
return;
}
console.log('clicked');
console.log(peerConnection, peerConnection?.connectionState);
if (!peerConnection || peerConnection.connectionState !== 'connected') {

View File

@@ -1,4 +1,4 @@
fastrtc[vad]
fastrtc[vad]==0.0.20.rc2
openai
twilio
python-dotenv

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Llama 3.2 - SambaNova API
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN_ALT, secret|SAMBANOVA_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Llama 3.2 - SambaNova API (Gradio)
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN_ALT, secret|SAMBANOVA_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -13,8 +13,9 @@ from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
Stream,
get_cloudflare_turn_credentials,
get_cloudflare_turn_credentials_async,
get_stt_model,
get_twilio_turn_credentials,
)
from gradio.utils import get_space
from pydantic import BaseModel
@@ -75,7 +76,8 @@ stream = Stream(
additional_outputs=[chatbot, state],
additional_outputs_handler=lambda *a: (a[2], a[3]),
concurrency_limit=20 if get_space() else None,
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
rtc_configuration=get_cloudflare_turn_credentials_async,
server_rtc_configuration=get_cloudflare_turn_credentials(ttl=36_000),
)
app = FastAPI()
@@ -95,7 +97,9 @@ class InputData(BaseModel):
@app.get("/")
async def _():
rtc_config = get_twilio_turn_credentials() if get_space() else None
rtc_config = await get_cloudflare_turn_credentials_async(
hf_token=os.getenv("HF_TOKEN_ALT")
)
html_content = (curr_dir / "index.html").read_text()
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content)

View File

@@ -72,13 +72,17 @@
background-color: #0066cc;
color: white;
border: none;
padding: 12px 24px;
padding: 12px 18px;
font-family: inherit;
font-size: 14px;
cursor: pointer;
transition: all 0.3s;
border-radius: 4px;
font-weight: 500;
display: inline-flex;
align-items: center;
justify-content: center;
gap: 8px;
}
button:hover {
@@ -94,7 +98,6 @@
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.spinner {
@@ -118,7 +121,6 @@
align-items: center;
justify-content: center;
gap: 12px;
min-width: 180px;
}
.pulse-circle {
@@ -200,6 +202,23 @@
background-color: #ffd700;
color: black;
}
/* Styles for the mute toggle icon */
.mute-toggle {
width: 20px;
height: 20px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
flex-shrink: 0;
}
.mute-toggle svg {
width: 100%;
height: 100%;
stroke: white;
}
</style>
</head>
@@ -239,28 +258,82 @@
let audioContext, analyser, audioSource;
let messages = [];
let eventSource;
let isMuted = false;
// SVG Icons
const micIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>`;
const micMutedIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
<line x1="1" y1="1" x2="23" y2="23"></line>
</svg>`;
function updateButtonState() {
const button = document.getElementById('start-button');
const existingMuteButton = startButton.querySelector('.mute-toggle');
if (existingMuteButton) {
existingMuteButton.removeEventListener('click', toggleMute);
}
startButton.innerHTML = '';
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
button.innerHTML = `
startButton.innerHTML = `
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
</div>
`;
startButton.disabled = true;
} else if (peerConnection && peerConnection.connectionState === 'connected') {
button.innerHTML = `
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Conversation</span>
</div>
const pulseContainer = document.createElement('div');
pulseContainer.className = 'pulse-container';
pulseContainer.innerHTML = `
<div class="pulse-circle"></div>
<span>Stop Conversation</span>
`;
const muteToggle = document.createElement('div');
muteToggle.className = 'mute-toggle';
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
muteToggle.addEventListener('click', toggleMute);
startButton.appendChild(pulseContainer);
startButton.appendChild(muteToggle);
startButton.disabled = false;
} else {
button.innerHTML = 'Start Conversation';
startButton.textContent = 'Start Conversation';
startButton.disabled = false;
}
}
function toggleMute(event) {
event.stopPropagation();
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
isMuted = !isMuted;
console.log("Mute toggled:", isMuted);
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.kind === 'audio') {
sender.track.enabled = !isMuted;
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
}
});
updateButtonState();
}
function setupAudioVisualization(stream) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
@@ -378,6 +451,8 @@
clearTimeout(timeoutId);
const toast = document.getElementById('error-toast');
toast.style.display = 'none';
} else if (['closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
stop();
}
updateButtonState();
});
@@ -448,9 +523,10 @@
if (animationFrame) {
cancelAnimationFrame(animationFrame);
animationFrame = null;
}
if (audioContext) {
audioContext.close();
audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
audioContext = null;
analyser = null;
audioSource = null;
@@ -464,22 +540,33 @@
});
}
if (peerConnection.getSenders) {
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.stop) sender.track.stop();
});
}
peerConnection.onicecandidate = null;
peerConnection.ondatachannel = null;
peerConnection.onconnectionstatechange = null;
peerConnection.close();
peerConnection = null;
console.log("Peer connection closed.");
}
isMuted = false;
updateButtonState();
audioLevel = 0;
}
startButton.addEventListener('click', () => {
if (!peerConnection || peerConnection.connectionState !== 'connected') {
setupWebRTC();
} else {
startButton.addEventListener('click', (event) => {
if (event.target.closest('.mute-toggle')) {
return;
}
if (peerConnection && peerConnection.connectionState === 'connected') {
console.log("Stop button clicked");
stop();
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
console.log("Start button clicked");
messages = [];
chatMessages.innerHTML = '';
setupWebRTC();
updateButtonState();
}
});
</script>

View File

@@ -1,4 +1,4 @@
fastrtc[vad, stt]
fastrtc[vad, stt]==0.0.20.rc2
python-dotenv
huggingface_hub>=0.29.0
twilio

View File

@@ -1,5 +1,4 @@
from pathlib import Path
from typing import Dict, List
from dotenv import load_dotenv
from fastrtc import (
@@ -22,11 +21,11 @@ stt_model = get_stt_model()
tts_model = get_tts_model()
# Conversation state to maintain history
conversation_state: List[Dict[str, str]] = []
conversation_state: list[dict[str, str]] = []
# System prompt for agent
system_prompt = """You are a helpful assistant that can helps with finding places to
workremotely from. You should specifically check against reviews and ratings of the
work remotely from. You should specifically check against reviews and ratings of the
place. You should use this criteria to find the best place to work from:
- Price
- Reviews
@@ -78,9 +77,7 @@ def process_response(audio):
response_content = agent.run(input_text)
# Convert response to audio using TTS model
for audio_chunk in tts_model.stream_tts_sync(response_content or ""):
# Yield the audio chunk
yield audio_chunk
yield from tts_model.stream_tts_sync(response_content or "")
stream = Stream(

View File

@@ -76,14 +76,14 @@ def response(
)
for chunk in aggregate_bytes_to_16bit(iterator):
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
yield (24000, audio_array, "mono")
yield (24000, audio_array)
chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response),
handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],

View File

@@ -390,35 +390,8 @@
rttValues: []
};
// Load mu-law library
// Add load promise to track when the script is ready
function resample(audioData, fromSampleRate, toSampleRate) {
const ratio = fromSampleRate / toSampleRate;
const newLength = Math.round(audioData.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const position = i * ratio;
const index = Math.floor(position);
const fraction = position - index;
if (index + 1 < audioData.length) {
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
} else {
result[i] = audioData[index];
}
}
return result;
}
function convertToMulaw(audioData, sampleRate) {
// Resample to 8000 Hz if needed
if (sampleRate !== 8000) {
audioData = resample(audioData, sampleRate, 8000);
}
// Convert float32 [-1,1] to int16 [-32768,32767]
const int16Data = new Int16Array(audioData.length);
@@ -449,7 +422,7 @@
wsMetrics.startTime = performance.now();
// Create audio context and analyser for visualization
const audioContext = new AudioContext();
const audioContext = new AudioContext({ sampleRate: 24000 });
const analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser);

View File

@@ -9,7 +9,7 @@ app_file: app.py
pinned: false
license: mit
short_description: Transcribe audio in realtime with Whisper
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY]
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
---
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

View File

@@ -12,8 +12,7 @@ tags:
- webrtc
- websocket
- gradio
- secret|TWILIO_ACCOUNT_SID
- secret|TWILIO_AUTH_TOKEN
- secret|HF_TOKEN
- secret|GROQ_API_KEY
title: Whisper Realtime Transcription (Gradio UI)
---

View File

@@ -9,14 +9,21 @@
:root {
--primary-gradient: linear-gradient(135deg, #f9a45c 0%, #e66465 100%);
--background-cream: #faf8f5;
--background-cream-end: #f7f5f2;
/* Slightly warmer end color for body gradient */
--text-dark: #2d2d2d;
--transcript-bg: #ffffff;
/* White background for transcript area */
--transcript-border: #e0e0e0;
/* Light border for transcript items */
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
margin: 0;
padding: 0;
background-color: var(--background-cream);
/* Apply a subtle vertical gradient to the body */
background: linear-gradient(to bottom, var(--background-cream), var(--background-cream-end));
color: var(--text-dark);
min-height: 100vh;
}
@@ -43,18 +50,26 @@
.container {
max-width: 1000px;
margin: 1.5rem auto;
margin: 2.5rem auto;
/* Increased top/bottom margin */
padding: 0 2rem;
}
.transcript-container {
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
border-radius: 12px;
/* Slightly larger radius */
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08);
/* Enhanced shadow */
padding: 1.5rem;
height: 300px;
height: 350px;
/* Increased height */
overflow-y: auto;
margin-bottom: 1.5rem;
border: 1px solid rgba(0, 0, 0, 0.1);
margin-bottom: 2rem;
/* Increased margin */
border: 1px solid rgba(0, 0, 0, 0.05);
/* Softer border */
background-color: var(--transcript-bg);
/* Use the new variable */
}
.controls {
@@ -73,6 +88,8 @@
transition: all 0.2s ease;
font-weight: 500;
min-width: 180px;
position: relative;
padding-right: 50px;
}
button:hover {
@@ -86,22 +103,39 @@
/* Transcript text styling */
.transcript-container p {
margin: 0.4rem 0;
padding: 0.6rem;
margin: 0.6rem 0;
/* Increased vertical margin */
padding: 0.8rem 1rem;
/* Increased padding */
background: var(--background-cream);
border-radius: 4px;
line-height: 1.4;
font-size: 0.95rem;
/* Use the lighter cream for contrast */
border-radius: 6px;
/* Slightly larger radius */
line-height: 1.5;
/* Improved line spacing */
font-size: 0.98rem;
/* Slightly larger font */
border-left: 3px solid var(--transcript-border);
/* Add a subtle left border */
transition: background-color 0.2s ease;
/* Smooth hover effect */
}
/* Custom scrollbar - made thinner */
.transcript-container p:hover {
background-color: #fdfbf9;
/* Slightly change background on hover */
}
/* Custom scrollbar - update track color */
.transcript-container::-webkit-scrollbar {
width: 6px;
width: 8px;
/* Slightly wider scrollbar */
}
.transcript-container::-webkit-scrollbar-track {
background: var(--background-cream);
border-radius: 3px;
background: var(--background-cream-end);
/* Match body end gradient */
border-radius: 4px;
}
.transcript-container::-webkit-scrollbar-thumb {
@@ -176,6 +210,40 @@
transition: transform 0.1s ease;
}
/* Styles for the mute button */
.mute-toggle {
position: absolute;
right: 10px;
top: 50%;
transform: translateY(-50%);
width: 24px;
height: 24px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
}
.mute-toggle svg {
width: 20px;
height: 20px;
stroke: white;
}
/* Adjust layout for button content when mute is present */
.button-content {
display: flex;
align-items: center;
justify-content: center;
width: calc(100% - 40px);
margin-right: 40px;
}
.icon-with-spinner,
.pulse-container {
width: 100%;
}
@keyframes spin {
to {
transform: rotate(360deg);
@@ -206,10 +274,29 @@
let audioContext, analyser, audioSource;
let audioLevel = 0;
let animationFrame;
let isMuted = false;
const startButton = document.getElementById('start-button');
const transcriptDiv = document.getElementById('transcript');
// SVG Icons
const micIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>`;
const micMutedIconSVG = `
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
<line x1="1" y1="1" x2="23" y2="23"></line>
</svg>`;
function showError(message) {
const toast = document.getElementById('error-toast');
toast.textContent = message;
@@ -241,25 +328,63 @@
}
function updateButtonState() {
// Remove existing mute listener if present
const existingMuteButton = startButton.querySelector('.mute-toggle');
if (existingMuteButton) {
existingMuteButton.removeEventListener('click', toggleMute);
existingMuteButton.remove();
}
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
startButton.innerHTML = `
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
<div class="button-content">
<div class="icon-with-spinner">
<div class="spinner"></div>
<span>Connecting...</span>
</div>
</div>
`;
startButton.disabled = true;
} else if (peerConnection && peerConnection.connectionState === 'connected') {
startButton.innerHTML = `
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Recording</span>
<div class="button-content">
<div class="pulse-container">
<div class="pulse-circle"></div>
<span>Stop Recording</span>
</div>
</div>
<div class="mute-toggle" title="${isMuted ? 'Unmute' : 'Mute'}">
${isMuted ? micMutedIconSVG : micIconSVG}
</div>
`;
startButton.disabled = false;
const muteButton = startButton.querySelector('.mute-toggle');
if (muteButton) {
muteButton.addEventListener('click', toggleMute);
}
} else {
startButton.innerHTML = 'Start Recording';
startButton.disabled = false;
}
}
function toggleMute(event) {
event.stopPropagation();
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
isMuted = !isMuted;
console.log("Mute toggled:", isMuted);
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.kind === 'audio') {
sender.track.enabled = !isMuted;
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
}
});
updateButtonState();
}
function setupAudioVisualization(stream) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
@@ -321,6 +446,21 @@
updateButtonState();
});
peerConnection.onicecandidate = ({ candidate }) => {
if (candidate) {
console.debug("Sending ICE candidate", candidate);
fetch('/webrtc/offer', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
candidate: candidate.toJSON(),
webrtc_id: webrtc_id,
type: "ice-candidate",
})
})
}
};
// Create data channel for messages
const dataChannel = peerConnection.createDataChannel('text');
dataChannel.onmessage = handleMessage;
@@ -329,20 +469,6 @@
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
await new Promise((resolve) => {
if (peerConnection.iceGatheringState === "complete") {
resolve();
} else {
const checkState = () => {
if (peerConnection.iceGatheringState === "complete") {
peerConnection.removeEventListener("icegatheringstatechange", checkState);
resolve();
}
};
peerConnection.addEventListener("icegatheringstatechange", checkState);
}
});
webrtc_id = Math.random().toString(36).substring(7);
const response = await fetch('/webrtc/offer', {
@@ -392,41 +518,45 @@
function stop() {
if (animationFrame) {
cancelAnimationFrame(animationFrame);
animationFrame = null;
}
if (audioContext) {
audioContext.close();
audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
audioContext = null;
analyser = null;
audioSource = null;
}
if (peerConnection) {
if (peerConnection.getTransceivers) {
peerConnection.getTransceivers().forEach(transceiver => {
if (transceiver.stop) {
transceiver.stop();
if (peerConnection.getSenders) {
peerConnection.getSenders().forEach(sender => {
if (sender.track) {
sender.track.stop();
console.log(`Track ${sender.track.id} stopped.`);
}
});
}
if (peerConnection.getSenders) {
peerConnection.getSenders().forEach(sender => {
if (sender.track && sender.track.stop) sender.track.stop();
});
}
setTimeout(() => {
peerConnection.close();
}, 500);
peerConnection.close();
peerConnection = null;
console.log("Peer connection closed.");
}
audioLevel = 0;
isMuted = false;
updateButtonState();
}
startButton.addEventListener('click', () => {
if (startButton.textContent === 'Start Recording') {
setupWebRTC();
} else {
startButton.addEventListener('click', (event) => {
if (event.target.closest('.mute-toggle')) {
return;
}
if (peerConnection && peerConnection.connectionState === 'connected') {
console.log("Stop button clicked");
stop();
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
console.log("Start button clicked");
transcriptDiv.innerHTML = '';
setupWebRTC();
updateButtonState();
}
});
</script>

View File

@@ -1,4 +1,3 @@
fastrtc[vad]
fastrtc[vad]==0.0.20.rc2
groq
python-dotenv
twilio
python-dotenv