mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
Merge remote-tracking branch 'origin/main' into open-avatar-chat-0.4.0
This commit is contained in:
@@ -4,12 +4,12 @@ emoji: ♊️
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.16.0
|
||||
sdk_version: 5.25.2
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Gemini understands audio and video!
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -6,12 +6,14 @@ from io import BytesIO
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
import websockets
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AsyncAudioVideoStreamHandler,
|
||||
Stream,
|
||||
WebRTC,
|
||||
get_twilio_turn_credentials,
|
||||
get_cloudflare_turn_credentials_async,
|
||||
wait_for_item,
|
||||
)
|
||||
from google import genai
|
||||
from gradio.utils import get_space
|
||||
@@ -61,18 +63,24 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
|
||||
)
|
||||
config = {"response_modalities": ["AUDIO"]}
|
||||
async with client.aio.live.connect(
|
||||
model="gemini-2.0-flash-exp", config=config
|
||||
model="gemini-2.0-flash-exp",
|
||||
config=config, # type: ignore
|
||||
) as session:
|
||||
self.session = session
|
||||
print("set session")
|
||||
while not self.quit.is_set():
|
||||
turn = self.session.receive()
|
||||
async for response in turn:
|
||||
if data := response.data:
|
||||
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
|
||||
try:
|
||||
async for response in turn:
|
||||
if data := response.data:
|
||||
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
|
||||
self.audio_queue.put_nowait(audio)
|
||||
except websockets.exceptions.ConnectionClosedOK:
|
||||
print("connection closed")
|
||||
break
|
||||
|
||||
async def video_receive(self, frame: np.ndarray):
|
||||
self.video_queue.put_nowait(frame)
|
||||
|
||||
if self.session:
|
||||
# send image every 1 second
|
||||
print(time.time() - self.last_frame_time)
|
||||
@@ -82,10 +90,12 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
|
||||
if self.latest_args[1] is not None:
|
||||
await self.session.send(input=encode_image(self.latest_args[1]))
|
||||
|
||||
self.video_queue.put_nowait(frame)
|
||||
|
||||
async def video_emit(self):
|
||||
return await self.video_queue.get()
|
||||
frame = await wait_for_item(self.video_queue, 0.01)
|
||||
if frame is not None:
|
||||
return frame
|
||||
else:
|
||||
return np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
_, array = frame
|
||||
@@ -95,13 +105,15 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
|
||||
await self.session.send(input=audio_message)
|
||||
|
||||
async def emit(self):
|
||||
array = await self.audio_queue.get()
|
||||
return (self.output_sample_rate, array)
|
||||
array = await wait_for_item(self.audio_queue, 0.01)
|
||||
if array is not None:
|
||||
return (self.output_sample_rate, array)
|
||||
return array
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
if self.session:
|
||||
self.quit.set()
|
||||
await self.session._websocket.close()
|
||||
await self.session.close()
|
||||
self.quit.clear()
|
||||
|
||||
|
||||
@@ -109,10 +121,8 @@ stream = Stream(
|
||||
handler=GeminiHandler(),
|
||||
modality="audio-video",
|
||||
mode="send-receive",
|
||||
rtc_configuration=get_twilio_turn_credentials()
|
||||
if get_space() == "spaces"
|
||||
else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
|
||||
time_limit=180 if get_space() else None,
|
||||
additional_inputs=[
|
||||
gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
|
||||
],
|
||||
@@ -151,8 +161,8 @@ with gr.Blocks(css=css) as demo:
|
||||
modality="audio-video",
|
||||
mode="send-receive",
|
||||
elem_id="video-source",
|
||||
rtc_configuration=get_twilio_turn_credentials()
|
||||
if get_space() == "spaces"
|
||||
rtc_configuration=get_cloudflare_turn_credentials_async
|
||||
if get_space()
|
||||
else None,
|
||||
icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
||||
pulse_color="rgb(255, 255, 255)",
|
||||
@@ -167,7 +177,7 @@ with gr.Blocks(css=css) as demo:
|
||||
GeminiHandler(),
|
||||
inputs=[webrtc, image_input],
|
||||
outputs=[webrtc],
|
||||
time_limit=60 if get_space() else None,
|
||||
time_limit=180 if get_space() else None,
|
||||
concurrency_limit=2 if get_space() else None,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
fastrtc
|
||||
fastrtc==0.0.23.rc1
|
||||
python-dotenv
|
||||
google-genai
|
||||
twilio
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
from pathlib import Path
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
@@ -190,13 +190,13 @@ if __name__ == "__main__":
|
||||
gr.HTML(
|
||||
"""
|
||||
<div style="display: flex; justify-content: center; align-items: center;">
|
||||
<h1>Gemini Conversation</h1>
|
||||
<h1>Gemini Conversation</h1>
|
||||
</div>
|
||||
"""
|
||||
)
|
||||
gr.Markdown(
|
||||
"""# How to run this demo
|
||||
|
||||
|
||||
- Clone the repo - top right of the page click the vertical three dots and select "Clone repository"
|
||||
- Open the repo in a terminal and install the dependencies
|
||||
- Get a gemini API key [here](https://ai.google.dev/gemini-api/docs/api-key)
|
||||
|
||||
19
demo/integrated_textbox/README.md
Normal file
19
demo/integrated_textbox/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
title: Integrated Text Box
|
||||
emoji: 📝
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.31.0
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk or type to ANY LLM!
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN]
|
||||
---
|
||||
|
||||
# Integrated Textbox
|
||||
|
||||
Talk or type to ANY LLM!
|
||||
|
||||
|
||||
143
demo/integrated_textbox/app.py
Normal file
143
demo/integrated_textbox/app.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "fastrtc[vad, stt]">=0.0.26",
|
||||
# "openai",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
|
||||
import gradio as gr
|
||||
import huggingface_hub
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
WebRTC,
|
||||
WebRTCData,
|
||||
WebRTCError,
|
||||
get_hf_turn_credentials,
|
||||
get_stt_model,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from openai import OpenAI
|
||||
|
||||
stt_model = get_stt_model()
|
||||
|
||||
conversations = {}
|
||||
|
||||
|
||||
def response(
|
||||
data: WebRTCData,
|
||||
conversation: list[dict],
|
||||
token: str | None = None,
|
||||
model: str = "meta-llama/Llama-3.2-3B-Instruct",
|
||||
provider: str = "sambanova",
|
||||
):
|
||||
print("conversation before", conversation)
|
||||
if not provider.startswith("http") and not token:
|
||||
raise WebRTCError("Please add your HF token.")
|
||||
|
||||
if data.audio is not None and data.audio[1].size > 0:
|
||||
user_audio_text = stt_model.stt(data.audio)
|
||||
conversation.append({"role": "user", "content": user_audio_text})
|
||||
else:
|
||||
conversation.append({"role": "user", "content": data.textbox})
|
||||
|
||||
yield AdditionalOutputs(conversation)
|
||||
|
||||
if provider.startswith("http"):
|
||||
client = OpenAI(base_url=provider, api_key="ollama")
|
||||
else:
|
||||
client = huggingface_hub.InferenceClient(
|
||||
api_key=token,
|
||||
provider=provider, # type: ignore
|
||||
)
|
||||
|
||||
request = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=conversation, # type: ignore
|
||||
temperature=1,
|
||||
top_p=0.1,
|
||||
)
|
||||
response = {"role": "assistant", "content": request.choices[0].message.content}
|
||||
|
||||
conversation.append(response)
|
||||
print("conversation after", conversation)
|
||||
yield AdditionalOutputs(conversation)
|
||||
|
||||
|
||||
css = """
|
||||
footer {
|
||||
display: none !important;
|
||||
}
|
||||
"""
|
||||
|
||||
providers = [
|
||||
"black-forest-labs",
|
||||
"cerebras",
|
||||
"cohere",
|
||||
"fal-ai",
|
||||
"fireworks-ai",
|
||||
"hf-inference",
|
||||
"hyperbolic",
|
||||
"nebius",
|
||||
"novita",
|
||||
"openai",
|
||||
"replicate",
|
||||
"sambanova",
|
||||
"together",
|
||||
]
|
||||
|
||||
|
||||
def hide_token(provider: str):
|
||||
if provider.startswith("http"):
|
||||
return gr.Textbox(visible=False)
|
||||
return gr.skip()
|
||||
|
||||
|
||||
with gr.Blocks(css=css) as demo:
|
||||
gr.HTML(
|
||||
"""
|
||||
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
|
||||
<img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/AV_Huggy.png" alt="Streaming Huggy" style="height: 50px; margin-right: 10px"> FastRTC Chat
|
||||
</h1>
|
||||
"""
|
||||
)
|
||||
with gr.Sidebar():
|
||||
token = gr.Textbox(
|
||||
placeholder="Place your HF token here", type="password", label="HF Token"
|
||||
)
|
||||
model = gr.Dropdown(
|
||||
choices=["meta-llama/Llama-3.2-3B-Instruct"],
|
||||
allow_custom_value=True,
|
||||
label="Model",
|
||||
)
|
||||
provider = gr.Dropdown(
|
||||
label="Provider",
|
||||
choices=providers,
|
||||
value="sambanova",
|
||||
info="Select a hf-compatible provider or type the url of your server, e.g. http://127.0.0.1:11434/v1 for ollama",
|
||||
allow_custom_value=True,
|
||||
)
|
||||
provider.change(hide_token, inputs=[provider], outputs=[token])
|
||||
cb = gr.Chatbot(type="messages", height=600)
|
||||
webrtc = WebRTC(
|
||||
modality="audio",
|
||||
mode="send",
|
||||
variant="textbox",
|
||||
rtc_configuration=get_hf_turn_credentials if get_space() else None,
|
||||
server_rtc_configuration=get_hf_turn_credentials(ttl=3_600 * 24 * 30)
|
||||
if get_space()
|
||||
else None,
|
||||
)
|
||||
webrtc.stream(
|
||||
ReplyOnPause(response), # type: ignore
|
||||
inputs=[webrtc, cb, token, model, provider],
|
||||
outputs=[cb],
|
||||
concurrency_limit=100,
|
||||
)
|
||||
webrtc.on_additional_outputs(
|
||||
lambda old, new: new, inputs=[cb], outputs=[cb], concurrency_limit=100
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch(server_port=7860)
|
||||
2
demo/integrated_textbox/requirements.txt
Normal file
2
demo/integrated_textbox/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastrtc[vad, stt]
|
||||
openai
|
||||
@@ -1,5 +1,6 @@
|
||||
from functools import lru_cache
|
||||
from typing import Generator, Literal
|
||||
from collections.abc import Generator
|
||||
from functools import cache
|
||||
from typing import Literal
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
@@ -17,7 +18,7 @@ from numpy.typing import NDArray
|
||||
load_dotenv()
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def load_moonshine(
|
||||
model_name: Literal["moonshine/base", "moonshine/tiny"],
|
||||
) -> MoonshineOnnxModel:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import fastapi
|
||||
from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
|
||||
from fastrtc.utils import audio_to_bytes
|
||||
from fastrtc.utils import audio_to_bytes, audio_to_float32
|
||||
from openai import OpenAI
|
||||
import logging
|
||||
import time
|
||||
@@ -78,8 +78,8 @@ def echo(audio):
|
||||
)
|
||||
|
||||
for audio_chunk in audio_stream:
|
||||
audio_array = (
|
||||
np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
audio_array = audio_to_float32(
|
||||
np.frombuffer(audio_chunk, dtype=np.int16)
|
||||
)
|
||||
yield (24000, audio_array)
|
||||
|
||||
|
||||
363
demo/patient_intake/app.py
Normal file
363
demo/patient_intake/app.py
Normal file
@@ -0,0 +1,363 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
CloseStream,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_current_context,
|
||||
get_stt_model,
|
||||
get_tts_model,
|
||||
)
|
||||
from numpy.typing import NDArray
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
|
||||
tts = get_tts_model()
|
||||
stt = get_stt_model()
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
|
||||
conversations: dict[str, list[dict]] = {}
|
||||
|
||||
FUNCTIONS = [
|
||||
{
|
||||
"name": "verify_birthday",
|
||||
"description": "Use this function to verify the user has provided their correct birthday.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"birthday": {
|
||||
"type": "string",
|
||||
"description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.",
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "list_prescriptions",
|
||||
"description": "Once the user has provided a list of their prescription medications, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prescriptions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"medication": {
|
||||
"type": "string",
|
||||
"description": "The medication's name",
|
||||
},
|
||||
"dosage": {
|
||||
"type": "string",
|
||||
"description": "The prescription's dosage",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "list_allergies",
|
||||
"description": "Once the user has provided a list of their allergies, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"allergies": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "What the user is allergic to",
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "list_conditions",
|
||||
"description": "Once the user has provided a list of their medical conditions, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's medical condition",
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "list_visit_reasons",
|
||||
"description": "Once the user has provided a list of the reasons they are visiting a doctor today, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"visit_reasons": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's reason for visiting the doctor",
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def create_system_message():
|
||||
system_message = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Freddy. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function.",
|
||||
}
|
||||
]
|
||||
return system_message
|
||||
|
||||
|
||||
def start_up():
|
||||
stream_id = get_current_context().webrtc_id
|
||||
conversation = create_system_message()
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=conversation, # type: ignore
|
||||
)
|
||||
llm_response = response.choices[0].message.content
|
||||
assert llm_response is not None
|
||||
yield from tts.stream_tts_sync(llm_response)
|
||||
llm_dict = {"role": "assistant", "content": llm_response}
|
||||
yield AdditionalOutputs(llm_dict, conversation)
|
||||
conversation.append(llm_dict)
|
||||
conversations[stream_id] = conversation
|
||||
|
||||
|
||||
def response(audio: tuple[int, NDArray[np.int16]]):
|
||||
stream_id = get_current_context().webrtc_id
|
||||
if stream_id not in conversations:
|
||||
conversations[stream_id] = create_system_message()
|
||||
message = stt.stt(audio)
|
||||
print("message", message)
|
||||
conversation = conversations[stream_id]
|
||||
conversation.append({"role": "user", "content": message})
|
||||
yield AdditionalOutputs({"role": "user", "content": message})
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=conversation, # type: ignore
|
||||
functions=FUNCTIONS, # type: ignore
|
||||
function_call="auto",
|
||||
)
|
||||
should_end = False
|
||||
response_message = response.choices[0].message
|
||||
if response_message.function_call:
|
||||
function_name = response_message.function_call.name
|
||||
function_args = json.loads(response_message.function_call.arguments)
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": f"Function call: {function_name} with arguments: {function_args}",
|
||||
}
|
||||
)
|
||||
if function_name == "verify_birthday":
|
||||
if function_args.get("birthday") == "1983-01-01":
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Successfully verified birthday",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Success",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions if they have any. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages. Once they have listed their prescriptions or confirmed they don't have any, call the list_prescriptions function.",
|
||||
}
|
||||
)
|
||||
else:
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Failed to verify birthday",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Failed",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function.",
|
||||
}
|
||||
)
|
||||
elif function_name == "list_prescriptions":
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Successfully listed prescriptions",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Success",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function.",
|
||||
}
|
||||
)
|
||||
elif function_name == "list_allergies":
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Successfully listed allergies",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Success",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function.",
|
||||
}
|
||||
)
|
||||
elif function_name == "list_conditions":
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Successfully listed conditions",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Success",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function.",
|
||||
}
|
||||
)
|
||||
elif function_name == "list_visit_reasons":
|
||||
yield AdditionalOutputs(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Successfully listed visit reasons",
|
||||
}
|
||||
)
|
||||
conversation.append(response_message.model_dump())
|
||||
conversation.append(
|
||||
{
|
||||
"role": "function",
|
||||
"name": function_name,
|
||||
"content": "Success",
|
||||
}
|
||||
)
|
||||
conversation.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Now, thank the user and end the conversation.",
|
||||
}
|
||||
)
|
||||
should_end = True
|
||||
llm_response = (
|
||||
client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=conversation, # type: ignore
|
||||
functions=FUNCTIONS, # type: ignore
|
||||
function_call="auto",
|
||||
)
|
||||
.choices[0]
|
||||
.message.content
|
||||
)
|
||||
else:
|
||||
llm_response = response.choices[0].message.content
|
||||
assert llm_response is not None
|
||||
yield from tts.stream_tts_sync(llm_response)
|
||||
llm_dict = {"role": "assistant", "content": llm_response}
|
||||
yield AdditionalOutputs(llm_dict, conversation)
|
||||
conversation.append(llm_dict)
|
||||
if should_end:
|
||||
yield CloseStream()
|
||||
|
||||
|
||||
def update_chatbot(
|
||||
chatbot: list[dict],
|
||||
conversation_old,
|
||||
response: dict,
|
||||
conversation: list[dict] | None = None,
|
||||
):
|
||||
chatbot.append(response)
|
||||
return chatbot, conversation
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(response, start_up),
|
||||
mode="send-receive",
|
||||
modality="audio",
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot, gr.JSON(label="Conversation")],
|
||||
additional_outputs_handler=update_chatbot,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
14
demo/qwen_phone_chat/README.md
Normal file
14
demo/qwen_phone_chat/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
title: Qwen Phone Chat
|
||||
emoji: 📞
|
||||
colorFrom: pink
|
||||
colorTo: green
|
||||
sdk: gradio
|
||||
sdk_version: 5.25.2
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk with Qwen 2.5 Omni over the Phone
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
217
demo/qwen_phone_chat/app.py
Normal file
217
demo/qwen_phone_chat/app.py
Normal file
@@ -0,0 +1,217 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_cloudflare_turn_credentials_async,
|
||||
wait_for_item,
|
||||
)
|
||||
from websockets.asyncio.client import connect
|
||||
|
||||
load_dotenv()
|
||||
|
||||
cur_dir = Path(__file__).parent
|
||||
|
||||
API_KEY = os.getenv("MODELSCOPE_API_KEY", "")
|
||||
API_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen-omni-turbo-realtime-2025-03-26"
|
||||
VOICES = ["Chelsie", "Serena", "Ethan", "Cherry"]
|
||||
headers = {"Authorization": "Bearer " + API_KEY}
|
||||
|
||||
|
||||
class QwenOmniHandler(AsyncStreamHandler):
|
||||
def __init__(
|
||||
self,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
expected_layout="mono",
|
||||
output_sample_rate=24_000,
|
||||
input_sample_rate=16_000,
|
||||
)
|
||||
self.connection = None
|
||||
self.output_queue = asyncio.Queue()
|
||||
|
||||
def copy(self):
|
||||
return QwenOmniHandler()
|
||||
|
||||
@staticmethod
|
||||
def msg_id() -> str:
|
||||
return f"event_{secrets.token_hex(10)}"
|
||||
|
||||
async def start_up(
|
||||
self,
|
||||
):
|
||||
"""Connect to realtime API. Run forever in separate thread to keep connection open."""
|
||||
voice_id = "Serena"
|
||||
print("voice_id", voice_id)
|
||||
async with connect(
|
||||
API_URL,
|
||||
additional_headers=headers,
|
||||
) as conn:
|
||||
self.client = conn
|
||||
await conn.send(
|
||||
json.dumps(
|
||||
{
|
||||
"event_id": self.msg_id(),
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"modalities": [
|
||||
"text",
|
||||
"audio",
|
||||
],
|
||||
"voice": voice_id,
|
||||
"input_audio_format": "pcm16",
|
||||
},
|
||||
}
|
||||
)
|
||||
)
|
||||
self.connection = conn
|
||||
try:
|
||||
async for data in self.connection:
|
||||
event = json.loads(data)
|
||||
print("event", event["type"])
|
||||
if "type" not in event:
|
||||
continue
|
||||
# Handle interruptions
|
||||
if event["type"] == "input_audio_buffer.speech_started":
|
||||
self.clear_queue()
|
||||
if event["type"] == "response.audio.delta":
|
||||
print("putting output")
|
||||
await self.output_queue.put(
|
||||
(
|
||||
self.output_sample_rate,
|
||||
np.frombuffer(
|
||||
base64.b64decode(event["delta"]), dtype=np.int16
|
||||
).reshape(1, -1),
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
print("error", e)
|
||||
|
||||
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
||||
if not self.connection:
|
||||
return
|
||||
_, array = frame
|
||||
array = array.squeeze()
|
||||
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
|
||||
try:
|
||||
await self.connection.send(
|
||||
json.dumps(
|
||||
{
|
||||
"event_id": self.msg_id(),
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": audio_message,
|
||||
}
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
print("error", e)
|
||||
|
||||
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
||||
return await wait_for_item(self.output_queue)
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
if self.connection:
|
||||
await self.connection.close()
|
||||
self.connection = None
|
||||
|
||||
|
||||
voice = gr.Dropdown(choices=VOICES, value=VOICES[0], type="value", label="Voice")
|
||||
stream = Stream(
|
||||
QwenOmniHandler(),
|
||||
mode="send-receive",
|
||||
modality="audio",
|
||||
additional_inputs=[voice],
|
||||
additional_outputs=None,
|
||||
rtc_configuration=get_cloudflare_turn_credentials_async,
|
||||
concurrency_limit=20,
|
||||
time_limit=180,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.post("/telephone/incoming")
|
||||
async def handle_incoming_call(request: Request):
|
||||
"""
|
||||
Handle incoming telephone calls (e.g., via Twilio).
|
||||
|
||||
Generates TwiML instructions to connect the incoming call to the
|
||||
WebSocket handler (`/telephone/handler`) for audio streaming.
|
||||
|
||||
Args:
|
||||
request: The FastAPI Request object for the incoming call webhook.
|
||||
|
||||
Returns:
|
||||
An HTMLResponse containing the TwiML instructions as XML.
|
||||
"""
|
||||
from twilio.twiml.voice_response import Connect, VoiceResponse
|
||||
|
||||
if len(stream.connections) > (stream.concurrency_limit or 20):
|
||||
response = VoiceResponse()
|
||||
response.say("Qwen is busy please try again later!")
|
||||
return HTMLResponse(content=str(response), media_type="application/xml")
|
||||
|
||||
response = VoiceResponse()
|
||||
response.say("Connecting to Qwen")
|
||||
connect = Connect()
|
||||
print("request.url.hostname", request.url.hostname)
|
||||
connect.stream(url=f"wss://{request.url.hostname}/telephone/handler")
|
||||
response.append(connect)
|
||||
response.say("The call has been disconnected.")
|
||||
return HTMLResponse(content=str(response), media_type="application/xml")
|
||||
|
||||
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Qwen Phone Chat</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
pre {
|
||||
background-color: #f5f5f5;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
}
|
||||
h1 {
|
||||
color: #333;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Qwen Phone Chat</h1>
|
||||
<p>Call +1 (877) 853-7936</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# stream.fastphone(host="0.0.0.0", port=7860)
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
2
demo/qwen_phone_chat/requirements.txt
Normal file
2
demo/qwen_phone_chat/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastrtc
|
||||
websockets>=14.0
|
||||
173
demo/send_text_or_audio/app.py
Normal file
173
demo/send_text_or_audio/app.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
import gradio as gr
|
||||
import huggingface_hub
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from pydantic import BaseModel
|
||||
|
||||
load_dotenv()
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
|
||||
|
||||
client = huggingface_hub.InferenceClient(
|
||||
api_key=os.environ.get("SAMBANOVA_API_KEY"),
|
||||
provider="sambanova",
|
||||
)
|
||||
stt_model = get_stt_model()
|
||||
|
||||
|
||||
def response(
|
||||
audio: tuple[int, np.ndarray],
|
||||
gradio_chatbot: list[dict] | None = None,
|
||||
conversation_state: list[dict] | None = None,
|
||||
textbox: str | None = None,
|
||||
):
|
||||
gradio_chatbot = gradio_chatbot or []
|
||||
conversation_state = conversation_state or []
|
||||
print("chatbot", gradio_chatbot)
|
||||
|
||||
if textbox:
|
||||
text = textbox
|
||||
else:
|
||||
text = stt_model.stt(audio)
|
||||
|
||||
sample_rate, array = audio
|
||||
gradio_chatbot.append({"role": "user", "content": text})
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
conversation_state.append({"role": "user", "content": text})
|
||||
request = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
messages=conversation_state, # type: ignore
|
||||
temperature=0.1,
|
||||
top_p=0.1,
|
||||
)
|
||||
response = {"role": "assistant", "content": request.choices[0].message.content}
|
||||
|
||||
conversation_state.append(response)
|
||||
gradio_chatbot.append(response)
|
||||
|
||||
yield AdditionalOutputs(gradio_chatbot, conversation_state)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages", value=[])
|
||||
state = gr.State(value=[])
|
||||
textbox = gr.Textbox(value="", interactive=True)
|
||||
stream = Stream(
|
||||
ReplyOnPause(
|
||||
response, # type: ignore
|
||||
input_sample_rate=16000,
|
||||
),
|
||||
mode="send",
|
||||
modality="audio",
|
||||
additional_inputs=[
|
||||
chatbot,
|
||||
state,
|
||||
textbox,
|
||||
],
|
||||
additional_outputs=[chatbot, state],
|
||||
additional_outputs_handler=lambda *a: (a[2], a[3]),
|
||||
concurrency_limit=20 if get_space() else 5,
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
)
|
||||
|
||||
|
||||
def trigger_response(webrtc_id: str):
|
||||
cast(ReplyOnPause, stream.webrtc_component.handlers[webrtc_id]).trigger_response()
|
||||
return ""
|
||||
|
||||
|
||||
with stream.ui as demo:
|
||||
button = gr.Button("Send")
|
||||
button.click(
|
||||
trigger_response,
|
||||
inputs=[stream.webrtc_component],
|
||||
outputs=[textbox],
|
||||
)
|
||||
|
||||
stream.ui = demo
|
||||
app = FastAPI()
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class InputData(BaseModel):
|
||||
webrtc_id: str
|
||||
chatbot: list[Message]
|
||||
state: list[Message]
|
||||
textbox: str
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
@app.post("/input_hook")
|
||||
async def _(data: InputData):
|
||||
body = data.model_dump()
|
||||
stream.set_input(data.webrtc_id, body["chatbot"], body["state"], body["textbox"])
|
||||
cast(ReplyOnPause, stream.handlers[data.webrtc_id]).trigger_response()
|
||||
|
||||
|
||||
def audio_to_base64(file_path):
|
||||
audio_format = "wav"
|
||||
with open(file_path, "rb") as audio_file:
|
||||
encoded_audio = base64.b64encode(audio_file.read()).decode("utf-8")
|
||||
return f"data:audio/{audio_format};base64,{encoded_audio}"
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
async def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
chatbot = output.args[0]
|
||||
state = output.args[1]
|
||||
user_message = chatbot[-1]["content"]
|
||||
data = {
|
||||
"message": state[-1],
|
||||
"audio": (
|
||||
audio_to_base64(user_message["path"])
|
||||
if isinstance(user_message, dict) and "path" in user_message
|
||||
else None
|
||||
),
|
||||
}
|
||||
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
raise ValueError("Phone mode not supported")
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
539
demo/send_text_or_audio/index.html
Normal file
539
demo/send_text_or_audio/index.html
Normal file
@@ -0,0 +1,539 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Talk to Sambanova</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
background-color: #f8f9fa;
|
||||
color: #1a1a1a;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
height: 100vh;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
height: 80%;
|
||||
}
|
||||
|
||||
.logo {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
padding: 20px;
|
||||
height: 90%;
|
||||
box-sizing: border-box;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 20px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 20px;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #e9ecef;
|
||||
margin-left: 20%;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #f1f3f5;
|
||||
margin-right: 20%;
|
||||
}
|
||||
|
||||
.controls {
|
||||
text-align: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
button {
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
border-radius: 4px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background-color: #0052a3;
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid #ffffff;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background-color: #ffffff;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Add styles for typing indicator */
|
||||
.typing-indicator {
|
||||
padding: 8px;
|
||||
background-color: #f1f3f5;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 10px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.dots {
|
||||
display: inline-flex;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
background-color: #0066cc;
|
||||
border-radius: 50%;
|
||||
animation: pulse 1.5s infinite;
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.5s;
|
||||
}
|
||||
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 1s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.5;
|
||||
transform: scale(1);
|
||||
}
|
||||
|
||||
50% {
|
||||
opacity: 1;
|
||||
transform: scale(1.2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 16px 24px;
|
||||
border-radius: 4px;
|
||||
font-size: 14px;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
|
||||
/* Add styles for text input */
|
||||
.text-input-container {
|
||||
display: flex;
|
||||
margin-top: 10px;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
#text-input {
|
||||
flex-grow: 1;
|
||||
padding: 10px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.text-input-container button {
|
||||
padding: 10px 15px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<!-- Add toast element after body opening tag -->
|
||||
<div id="error-toast" class="toast"></div>
|
||||
<div class="container">
|
||||
<div class="logo">
|
||||
<h1>Talk to Sambanova 🗣️</h1>
|
||||
<h2 style="font-size: 1.2em; color: #666; margin-top: 10px;">Speak to Llama 3.2 powered by Sambanova API
|
||||
</h2>
|
||||
</div>
|
||||
<div class="chat-container">
|
||||
<div class="chat-messages" id="chat-messages"></div>
|
||||
<div class="typing-indicator" id="typing-indicator">
|
||||
<div class="dots">
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Added text input form -->
|
||||
<form id="text-input-form" class="text-input-container">
|
||||
<input type="text" id="text-input" placeholder="Type your message..." />
|
||||
<button type="submit">Send</button>
|
||||
</form>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="start-button">Start Conversation</button>
|
||||
</div>
|
||||
</div>
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
|
||||
let audioLevel = 0;
|
||||
let animationFrame;
|
||||
let audioContext, analyser, audioSource;
|
||||
let messages = [];
|
||||
let eventSource;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
}
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
audioSource = audioContext.createMediaStreamSource(stream);
|
||||
audioSource.connect(analyser);
|
||||
analyser.fftSize = 64;
|
||||
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
|
||||
audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationFrame = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide toast after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
function handleMessage(event) {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
const typingIndicator = document.getElementById('typing-indicator');
|
||||
const textInput = document.getElementById('text-input');
|
||||
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: messages,
|
||||
state: messages,
|
||||
textbox: textInput.value
|
||||
})
|
||||
});
|
||||
} else if (eventJson.type === "log") {
|
||||
if (eventJson.data === "pause_detected") {
|
||||
typingIndicator.style.display = 'block';
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (eventJson.data === "response_starting") {
|
||||
typingIndicator.style.display = 'none';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
// Hide warning after 5 seconds
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
setupAudioVisualization(stream);
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = handleMessage;
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
peerConnection.onicecandidate = ({ candidate }) => {
|
||||
if (candidate) {
|
||||
console.debug("Sending ICE candidate", candidate);
|
||||
fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
candidate: candidate.toJSON(),
|
||||
webrtc_id: webrtc_id,
|
||||
type: "ice-candidate",
|
||||
})
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
console.log(eventJson);
|
||||
messages.push(eventJson.message);
|
||||
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
|
||||
});
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
|
||||
if (role === 'user' && content.startsWith("data:audio/wav;base64,")) {
|
||||
// Create audio element for user messages
|
||||
const audio = document.createElement('audio');
|
||||
audio.controls = true;
|
||||
audio.src = content;
|
||||
messageDiv.appendChild(audio);
|
||||
} else {
|
||||
// Text content for assistant messages
|
||||
messageDiv.textContent = content;
|
||||
}
|
||||
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (eventSource) {
|
||||
eventSource.close();
|
||||
eventSource = null;
|
||||
}
|
||||
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
}
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
peerConnection.close();
|
||||
}
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
stop();
|
||||
}
|
||||
});
|
||||
|
||||
// Add event listener for text input form
|
||||
document.getElementById('text-input-form').addEventListener('submit', function (e) {
|
||||
e.preventDefault();
|
||||
const textInput = document.getElementById('text-input');
|
||||
|
||||
if (textInput.value.trim() !== '') {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: messages,
|
||||
state: messages,
|
||||
textbox: textInput.value
|
||||
})
|
||||
});
|
||||
|
||||
// Clear the input after submission
|
||||
textInput.value = '';
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Gemini using Google's multimodal API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Gemini (Gradio UI)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GEMINI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -3,7 +3,8 @@ import base64
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import AsyncGenerator, Literal
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Literal
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
@@ -13,7 +14,7 @@ from fastapi.responses import HTMLResponse
|
||||
from fastrtc import (
|
||||
AsyncStreamHandler,
|
||||
Stream,
|
||||
get_twilio_turn_credentials,
|
||||
get_cloudflare_turn_credentials_async,
|
||||
wait_for_item,
|
||||
)
|
||||
from google import genai
|
||||
@@ -116,7 +117,7 @@ stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=GeminiHandler(),
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
|
||||
concurrency_limit=5 if get_space() else None,
|
||||
time_limit=90 if get_space() else None,
|
||||
additional_inputs=[
|
||||
@@ -159,7 +160,7 @@ async def _(body: InputData):
|
||||
|
||||
@app.get("/")
|
||||
async def index():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
rtc_config = await get_cloudflare_turn_credentials_async() if get_space() else None
|
||||
html_content = (current_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
@@ -98,6 +98,11 @@
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
@@ -134,7 +139,6 @@
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
@@ -171,6 +175,23 @@
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
|
||||
/* Add styles for the mute toggle */
|
||||
.mute-toggle {
|
||||
width: 24px;
|
||||
height: 24px;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.mute-toggle svg {
|
||||
display: block;
|
||||
}
|
||||
|
||||
#start-button {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
@@ -221,6 +242,11 @@
|
||||
let dataChannel;
|
||||
let isRecording = false;
|
||||
let webrtc_id;
|
||||
let isMuted = false;
|
||||
let analyser_input, dataArray_input;
|
||||
let analyser, dataArray;
|
||||
let source_input = null;
|
||||
let source_output = null;
|
||||
|
||||
const startButton = document.getElementById('start-button');
|
||||
const apiKeyInput = document.getElementById('api-key');
|
||||
@@ -235,7 +261,28 @@
|
||||
boxContainer.appendChild(box);
|
||||
}
|
||||
|
||||
// SVG Icons
|
||||
const micIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
const micMutedIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
<line x1="1" y1="1" x2="23" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
function updateButtonState() {
|
||||
startButton.innerHTML = '';
|
||||
startButton.onclick = null;
|
||||
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
@@ -243,15 +290,28 @@
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
startButton.disabled = true;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
startButton.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Recording</span>
|
||||
</div>
|
||||
const pulseContainer = document.createElement('div');
|
||||
pulseContainer.className = 'pulse-container';
|
||||
pulseContainer.innerHTML = `
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Recording</span>
|
||||
`;
|
||||
|
||||
const muteToggle = document.createElement('div');
|
||||
muteToggle.className = 'mute-toggle';
|
||||
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
|
||||
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
|
||||
muteToggle.addEventListener('click', toggleMute);
|
||||
|
||||
startButton.appendChild(pulseContainer);
|
||||
startButton.appendChild(muteToggle);
|
||||
startButton.disabled = false;
|
||||
|
||||
} else {
|
||||
startButton.innerHTML = 'Start Recording';
|
||||
startButton.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,6 +327,23 @@
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
function toggleMute(event) {
|
||||
event.stopPropagation();
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
||||
|
||||
isMuted = !isMuted;
|
||||
console.log("Mute toggled:", isMuted);
|
||||
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.kind === 'audio') {
|
||||
sender.track.enabled = !isMuted;
|
||||
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
@@ -288,58 +365,74 @@
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
stream.getTracks().forEach(track => peerConnection.addTrack(track, stream));
|
||||
|
||||
// Update audio visualization setup
|
||||
audioContext = new AudioContext();
|
||||
if (!audioContext || audioContext.state === 'closed') {
|
||||
audioContext = new AudioContext();
|
||||
}
|
||||
if (source_input) {
|
||||
try { source_input.disconnect(); } catch (e) { console.warn("Error disconnecting previous input source:", e); }
|
||||
source_input = null;
|
||||
}
|
||||
source_input = audioContext.createMediaStreamSource(stream);
|
||||
analyser_input = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(analyser_input);
|
||||
source_input.connect(analyser_input);
|
||||
analyser_input.fftSize = 64;
|
||||
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
analyser_input.getByteFrequencyData(dataArray_input);
|
||||
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
|
||||
const audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
console.log("audioLevel", audioLevel);
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationId = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
updateAudioLevel();
|
||||
|
||||
// Add connection state change listener
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
if (analyser_input) updateAudioLevel();
|
||||
if (analyser) updateVisualization();
|
||||
} else if (['disconnected', 'failed', 'closed'].includes(peerConnection.connectionState)) {
|
||||
// Explicitly stop animations if connection drops unexpectedly
|
||||
// Note: stopWebRTC() handles the normal stop case
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
// Handle incoming audio
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (audioOutput && audioOutput.srcObject !== evt.streams[0]) {
|
||||
audioOutput.srcObject = evt.streams[0];
|
||||
audioOutput.play();
|
||||
peerConnection.onicecandidate = ({ candidate }) => {
|
||||
if (candidate) {
|
||||
console.debug("Sending ICE candidate", candidate);
|
||||
fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
candidate: candidate.toJSON(),
|
||||
webrtc_id: webrtc_id,
|
||||
type: "ice-candidate",
|
||||
})
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
// Set up audio visualization on the output stream
|
||||
audioContext = new AudioContext();
|
||||
analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(evt.streams[0]);
|
||||
source.connect(analyser);
|
||||
analyser.fftSize = 2048;
|
||||
dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
updateVisualization();
|
||||
peerConnection.addEventListener('track', (evt) => {
|
||||
if (evt.track.kind === 'audio' && audioOutput) {
|
||||
if (audioOutput.srcObject !== evt.streams[0]) {
|
||||
audioOutput.srcObject = evt.streams[0];
|
||||
audioOutput.play().catch(e => console.error("Audio play failed:", e));
|
||||
|
||||
if (!audioContext || audioContext.state === 'closed') {
|
||||
console.warn("AudioContext not ready for output track analysis.");
|
||||
return;
|
||||
}
|
||||
if (source_output) {
|
||||
try { source_output.disconnect(); } catch (e) { console.warn("Error disconnecting previous output source:", e); }
|
||||
source_output = null;
|
||||
}
|
||||
source_output = audioContext.createMediaStreamSource(evt.streams[0]);
|
||||
analyser = audioContext.createAnalyser();
|
||||
source_output.connect(analyser);
|
||||
analyser.fftSize = 2048;
|
||||
dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
updateVisualization();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Create data channel for messages
|
||||
dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
@@ -360,24 +453,9 @@
|
||||
}
|
||||
};
|
||||
|
||||
// Create and send offer
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
@@ -394,7 +472,7 @@
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
stopWebRTC();
|
||||
startButton.textContent = 'Start Recording';
|
||||
return;
|
||||
}
|
||||
@@ -404,13 +482,17 @@
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
stopWebRTC();
|
||||
startButton.textContent = 'Start Recording';
|
||||
}
|
||||
}
|
||||
|
||||
function updateVisualization() {
|
||||
if (!analyser) return;
|
||||
if (!analyser || !peerConnection || !['connected', 'connecting'].includes(peerConnection.connectionState)) {
|
||||
const bars = document.querySelectorAll('.box');
|
||||
bars.forEach(bar => bar.style.transform = 'scaleY(0.1)');
|
||||
return;
|
||||
}
|
||||
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
const bars = document.querySelectorAll('.box');
|
||||
@@ -420,32 +502,114 @@
|
||||
bars[i].style.transform = `scaleY(${Math.max(0.1, barHeight)})`;
|
||||
}
|
||||
|
||||
animationId = requestAnimationFrame(updateVisualization);
|
||||
requestAnimationFrame(updateVisualization);
|
||||
}
|
||||
|
||||
function updateAudioLevel() {
|
||||
if (!analyser_input || !peerConnection || !['connected', 'connecting'].includes(peerConnection.connectionState)) {
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
analyser_input.getByteFrequencyData(dataArray_input);
|
||||
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
|
||||
const audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
|
||||
function stopWebRTC() {
|
||||
console.log("Running stopWebRTC");
|
||||
if (peerConnection) {
|
||||
peerConnection.close();
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track) {
|
||||
sender.track.stop();
|
||||
}
|
||||
});
|
||||
peerConnection.ontrack = null;
|
||||
peerConnection.onicegatheringstatechange = null;
|
||||
peerConnection.onconnectionstatechange = null;
|
||||
|
||||
if (dataChannel) {
|
||||
dataChannel.onmessage = null;
|
||||
try { dataChannel.close(); } catch (e) { console.warn("Error closing data channel:", e); }
|
||||
dataChannel = null;
|
||||
}
|
||||
try { peerConnection.close(); } catch (e) { console.warn("Error closing peer connection:", e); }
|
||||
peerConnection = null;
|
||||
}
|
||||
if (animationId) {
|
||||
cancelAnimationFrame(animationId);
|
||||
|
||||
if (audioOutput) {
|
||||
audioOutput.pause();
|
||||
audioOutput.srcObject = null;
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
|
||||
if (source_input) {
|
||||
try { source_input.disconnect(); } catch (e) { console.warn("Error disconnecting input source:", e); }
|
||||
source_input = null;
|
||||
}
|
||||
if (source_output) {
|
||||
try { source_output.disconnect(); } catch (e) { console.warn("Error disconnecting output source:", e); }
|
||||
source_output = null;
|
||||
}
|
||||
|
||||
if (audioContext && audioContext.state !== 'closed') {
|
||||
audioContext.close().then(() => {
|
||||
console.log("AudioContext closed successfully.");
|
||||
audioContext = null;
|
||||
}).catch(e => {
|
||||
console.error("Error closing AudioContext:", e);
|
||||
audioContext = null;
|
||||
});
|
||||
} else {
|
||||
audioContext = null;
|
||||
}
|
||||
|
||||
analyser_input = null;
|
||||
dataArray_input = null;
|
||||
analyser = null;
|
||||
dataArray = null;
|
||||
|
||||
isMuted = false;
|
||||
isRecording = false;
|
||||
updateButtonState();
|
||||
|
||||
const bars = document.querySelectorAll('.box');
|
||||
bars.forEach(bar => bar.style.transform = 'scaleY(0.1)');
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1);
|
||||
}
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!isRecording) {
|
||||
setupWebRTC();
|
||||
startButton.classList.add('recording');
|
||||
} else {
|
||||
stopWebRTC();
|
||||
startButton.classList.remove('recording');
|
||||
startButton.addEventListener('click', (event) => {
|
||||
if (event.target.closest('.mute-toggle')) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
console.log("Stop button clicked");
|
||||
stopWebRTC();
|
||||
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
console.log("Start button clicked");
|
||||
if (!apiKeyInput.value) {
|
||||
showError("Please enter your API Key.");
|
||||
return;
|
||||
}
|
||||
setupWebRTC();
|
||||
isRecording = true;
|
||||
updateButtonState();
|
||||
}
|
||||
isRecording = !isRecording;
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
</script>
|
||||
</body>
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
fastrtc
|
||||
fastrtc[vad]==0.0.20.rc2
|
||||
python-dotenv
|
||||
google-genai
|
||||
twilio
|
||||
|
||||
BIN
demo/talk_to_llama4/AV_Huggy.png
Normal file
BIN
demo/talk_to_llama4/AV_Huggy.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 46 KiB |
15
demo/talk_to_llama4/README.md
Normal file
15
demo/talk_to_llama4/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Talk to Llama 4
|
||||
emoji: 🦙
|
||||
colorFrom: purple
|
||||
colorTo: red
|
||||
sdk: gradio
|
||||
sdk_version: 5.23.3
|
||||
app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to Llama 4 using Groq + Cloudflare
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
136
demo/talk_to_llama4/app.py
Normal file
136
demo/talk_to_llama4/app.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||
from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
CartesiaTTSOptions,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_cloudflare_turn_credentials_async,
|
||||
get_current_context,
|
||||
get_stt_model,
|
||||
get_tts_model,
|
||||
)
|
||||
from groq import Groq
|
||||
from numpy.typing import NDArray
|
||||
|
||||
curr_dir = Path(__file__).parent
|
||||
load_dotenv()
|
||||
|
||||
tts_model = get_tts_model(
|
||||
model="cartesia", cartesia_api_key=os.getenv("CARTESIA_API_KEY")
|
||||
)
|
||||
groq = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
||||
stt_model = get_stt_model()
|
||||
|
||||
conversations: dict[str, list[dict[str, str]]] = {}
|
||||
|
||||
|
||||
def response(user_audio: tuple[int, NDArray[np.int16]]):
|
||||
context = get_current_context()
|
||||
if context.webrtc_id not in conversations:
|
||||
conversations[context.webrtc_id] = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful assistant that can answer questions and help with tasks."
|
||||
'Please return a short (that will be converted to audio using a text-to-speech model) response and long response to this question. They can be the same if appropriate. Please return in JSON format\n\n{"short":, "long"}\n\n'
|
||||
),
|
||||
}
|
||||
]
|
||||
messages = conversations[context.webrtc_id]
|
||||
|
||||
transcription = stt_model.stt(user_audio)
|
||||
messages.append({"role": "user", "content": transcription})
|
||||
|
||||
completion = groq.chat.completions.create( # type: ignore
|
||||
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
||||
messages=messages, # type: ignore
|
||||
temperature=1,
|
||||
max_completion_tokens=1024,
|
||||
top_p=1,
|
||||
stream=False,
|
||||
response_format={"type": "json_object"},
|
||||
stop=None,
|
||||
)
|
||||
response = completion.choices[0].message.content
|
||||
response = json.loads(response)
|
||||
short_response = response["short"]
|
||||
long_response = response["long"]
|
||||
messages.append({"role": "assistant", "content": long_response})
|
||||
conversations[context.webrtc_id] = messages
|
||||
yield from tts_model.stream_tts_sync(
|
||||
short_response, options=CartesiaTTSOptions(sample_rate=24_000)
|
||||
)
|
||||
yield AdditionalOutputs(messages)
|
||||
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(response),
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
additional_outputs=[gr.Chatbot(type="messages")],
|
||||
additional_outputs_handler=lambda old, new: new,
|
||||
rtc_configuration=None,
|
||||
ui_args={"hide_title": True},
|
||||
)
|
||||
|
||||
with gr.Blocks() as demo:
|
||||
gr.HTML(
|
||||
f"""
|
||||
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
|
||||
<img src="/gradio_api/file={str((Path(__file__).parent / "AV_Huggy.png").resolve())}" alt="AV Huggy" style="height: 100px; margin-right: 10px"> FastRTC + Cartesia TTS = Blazing Fast LLM Audio
|
||||
</h1>
|
||||
"""
|
||||
)
|
||||
stream.ui.render()
|
||||
|
||||
stream.ui = demo
|
||||
|
||||
app = FastAPI()
|
||||
stream.mount(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = await get_cloudflare_turn_credentials_async()
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
|
||||
@app.get("/outputs")
|
||||
async def _(webrtc_id: str):
|
||||
async def output_stream():
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
state = output.args[0]
|
||||
for msg in state[-2:]:
|
||||
data = {
|
||||
"message": msg,
|
||||
}
|
||||
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(
|
||||
server_port=7860,
|
||||
allowed_paths=[str((Path(__file__).parent / "AV_Huggy.png").resolve())],
|
||||
)
|
||||
elif mode == "PHONE":
|
||||
raise ValueError("Phone mode not supported")
|
||||
else:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=7860)
|
||||
839
demo/talk_to_llama4/index.html
Normal file
839
demo/talk_to_llama4/index.html
Normal file
@@ -0,0 +1,839 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Talk to Llama 4</title>
|
||||
<style>
|
||||
:root {
|
||||
--color-primary: #3b82f6;
|
||||
--color-secondary: #f97316;
|
||||
--color-background: #0f172a;
|
||||
--color-surface: #1e293b;
|
||||
--color-text: #f1f5f9;
|
||||
--color-message-user: #334155;
|
||||
--color-message-assistant: #1e40af;
|
||||
--gradient-primary: linear-gradient(135deg, #3b82f6, #8b5cf6);
|
||||
--gradient-secondary: linear-gradient(135deg, #f97316, #ec4899);
|
||||
--boxSize: 8px;
|
||||
--gutter: 4px;
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
background-color: var(--color-background);
|
||||
color: var(--color-text);
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
padding: 2rem 1rem;
|
||||
background-image:
|
||||
radial-gradient(circle at 25% 25%, rgba(59, 130, 246, 0.1) 0%, transparent 50%),
|
||||
radial-gradient(circle at 75% 75%, rgba(249, 115, 22, 0.1) 0%, transparent 50%);
|
||||
}
|
||||
|
||||
.header-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 2rem;
|
||||
margin-bottom: 2rem;
|
||||
width: 100%;
|
||||
max-width: 800px;
|
||||
animation: fadeIn 1s ease-out;
|
||||
}
|
||||
|
||||
.header {
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.header h1 {
|
||||
font-size: 2.5rem;
|
||||
margin-bottom: 0.5rem;
|
||||
background: var(--gradient-primary);
|
||||
-webkit-background-clip: text;
|
||||
-webkit-text-fill-color: transparent;
|
||||
font-weight: 800;
|
||||
}
|
||||
|
||||
.header h2 {
|
||||
font-size: 1.2rem;
|
||||
font-weight: 400;
|
||||
color: rgba(241, 245, 249, 0.8);
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.logo {
|
||||
width: 120px;
|
||||
height: 120px;
|
||||
background: var(--color-surface);
|
||||
border-radius: 50%;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
box-shadow: 0 15px 30px rgba(0, 0, 0, 0.3);
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
animation: float 6s ease-in-out infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.logo::before {
|
||||
content: "";
|
||||
position: absolute;
|
||||
width: 200%;
|
||||
height: 200%;
|
||||
background: var(--gradient-secondary);
|
||||
opacity: 0.2;
|
||||
animation: rotate 10s linear infinite;
|
||||
}
|
||||
|
||||
.logo img {
|
||||
width: 75%;
|
||||
height: 75%;
|
||||
object-fit: contain;
|
||||
z-index: 2;
|
||||
}
|
||||
|
||||
.container {
|
||||
width: 100%;
|
||||
max-width: 800px;
|
||||
background-color: var(--color-surface);
|
||||
border-radius: 1rem;
|
||||
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
|
||||
overflow: hidden;
|
||||
animation: slideUp 0.5s ease-out;
|
||||
}
|
||||
|
||||
.chat-container {
|
||||
height: 400px;
|
||||
overflow-y: auto;
|
||||
padding: 1.5rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
scroll-behavior: smooth;
|
||||
}
|
||||
|
||||
.message {
|
||||
max-width: 80%;
|
||||
padding: 1rem;
|
||||
border-radius: 1rem;
|
||||
line-height: 1.5;
|
||||
animation: fadeIn 0.3s ease-out;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: var(--color-message-user);
|
||||
color: var(--color-text);
|
||||
align-self: flex-end;
|
||||
border-bottom-right-radius: 0.25rem;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: var(--color-message-assistant);
|
||||
color: var(--color-text);
|
||||
align-self: flex-start;
|
||||
border-bottom-left-radius: 0.25rem;
|
||||
}
|
||||
|
||||
.wave-visualizer {
|
||||
height: 100px;
|
||||
padding: 1rem;
|
||||
background-color: rgba(30, 41, 59, 0.8);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
border-top: 1px solid rgba(255, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
.box-container {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
width: 100%;
|
||||
height: 64px;
|
||||
padding: 0 1rem;
|
||||
}
|
||||
|
||||
.box {
|
||||
height: 100%;
|
||||
width: var(--boxSize);
|
||||
background: var(--gradient-primary);
|
||||
border-radius: 4px;
|
||||
transform: scaleY(0.1);
|
||||
transition: transform 0.05s ease;
|
||||
}
|
||||
|
||||
.controls {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
padding: 1.5rem;
|
||||
gap: 1rem;
|
||||
border-top: 1px solid rgba(255, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
#start-button {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: var(--gradient-primary);
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 9999px;
|
||||
padding: 0.75rem 1.5rem;
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
box-shadow: 0 4px 14px rgba(59, 130, 246, 0.4);
|
||||
}
|
||||
|
||||
#start-button:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 6px 20px rgba(59, 130, 246, 0.6);
|
||||
}
|
||||
|
||||
#start-button:active {
|
||||
transform: translateY(1px);
|
||||
}
|
||||
|
||||
.icon-with-spinner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border: 2px solid white;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
border-radius: 50%;
|
||||
background: var(--color-secondary);
|
||||
opacity: 0.85;
|
||||
flex-shrink: 0;
|
||||
transform: scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
.mute-toggle {
|
||||
width: 24px;
|
||||
height: 24px;
|
||||
cursor: pointer;
|
||||
margin-left: 12px;
|
||||
flex-shrink: 0;
|
||||
filter: drop-shadow(0 4px 6px rgba(0, 0, 0, 0.2));
|
||||
}
|
||||
|
||||
.mute-toggle svg {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
stroke: white;
|
||||
}
|
||||
|
||||
.typing-indicator {
|
||||
padding: 0.5rem 1rem;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
background-color: var(--color-message-assistant);
|
||||
border-radius: 1rem;
|
||||
align-self: flex-start;
|
||||
margin-bottom: 0.5rem;
|
||||
display: none;
|
||||
animation: fadeIn 0.3s ease-out;
|
||||
}
|
||||
|
||||
.dots {
|
||||
display: inline-flex;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
background-color: white;
|
||||
border-radius: 50%;
|
||||
animation: bounce 1.5s infinite;
|
||||
opacity: 0.7;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.15s;
|
||||
}
|
||||
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 0.3s;
|
||||
}
|
||||
|
||||
.toast {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
padding: 1rem 1.5rem;
|
||||
border-radius: 0.5rem;
|
||||
font-size: 0.875rem;
|
||||
z-index: 1000;
|
||||
display: none;
|
||||
box-shadow: 0 10px 25px rgba(0, 0, 0, 0.3);
|
||||
animation: slideDown 0.3s ease-out;
|
||||
}
|
||||
|
||||
.toast.error {
|
||||
background-color: #ef4444;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.toast.warning {
|
||||
background-color: #f59e0b;
|
||||
color: black;
|
||||
}
|
||||
|
||||
#audio-output {
|
||||
display: none;
|
||||
}
|
||||
|
||||
@keyframes float {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
50% {
|
||||
transform: translateY(-10px);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes rotate {
|
||||
0% {
|
||||
transform: rotate(0deg);
|
||||
}
|
||||
|
||||
100% {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes bounce {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
50% {
|
||||
transform: translateY(-4px);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from {
|
||||
opacity: 0;
|
||||
}
|
||||
|
||||
to {
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes slideUp {
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translateY(20px);
|
||||
}
|
||||
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes slideDown {
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translate(-50%, -20px);
|
||||
}
|
||||
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translate(-50%, 0);
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="error-toast" class="toast"></div>
|
||||
|
||||
<div class="header-container">
|
||||
<div class="logo">
|
||||
<img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/Video%26Audio%20huggy.png"
|
||||
alt="LLaMA Logo">
|
||||
</div>
|
||||
<div class="header">
|
||||
<h1>Talk to LLaMA 4</h1>
|
||||
<h2>Experience seamless real-time conversation thanks to Cloudflare and Hugging Face's FastRTC.</h2>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="chat-container" id="chat-messages">
|
||||
<!-- Messages will appear here -->
|
||||
</div>
|
||||
|
||||
<div class="typing-indicator" id="typing-indicator">
|
||||
<div class="dots">
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
<div class="dot"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="wave-visualizer">
|
||||
<div class="box-container" id="box-container">
|
||||
<!-- Boxes will be dynamically added here -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="controls">
|
||||
<button id="start-button">Start Conversation</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<audio id="audio-output"></audio>
|
||||
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
const boxContainer = document.getElementById('box-container');
|
||||
const typingIndicator = document.getElementById('typing-indicator');
|
||||
const audioOutput = document.getElementById('audio-output');
|
||||
|
||||
let audioLevel = 0;
|
||||
let animationFrame_input, animationFrame_output;
|
||||
let audioContext_input, audioContext_output;
|
||||
let analyser_input, dataArray_input;
|
||||
let analyser_output, dataArray_output;
|
||||
let audioSource_input, audioSource_output;
|
||||
let messages = [];
|
||||
let eventSource;
|
||||
let isMuted = false;
|
||||
|
||||
// Create wave visualizer boxes
|
||||
const numBars = 32;
|
||||
for (let i = 0; i < numBars; i++) {
|
||||
const box = document.createElement('div');
|
||||
box.className = 'box';
|
||||
boxContainer.appendChild(box);
|
||||
}
|
||||
|
||||
// SVG Icons
|
||||
const micIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
const micMutedIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
<line x1="1" y1="1" x2="23" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
function updateButtonState() {
|
||||
const existingMuteButton = startButton.querySelector('.mute-toggle');
|
||||
if (existingMuteButton) {
|
||||
existingMuteButton.removeEventListener('click', toggleMute);
|
||||
}
|
||||
startButton.innerHTML = '';
|
||||
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
startButton.disabled = true;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
const pulseContainer = document.createElement('div');
|
||||
pulseContainer.className = 'pulse-container';
|
||||
pulseContainer.innerHTML = `
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
`;
|
||||
|
||||
const muteToggle = document.createElement('div');
|
||||
muteToggle.className = 'mute-toggle';
|
||||
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
|
||||
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
|
||||
muteToggle.addEventListener('click', toggleMute);
|
||||
|
||||
startButton.appendChild(pulseContainer);
|
||||
startButton.appendChild(muteToggle);
|
||||
startButton.disabled = false;
|
||||
|
||||
} else {
|
||||
startButton.textContent = 'Start Conversation';
|
||||
startButton.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function toggleMute(event) {
|
||||
event.stopPropagation();
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
||||
|
||||
isMuted = !isMuted;
|
||||
console.log("Mute toggled:", isMuted);
|
||||
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.kind === 'audio') {
|
||||
sender.track.enabled = !isMuted;
|
||||
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
// Input audio context for pulse circle
|
||||
audioContext_input = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser_input = audioContext_input.createAnalyser();
|
||||
audioSource_input = audioContext_input.createMediaStreamSource(stream);
|
||||
audioSource_input.connect(analyser_input);
|
||||
analyser_input.fftSize = 64;
|
||||
dataArray_input = new Uint8Array(analyser_input.frequencyBinCount);
|
||||
|
||||
function updateAudioLevel() {
|
||||
// Update input audio visualization (pulse circle)
|
||||
analyser_input.getByteFrequencyData(dataArray_input);
|
||||
const average = Array.from(dataArray_input).reduce((a, b) => a + b, 0) / dataArray_input.length;
|
||||
audioLevel = average / 255;
|
||||
|
||||
const pulseCircle = document.querySelector('.pulse-circle');
|
||||
if (pulseCircle) {
|
||||
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
||||
}
|
||||
|
||||
animationFrame_input = requestAnimationFrame(updateAudioLevel);
|
||||
}
|
||||
|
||||
updateAudioLevel();
|
||||
}
|
||||
|
||||
function setupOutputVisualization(stream) {
|
||||
// Create separate audio context for output visualization
|
||||
audioContext_output = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser_output = audioContext_output.createAnalyser();
|
||||
audioSource_output = audioContext_output.createMediaStreamSource(stream);
|
||||
audioSource_output.connect(analyser_output);
|
||||
analyser_output.fftSize = 2048;
|
||||
dataArray_output = new Uint8Array(analyser_output.frequencyBinCount);
|
||||
|
||||
function updateVisualization() {
|
||||
// Update output audio visualization (wave bars)
|
||||
analyser_output.getByteFrequencyData(dataArray_output);
|
||||
|
||||
const boxes = document.querySelectorAll('.box');
|
||||
for (let i = 0; i < boxes.length; i++) {
|
||||
const index = Math.floor(i * dataArray_output.length / boxes.length);
|
||||
const value = dataArray_output[index] / 255;
|
||||
boxes[i].style.transform = `scaleY(${Math.max(0.1, value * 1.5)})`;
|
||||
}
|
||||
|
||||
animationFrame_output = requestAnimationFrame(updateVisualization);
|
||||
}
|
||||
|
||||
updateVisualization();
|
||||
}
|
||||
|
||||
// Reset wave visualization bars to minimum height
|
||||
function resetVisualization() {
|
||||
const boxes = document.querySelectorAll('.box');
|
||||
boxes.forEach(box => box.style.transform = 'scaleY(0.1)');
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
toast.className = 'toast error';
|
||||
toast.style.display = 'block';
|
||||
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
function handleMessage(event) {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
|
||||
if (eventJson.type === "error") {
|
||||
showError(eventJson.message);
|
||||
} else if (eventJson.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
webrtc_id: webrtc_id,
|
||||
chatbot: messages,
|
||||
state: messages
|
||||
})
|
||||
});
|
||||
} else if (eventJson.type === "log") {
|
||||
if (eventJson.data === "pause_detected") {
|
||||
typingIndicator.style.display = 'block';
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
} else if (eventJson.data === "response_starting") {
|
||||
typingIndicator.style.display = 'none';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function setupWebRTC() {
|
||||
const config = __RTC_CONFIGURATION__;
|
||||
peerConnection = new RTCPeerConnection(config);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
||||
toast.className = 'toast warning';
|
||||
toast.style.display = 'block';
|
||||
|
||||
setTimeout(() => {
|
||||
toast.style.display = 'none';
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: true
|
||||
});
|
||||
|
||||
setupAudioVisualization(stream);
|
||||
|
||||
stream.getTracks().forEach(track => {
|
||||
peerConnection.addTrack(track, stream);
|
||||
});
|
||||
|
||||
// Add this listener to handle incoming audio track
|
||||
peerConnection.addEventListener('track', (event) => {
|
||||
if (event.track.kind === 'audio') {
|
||||
console.log("Received audio track from server");
|
||||
|
||||
if (audioOutput) {
|
||||
audioOutput.srcObject = event.streams[0];
|
||||
audioOutput.play().catch(e => console.error("Audio play failed:", e));
|
||||
}
|
||||
|
||||
// Set up visualization for output audio with separate context
|
||||
setupOutputVisualization(event.streams[0]);
|
||||
}
|
||||
});
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = handleMessage;
|
||||
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
peerConnection.onicecandidate = ({ candidate }) => {
|
||||
if (candidate) {
|
||||
console.debug("Sending ICE candidate", candidate);
|
||||
fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
candidate: candidate.toJSON(),
|
||||
webrtc_id: webrtc_id,
|
||||
type: "ice-candidate",
|
||||
})
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
} else if (['closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
stop();
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sdp: peerConnection.localDescription.sdp,
|
||||
type: peerConnection.localDescription.type,
|
||||
webrtc_id: webrtc_id
|
||||
})
|
||||
});
|
||||
|
||||
const serverResponse = await response.json();
|
||||
|
||||
if (serverResponse.status === 'failed') {
|
||||
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
||||
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
||||
: serverResponse.meta.error);
|
||||
stop();
|
||||
return;
|
||||
}
|
||||
|
||||
await peerConnection.setRemoteDescription(serverResponse);
|
||||
|
||||
eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
console.log(eventJson);
|
||||
messages.push(eventJson.message);
|
||||
addMessage(eventJson.message.role, eventJson.audio ?? eventJson.message.content);
|
||||
})
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
console.error('Error setting up WebRTC:', err);
|
||||
showError('Failed to establish connection. Please try again.');
|
||||
stop();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessage(role, content) {
|
||||
const messageDiv = document.createElement('div');
|
||||
messageDiv.classList.add('message', role);
|
||||
messageDiv.textContent = content;
|
||||
chatMessages.appendChild(messageDiv);
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (eventSource) {
|
||||
eventSource.close();
|
||||
eventSource = null;
|
||||
}
|
||||
|
||||
if (animationFrame_input) {
|
||||
cancelAnimationFrame(animationFrame_input);
|
||||
animationFrame_input = null;
|
||||
}
|
||||
|
||||
if (animationFrame_output) {
|
||||
cancelAnimationFrame(animationFrame_output);
|
||||
animationFrame_output = null;
|
||||
}
|
||||
|
||||
if (audioContext_input) {
|
||||
audioContext_input.close().catch(e => console.error("Error closing input AudioContext:", e));
|
||||
audioContext_input = null;
|
||||
analyser_input = null;
|
||||
dataArray_input = null;
|
||||
audioSource_input = null;
|
||||
}
|
||||
|
||||
if (audioContext_output) {
|
||||
audioContext_output.close().catch(e => console.error("Error closing output AudioContext:", e));
|
||||
audioContext_output = null;
|
||||
analyser_output = null;
|
||||
dataArray_output = null;
|
||||
audioSource_output = null;
|
||||
}
|
||||
|
||||
if (audioOutput) {
|
||||
audioOutput.pause();
|
||||
audioOutput.srcObject = null;
|
||||
}
|
||||
|
||||
// Reset visualization
|
||||
resetVisualization();
|
||||
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
peerConnection.onicecandidate = null;
|
||||
peerConnection.ondatachannel = null;
|
||||
peerConnection.onconnectionstatechange = null;
|
||||
|
||||
peerConnection.close();
|
||||
peerConnection = null;
|
||||
}
|
||||
|
||||
isMuted = false;
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', (event) => {
|
||||
if (event.target.closest('.mute-toggle')) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
console.log("Stop button clicked");
|
||||
stop();
|
||||
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
console.log("Start button clicked");
|
||||
messages = [];
|
||||
chatMessages.innerHTML = '';
|
||||
setupWebRTC();
|
||||
updateButtonState();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
3
demo/talk_to_llama4/requirements.txt
Normal file
3
demo/talk_to_llama4/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
fastrtc[vad, tts]==0.0.20.rc2
|
||||
groq
|
||||
python-dotenv
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to OpenAI using their multimodal API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|OPENAI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Talk to OpenAI (Gradio UI)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|OPENAI_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -17,7 +17,6 @@ from fastrtc import (
|
||||
wait_for_item,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@@ -50,12 +49,32 @@ class OpenAIHandler(AsyncStreamHandler):
|
||||
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
||||
) as conn:
|
||||
await conn.session.update(
|
||||
session={"turn_detection": {"type": "server_vad"}}
|
||||
session={
|
||||
"turn_detection": {"type": "server_vad"},
|
||||
"input_audio_transcription": {
|
||||
"model": "whisper-1",
|
||||
"language": "en",
|
||||
},
|
||||
}
|
||||
)
|
||||
self.connection = conn
|
||||
async for event in self.connection:
|
||||
# Handle interruptions
|
||||
if event.type == "input_audio_buffer.speech_started":
|
||||
self.clear_queue()
|
||||
if (
|
||||
event.type
|
||||
== "conversation.item.input_audio_transcription.completed"
|
||||
):
|
||||
await self.output_queue.put(
|
||||
AdditionalOutputs({"role": "user", "content": event.transcript})
|
||||
)
|
||||
if event.type == "response.audio_transcript.done":
|
||||
await self.output_queue.put(AdditionalOutputs(event))
|
||||
await self.output_queue.put(
|
||||
AdditionalOutputs(
|
||||
{"role": "assistant", "content": event.transcript}
|
||||
)
|
||||
)
|
||||
if event.type == "response.audio.delta":
|
||||
await self.output_queue.put(
|
||||
(
|
||||
@@ -83,8 +102,8 @@ class OpenAIHandler(AsyncStreamHandler):
|
||||
self.connection = None
|
||||
|
||||
|
||||
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
|
||||
chatbot.append({"role": "assistant", "content": response.transcript})
|
||||
def update_chatbot(chatbot: list[dict], response: dict):
|
||||
chatbot.append(response)
|
||||
return chatbot
|
||||
|
||||
|
||||
@@ -121,7 +140,7 @@ def _(webrtc_id: str):
|
||||
import json
|
||||
|
||||
async for output in stream.output_stream(webrtc_id):
|
||||
s = json.dumps({"role": "assistant", "content": output.args[0].transcript})
|
||||
s = json.dumps(output.args[0])
|
||||
yield f"event: output\ndata: {s}\n\n"
|
||||
|
||||
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
||||
|
||||
@@ -45,20 +45,26 @@
|
||||
|
||||
.message {
|
||||
margin-bottom: 20px;
|
||||
padding: 12px;
|
||||
border-radius: 4px;
|
||||
padding: 12px 16px;
|
||||
border-radius: 8px;
|
||||
font-size: 16px;
|
||||
line-height: 1.5;
|
||||
max-width: 70%;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
.message.user {
|
||||
background-color: #1a1a1a;
|
||||
margin-left: 20%;
|
||||
background-color: #2c2c2c;
|
||||
float: right;
|
||||
border-bottom-right-radius: 2px;
|
||||
border: 1px solid #404040;
|
||||
}
|
||||
|
||||
.message.assistant {
|
||||
background-color: #262626;
|
||||
margin-right: 20%;
|
||||
float: left;
|
||||
border-bottom-left-radius: 2px;
|
||||
border: 1px solid #333;
|
||||
}
|
||||
|
||||
.controls {
|
||||
@@ -67,16 +73,21 @@
|
||||
}
|
||||
|
||||
button {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 10px;
|
||||
padding: 12px 24px;
|
||||
background-color: transparent;
|
||||
color: #ffffff;
|
||||
border: 1px solid #ffffff;
|
||||
padding: 12px 24px;
|
||||
font-family: inherit;
|
||||
font-size: 16px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 1px;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
@@ -116,9 +127,7 @@
|
||||
.pulse-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
@@ -128,10 +137,47 @@
|
||||
background-color: #ffffff;
|
||||
opacity: 0.2;
|
||||
flex-shrink: 0;
|
||||
transform: translateX(-0%) scale(var(--audio-level, 1));
|
||||
transform: scale(var(--audio-level, 1));
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Fix button layout */
|
||||
button {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 10px;
|
||||
padding: 12px 24px;
|
||||
background-color: transparent;
|
||||
color: #ffffff;
|
||||
border: 1px solid #ffffff;
|
||||
font-family: inherit;
|
||||
font-size: 16px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 1px;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.mute-toggle {
|
||||
width: 24px;
|
||||
height: 24px;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.mute-toggle svg {
|
||||
display: block;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
#start-button {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
|
||||
/* Add styles for toast notifications */
|
||||
.toast {
|
||||
position: fixed;
|
||||
@@ -177,6 +223,7 @@
|
||||
<script>
|
||||
let peerConnection;
|
||||
let webrtc_id;
|
||||
let isMuted = false;
|
||||
const audioOutput = document.getElementById('audio-output');
|
||||
const startButton = document.getElementById('start-button');
|
||||
const chatMessages = document.getElementById('chat-messages');
|
||||
@@ -185,27 +232,82 @@
|
||||
let animationFrame;
|
||||
let audioContext, analyser, audioSource;
|
||||
|
||||
// SVG Icons
|
||||
const micIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
const micMutedIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
<line x1="1" y1="1" x2="23" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
|
||||
// Clear previous content
|
||||
button.innerHTML = '';
|
||||
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
const spinner = document.createElement('div');
|
||||
spinner.className = 'spinner';
|
||||
|
||||
const text = document.createElement('span');
|
||||
text.textContent = 'Connecting...';
|
||||
|
||||
button.appendChild(spinner);
|
||||
button.appendChild(text);
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
`;
|
||||
// Create pulse circle
|
||||
const pulseCircle = document.createElement('div');
|
||||
pulseCircle.className = 'pulse-circle';
|
||||
|
||||
// Create mic icon
|
||||
const micIcon = document.createElement('div');
|
||||
micIcon.className = 'mute-toggle';
|
||||
micIcon.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
|
||||
micIcon.addEventListener('click', toggleMute);
|
||||
|
||||
// Create text
|
||||
const text = document.createElement('span');
|
||||
text.textContent = 'Stop Conversation';
|
||||
|
||||
// Add elements in correct order
|
||||
button.appendChild(pulseCircle);
|
||||
button.appendChild(micIcon);
|
||||
button.appendChild(text);
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
const text = document.createElement('span');
|
||||
text.textContent = 'Start Conversation';
|
||||
button.appendChild(text);
|
||||
}
|
||||
}
|
||||
|
||||
function toggleMute(event) {
|
||||
event.stopPropagation();
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
||||
|
||||
isMuted = !isMuted;
|
||||
console.log("Mute toggled:", isMuted);
|
||||
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.kind === 'audio') {
|
||||
sender.track.enabled = !isMuted;
|
||||
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
@@ -276,6 +378,21 @@
|
||||
}
|
||||
});
|
||||
|
||||
peerConnection.onicecandidate = ({ candidate }) => {
|
||||
if (candidate) {
|
||||
console.debug("Sending ICE candidate", candidate);
|
||||
fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
candidate: candidate.toJSON(),
|
||||
webrtc_id: webrtc_id,
|
||||
type: "ice-candidate",
|
||||
})
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
@@ -287,20 +404,6 @@
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
peerConnection.addEventListener('connectionstatechange', () => {
|
||||
console.log('connectionstatechange', peerConnection.connectionState);
|
||||
if (peerConnection.connectionState === 'connected') {
|
||||
@@ -338,7 +441,7 @@
|
||||
const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
|
||||
eventSource.addEventListener("output", (event) => {
|
||||
const eventJson = JSON.parse(event.data);
|
||||
addMessage("assistant", eventJson.content);
|
||||
addMessage(eventJson.role, eventJson.content);
|
||||
|
||||
});
|
||||
} catch (err) {
|
||||
@@ -388,7 +491,12 @@
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
startButton.addEventListener('click', (event) => {
|
||||
// Skip if clicking the mute toggle
|
||||
if (event.target.closest('.mute-toggle')) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('clicked');
|
||||
console.log(peerConnection, peerConnection?.connectionState);
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
fastrtc[vad]
|
||||
fastrtc[vad]==0.0.20.rc2
|
||||
openai
|
||||
twilio
|
||||
python-dotenv
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Llama 3.2 - SambaNova API
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN_ALT, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Llama 3.2 - SambaNova API (Gradio)
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|SAMBANOVA_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN_ALT, secret|SAMBANOVA_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -13,8 +13,9 @@ from fastrtc import (
|
||||
AdditionalOutputs,
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_cloudflare_turn_credentials,
|
||||
get_cloudflare_turn_credentials_async,
|
||||
get_stt_model,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from gradio.utils import get_space
|
||||
from pydantic import BaseModel
|
||||
@@ -75,7 +76,8 @@ stream = Stream(
|
||||
additional_outputs=[chatbot, state],
|
||||
additional_outputs_handler=lambda *a: (a[2], a[3]),
|
||||
concurrency_limit=20 if get_space() else None,
|
||||
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
||||
rtc_configuration=get_cloudflare_turn_credentials_async,
|
||||
server_rtc_configuration=get_cloudflare_turn_credentials(ttl=36_000),
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
@@ -95,7 +97,9 @@ class InputData(BaseModel):
|
||||
|
||||
@app.get("/")
|
||||
async def _():
|
||||
rtc_config = get_twilio_turn_credentials() if get_space() else None
|
||||
rtc_config = await get_cloudflare_turn_credentials_async(
|
||||
hf_token=os.getenv("HF_TOKEN_ALT")
|
||||
)
|
||||
html_content = (curr_dir / "index.html").read_text()
|
||||
html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
|
||||
return HTMLResponse(content=html_content)
|
||||
|
||||
@@ -72,13 +72,17 @@
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
padding: 12px 18px;
|
||||
font-family: inherit;
|
||||
font-size: 14px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s;
|
||||
border-radius: 4px;
|
||||
font-weight: 500;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
@@ -94,7 +98,6 @@
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.spinner {
|
||||
@@ -118,7 +121,6 @@
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
min-width: 180px;
|
||||
}
|
||||
|
||||
.pulse-circle {
|
||||
@@ -200,6 +202,23 @@
|
||||
background-color: #ffd700;
|
||||
color: black;
|
||||
}
|
||||
|
||||
/* Styles for the mute toggle icon */
|
||||
.mute-toggle {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
cursor: pointer;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.mute-toggle svg {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
stroke: white;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
@@ -239,28 +258,82 @@
|
||||
let audioContext, analyser, audioSource;
|
||||
let messages = [];
|
||||
let eventSource;
|
||||
let isMuted = false;
|
||||
|
||||
// SVG Icons
|
||||
const micIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
const micMutedIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
<line x1="1" y1="1" x2="23" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
function updateButtonState() {
|
||||
const button = document.getElementById('start-button');
|
||||
const existingMuteButton = startButton.querySelector('.mute-toggle');
|
||||
if (existingMuteButton) {
|
||||
existingMuteButton.removeEventListener('click', toggleMute);
|
||||
}
|
||||
startButton.innerHTML = '';
|
||||
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
button.innerHTML = `
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
`;
|
||||
startButton.disabled = true;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
button.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
</div>
|
||||
const pulseContainer = document.createElement('div');
|
||||
pulseContainer.className = 'pulse-container';
|
||||
pulseContainer.innerHTML = `
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Conversation</span>
|
||||
`;
|
||||
|
||||
const muteToggle = document.createElement('div');
|
||||
muteToggle.className = 'mute-toggle';
|
||||
muteToggle.title = isMuted ? 'Unmute' : 'Mute';
|
||||
muteToggle.innerHTML = isMuted ? micMutedIconSVG : micIconSVG;
|
||||
muteToggle.addEventListener('click', toggleMute);
|
||||
|
||||
startButton.appendChild(pulseContainer);
|
||||
startButton.appendChild(muteToggle);
|
||||
startButton.disabled = false;
|
||||
|
||||
} else {
|
||||
button.innerHTML = 'Start Conversation';
|
||||
startButton.textContent = 'Start Conversation';
|
||||
startButton.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function toggleMute(event) {
|
||||
event.stopPropagation();
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
||||
|
||||
isMuted = !isMuted;
|
||||
console.log("Mute toggled:", isMuted);
|
||||
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.kind === 'audio') {
|
||||
sender.track.enabled = !isMuted;
|
||||
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
@@ -378,6 +451,8 @@
|
||||
clearTimeout(timeoutId);
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.style.display = 'none';
|
||||
} else if (['closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
stop();
|
||||
}
|
||||
updateButtonState();
|
||||
});
|
||||
@@ -448,9 +523,10 @@
|
||||
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
animationFrame = null;
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
@@ -464,22 +540,33 @@
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
peerConnection.onicecandidate = null;
|
||||
peerConnection.ondatachannel = null;
|
||||
peerConnection.onconnectionstatechange = null;
|
||||
|
||||
peerConnection.close();
|
||||
peerConnection = null;
|
||||
console.log("Peer connection closed.");
|
||||
}
|
||||
isMuted = false;
|
||||
updateButtonState();
|
||||
audioLevel = 0;
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
startButton.addEventListener('click', (event) => {
|
||||
if (event.target.closest('.mute-toggle')) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
console.log("Stop button clicked");
|
||||
stop();
|
||||
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
console.log("Start button clicked");
|
||||
messages = [];
|
||||
chatMessages.innerHTML = '';
|
||||
setupWebRTC();
|
||||
updateButtonState();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
fastrtc[vad, stt]
|
||||
fastrtc[vad, stt]==0.0.20.rc2
|
||||
python-dotenv
|
||||
huggingface_hub>=0.29.0
|
||||
twilio
|
||||
@@ -1,5 +1,4 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
@@ -22,11 +21,11 @@ stt_model = get_stt_model()
|
||||
tts_model = get_tts_model()
|
||||
|
||||
# Conversation state to maintain history
|
||||
conversation_state: List[Dict[str, str]] = []
|
||||
conversation_state: list[dict[str, str]] = []
|
||||
|
||||
# System prompt for agent
|
||||
system_prompt = """You are a helpful assistant that can helps with finding places to
|
||||
workremotely from. You should specifically check against reviews and ratings of the
|
||||
work remotely from. You should specifically check against reviews and ratings of the
|
||||
place. You should use this criteria to find the best place to work from:
|
||||
- Price
|
||||
- Reviews
|
||||
@@ -78,9 +77,7 @@ def process_response(audio):
|
||||
response_content = agent.run(input_text)
|
||||
|
||||
# Convert response to audio using TTS model
|
||||
for audio_chunk in tts_model.stream_tts_sync(response_content or ""):
|
||||
# Yield the audio chunk
|
||||
yield audio_chunk
|
||||
yield from tts_model.stream_tts_sync(response_content or "")
|
||||
|
||||
|
||||
stream = Stream(
|
||||
|
||||
@@ -76,14 +76,14 @@ def response(
|
||||
)
|
||||
for chunk in aggregate_bytes_to_16bit(iterator):
|
||||
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
|
||||
yield (24000, audio_array, "mono")
|
||||
yield (24000, audio_array)
|
||||
|
||||
|
||||
chatbot = gr.Chatbot(type="messages")
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=ReplyOnPause(response),
|
||||
handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
|
||||
additional_outputs_handler=lambda a, b: b,
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
|
||||
@@ -390,35 +390,8 @@
|
||||
rttValues: []
|
||||
};
|
||||
|
||||
// Load mu-law library
|
||||
|
||||
// Add load promise to track when the script is ready
|
||||
|
||||
|
||||
function resample(audioData, fromSampleRate, toSampleRate) {
|
||||
const ratio = fromSampleRate / toSampleRate;
|
||||
const newLength = Math.round(audioData.length / ratio);
|
||||
const result = new Float32Array(newLength);
|
||||
|
||||
for (let i = 0; i < newLength; i++) {
|
||||
const position = i * ratio;
|
||||
const index = Math.floor(position);
|
||||
const fraction = position - index;
|
||||
|
||||
if (index + 1 < audioData.length) {
|
||||
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
|
||||
} else {
|
||||
result[i] = audioData[index];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function convertToMulaw(audioData, sampleRate) {
|
||||
// Resample to 8000 Hz if needed
|
||||
if (sampleRate !== 8000) {
|
||||
audioData = resample(audioData, sampleRate, 8000);
|
||||
}
|
||||
|
||||
// Convert float32 [-1,1] to int16 [-32768,32767]
|
||||
const int16Data = new Int16Array(audioData.length);
|
||||
@@ -449,7 +422,7 @@
|
||||
wsMetrics.startTime = performance.now();
|
||||
|
||||
// Create audio context and analyser for visualization
|
||||
const audioContext = new AudioContext();
|
||||
const audioContext = new AudioContext({ sampleRate: 24000 });
|
||||
const analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(analyser);
|
||||
|
||||
@@ -9,7 +9,7 @@ app_file: app.py
|
||||
pinned: false
|
||||
license: mit
|
||||
short_description: Transcribe audio in realtime with Whisper
|
||||
tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY]
|
||||
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
|
||||
---
|
||||
|
||||
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
||||
@@ -12,8 +12,7 @@ tags:
|
||||
- webrtc
|
||||
- websocket
|
||||
- gradio
|
||||
- secret|TWILIO_ACCOUNT_SID
|
||||
- secret|TWILIO_AUTH_TOKEN
|
||||
- secret|HF_TOKEN
|
||||
- secret|GROQ_API_KEY
|
||||
title: Whisper Realtime Transcription (Gradio UI)
|
||||
---
|
||||
|
||||
@@ -9,14 +9,21 @@
|
||||
:root {
|
||||
--primary-gradient: linear-gradient(135deg, #f9a45c 0%, #e66465 100%);
|
||||
--background-cream: #faf8f5;
|
||||
--background-cream-end: #f7f5f2;
|
||||
/* Slightly warmer end color for body gradient */
|
||||
--text-dark: #2d2d2d;
|
||||
--transcript-bg: #ffffff;
|
||||
/* White background for transcript area */
|
||||
--transcript-border: #e0e0e0;
|
||||
/* Light border for transcript items */
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background-color: var(--background-cream);
|
||||
/* Apply a subtle vertical gradient to the body */
|
||||
background: linear-gradient(to bottom, var(--background-cream), var(--background-cream-end));
|
||||
color: var(--text-dark);
|
||||
min-height: 100vh;
|
||||
}
|
||||
@@ -43,18 +50,26 @@
|
||||
|
||||
.container {
|
||||
max-width: 1000px;
|
||||
margin: 1.5rem auto;
|
||||
margin: 2.5rem auto;
|
||||
/* Increased top/bottom margin */
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.transcript-container {
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
|
||||
border-radius: 12px;
|
||||
/* Slightly larger radius */
|
||||
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08);
|
||||
/* Enhanced shadow */
|
||||
padding: 1.5rem;
|
||||
height: 300px;
|
||||
height: 350px;
|
||||
/* Increased height */
|
||||
overflow-y: auto;
|
||||
margin-bottom: 1.5rem;
|
||||
border: 1px solid rgba(0, 0, 0, 0.1);
|
||||
margin-bottom: 2rem;
|
||||
/* Increased margin */
|
||||
border: 1px solid rgba(0, 0, 0, 0.05);
|
||||
/* Softer border */
|
||||
background-color: var(--transcript-bg);
|
||||
/* Use the new variable */
|
||||
}
|
||||
|
||||
.controls {
|
||||
@@ -73,6 +88,8 @@
|
||||
transition: all 0.2s ease;
|
||||
font-weight: 500;
|
||||
min-width: 180px;
|
||||
position: relative;
|
||||
padding-right: 50px;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
@@ -86,22 +103,39 @@
|
||||
|
||||
/* Transcript text styling */
|
||||
.transcript-container p {
|
||||
margin: 0.4rem 0;
|
||||
padding: 0.6rem;
|
||||
margin: 0.6rem 0;
|
||||
/* Increased vertical margin */
|
||||
padding: 0.8rem 1rem;
|
||||
/* Increased padding */
|
||||
background: var(--background-cream);
|
||||
border-radius: 4px;
|
||||
line-height: 1.4;
|
||||
font-size: 0.95rem;
|
||||
/* Use the lighter cream for contrast */
|
||||
border-radius: 6px;
|
||||
/* Slightly larger radius */
|
||||
line-height: 1.5;
|
||||
/* Improved line spacing */
|
||||
font-size: 0.98rem;
|
||||
/* Slightly larger font */
|
||||
border-left: 3px solid var(--transcript-border);
|
||||
/* Add a subtle left border */
|
||||
transition: background-color 0.2s ease;
|
||||
/* Smooth hover effect */
|
||||
}
|
||||
|
||||
/* Custom scrollbar - made thinner */
|
||||
.transcript-container p:hover {
|
||||
background-color: #fdfbf9;
|
||||
/* Slightly change background on hover */
|
||||
}
|
||||
|
||||
/* Custom scrollbar - update track color */
|
||||
.transcript-container::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
width: 8px;
|
||||
/* Slightly wider scrollbar */
|
||||
}
|
||||
|
||||
.transcript-container::-webkit-scrollbar-track {
|
||||
background: var(--background-cream);
|
||||
border-radius: 3px;
|
||||
background: var(--background-cream-end);
|
||||
/* Match body end gradient */
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.transcript-container::-webkit-scrollbar-thumb {
|
||||
@@ -176,6 +210,40 @@
|
||||
transition: transform 0.1s ease;
|
||||
}
|
||||
|
||||
/* Styles for the mute button */
|
||||
.mute-toggle {
|
||||
position: absolute;
|
||||
right: 10px;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
width: 24px;
|
||||
height: 24px;
|
||||
cursor: pointer;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.mute-toggle svg {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
stroke: white;
|
||||
}
|
||||
|
||||
/* Adjust layout for button content when mute is present */
|
||||
.button-content {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: calc(100% - 40px);
|
||||
margin-right: 40px;
|
||||
}
|
||||
|
||||
.icon-with-spinner,
|
||||
.pulse-container {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
@@ -206,10 +274,29 @@
|
||||
let audioContext, analyser, audioSource;
|
||||
let audioLevel = 0;
|
||||
let animationFrame;
|
||||
let isMuted = false;
|
||||
|
||||
const startButton = document.getElementById('start-button');
|
||||
const transcriptDiv = document.getElementById('transcript');
|
||||
|
||||
// SVG Icons
|
||||
const micIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
const micMutedIconSVG = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
<line x1="1" y1="1" x2="23" y2="23"></line>
|
||||
</svg>`;
|
||||
|
||||
function showError(message) {
|
||||
const toast = document.getElementById('error-toast');
|
||||
toast.textContent = message;
|
||||
@@ -241,25 +328,63 @@
|
||||
}
|
||||
|
||||
function updateButtonState() {
|
||||
// Remove existing mute listener if present
|
||||
const existingMuteButton = startButton.querySelector('.mute-toggle');
|
||||
if (existingMuteButton) {
|
||||
existingMuteButton.removeEventListener('click', toggleMute);
|
||||
existingMuteButton.remove();
|
||||
}
|
||||
|
||||
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
||||
startButton.innerHTML = `
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
<div class="button-content">
|
||||
<div class="icon-with-spinner">
|
||||
<div class="spinner"></div>
|
||||
<span>Connecting...</span>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
startButton.disabled = true;
|
||||
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
startButton.innerHTML = `
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Recording</span>
|
||||
<div class="button-content">
|
||||
<div class="pulse-container">
|
||||
<div class="pulse-circle"></div>
|
||||
<span>Stop Recording</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="mute-toggle" title="${isMuted ? 'Unmute' : 'Mute'}">
|
||||
${isMuted ? micMutedIconSVG : micIconSVG}
|
||||
</div>
|
||||
`;
|
||||
startButton.disabled = false;
|
||||
const muteButton = startButton.querySelector('.mute-toggle');
|
||||
if (muteButton) {
|
||||
muteButton.addEventListener('click', toggleMute);
|
||||
}
|
||||
} else {
|
||||
startButton.innerHTML = 'Start Recording';
|
||||
startButton.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function toggleMute(event) {
|
||||
event.stopPropagation();
|
||||
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
||||
|
||||
isMuted = !isMuted;
|
||||
console.log("Mute toggled:", isMuted);
|
||||
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.kind === 'audio') {
|
||||
sender.track.enabled = !isMuted;
|
||||
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
function setupAudioVisualization(stream) {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioContext.createAnalyser();
|
||||
@@ -321,6 +446,21 @@
|
||||
updateButtonState();
|
||||
});
|
||||
|
||||
peerConnection.onicecandidate = ({ candidate }) => {
|
||||
if (candidate) {
|
||||
console.debug("Sending ICE candidate", candidate);
|
||||
fetch('/webrtc/offer', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
candidate: candidate.toJSON(),
|
||||
webrtc_id: webrtc_id,
|
||||
type: "ice-candidate",
|
||||
})
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
// Create data channel for messages
|
||||
const dataChannel = peerConnection.createDataChannel('text');
|
||||
dataChannel.onmessage = handleMessage;
|
||||
@@ -329,20 +469,6 @@
|
||||
const offer = await peerConnection.createOffer();
|
||||
await peerConnection.setLocalDescription(offer);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
resolve();
|
||||
} else {
|
||||
const checkState = () => {
|
||||
if (peerConnection.iceGatheringState === "complete") {
|
||||
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
||||
}
|
||||
});
|
||||
|
||||
webrtc_id = Math.random().toString(36).substring(7);
|
||||
|
||||
const response = await fetch('/webrtc/offer', {
|
||||
@@ -392,41 +518,45 @@
|
||||
function stop() {
|
||||
if (animationFrame) {
|
||||
cancelAnimationFrame(animationFrame);
|
||||
animationFrame = null;
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
|
||||
audioContext = null;
|
||||
analyser = null;
|
||||
audioSource = null;
|
||||
}
|
||||
if (peerConnection) {
|
||||
if (peerConnection.getTransceivers) {
|
||||
peerConnection.getTransceivers().forEach(transceiver => {
|
||||
if (transceiver.stop) {
|
||||
transceiver.stop();
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track) {
|
||||
sender.track.stop();
|
||||
console.log(`Track ${sender.track.id} stopped.`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (peerConnection.getSenders) {
|
||||
peerConnection.getSenders().forEach(sender => {
|
||||
if (sender.track && sender.track.stop) sender.track.stop();
|
||||
});
|
||||
}
|
||||
|
||||
setTimeout(() => {
|
||||
peerConnection.close();
|
||||
}, 500);
|
||||
peerConnection.close();
|
||||
peerConnection = null;
|
||||
console.log("Peer connection closed.");
|
||||
}
|
||||
audioLevel = 0;
|
||||
isMuted = false;
|
||||
updateButtonState();
|
||||
}
|
||||
|
||||
startButton.addEventListener('click', () => {
|
||||
if (startButton.textContent === 'Start Recording') {
|
||||
setupWebRTC();
|
||||
} else {
|
||||
startButton.addEventListener('click', (event) => {
|
||||
if (event.target.closest('.mute-toggle')) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (peerConnection && peerConnection.connectionState === 'connected') {
|
||||
console.log("Stop button clicked");
|
||||
stop();
|
||||
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
||||
console.log("Start button clicked");
|
||||
transcriptDiv.innerHTML = '';
|
||||
setupWebRTC();
|
||||
updateButtonState();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
fastrtc[vad]
|
||||
fastrtc[vad]==0.0.20.rc2
|
||||
groq
|
||||
python-dotenv
|
||||
twilio
|
||||
python-dotenv
|
||||
Reference in New Issue
Block a user