diff --git a/demo/talk_to_azure_openai/README.md b/demo/talk_to_azure_openai/README.md new file mode 100644 index 0000000..0fa2a09 --- /dev/null +++ b/demo/talk_to_azure_openai/README.md @@ -0,0 +1,15 @@ +--- +title: Talk to Azure OpenAI +emoji: 🗣️ +colorFrom: purple +colorTo: red +sdk: gradio +sdk_version: 5.16.0 +app_file: app.py +pinned: false +license: mit +short_description: Talk to Azure OpenAI using their multimodal API +tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY] +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file diff --git a/demo/talk_to_azure_openai/README_gradio.md b/demo/talk_to_azure_openai/README_gradio.md new file mode 100644 index 0000000..8353f3a --- /dev/null +++ b/demo/talk_to_azure_openai/README_gradio.md @@ -0,0 +1,15 @@ +--- +title: Talk to Azure OpenAI (Gradio UI) +emoji: 🗣️ +colorFrom: purple +colorTo: red +sdk: gradio +sdk_version: 5.16.0 +app_file: app.py +pinned: false +license: mit +short_description: Talk to Azure OpenAI (Gradio UI) +tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY] +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file diff --git a/demo/talk_to_azure_openai/app.py b/demo/talk_to_azure_openai/app.py new file mode 100644 index 0000000..59b3c8f --- /dev/null +++ b/demo/talk_to_azure_openai/app.py @@ -0,0 +1,233 @@ +import asyncio +import base64 +import json +from pathlib import Path +import sounddevice as sd + +import gradio as gr +import numpy as np +import aiohttp # pip install aiohttp +from dotenv import load_dotenv +from fastapi import FastAPI +from fastapi.responses import HTMLResponse, StreamingResponse +from fastrtc import ( + AdditionalOutputs, + AsyncStreamHandler, + Stream, + get_twilio_turn_credentials, + wait_for_item, +) +from gradio.utils import get_space + +load_dotenv() +cur_dir = Path(__file__).parent +load_dotenv("key.env") +# sd.default.device = (3, 3) # (Input-Gerät, Output-Gerät) + +# print(f"Used Mic: {sd.query_devices(3)['name']}") +# print(f"Used Speaker: {sd.query_devices(3)['name']}") +SAMPLE_RATE = 24000 + +instruction = """ + +You a helpful assistant. +""" + + +class AzureAudioHandler(AsyncStreamHandler): + def __init__(self) -> None: + super().__init__( + expected_layout="mono", + output_sample_rate=SAMPLE_RATE, + output_frame_size=480, + input_sample_rate=SAMPLE_RATE, + ) + self.ws = None + self.session = None + self.output_queue = asyncio.Queue() + # This internal buffer is not used directly in receive_messages. + # Instead, multiple audio chunks are collected in the emit() method. + # If needed, a continuous buffer can also be implemented here. + # self.audio_buffer = bytearray() + + def copy(self): + return AzureAudioHandler() + + async def start_up(self): + """Connects to the Azure Real-time Audio API via WebSocket using aiohttp.""" + # Replace the following placeholders with your actual Azure values: + azure_api_key = "your-api-key" # e.g., "your-api-key" + azure_resource_name = "your-resource-name" # e.g., "aigdopenai" + deployment_id = "your-deployment-id" # e.g., "gpt-4o-realtime-preview" + api_version = "2024-10-01-preview" + azure_endpoint = ( + f"wss://{azure_resource_name}.openai.azure.com/openai/realtime" + f"?api-version={api_version}&deployment={deployment_id}" + ) + headers = {"api-key": azure_api_key} + + self.session = aiohttp.ClientSession() + self.ws = await self.session.ws_connect(azure_endpoint, headers=headers) + # Send initial session parameters + session_update_message = { + "type": "session.update", + "session": { + "turn_detection": {"type": "server_vad"}, + "instructions": instruction, + "voice": "ballad", # Possible voices see https://platform.openai.com/docs/guides/realtime-model-capabilities#voice-options + }, + } + + await self.ws.send_str(json.dumps(session_update_message)) + # Start receiving messages asynchronously + asyncio.create_task(self.receive_messages()) + + async def receive_messages(self): + """Handles incoming WebSocket messages and processes them accordingly.""" + async for msg in self.ws: + if msg.type == aiohttp.WSMsgType.TEXT: + print("Received event:", msg.data) # Debug output + event = json.loads(msg.data) + event_type = event.get("type") + if event_type in ["final", "response.audio_transcript.done"]: + transcript = event.get("transcript", "") + + # Wrap the transcript in an object with a .transcript attribute + class TranscriptEvent: + pass + + te = TranscriptEvent() + te.transcript = transcript + await self.output_queue.put(AdditionalOutputs(te)) + elif event_type == "partial": + print("Partial transcript:", event.get("transcript", "")) + elif event_type == "response.audio.delta": + audio_message = event.get("delta") + if audio_message: + try: + audio_bytes = base64.b64decode(audio_message) + # Assuming 16-bit PCM (int16) + audio_array = np.frombuffer(audio_bytes, dtype=np.int16) + # Interpret as mono audio: + audio_array = audio_array.reshape(1, -1) + # Instead of playing the audio, add the chunk to the output queue + await self.output_queue.put( + (self.output_sample_rate, audio_array) + ) + except Exception as e: + print("Error processing audio data:", e) + else: + print("Unknown event:", event) + elif msg.type == aiohttp.WSMsgType.ERROR: + break + + async def receive(self, frame: tuple[int, np.ndarray]) -> None: + """Sends received audio frames to the WebSocket.""" + if not self.ws or self.ws.closed: + return + try: + _, array = frame + array = array.squeeze() + audio_message = base64.b64encode(array.tobytes()).decode("utf-8") + message = {"type": "input_audio_buffer.append", "audio": audio_message} + await self.ws.send_str(json.dumps(message)) + except aiohttp.ClientConnectionError as e: + print("Connection closed while sending:", e) + return + + async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None: + """ + Collects multiple audio chunks from the queue before returning them as a single contiguous audio array. + This helps smooth playback. + """ + item = await wait_for_item(self.output_queue) + # If it's a transcript event, return it immediately. + if not isinstance(item, tuple): + return item + # Otherwise, it is an audio chunk (sample_rate, audio_array) + sample_rate, first_chunk = item + audio_chunks = [first_chunk] + # Define a minimum length (e.g., 0.1 seconds) + min_samples = int(SAMPLE_RATE * 0.1) # 0.1 sec + # Collect more audio chunks until we have enough samples + while audio_chunks and audio_chunks[0].shape[1] < min_samples: + try: + extra = self.output_queue.get_nowait() + if isinstance(extra, tuple): + _, chunk = extra + audio_chunks.append(chunk) + else: + # If it's not an audio chunk, put it back + await self.output_queue.put(extra) + break + except asyncio.QueueEmpty: + break + # Concatenate collected chunks along the time axis (axis=1) + full_audio = np.concatenate(audio_chunks, axis=1) + return (sample_rate, full_audio) + + async def shutdown(self) -> None: + """Closes the WebSocket and session properly.""" + if self.ws: + await self.ws.close() + self.ws = None + if self.session: + await self.session.close() + self.session = None + + +def update_chatbot(chatbot: list[dict], response) -> list[dict]: + """Appends the AI assistant's transcript response to the chatbot messages.""" + chatbot.append({"role": "assistant", "content": response.transcript}) + return chatbot + + +chatbot = gr.Chatbot(type="messages") +latest_message = gr.Textbox(type="text", visible=False) +stream = Stream( + AzureAudioHandler(), + mode="send-receive", + modality="audio", + additional_inputs=[chatbot], + additional_outputs=[chatbot], + additional_outputs_handler=update_chatbot, + rtc_configuration=get_twilio_turn_credentials() if get_space() else None, + concurrency_limit=5 if get_space() else None, + time_limit=90 if get_space() else None, +) + +app = FastAPI() +stream.mount(app) + + +@app.get("/") +async def _(): + rtc_config = get_twilio_turn_credentials() if get_space() else None + html_content = (cur_dir / "index.html").read_text() + html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)) + return HTMLResponse(content=html_content) + + +@app.get("/outputs") +def _(webrtc_id: str): + async def output_stream(): + import json + + async for output in stream.output_stream(webrtc_id): + s = json.dumps({"role": "assistant", "content": output.args[0].transcript}) + yield f"event: output\ndata: {s}\n\n" + + return StreamingResponse(output_stream(), media_type="text/event-stream") + + +if __name__ == "__main__": + import os + + if (mode := os.getenv("MODE")) == "UI": + stream.ui.launch(server_port=7860) + elif mode == "PHONE": + stream.fastphone(host="0.0.0.0", port=7860) + else: + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=7860) diff --git a/demo/talk_to_azure_openai/index.html b/demo/talk_to_azure_openai/index.html new file mode 100644 index 0000000..936065e --- /dev/null +++ b/demo/talk_to_azure_openai/index.html @@ -0,0 +1,356 @@ + + + + + + + + Azure OpenAI Real-Time Chat + + + + + +
+
+ +
+
+
+
+ +
+
+ + + + + + diff --git a/demo/talk_to_azure_openai/requirements.txt b/demo/talk_to_azure_openai/requirements.txt new file mode 100644 index 0000000..dc13284 --- /dev/null +++ b/demo/talk_to_azure_openai/requirements.txt @@ -0,0 +1,123 @@ +aiofiles==23.2.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.13 +aiohttp-retry==2.9.1 +aioice==0.9.0 +aiortc==1.10.1 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.8.0 +attrs==25.2.0 +audioread==3.0.1 +av==13.1.0 +babel==2.17.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==6.9.0 +cryptography==44.0.2 +csvw==3.5.1 +decorator==5.2.1 +distro==1.9.0 +dlinfo==2.0.0 +dnspython==2.7.0 +espeakng-loader==0.2.4 +fastapi==0.115.11 +fastrtc==0.0.14 +ffmpy==0.5.0 +filelock==3.17.0 +flatbuffers==25.2.10 +frozenlist==1.5.0 +fsspec==2025.3.0 +google-crc32c==1.6.0 +gradio==5.20.1 +gradio_client==1.7.2 +groovy==0.1.2 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.29.3 +humanfriendly==10.0 +idna==3.10 +ifaddr==0.2.0 +isodate==0.7.2 +Jinja2==3.1.6 +jiter==0.9.0 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kokoro-onnx==0.4.5 +language-tags==1.2.0 +lazy_loader==0.4 +librosa==0.11.0 +llvmlite==0.44.0 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +numba==0.61.0 +numpy==2.1.3 +onnxruntime==1.21.0 +openai==1.66.2 +orjson==3.10.15 +packaging==24.2 +pandas==2.2.3 +phonemizer-fork==3.3.1 +pillow==11.1.0 +platformdirs==4.3.6 +pooch==1.8.2 +propcache==0.3.0 +protobuf==6.30.0 +pycparser==2.22 +pydantic==2.10.6 +pydantic_core==2.27.2 +pydub==0.25.1 +pyee==12.1.1 +Pygments==2.19.1 +PyJWT==2.10.1 +pylibsrtp==0.11.0 +pyOpenSSL==25.0.0 +pyparsing==3.2.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.20 +pytz==2025.1 +PyYAML==6.0.2 +rdflib==7.1.3 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rfc3986==1.5.0 +rich==13.9.4 +rpds-py==0.23.1 +ruff==0.9.10 +safehttpx==0.1.6 +scikit-learn==1.6.1 +scipy==1.15.2 +segments==2.3.0 +semantic-version==2.10.0 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sounddevice==0.5.1 +soundfile==0.13.1 +soxr==0.5.0.post1 +starlette==0.46.1 +sympy==1.13.3 +threadpoolctl==3.5.0 +tomlkit==0.13.2 +tqdm==4.67.1 +twilio==9.5.0 +typer==0.15.2 +typing_extensions==4.12.2 +tzdata==2025.1 +uritemplate==4.1.1 +urllib3==2.3.0 +uvicorn==0.34.0 +websockets==15.0.1 +yarl==1.18.3 diff --git a/docs/cookbook.md b/docs/cookbook.md index c19107c..98a968f 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -35,6 +35,7 @@ A collection of applications built with FastRTC. Click on the tags below to find +