Fix websocket interruption (#291)

* Code * Fix * add code * interruptions * Add code * code * Add code * Add code * code
2026-02-05 18:09:23 +08:00 · 2025-04-22 14:40:19 -04:00
parent a68023101d
commit 074e9c9345
9 changed files with 287 additions and 21 deletions
--- a/demo/qwen_phone_chat/README.md
+++ b/demo/qwen_phone_chat/README.md
@@ -0,0 +1,14 @@
+---
+title: Qwen Phone Chat
+emoji: 📞
+colorFrom: pink
+colorTo: green
+sdk: gradio
+sdk_version: 5.25.2
+app_file: app.py
+pinned: false
+license: mit
+short_description: Talk with Qwen 2.5 Omni over the Phone
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
--- a/demo/qwen_phone_chat/app.py
+++ b/demo/qwen_phone_chat/app.py
@@ -0,0 +1,217 @@
+import asyncio
+import base64
+import json
+import os
+import secrets
+from pathlib import Path
+
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse
+from fastrtc import (
+    AdditionalOutputs,
+    AsyncStreamHandler,
+    Stream,
+    get_cloudflare_turn_credentials_async,
+    wait_for_item,
+)
+from websockets.asyncio.client import connect
+
+load_dotenv()
+
+cur_dir = Path(__file__).parent
+
+API_KEY = os.getenv("MODELSCOPE_API_KEY", "")
+API_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen-omni-turbo-realtime-2025-03-26"
+VOICES = ["Chelsie", "Serena", "Ethan", "Cherry"]
+headers = {"Authorization": "Bearer " + API_KEY}
+
+
+class QwenOmniHandler(AsyncStreamHandler):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__(
+            expected_layout="mono",
+            output_sample_rate=24_000,
+            input_sample_rate=16_000,
+        )
+        self.connection = None
+        self.output_queue = asyncio.Queue()
+
+    def copy(self):
+        return QwenOmniHandler()
+
+    @staticmethod
+    def msg_id() -> str:
+        return f"event_{secrets.token_hex(10)}"
+
+    async def start_up(
+        self,
+    ):
+        """Connect to realtime API. Run forever in separate thread to keep connection open."""
+        voice_id = "Serena"
+        print("voice_id", voice_id)
+        async with connect(
+            API_URL,
+            additional_headers=headers,
+        ) as conn:
+            self.client = conn
+            await conn.send(
+                json.dumps(
+                    {
+                        "event_id": self.msg_id(),
+                        "type": "session.update",
+                        "session": {
+                            "modalities": [
+                                "text",
+                                "audio",
+                            ],
+                            "voice": voice_id,
+                            "input_audio_format": "pcm16",
+                        },
+                    }
+                )
+            )
+            self.connection = conn
+            try:
+                async for data in self.connection:
+                    event = json.loads(data)
+                    print("event", event["type"])
+                    if "type" not in event:
+                        continue
+                    # Handle interruptions
+                    if event["type"] == "input_audio_buffer.speech_started":
+                        self.clear_queue()
+                    if event["type"] == "response.audio.delta":
+                        print("putting output")
+                        await self.output_queue.put(
+                            (
+                                self.output_sample_rate,
+                                np.frombuffer(
+                                    base64.b64decode(event["delta"]), dtype=np.int16
+                                ).reshape(1, -1),
+                            ),
+                        )
+            except Exception as e:
+                print("error", e)
+
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        if not self.connection:
+            return
+        _, array = frame
+        array = array.squeeze()
+        audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
+        try:
+            await self.connection.send(
+                json.dumps(
+                    {
+                        "event_id": self.msg_id(),
+                        "type": "input_audio_buffer.append",
+                        "audio": audio_message,
+                    }
+                )
+            )
+        except Exception as e:
+            print("error", e)
+
+    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        return await wait_for_item(self.output_queue)
+
+    async def shutdown(self) -> None:
+        if self.connection:
+            await self.connection.close()
+            self.connection = None
+
+
+voice = gr.Dropdown(choices=VOICES, value=VOICES[0], type="value", label="Voice")
+stream = Stream(
+    QwenOmniHandler(),
+    mode="send-receive",
+    modality="audio",
+    additional_inputs=[voice],
+    additional_outputs=None,
+    rtc_configuration=get_cloudflare_turn_credentials_async,
+    concurrency_limit=20,
+    time_limit=180,
+)
+
+app = FastAPI()
+
+
+@app.post("/telephone/incoming")
+async def handle_incoming_call(request: Request):
+    """
+    Handle incoming telephone calls (e.g., via Twilio).
+
+    Generates TwiML instructions to connect the incoming call to the
+    WebSocket handler (`/telephone/handler`) for audio streaming.
+
+    Args:
+        request: The FastAPI Request object for the incoming call webhook.
+
+    Returns:
+        An HTMLResponse containing the TwiML instructions as XML.
+    """
+    from twilio.twiml.voice_response import Connect, VoiceResponse
+
+    if len(stream.connections) > (stream.concurrency_limit or 20):
+        response = VoiceResponse()
+        response.say("Qwen is busy please try again later!")
+        return HTMLResponse(content=str(response), media_type="application/xml")
+
+    response = VoiceResponse()
+    response.say("Connecting to Qwen")
+    connect = Connect()
+    print("request.url.hostname", request.url.hostname)
+    connect.stream(url=f"wss://{request.url.hostname}/telephone/handler")
+    response.append(connect)
+    response.say("The call has been disconnected.")
+    return HTMLResponse(content=str(response), media_type="application/xml")
+
+
+stream.mount(app)
+
+
+@app.get("/")
+async def _():
+    html_content = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Qwen Phone Chat</title>
+        <style>
+            body {
+                font-family: Arial, sans-serif;
+                max-width: 800px;
+                margin: 0 auto;
+                padding: 20px;
+                line-height: 1.6;
+            }
+            pre {
+                background-color: #f5f5f5;
+                padding: 15px;
+                border-radius: 5px;
+                overflow-x: auto;
+            }
+            h1 {
+                color: #333;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Qwen Phone Chat</h1>
+        <p>Call +1 (877) 853-7936</p>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)
+
+
+if __name__ == "__main__":
+    #  stream.fastphone(host="0.0.0.0", port=7860)
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=7860)
--- a/demo/qwen_phone_chat/requirements.txt
+++ b/demo/qwen_phone_chat/requirements.txt
@@ -0,0 +1,2 @@
+fastrtc
+websockets>=14.0
--- a/demo/talk_to_openai/app.py
+++ b/demo/talk_to_openai/app.py
@@ -17,7 +17,6 @@ from fastrtc import (
    wait_for_item,
 )
 from gradio.utils import get_space
-from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent

 load_dotenv()

@@ -103,8 +102,8 @@ class OpenAIHandler(AsyncStreamHandler):
            self.connection = None


-def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
-    chatbot.append({"role": "assistant", "content": response.transcript})
+def update_chatbot(chatbot: list[dict], response: dict):
+    chatbot.append(response)
    return chatbot


--- a/demo/webrtc_vs_websocket/app.py
+++ b/demo/webrtc_vs_websocket/app.py
@@ -76,7 +76,7 @@ def response(
    )
    for chunk in aggregate_bytes_to_16bit(iterator):
        audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
-        yield (24000, audio_array, "mono")
+        yield (24000, audio_array)


 chatbot = gr.Chatbot(type="messages")