Fix Websocket Client Processing (#286)

* Fix * Add code
2026-02-05 18:09:23 +08:00 · 2025-04-17 12:21:13 -04:00
parent c9bca428af
commit a68023101d
4 changed files with 52 additions and 36 deletions
--- a/backend/fastrtc/websocket.py
+++ b/backend/fastrtc/websocket.py
@@ -43,7 +43,9 @@ def convert_to_mulaw(
    audio_data = audio_to_float32(audio_data)
    if original_rate != target_rate:
-        audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000)
+        audio_data = librosa.resample(
            audio_data, orig_sr=original_rate, target_sr=target_rate
        )
    audio_data = audio_to_int16(audio_data)
@@ -128,7 +130,10 @@ class WebSocketHandler:
                        audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
                    )
-                    if self.stream_handler.input_sample_rate != 8000:
+                    if (
                        self.stream_handler.phone_mode
                        and self.stream_handler.input_sample_rate != 8000
                    ):
                        audio_array = audio_to_float32(audio_array)
                        audio_array = librosa.resample(
                            audio_array,
@@ -269,14 +274,15 @@ class WebSocketHandler:
                    if not isinstance(frame, tuple):
                        continue
                    target_rate = (
-                        self.stream_handler.output_sample_rate
+                        8_000
-                        if not self.stream_handler.phone_mode
+                        if self.stream_handler.phone_mode
-                        else 8000
+                        else self.stream_handler.output_sample_rate
                    )
                    mulaw_audio = convert_to_mulaw(
-                        frame[1], frame[0], target_rate=target_rate
+                        frame[1],
                        frame[0],
                        target_rate=target_rate,
                    )
                    audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")
--- a/demo/webrtc_vs_websocket/app.py
+++ b/demo/webrtc_vs_websocket/app.py
@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
 stream = Stream(
    modality="audio",
    mode="send-receive",
-    handler=ReplyOnPause(response),
+    handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
    additional_outputs_handler=lambda a, b: b,
    additional_inputs=[chatbot],
    additional_outputs=[chatbot],
--- a/demo/webrtc_vs_websocket/index.html
+++ b/demo/webrtc_vs_websocket/index.html
@@ -390,35 +390,8 @@
            rttValues: []
        };
        // Load mu-law library
        // Add load promise to track when the script is ready
        function resample(audioData, fromSampleRate, toSampleRate) {
            const ratio = fromSampleRate / toSampleRate;
            const newLength = Math.round(audioData.length / ratio);
            const result = new Float32Array(newLength);
            for (let i = 0; i < newLength; i++) {
                const position = i * ratio;
                const index = Math.floor(position);
                const fraction = position - index;
                if (index + 1 < audioData.length) {
                    result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
                } else {
                    result[i] = audioData[index];
                }
            }
            return result;
        }
        function convertToMulaw(audioData, sampleRate) {
            // Resample to 8000 Hz if needed
            if (sampleRate !== 8000) {
                audioData = resample(audioData, sampleRate, 8000);
            }
            // Convert float32 [-1,1] to int16 [-32768,32767]
            const int16Data = new Int16Array(audioData.length);
@@ -449,7 +422,7 @@
                wsMetrics.startTime = performance.now();
                // Create audio context and analyser for visualization
-                const audioContext = new AudioContext();
+                const audioContext = new AudioContext({ sampleRate: 24000 });
                const analyser = audioContext.createAnalyser();
                const source = audioContext.createMediaStreamSource(stream);
                source.connect(analyser);
--- a/docs/userguide/api.md
+++ b/docs/userguide/api.md
@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod
 To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
 The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz. 
 The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
 \`\`\`javascript
 // Setup audio context and stream
 const audioContext = new AudioContext();
@@ -441,6 +444,40 @@ ws.onopen = () => {
        }
    };
 };
 ws.onmessage = (event) => {
    const data = JSON.parse(event.data);
    if (data?.type === "send_input") {
        fetch('/input_hook', {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            // Send additional input data here
            body: JSON.stringify({ webrtc_id: wsId })
        });
    }
    if (data.event === "media") {
        // Process received audio
        const audioData = atob(data.media.payload);
        const mulawData = new Uint8Array(audioData.length);
        for (let i = 0; i < audioData.length; i++) {
            mulawData[i] = audioData.charCodeAt(i);
        }
        // Convert mu-law to linear PCM
        const linearData = alawmulaw.mulaw.decode(mulawData);
        // Create an AudioBuffer
        const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
        const channelData = audioBuffer.getChannelData(0);
        // Fill the buffer with the decoded data
        for (let i = 0; i < linearData.length; i++) {
            channelData[i] = linearData[i] / 32768.0;
        }
        // Do something with Audio Buffer
    }
 };
 \`\`\`
 {{?}}
 `);