Fix Websocket Client Processing (#286)

* Fix * Add code
2026-02-04 17:39:23 +08:00 · 2025-04-17 12:21:13 -04:00
parent c9bca428af
commit a68023101d
4 changed files with 52 additions and 36 deletions
--- a/backend/fastrtc/websocket.py
+++ b/backend/fastrtc/websocket.py
@@ -43,7 +43,9 @@ def convert_to_mulaw(
    audio_data = audio_to_float32(audio_data)

    if original_rate != target_rate:
-        audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000)
+        audio_data = librosa.resample(
+            audio_data, orig_sr=original_rate, target_sr=target_rate
+        )

    audio_data = audio_to_int16(audio_data)

@@ -128,7 +130,10 @@ class WebSocketHandler:
                        audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
                    )

-                    if self.stream_handler.input_sample_rate != 8000:
+                    if (
+                        self.stream_handler.phone_mode
+                        and self.stream_handler.input_sample_rate != 8000
+                    ):
                        audio_array = audio_to_float32(audio_array)
                        audio_array = librosa.resample(
                            audio_array,
@@ -269,14 +274,15 @@ class WebSocketHandler:

                    if not isinstance(frame, tuple):
                        continue
-
                    target_rate = (
-                        self.stream_handler.output_sample_rate
-                        if not self.stream_handler.phone_mode
-                        else 8000
+                        8_000
+                        if self.stream_handler.phone_mode
+                        else self.stream_handler.output_sample_rate
                    )
                    mulaw_audio = convert_to_mulaw(
-                        frame[1], frame[0], target_rate=target_rate
+                        frame[1],
+                        frame[0],
+                        target_rate=target_rate,
                    )
                    audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")

--- a/demo/webrtc_vs_websocket/app.py
+++ b/demo/webrtc_vs_websocket/app.py
@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
 stream = Stream(
    modality="audio",
    mode="send-receive",
-    handler=ReplyOnPause(response),
+    handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
    additional_outputs_handler=lambda a, b: b,
    additional_inputs=[chatbot],
    additional_outputs=[chatbot],
--- a/demo/webrtc_vs_websocket/index.html
+++ b/demo/webrtc_vs_websocket/index.html
@@ -390,35 +390,8 @@
            rttValues: []
        };

-        // Load mu-law library
-
-        // Add load promise to track when the script is ready
-
-
-        function resample(audioData, fromSampleRate, toSampleRate) {
-            const ratio = fromSampleRate / toSampleRate;
-            const newLength = Math.round(audioData.length / ratio);
-            const result = new Float32Array(newLength);
-
-            for (let i = 0; i < newLength; i++) {
-                const position = i * ratio;
-                const index = Math.floor(position);
-                const fraction = position - index;
-
-                if (index + 1 < audioData.length) {
-                    result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
-                } else {
-                    result[i] = audioData[index];
-                }
-            }
-            return result;
-        }

        function convertToMulaw(audioData, sampleRate) {
-            // Resample to 8000 Hz if needed
-            if (sampleRate !== 8000) {
-                audioData = resample(audioData, sampleRate, 8000);
-            }

            // Convert float32 [-1,1] to int16 [-32768,32767]
            const int16Data = new Int16Array(audioData.length);
@@ -449,7 +422,7 @@
                wsMetrics.startTime = performance.now();

                // Create audio context and analyser for visualization
-                const audioContext = new AudioContext();
+                const audioContext = new AudioContext({ sampleRate: 24000 });
                const analyser = audioContext.createAnalyser();
                const source = audioContext.createMediaStreamSource(stream);
                source.connect(analyser);
--- a/docs/userguide/api.md
+++ b/docs/userguide/api.md
@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod

 To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.

+The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz. 
+The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
+
 \`\`\`javascript
 // Setup audio context and stream
 const audioContext = new AudioContext();
@@ -441,6 +444,40 @@ ws.onopen = () => {
        }
    };
 };
+
+ws.onmessage = (event) => {
+    const data = JSON.parse(event.data);
+    if (data?.type === "send_input") {
+        fetch('/input_hook', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            // Send additional input data here
+            body: JSON.stringify({ webrtc_id: wsId })
+        });
+    }
+    if (data.event === "media") {
+        // Process received audio
+        const audioData = atob(data.media.payload);
+        const mulawData = new Uint8Array(audioData.length);
+        for (let i = 0; i < audioData.length; i++) {
+            mulawData[i] = audioData.charCodeAt(i);
+        }
+
+        // Convert mu-law to linear PCM
+        const linearData = alawmulaw.mulaw.decode(mulawData);
+
+        // Create an AudioBuffer
+        const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
+        const channelData = audioBuffer.getChannelData(0);
+
+        // Fill the buffer with the decoded data
+        for (let i = 0; i < linearData.length; i++) {
+            channelData[i] = linearData[i] / 32768.0;
+        }
+
+        // Do something with Audio Buffer
+    }
+};
 \`\`\`
 {{?}}
 `);