diff --git a/backend/fastrtc/websocket.py b/backend/fastrtc/websocket.py index 041779f..4100208 100644 --- a/backend/fastrtc/websocket.py +++ b/backend/fastrtc/websocket.py @@ -43,7 +43,9 @@ def convert_to_mulaw( audio_data = audio_to_float32(audio_data) if original_rate != target_rate: - audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000) + audio_data = librosa.resample( + audio_data, orig_sr=original_rate, target_sr=target_rate + ) audio_data = audio_to_int16(audio_data) @@ -128,7 +130,10 @@ class WebSocketHandler: audioop.ulaw2lin(audio_payload, 2), dtype=np.int16 ) - if self.stream_handler.input_sample_rate != 8000: + if ( + self.stream_handler.phone_mode + and self.stream_handler.input_sample_rate != 8000 + ): audio_array = audio_to_float32(audio_array) audio_array = librosa.resample( audio_array, @@ -269,14 +274,15 @@ class WebSocketHandler: if not isinstance(frame, tuple): continue - target_rate = ( - self.stream_handler.output_sample_rate - if not self.stream_handler.phone_mode - else 8000 + 8_000 + if self.stream_handler.phone_mode + else self.stream_handler.output_sample_rate ) mulaw_audio = convert_to_mulaw( - frame[1], frame[0], target_rate=target_rate + frame[1], + frame[0], + target_rate=target_rate, ) audio_payload = base64.b64encode(mulaw_audio).decode("utf-8") diff --git a/demo/webrtc_vs_websocket/app.py b/demo/webrtc_vs_websocket/app.py index 36e4e5b..bf98a35 100644 --- a/demo/webrtc_vs_websocket/app.py +++ b/demo/webrtc_vs_websocket/app.py @@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages") stream = Stream( modality="audio", mode="send-receive", - handler=ReplyOnPause(response), + handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000), additional_outputs_handler=lambda a, b: b, additional_inputs=[chatbot], additional_outputs=[chatbot], diff --git a/demo/webrtc_vs_websocket/index.html b/demo/webrtc_vs_websocket/index.html index cbc72b0..869e06f 100644 --- a/demo/webrtc_vs_websocket/index.html +++ b/demo/webrtc_vs_websocket/index.html @@ -390,35 +390,8 @@ rttValues: [] }; - // Load mu-law library - - // Add load promise to track when the script is ready - - - function resample(audioData, fromSampleRate, toSampleRate) { - const ratio = fromSampleRate / toSampleRate; - const newLength = Math.round(audioData.length / ratio); - const result = new Float32Array(newLength); - - for (let i = 0; i < newLength; i++) { - const position = i * ratio; - const index = Math.floor(position); - const fraction = position - index; - - if (index + 1 < audioData.length) { - result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction; - } else { - result[i] = audioData[index]; - } - } - return result; - } function convertToMulaw(audioData, sampleRate) { - // Resample to 8000 Hz if needed - if (sampleRate !== 8000) { - audioData = resample(audioData, sampleRate, 8000); - } // Convert float32 [-1,1] to int16 [-32768,32767] const int16Data = new Int16Array(audioData.length); @@ -449,7 +422,7 @@ wsMetrics.startTime = performance.now(); // Create audio context and analyser for visualization - const audioContext = new AudioContext(); + const audioContext = new AudioContext({ sampleRate: 24000 }); const analyser = audioContext.createAnalyser(); const source = audioContext.createMediaStreamSource(stream); source.connect(analyser); diff --git a/docs/userguide/api.md b/docs/userguide/api.md index b91e7c9..b912650 100644 --- a/docs/userguide/api.md +++ b/docs/userguide/api.md @@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback. +The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz. +The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz. + \`\`\`javascript // Setup audio context and stream const audioContext = new AudioContext(); @@ -441,6 +444,40 @@ ws.onopen = () => { } }; }; + +ws.onmessage = (event) => { + const data = JSON.parse(event.data); + if (data?.type === "send_input") { + fetch('/input_hook', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + // Send additional input data here + body: JSON.stringify({ webrtc_id: wsId }) + }); + } + if (data.event === "media") { + // Process received audio + const audioData = atob(data.media.payload); + const mulawData = new Uint8Array(audioData.length); + for (let i = 0; i < audioData.length; i++) { + mulawData[i] = audioData.charCodeAt(i); + } + + // Convert mu-law to linear PCM + const linearData = alawmulaw.mulaw.decode(mulawData); + + // Create an AudioBuffer + const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate); + const channelData = audioBuffer.getChannelData(0); + + // Fill the buffer with the decoded data + for (let i = 0; i < linearData.length; i++) { + channelData[i] = linearData[i] / 32768.0; + } + + // Do something with Audio Buffer + } +}; \`\`\` {{?}} `);