Fix Websocket Client Processing (#286)

* Fix

* Add code
This commit is contained in:
Freddy Boulton
2025-04-17 12:21:13 -04:00
committed by GitHub
parent c9bca428af
commit a68023101d
4 changed files with 52 additions and 36 deletions

View File

@@ -43,7 +43,9 @@ def convert_to_mulaw(
audio_data = audio_to_float32(audio_data) audio_data = audio_to_float32(audio_data)
if original_rate != target_rate: if original_rate != target_rate:
audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000) audio_data = librosa.resample(
audio_data, orig_sr=original_rate, target_sr=target_rate
)
audio_data = audio_to_int16(audio_data) audio_data = audio_to_int16(audio_data)
@@ -128,7 +130,10 @@ class WebSocketHandler:
audioop.ulaw2lin(audio_payload, 2), dtype=np.int16 audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
) )
if self.stream_handler.input_sample_rate != 8000: if (
self.stream_handler.phone_mode
and self.stream_handler.input_sample_rate != 8000
):
audio_array = audio_to_float32(audio_array) audio_array = audio_to_float32(audio_array)
audio_array = librosa.resample( audio_array = librosa.resample(
audio_array, audio_array,
@@ -269,14 +274,15 @@ class WebSocketHandler:
if not isinstance(frame, tuple): if not isinstance(frame, tuple):
continue continue
target_rate = ( target_rate = (
self.stream_handler.output_sample_rate 8_000
if not self.stream_handler.phone_mode if self.stream_handler.phone_mode
else 8000 else self.stream_handler.output_sample_rate
) )
mulaw_audio = convert_to_mulaw( mulaw_audio = convert_to_mulaw(
frame[1], frame[0], target_rate=target_rate frame[1],
frame[0],
target_rate=target_rate,
) )
audio_payload = base64.b64encode(mulaw_audio).decode("utf-8") audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")

View File

@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
stream = Stream( stream = Stream(
modality="audio", modality="audio",
mode="send-receive", mode="send-receive",
handler=ReplyOnPause(response), handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
additional_outputs_handler=lambda a, b: b, additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot], additional_inputs=[chatbot],
additional_outputs=[chatbot], additional_outputs=[chatbot],

View File

@@ -390,35 +390,8 @@
rttValues: [] rttValues: []
}; };
// Load mu-law library
// Add load promise to track when the script is ready
function resample(audioData, fromSampleRate, toSampleRate) {
const ratio = fromSampleRate / toSampleRate;
const newLength = Math.round(audioData.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const position = i * ratio;
const index = Math.floor(position);
const fraction = position - index;
if (index + 1 < audioData.length) {
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
} else {
result[i] = audioData[index];
}
}
return result;
}
function convertToMulaw(audioData, sampleRate) { function convertToMulaw(audioData, sampleRate) {
// Resample to 8000 Hz if needed
if (sampleRate !== 8000) {
audioData = resample(audioData, sampleRate, 8000);
}
// Convert float32 [-1,1] to int16 [-32768,32767] // Convert float32 [-1,1] to int16 [-32768,32767]
const int16Data = new Int16Array(audioData.length); const int16Data = new Int16Array(audioData.length);
@@ -449,7 +422,7 @@
wsMetrics.startTime = performance.now(); wsMetrics.startTime = performance.now();
// Create audio context and analyser for visualization // Create audio context and analyser for visualization
const audioContext = new AudioContext(); const audioContext = new AudioContext({ sampleRate: 24000 });
const analyser = audioContext.createAnalyser(); const analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream); const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser); source.connect(analyser);

View File

@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod
To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback. To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz.
The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
\`\`\`javascript \`\`\`javascript
// Setup audio context and stream // Setup audio context and stream
const audioContext = new AudioContext(); const audioContext = new AudioContext();
@@ -441,6 +444,40 @@ ws.onopen = () => {
} }
}; };
}; };
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data?.type === "send_input") {
fetch('/input_hook', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
// Send additional input data here
body: JSON.stringify({ webrtc_id: wsId })
});
}
if (data.event === "media") {
// Process received audio
const audioData = atob(data.media.payload);
const mulawData = new Uint8Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
mulawData[i] = audioData.charCodeAt(i);
}
// Convert mu-law to linear PCM
const linearData = alawmulaw.mulaw.decode(mulawData);
// Create an AudioBuffer
const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
const channelData = audioBuffer.getChannelData(0);
// Fill the buffer with the decoded data
for (let i = 0; i < linearData.length; i++) {
channelData[i] = linearData[i] / 32768.0;
}
// Do something with Audio Buffer
}
};
\`\`\` \`\`\`
{{?}} {{?}}
`); `);