Fix Websocket Client Processing (#286)

* Fix

* Add code
This commit is contained in:
Freddy Boulton
2025-04-17 12:21:13 -04:00
committed by GitHub
parent c9bca428af
commit a68023101d
4 changed files with 52 additions and 36 deletions

View File

@@ -43,7 +43,9 @@ def convert_to_mulaw(
audio_data = audio_to_float32(audio_data)
if original_rate != target_rate:
audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000)
audio_data = librosa.resample(
audio_data, orig_sr=original_rate, target_sr=target_rate
)
audio_data = audio_to_int16(audio_data)
@@ -128,7 +130,10 @@ class WebSocketHandler:
audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
)
if self.stream_handler.input_sample_rate != 8000:
if (
self.stream_handler.phone_mode
and self.stream_handler.input_sample_rate != 8000
):
audio_array = audio_to_float32(audio_array)
audio_array = librosa.resample(
audio_array,
@@ -269,14 +274,15 @@ class WebSocketHandler:
if not isinstance(frame, tuple):
continue
target_rate = (
self.stream_handler.output_sample_rate
if not self.stream_handler.phone_mode
else 8000
8_000
if self.stream_handler.phone_mode
else self.stream_handler.output_sample_rate
)
mulaw_audio = convert_to_mulaw(
frame[1], frame[0], target_rate=target_rate
frame[1],
frame[0],
target_rate=target_rate,
)
audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")

View File

@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response),
handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],

View File

@@ -390,35 +390,8 @@
rttValues: []
};
// Load mu-law library
// Add load promise to track when the script is ready
function resample(audioData, fromSampleRate, toSampleRate) {
const ratio = fromSampleRate / toSampleRate;
const newLength = Math.round(audioData.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const position = i * ratio;
const index = Math.floor(position);
const fraction = position - index;
if (index + 1 < audioData.length) {
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
} else {
result[i] = audioData[index];
}
}
return result;
}
function convertToMulaw(audioData, sampleRate) {
// Resample to 8000 Hz if needed
if (sampleRate !== 8000) {
audioData = resample(audioData, sampleRate, 8000);
}
// Convert float32 [-1,1] to int16 [-32768,32767]
const int16Data = new Int16Array(audioData.length);
@@ -449,7 +422,7 @@
wsMetrics.startTime = performance.now();
// Create audio context and analyser for visualization
const audioContext = new AudioContext();
const audioContext = new AudioContext({ sampleRate: 24000 });
const analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser);

View File

@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod
To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz.
The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
\`\`\`javascript
// Setup audio context and stream
const audioContext = new AudioContext();
@@ -441,6 +444,40 @@ ws.onopen = () => {
}
};
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data?.type === "send_input") {
fetch('/input_hook', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
// Send additional input data here
body: JSON.stringify({ webrtc_id: wsId })
});
}
if (data.event === "media") {
// Process received audio
const audioData = atob(data.media.payload);
const mulawData = new Uint8Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
mulawData[i] = audioData.charCodeAt(i);
}
// Convert mu-law to linear PCM
const linearData = alawmulaw.mulaw.decode(mulawData);
// Create an AudioBuffer
const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
const channelData = audioBuffer.getChannelData(0);
// Fill the buffer with the decoded data
for (let i = 0; i < linearData.length; i++) {
channelData[i] = linearData[i] / 32768.0;
}
// Do something with Audio Buffer
}
};
\`\`\`
{{?}}
`);