mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
@@ -43,7 +43,9 @@ def convert_to_mulaw(
|
|||||||
audio_data = audio_to_float32(audio_data)
|
audio_data = audio_to_float32(audio_data)
|
||||||
|
|
||||||
if original_rate != target_rate:
|
if original_rate != target_rate:
|
||||||
audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000)
|
audio_data = librosa.resample(
|
||||||
|
audio_data, orig_sr=original_rate, target_sr=target_rate
|
||||||
|
)
|
||||||
|
|
||||||
audio_data = audio_to_int16(audio_data)
|
audio_data = audio_to_int16(audio_data)
|
||||||
|
|
||||||
@@ -128,7 +130,10 @@ class WebSocketHandler:
|
|||||||
audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
|
audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.stream_handler.input_sample_rate != 8000:
|
if (
|
||||||
|
self.stream_handler.phone_mode
|
||||||
|
and self.stream_handler.input_sample_rate != 8000
|
||||||
|
):
|
||||||
audio_array = audio_to_float32(audio_array)
|
audio_array = audio_to_float32(audio_array)
|
||||||
audio_array = librosa.resample(
|
audio_array = librosa.resample(
|
||||||
audio_array,
|
audio_array,
|
||||||
@@ -269,14 +274,15 @@ class WebSocketHandler:
|
|||||||
|
|
||||||
if not isinstance(frame, tuple):
|
if not isinstance(frame, tuple):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
target_rate = (
|
target_rate = (
|
||||||
self.stream_handler.output_sample_rate
|
8_000
|
||||||
if not self.stream_handler.phone_mode
|
if self.stream_handler.phone_mode
|
||||||
else 8000
|
else self.stream_handler.output_sample_rate
|
||||||
)
|
)
|
||||||
mulaw_audio = convert_to_mulaw(
|
mulaw_audio = convert_to_mulaw(
|
||||||
frame[1], frame[0], target_rate=target_rate
|
frame[1],
|
||||||
|
frame[0],
|
||||||
|
target_rate=target_rate,
|
||||||
)
|
)
|
||||||
audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")
|
audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")
|
||||||
|
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
|
|||||||
stream = Stream(
|
stream = Stream(
|
||||||
modality="audio",
|
modality="audio",
|
||||||
mode="send-receive",
|
mode="send-receive",
|
||||||
handler=ReplyOnPause(response),
|
handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
|
||||||
additional_outputs_handler=lambda a, b: b,
|
additional_outputs_handler=lambda a, b: b,
|
||||||
additional_inputs=[chatbot],
|
additional_inputs=[chatbot],
|
||||||
additional_outputs=[chatbot],
|
additional_outputs=[chatbot],
|
||||||
|
|||||||
@@ -390,35 +390,8 @@
|
|||||||
rttValues: []
|
rttValues: []
|
||||||
};
|
};
|
||||||
|
|
||||||
// Load mu-law library
|
|
||||||
|
|
||||||
// Add load promise to track when the script is ready
|
|
||||||
|
|
||||||
|
|
||||||
function resample(audioData, fromSampleRate, toSampleRate) {
|
|
||||||
const ratio = fromSampleRate / toSampleRate;
|
|
||||||
const newLength = Math.round(audioData.length / ratio);
|
|
||||||
const result = new Float32Array(newLength);
|
|
||||||
|
|
||||||
for (let i = 0; i < newLength; i++) {
|
|
||||||
const position = i * ratio;
|
|
||||||
const index = Math.floor(position);
|
|
||||||
const fraction = position - index;
|
|
||||||
|
|
||||||
if (index + 1 < audioData.length) {
|
|
||||||
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
|
|
||||||
} else {
|
|
||||||
result[i] = audioData[index];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
function convertToMulaw(audioData, sampleRate) {
|
function convertToMulaw(audioData, sampleRate) {
|
||||||
// Resample to 8000 Hz if needed
|
|
||||||
if (sampleRate !== 8000) {
|
|
||||||
audioData = resample(audioData, sampleRate, 8000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert float32 [-1,1] to int16 [-32768,32767]
|
// Convert float32 [-1,1] to int16 [-32768,32767]
|
||||||
const int16Data = new Int16Array(audioData.length);
|
const int16Data = new Int16Array(audioData.length);
|
||||||
@@ -449,7 +422,7 @@
|
|||||||
wsMetrics.startTime = performance.now();
|
wsMetrics.startTime = performance.now();
|
||||||
|
|
||||||
// Create audio context and analyser for visualization
|
// Create audio context and analyser for visualization
|
||||||
const audioContext = new AudioContext();
|
const audioContext = new AudioContext({ sampleRate: 24000 });
|
||||||
const analyser = audioContext.createAnalyser();
|
const analyser = audioContext.createAnalyser();
|
||||||
const source = audioContext.createMediaStreamSource(stream);
|
const source = audioContext.createMediaStreamSource(stream);
|
||||||
source.connect(analyser);
|
source.connect(analyser);
|
||||||
|
|||||||
@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod
|
|||||||
|
|
||||||
To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
|
To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
|
||||||
|
|
||||||
|
The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz.
|
||||||
|
The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
|
||||||
|
|
||||||
\`\`\`javascript
|
\`\`\`javascript
|
||||||
// Setup audio context and stream
|
// Setup audio context and stream
|
||||||
const audioContext = new AudioContext();
|
const audioContext = new AudioContext();
|
||||||
@@ -441,6 +444,40 @@ ws.onopen = () => {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ws.onmessage = (event) => {
|
||||||
|
const data = JSON.parse(event.data);
|
||||||
|
if (data?.type === "send_input") {
|
||||||
|
fetch('/input_hook', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
// Send additional input data here
|
||||||
|
body: JSON.stringify({ webrtc_id: wsId })
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (data.event === "media") {
|
||||||
|
// Process received audio
|
||||||
|
const audioData = atob(data.media.payload);
|
||||||
|
const mulawData = new Uint8Array(audioData.length);
|
||||||
|
for (let i = 0; i < audioData.length; i++) {
|
||||||
|
mulawData[i] = audioData.charCodeAt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert mu-law to linear PCM
|
||||||
|
const linearData = alawmulaw.mulaw.decode(mulawData);
|
||||||
|
|
||||||
|
// Create an AudioBuffer
|
||||||
|
const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
|
||||||
|
const channelData = audioBuffer.getChannelData(0);
|
||||||
|
|
||||||
|
// Fill the buffer with the decoded data
|
||||||
|
for (let i = 0; i < linearData.length; i++) {
|
||||||
|
channelData[i] = linearData[i] / 32768.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do something with Audio Buffer
|
||||||
|
}
|
||||||
|
};
|
||||||
\`\`\`
|
\`\`\`
|
||||||
{{?}}
|
{{?}}
|
||||||
`);
|
`);
|
||||||
|
|||||||
Reference in New Issue
Block a user