mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-04 17:39:23 +08:00
@@ -43,7 +43,9 @@ def convert_to_mulaw(
|
||||
audio_data = audio_to_float32(audio_data)
|
||||
|
||||
if original_rate != target_rate:
|
||||
audio_data = librosa.resample(audio_data, orig_sr=original_rate, target_sr=8000)
|
||||
audio_data = librosa.resample(
|
||||
audio_data, orig_sr=original_rate, target_sr=target_rate
|
||||
)
|
||||
|
||||
audio_data = audio_to_int16(audio_data)
|
||||
|
||||
@@ -128,7 +130,10 @@ class WebSocketHandler:
|
||||
audioop.ulaw2lin(audio_payload, 2), dtype=np.int16
|
||||
)
|
||||
|
||||
if self.stream_handler.input_sample_rate != 8000:
|
||||
if (
|
||||
self.stream_handler.phone_mode
|
||||
and self.stream_handler.input_sample_rate != 8000
|
||||
):
|
||||
audio_array = audio_to_float32(audio_array)
|
||||
audio_array = librosa.resample(
|
||||
audio_array,
|
||||
@@ -269,14 +274,15 @@ class WebSocketHandler:
|
||||
|
||||
if not isinstance(frame, tuple):
|
||||
continue
|
||||
|
||||
target_rate = (
|
||||
self.stream_handler.output_sample_rate
|
||||
if not self.stream_handler.phone_mode
|
||||
else 8000
|
||||
8_000
|
||||
if self.stream_handler.phone_mode
|
||||
else self.stream_handler.output_sample_rate
|
||||
)
|
||||
mulaw_audio = convert_to_mulaw(
|
||||
frame[1], frame[0], target_rate=target_rate
|
||||
frame[1],
|
||||
frame[0],
|
||||
target_rate=target_rate,
|
||||
)
|
||||
audio_payload = base64.b64encode(mulaw_audio).decode("utf-8")
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ chatbot = gr.Chatbot(type="messages")
|
||||
stream = Stream(
|
||||
modality="audio",
|
||||
mode="send-receive",
|
||||
handler=ReplyOnPause(response),
|
||||
handler=ReplyOnPause(response, input_sample_rate=24_000, output_sample_rate=24_000),
|
||||
additional_outputs_handler=lambda a, b: b,
|
||||
additional_inputs=[chatbot],
|
||||
additional_outputs=[chatbot],
|
||||
|
||||
@@ -390,35 +390,8 @@
|
||||
rttValues: []
|
||||
};
|
||||
|
||||
// Load mu-law library
|
||||
|
||||
// Add load promise to track when the script is ready
|
||||
|
||||
|
||||
function resample(audioData, fromSampleRate, toSampleRate) {
|
||||
const ratio = fromSampleRate / toSampleRate;
|
||||
const newLength = Math.round(audioData.length / ratio);
|
||||
const result = new Float32Array(newLength);
|
||||
|
||||
for (let i = 0; i < newLength; i++) {
|
||||
const position = i * ratio;
|
||||
const index = Math.floor(position);
|
||||
const fraction = position - index;
|
||||
|
||||
if (index + 1 < audioData.length) {
|
||||
result[i] = audioData[index] * (1 - fraction) + audioData[index + 1] * fraction;
|
||||
} else {
|
||||
result[i] = audioData[index];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function convertToMulaw(audioData, sampleRate) {
|
||||
// Resample to 8000 Hz if needed
|
||||
if (sampleRate !== 8000) {
|
||||
audioData = resample(audioData, sampleRate, 8000);
|
||||
}
|
||||
|
||||
// Convert float32 [-1,1] to int16 [-32768,32767]
|
||||
const int16Data = new Int16Array(audioData.length);
|
||||
@@ -449,7 +422,7 @@
|
||||
wsMetrics.startTime = performance.now();
|
||||
|
||||
// Create audio context and analyser for visualization
|
||||
const audioContext = new AudioContext();
|
||||
const audioContext = new AudioContext({ sampleRate: 24000 });
|
||||
const analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(analyser);
|
||||
|
||||
@@ -403,6 +403,9 @@ WebSocket connections are currently only supported for audio in send-receive mod
|
||||
|
||||
To connect to the server via WebSocket, you'll need to establish a WebSocket connection and handle audio processing. The code below assumes there is an HTML audio element for output playback.
|
||||
|
||||
The input audio must be mu-law encoded with a sample rate equal to the input_sample_rate of the handler you are connecting to. By default it is 48k Hz.
|
||||
The out audio will also be mulaw encoded and the sample rate will be equal to the output_sample_rate of the handler. By default it is 48k Hz.
|
||||
|
||||
\`\`\`javascript
|
||||
// Setup audio context and stream
|
||||
const audioContext = new AudioContext();
|
||||
@@ -441,6 +444,40 @@ ws.onopen = () => {
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
if (data?.type === "send_input") {
|
||||
fetch('/input_hook', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
// Send additional input data here
|
||||
body: JSON.stringify({ webrtc_id: wsId })
|
||||
});
|
||||
}
|
||||
if (data.event === "media") {
|
||||
// Process received audio
|
||||
const audioData = atob(data.media.payload);
|
||||
const mulawData = new Uint8Array(audioData.length);
|
||||
for (let i = 0; i < audioData.length; i++) {
|
||||
mulawData[i] = audioData.charCodeAt(i);
|
||||
}
|
||||
|
||||
// Convert mu-law to linear PCM
|
||||
const linearData = alawmulaw.mulaw.decode(mulawData);
|
||||
|
||||
// Create an AudioBuffer
|
||||
const audioBuffer = outputContext.createBuffer(1, linearData.length, sampleRate);
|
||||
const channelData = audioBuffer.getChannelData(0);
|
||||
|
||||
// Fill the buffer with the decoded data
|
||||
for (let i = 0; i < linearData.length; i++) {
|
||||
channelData[i] = linearData[i] / 32768.0;
|
||||
}
|
||||
|
||||
// Do something with Audio Buffer
|
||||
}
|
||||
};
|
||||
\`\`\`
|
||||
{{?}}
|
||||
`);
|
||||
|
||||
Reference in New Issue
Block a user