mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
Fix issue when the audio stream mixes sample rates and numpy array data types (#188)
* Fix code * Fix * keep same
This commit is contained in:
@@ -11,6 +11,7 @@ from contextvars import ContextVar
|
|||||||
from typing import Any, Callable, Literal, Protocol, TypedDict, cast
|
from typing import Any, Callable, Literal, Protocol, TypedDict, cast
|
||||||
|
|
||||||
import av
|
import av
|
||||||
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy.typing import NDArray
|
from numpy.typing import NDArray
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
@@ -134,7 +135,7 @@ async def player_worker_decode(
|
|||||||
rate=sample_rate,
|
rate=sample_rate,
|
||||||
frame_size=frame_size,
|
frame_size=frame_size,
|
||||||
)
|
)
|
||||||
|
first_sample_rate = None
|
||||||
while not thread_quit.is_set():
|
while not thread_quit.is_set():
|
||||||
try:
|
try:
|
||||||
# Get next frame
|
# Get next frame
|
||||||
@@ -174,19 +175,29 @@ async def player_worker_decode(
|
|||||||
layout, # type: ignore
|
layout, # type: ignore
|
||||||
)
|
)
|
||||||
format = "s16" if audio_array.dtype == "int16" else "fltp" # type: ignore
|
format = "s16" if audio_array.dtype == "int16" else "fltp" # type: ignore
|
||||||
|
if first_sample_rate is None:
|
||||||
|
first_sample_rate = sample_rate
|
||||||
|
|
||||||
|
if format == "s16":
|
||||||
|
audio_array = audio_to_float32((sample_rate, audio_array))
|
||||||
|
|
||||||
|
if first_sample_rate != sample_rate:
|
||||||
|
audio_array = librosa.resample(
|
||||||
|
audio_array, target_sr=first_sample_rate, orig_sr=sample_rate
|
||||||
|
)
|
||||||
|
|
||||||
if audio_array.ndim == 1:
|
if audio_array.ndim == 1:
|
||||||
audio_array = audio_array.reshape(1, -1)
|
audio_array = audio_array.reshape(1, -1)
|
||||||
|
|
||||||
# Convert to audio frame and resample
|
# Convert to audio frame and
|
||||||
|
|
||||||
# This runs in the same timeout context
|
# This runs in the same timeout context
|
||||||
frame = av.AudioFrame.from_ndarray( # type: ignore
|
frame = av.AudioFrame.from_ndarray( # type: ignore
|
||||||
audio_array, # type: ignore
|
audio_array, # type: ignore
|
||||||
format=format,
|
format="fltp",
|
||||||
layout=layout, # type: ignore
|
layout=layout, # type: ignore
|
||||||
)
|
)
|
||||||
frame.sample_rate = sample_rate
|
frame.sample_rate = first_sample_rate
|
||||||
|
|
||||||
for processed_frame in audio_resampler.resample(frame):
|
for processed_frame in audio_resampler.resample(frame):
|
||||||
processed_frame.pts = audio_samples
|
processed_frame.pts = audio_samples
|
||||||
processed_frame.time_base = audio_time_base
|
processed_frame.time_base = audio_time_base
|
||||||
|
|||||||
@@ -2,11 +2,10 @@ import asyncio
|
|||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sounddevice as sd
|
|
||||||
|
|
||||||
|
import aiohttp # pip install aiohttp
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import aiohttp # pip install aiohttp
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.responses import HTMLResponse, StreamingResponse
|
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ from fastrtc import get_tts_model, Stream, ReplyOnPause
|
|||||||
tts_client = get_tts_model()
|
tts_client = get_tts_model()
|
||||||
|
|
||||||
|
|
||||||
def detection(audio: tuple[int, np.ndarray]):
|
def echo(audio: tuple[int, np.ndarray]):
|
||||||
# Implement any iterator that yields audio
|
# Implement any iterator that yields audio
|
||||||
# See "LLM Voice Chat" for a more complete example
|
# See "LLM Voice Chat" for a more complete example
|
||||||
yield audio
|
yield audio
|
||||||
@@ -98,7 +98,7 @@ def startup():
|
|||||||
|
|
||||||
|
|
||||||
stream = Stream(
|
stream = Stream(
|
||||||
handler=ReplyOnPause(detection, startup_fn=startup),
|
handler=ReplyOnPause(echo, startup_fn=startup),
|
||||||
modality="audio",
|
modality="audio",
|
||||||
mode="send-receive",
|
mode="send-receive",
|
||||||
ui_args={"title": "Echo Audio"},
|
ui_args={"title": "Echo Audio"},
|
||||||
|
|||||||
Reference in New Issue
Block a user