mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-04 17:39:23 +08:00
* Audio + Video / test Audio * Add code * Fix demo * support additional inputs * Add code * Add code
68 lines
1.9 KiB
Python
68 lines
1.9 KiB
Python
import logging
|
|
import tempfile
|
|
|
|
import gradio as gr
|
|
import numpy as np
|
|
from dotenv import load_dotenv
|
|
from gradio_webrtc import AdditionalOutputs, ReplyOnPause, WebRTC
|
|
from openai import OpenAI
|
|
from pydub import AudioSegment
|
|
|
|
load_dotenv()
|
|
|
|
|
|
# Configure the root logger to WARNING to suppress debug messages from other libraries
|
|
logging.basicConfig(level=logging.WARNING)
|
|
|
|
# Create a console handler
|
|
console_handler = logging.StreamHandler()
|
|
console_handler.setLevel(logging.DEBUG)
|
|
|
|
# Create a formatter
|
|
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
|
|
console_handler.setFormatter(formatter)
|
|
|
|
# Configure the logger for your specific library
|
|
logger = logging.getLogger("gradio_webrtc")
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.addHandler(console_handler)
|
|
|
|
|
|
client = OpenAI()
|
|
|
|
|
|
def transcribe(audio: tuple[int, np.ndarray], transcript: list[dict]):
|
|
segment = AudioSegment(
|
|
audio[1].tobytes(),
|
|
frame_rate=audio[0],
|
|
sample_width=audio[1].dtype.itemsize,
|
|
channels=1,
|
|
)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
|
|
segment.export(temp_audio.name, format="mp3")
|
|
next_chunk = client.audio.transcriptions.create(
|
|
model="whisper-1", file=open(temp_audio.name, "rb")
|
|
).text
|
|
transcript.append({"role": "user", "content": next_chunk})
|
|
yield AdditionalOutputs(transcript)
|
|
|
|
|
|
with gr.Blocks() as demo:
|
|
with gr.Row():
|
|
with gr.Column():
|
|
audio = WebRTC(
|
|
label="Stream",
|
|
mode="send",
|
|
modality="audio",
|
|
)
|
|
with gr.Column():
|
|
transcript = gr.Chatbot(label="transcript", type="messages")
|
|
|
|
audio.stream(ReplyOnPause(transcribe), inputs=[audio, transcript], outputs=[audio],
|
|
time_limit=30)
|
|
audio.on_additional_outputs(lambda s: s, outputs=transcript)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch()
|