diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..f58f8c4 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,28 @@ +name: docs +on: + push: + branches: + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/backend/gradio_webrtc/__init__.py b/backend/gradio_webrtc/__init__.py index f99f83b..806216e 100644 --- a/backend/gradio_webrtc/__init__.py +++ b/backend/gradio_webrtc/__init__.py @@ -1,5 +1,6 @@ -from .reply_on_pause import ReplyOnPause +from .reply_on_pause import ReplyOnPause, AlgoOptions, SileroVadOptions from .utils import AdditionalOutputs from .webrtc import StreamHandler, WebRTC -__all__ = ["AdditionalOutputs", "ReplyOnPause", "StreamHandler", "WebRTC"] +__all__ = ["AlgoOptions", "AdditionalOutputs", "ReplyOnPause", + "SileroVadOptions", "StreamHandler", "WebRTC"] diff --git a/docs/additional-outputs.md b/docs/additional-outputs.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/advanced-configuration.md b/docs/advanced-configuration.md new file mode 100644 index 0000000..d021b3e --- /dev/null +++ b/docs/advanced-configuration.md @@ -0,0 +1,73 @@ +## Track Constraints + +You can specify the `track_constraints` parameter to control how the data is streamed to the server. The full documentation on track constraints is [here](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints#constraints). + +For example, you can control the size of the frames captured from the webcam like so: + +```python +track_constraints = { + "width": {"ideal": 500}, + "height": {"ideal": 500}, + "frameRate": {"ideal": 30}, +} +webrtc = WebRTC(track_constraints=track_constraints, + modality="video", + mode="send-receive") +``` + + +## The RTC Configuration + +You can configure how the connection is created on the client by passing an `rtc_configuration` parameter to the `WebRTC` component constructor. +See the list of available arguments [here](https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection/RTCPeerConnection#configuration). + +When deploying on a remote server, an `rtc_configuration` parameter must be passed in. See [Deployment](/deployment). + +## Reply on Pause Voice-Activity-Detection + +The `ReplyOnPause` class runs a Voice Activity Detection (VAD) algorithm to determine when a user has stopped speaking. + +1. First, the algorithm determines when the user has started speaking. +2. Then it groups the audio into chunks. +3. On each chunk, we determine the length of human speech in the chunk. +4. If the length of human speech is below a threshold, a pause is detected. + +The following parameters control this argument: + +```python +from gradio_webrtc import AlgoOptions, ReplyOnPause, WebRTC + +options = AlgoOptions(audio_chunk_duration=0.6, # (1) + started_talking_threshold=0.2, # (2) + speech_threshold=0.1, # (3) + ) + +with gr.Blocks as demo: + audio = WebRTC(...) + audio.stream(ReplyOnPause(..., algo_options=algo_options) + ) + +demo.launch() +``` + +1. This is the length (in seconds) of audio chunks. +2. If the chunk has more than 0.2 seconds of speech, the user started talking. +3. If, after the user started speaking, there is a chunk with less than 0.1 seconds of speech, the user stopped speaking. + +## Stream Handler Output Audio + +You can configure the output audio chunk size of `ReplyOnPause` (and any `StreamHandler`) +with the `output_sample_rate` and `output_frame_size` parameters. + +The following code (which uses the default values of these parameters), states that each output chunk will be a frame of 960 samples at a frame rate of `24,000` hz. So it will correspond to `0.04` seconds. + +```python +from gradio_webrtc import ReplyOnPause, WebRTC + +with gr.Blocks as demo: + audio = WebRTC(...) + audio.stream(ReplyOnPause(..., output_sample_rate=24000, output_frame_size=960) + ) + +demo.launch() +``` \ No newline at end of file diff --git a/docs/bolt.svg b/docs/bolt.svg new file mode 100644 index 0000000..f3a0046 --- /dev/null +++ b/docs/bolt.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/cookbook.md b/docs/cookbook.md new file mode 100644 index 0000000..4b0ee7f --- /dev/null +++ b/docs/cookbook.md @@ -0,0 +1,87 @@ +
+ +- :speaking_head:{ .lg .middle } __Audio Input/Output with mini-omni2__ + + --- + + Build a GPT-4o like experience with mini-omni2, an audio-native LLM. + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/mini-omni2-webrtc) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/mini-omni2-webrtc/blob/main/app.py) + +- :speaking_head:{ .lg .middle } __Talk to Claude__ + + --- + + Use the Anthropic and Play.Ht APIs to have an audio conversation with Claude + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-claude) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py) + +- :speaking_head:{ .lg .middle } __Talk to Llama 3.2 3b__ + + --- + + Use the Lepton API to make Llama 3.2 talk back to you! + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/llama-3.2-3b-voice-webrtc) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/llama-3.2-3b-voice-webrtc/blob/main/app.py) + + +- :speaking_head:{ .lg .middle } __Talk to Ultravox__ + + --- + + Talk to Fixie.AI's audio-native Ultravox LLM with the transformers library. + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-ultravox) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-ultravox/blob/main/app.py) + + +- :robot:{ .lg .middle } __Talk to Qwen2-Audio__ + + --- + + Qwen2-Audio is a SOTA audio-to-text LLM developed by Alibaba. + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-qwen-webrtc) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-qwen-webrtc/blob/main/app.py) + + +- :camera:{ .lg .middle } __Yolov10 Object Detection__ + + --- + + Run the Yolov10 model on a user webcam stream in real time! + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n/blob/main/app.py) + +- :camera:{ .lg .middle } __Video Object Detection with RT-DETR__ + + --- + + Upload a video and stream out frames with detected objects (powered by RT-DETR) model. + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc/blob/main/app.py) + +- :speaker:{ .lg .middle } __Text-to-Speech with Parler__ + + --- + + Stream out audio generated by Parler TTS! + + [:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) + + [:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc/blob/main/app.py) + + +
\ No newline at end of file diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 0000000..f3ac7ee --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,24 @@ +When deploying in a cloud environment (like Hugging Face Spaces, EC2, etc), you need to set up a TURN server to relay the WebRTC traffic. +The easiest way to do this is to use a service like Twilio. + +```python +from twilio.rest import Client +import os + +account_sid = os.environ.get("TWILIO_ACCOUNT_SID") +auth_token = os.environ.get("TWILIO_AUTH_TOKEN") + +client = Client(account_sid, auth_token) + +token = client.tokens.create() + +rtc_configuration = { + "iceServers": token.ice_servers, + "iceTransportPolicy": "relay", +} + +with gr.Blocks() as demo: + ... + rtc = WebRTC(rtc_configuration=rtc_configuration, ...) + ... +``` \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 0000000..3323b45 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,3 @@ +## Demo does not work when deploying to the cloud + +Make sure you are using a TURN server. See [deployment](/deployment). \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..b5269b0 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,28 @@ +

Gradio WebRTC ⚡️

+ +
+Static Badge +Static Badge +
+ +

+Stream video and audio in real time with Gradio using WebRTC. +

+ +## Installation + +```bash +pip install gradio_webrtc +``` + +to use built-in pause detection (see [conversational ai](#conversational-ai)), install the `vad` extra: + +```bash +pip install gradio_webrtc[vad] +``` + +## Examples +1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷 +2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥 +3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️ +4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️ \ No newline at end of file diff --git a/docs/user-guide.md b/docs/user-guide.md new file mode 100644 index 0000000..a07169b --- /dev/null +++ b/docs/user-guide.md @@ -0,0 +1,291 @@ +# User Guide + +To get started with WebRTC streams, all that's needed is to import the `WebRTC` component from this package and implement its `stream` event. + +This page will show how to do so with simple code examples. +For complete implementations of common tasks, see the [cookbook](/cookbook). + +## Audio Streaming + +### Reply on Pause + +Typically, you want to run an AI model that generates audio when the user has stopped speaking. This can be done by wrapping a python generator with the `ReplyOnPause` class +and passing it to the `stream` event of the `WebRTC` component. + +=== "Code" + ``` py title="ReplyonPause" + import gradio as gr + from gradio_webrtc import WebRTC, ReplyOnPause + + def response(audio: tuple[int, np.ndarray]): # (1) + """This function must yield audio frames""" + ... + for numpy_array in generated_audio: + yield (sampling_rate, numpy_array, "mono") # (2) + + + with gr.Blocks() as demo: + gr.HTML( + """ +

+ Chat (Powered by WebRTC ⚡️) +

+ """ + ) + with gr.Column(): + with gr.Group(): + audio = WebRTC( + mode="send-receive", # (3) + modality="audio", + ) + audio.stream(fn=ReplyOnPause(response), + inputs=[audio], outputs=[audio], # (4) + time_limit=60) # (5) + + demo.launch() + ``` + + 1. The python generator will receive the **entire** audio up until the user stopped. It will be a tuple of the form (sampling_rate, numpy array of audio). The array will have a shape of (1, num_samples). You can also pass in additional input components. + + 2. The generator must yield audio chunks as a tuple of (sampling_rate, numpy audio array). Each numpy audio array must have a shape of (1, num_samples). + + 3. The `mode` and `modality` arguments must be set to `"send-receive"` and `"audio"`. + + 4. The `WebRTC` component must be the first input and output component. + + 5. Set a `time_limit` to control how long a conversation will last. If the `concurrency_count` is 1 (default), only one conversation will be handled at a time. +=== "Notes" + 1. The python generator will receive the **entire** audio up until the user stopped. It will be a tuple of the form (sampling_rate, numpy array of audio). The array will have a shape of (1, num_samples). You can also pass in additional input components. + + 2. The generator must yield audio chunks as a tuple of (sampling_rate, numpy audio arrays). Each numpy audio array must have a shape of (1, num_samples). + + 3. The `mode` and `modality` arguments must be set to `"send-receive"` and `"audio"`. + + 4. The `WebRTC` component must be the first input and output component. + + 5. Set a `time_limit` to control how long a conversation will last. If the `concurrency_count` is 1 (default), only one conversation will be handled at a time. + +### Stream Handler + +`ReplyOnPause` is an implementation of a `StreamHandler`. The `StreamHandler` is a low-level +abstraction that gives you arbitrary control over how the input audio stream and output audio stream are created. The following example echos back the user audio. + +=== "Code" + ``` py title="Stream Handler" + import gradio as gr + from gradio_webrtc import WebRTC, StreamHandler + from queue import Queue + + class EchoHandler(StreamHandler): + def __init__(self) -> None: + super().__init__() + self.queue = Queue() + + def receive(self, frame: tuple[int, np.ndarray]) -> None: # (1) + self.queue.put(frame) + + def emit(self) -> None: # (2) + return self.queue.get() + + def copy(self) -> StreamHandler: + return EchoHandler() + + + with gr.Blocks() as demo: + with gr.Column(): + with gr.Group(): + audio = WebRTC( + mode="send-receive", + modality="audio", + ) + + audio.stream(fn=EchoHandler(), + inputs=[audio], outputs=[audio], + time_limit=15) + + demo.launch() + ``` + + 1. The `StreamHandler` class implements three methods: `receive`, `emit` and `copy`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client. The `copy` method is called at the beginning of the stream to ensure each user has a unique stream handler. + 2. The `emit` method SHOULD NOT block. If a frame is not ready to be sent, the method should return `None`. + +=== "Notes" + 1. The `StreamHandler` class implements three methods: `receive`, `emit` and `copy`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client. The `copy` method is called at the beginning of the stream to ensure each user has a unique stream handler. + 2. The `emit` method SHOULD NOT block. If a frame is not ready to be sent, the method should return `None`. + +### Server-To-Client Only + +To stream only from the server to the client, implement a python generator and pass it to the component's `stream` event. The stream event must also specify a `trigger` corresponding to a UI interaction that starts the stream. In this case, it's a button click. + +=== "Code" + + ``` py title="Server-To-CLient" + import gradio as gr + from gradio_webrtc import WebRTC + from pydub import AudioSegment + + def generation(num_steps): + for _ in range(num_steps): + segment = AudioSegment.from_file("audio_file.wav") + array = np.array(segment.get_array_of_samples()).reshape(1, -1) + yield (segment.frame_rate, array) + + with gr.Blocks() as demo: + audio = WebRTC(label="Stream", mode="receive", # (1) + modality="audio") + num_steps = gr.Slider(label="Number of Steps", minimum=1, + maximum=10, step=1, value=5) + button = gr.Button("Generate") + + audio.stream( + fn=generation, inputs=[num_steps], outputs=[audio], + trigger=button.click # (2) + ) + ``` + + 1. Set `mode="receive"` to only receive audio from the server. + 2. The `stream` event must take a `trigger` that corresponds to the gradio event that starts the stream. In this case, it's the button click. +=== "Notes" + 1. Set `mode="receive"` to only receive audio from the server. + 2. The `stream` event must take a `trigger` that corresponds to the gradio event that starts the stream. In this case, it's the button click. + +## Video Streaming + +### Input/Output Streaming +Set up a video Input/Output stream to continuosly receive webcam frames from the user and run an arbitrary python function to return a modified frame. + +=== "Code" + + ``` py title="Input/Output Streaming" + import gradio as gr + from gradio_webrtc import WebRTC + + + def detection(image, conf_threshold=0.3): # (1) + ... your detection code here ... + return modified_frame # (2) + + + with gr.Blocks() as demo: + image = WebRTC(label="Stream", mode="send-receive", modality="video") # (3) + conf_threshold = gr.Slider( + label="Confidence Threshold", + minimum=0.0, + maximum=1.0, + step=0.05, + value=0.30, + ) + image.stream( + fn=detection, + inputs=[image, conf_threshold], # (4) + outputs=[image], time_limit=10 + ) + + if __name__ == "__main__": + demo.launch() + ``` + + 1. The webcam frame will be represented as a numpy array of shape (height, width, RGB). + 2. The function must return a numpy array. It can take arbitrary values from other components. + 3. Set the `modality="video"` and `mode="send-receive"` + 4. The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component. +=== "Notes" + 1. The webcam frame will be represented as a numpy array of shape (height, width, RGB). + 2. The function must return a numpy array. It can take arbitrary values from other components. + 3. Set the `modality="video"` and `mode="send-receive"` + 4. The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component. + +### Server-to-Client Only + +Set up a server-to-client stream to stream video from an arbitrary user interaction. + +=== "Code" + ``` py title="Server-To-Client" + import gradio as gr + from gradio_webrtc import WebRTC + import cv2 + + def generation(): + url = "https://download.tsi.telecom-paristech.fr/gpac/dataset/dash/uhd/mux_sources/hevcds_720p30_2M.mp4" + cap = cv2.VideoCapture(url) + iterating = True + while iterating: + iterating, frame = cap.read() + yield frame # (1) + + with gr.Blocks() as demo: + output_video = WebRTC(label="Video Stream", mode="receive", # (2) + modality="video") + button = gr.Button("Start", variant="primary") + output_video.stream( + fn=generation, inputs=None, outputs=[output_video], + trigger=button.click # (3) + ) + demo.launch() + ``` + + 1. The `stream` event's `fn` parameter is a generator function that yields the next frame from the video as a **numpy array**. + 2. Set `mode="receive"` to only receive audio from the server. + 3. The `trigger` parameter the gradio event that will trigger the stream. In this case, the button click event. +=== "Notes" + 1. The `stream` event's `fn` parameter is a generator function that yields the next frame from the video as a **numpy array**. + 2. Set `mode="receive"` to only receive audio from the server. + 3. The `trigger` parameter the gradio event that will trigger the stream. In this case, the button click event. + + +## Additional Outputs + +In order to modify other components from within the WebRTC stream, you must yield an instance of `AdditionalOutputs` and add an `on_additional_outputs` event to the `WebRTC` component. + +This is common for displaying a multimodal text/audio conversation in a Chatbot UI. + +=== "Code" + + ``` py title="Additional Outputs" + from gradio_webrtc import AdditionalOutputs, WebRTC + + def transcribe(audio: tuple[int, np.ndarray], + transformers_convo: list[dict], + gradio_convo: list[dict]): + ... generate text response ... + response = model.generate(**inputs, max_length=256) + transformers_convo.append({"role": "assistant", "content": response}) + gradio_convo.append({"role": "assistant", "content": response}) + yield AdditionalOutputs(transformers_convo, gradio_convo) # (1) + + + with gr.Blocks() as demo: + gr.HTML( + """ +

+ Talk to Qwen2Audio (Powered by WebRTC ⚡️) +

+ """ + ) + transformers_convo = gr.State(value=[]) + with gr.Row(): + with gr.Column(): + audio = WebRTC( + label="Stream", + mode="send", # (2) + modality="audio", + ) + with gr.Column(): + transcript = gr.Chatbot(label="transcript", type="messages") + + audio.stream(ReplyOnPause(transcribe), + inputs=[audio, transformers_convo, transcript], + outputs=[audio], time_limit=90) + audio.on_additional_outputs(lambda s,a: (s,a), # (3) + outputs=[transformers_convo, transcript], + queue=False, show_progress="hidden") + demo.launch() + ``` + + 1. Pass your data to `AdditionalOutputs` and yield it. + 2. In this case, no audio is being returned, so we set `mode="send"`. However, if we set `mode="send-receive"`, we could also yield generated audio and `AdditionalOutputs`. + 3. The `on_additional_outputs` event does not take `inputs`. It's common practice to not run this event on the queue since it is just a quick UI update. +=== "Notes" + 1. Pass your data to `AdditionalOutputs` and yield it. + 2. In this case, no audio is being returned, so we set `mode="send"`. However, if we set `mode="send-receive"`, we could also yield generated audio and `AdditionalOutputs`. + 3. The `on_additional_outputs` event does not take `inputs`. It's common practice to not run this event on the queue since it is just a quick UI update. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..80ea79c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,36 @@ +site_name: Gradio WebRTC +site_url: https://sitename.example +repo_name: gradio-webrtc +repo_url: https://github.com/freddyaboulton/gradio-webrtc +theme: + name: material + palette: + scheme: slate + primary: black + accent: yellow + features: + - content.code.copy + - content.code.annotate + logo: bolt.svg + favicon: bolt.svg +nav: + - Home: index.md + - User Guide: user-guide.md + - Cookbook: cookbook.md + - Deployment: deployment.md + - Advanced Configuration: advanced-configuration.md +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - attr_list + - md_in_html + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 26a2707..a31a9fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "hatchling.build" [project] name = "gradio_webrtc" -version = "0.0.12" +version = "0.0.13" description = "Stream images in realtime with webrtc" readme = "README.md" license = "apache-2.0"