mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
28
.github/workflows/docs.yml
vendored
Normal file
28
.github/workflows/docs.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
name: docs
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Configure Git Credentials
|
||||||
|
run: |
|
||||||
|
git config user.name github-actions[bot]
|
||||||
|
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
|
||||||
|
- uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
key: mkdocs-material-${{ env.cache_id }}
|
||||||
|
path: .cache
|
||||||
|
restore-keys: |
|
||||||
|
mkdocs-material-
|
||||||
|
- run: pip install mkdocs-material
|
||||||
|
- run: mkdocs gh-deploy --force
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
from .reply_on_pause import ReplyOnPause
|
from .reply_on_pause import ReplyOnPause, AlgoOptions, SileroVadOptions
|
||||||
from .utils import AdditionalOutputs
|
from .utils import AdditionalOutputs
|
||||||
from .webrtc import StreamHandler, WebRTC
|
from .webrtc import StreamHandler, WebRTC
|
||||||
|
|
||||||
__all__ = ["AdditionalOutputs", "ReplyOnPause", "StreamHandler", "WebRTC"]
|
__all__ = ["AlgoOptions", "AdditionalOutputs", "ReplyOnPause",
|
||||||
|
"SileroVadOptions", "StreamHandler", "WebRTC"]
|
||||||
|
|||||||
0
docs/additional-outputs.md
Normal file
0
docs/additional-outputs.md
Normal file
73
docs/advanced-configuration.md
Normal file
73
docs/advanced-configuration.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
## Track Constraints
|
||||||
|
|
||||||
|
You can specify the `track_constraints` parameter to control how the data is streamed to the server. The full documentation on track constraints is [here](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints#constraints).
|
||||||
|
|
||||||
|
For example, you can control the size of the frames captured from the webcam like so:
|
||||||
|
|
||||||
|
```python
|
||||||
|
track_constraints = {
|
||||||
|
"width": {"ideal": 500},
|
||||||
|
"height": {"ideal": 500},
|
||||||
|
"frameRate": {"ideal": 30},
|
||||||
|
}
|
||||||
|
webrtc = WebRTC(track_constraints=track_constraints,
|
||||||
|
modality="video",
|
||||||
|
mode="send-receive")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## The RTC Configuration
|
||||||
|
|
||||||
|
You can configure how the connection is created on the client by passing an `rtc_configuration` parameter to the `WebRTC` component constructor.
|
||||||
|
See the list of available arguments [here](https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection/RTCPeerConnection#configuration).
|
||||||
|
|
||||||
|
When deploying on a remote server, an `rtc_configuration` parameter must be passed in. See [Deployment](/deployment).
|
||||||
|
|
||||||
|
## Reply on Pause Voice-Activity-Detection
|
||||||
|
|
||||||
|
The `ReplyOnPause` class runs a Voice Activity Detection (VAD) algorithm to determine when a user has stopped speaking.
|
||||||
|
|
||||||
|
1. First, the algorithm determines when the user has started speaking.
|
||||||
|
2. Then it groups the audio into chunks.
|
||||||
|
3. On each chunk, we determine the length of human speech in the chunk.
|
||||||
|
4. If the length of human speech is below a threshold, a pause is detected.
|
||||||
|
|
||||||
|
The following parameters control this argument:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from gradio_webrtc import AlgoOptions, ReplyOnPause, WebRTC
|
||||||
|
|
||||||
|
options = AlgoOptions(audio_chunk_duration=0.6, # (1)
|
||||||
|
started_talking_threshold=0.2, # (2)
|
||||||
|
speech_threshold=0.1, # (3)
|
||||||
|
)
|
||||||
|
|
||||||
|
with gr.Blocks as demo:
|
||||||
|
audio = WebRTC(...)
|
||||||
|
audio.stream(ReplyOnPause(..., algo_options=algo_options)
|
||||||
|
)
|
||||||
|
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. This is the length (in seconds) of audio chunks.
|
||||||
|
2. If the chunk has more than 0.2 seconds of speech, the user started talking.
|
||||||
|
3. If, after the user started speaking, there is a chunk with less than 0.1 seconds of speech, the user stopped speaking.
|
||||||
|
|
||||||
|
## Stream Handler Output Audio
|
||||||
|
|
||||||
|
You can configure the output audio chunk size of `ReplyOnPause` (and any `StreamHandler`)
|
||||||
|
with the `output_sample_rate` and `output_frame_size` parameters.
|
||||||
|
|
||||||
|
The following code (which uses the default values of these parameters), states that each output chunk will be a frame of 960 samples at a frame rate of `24,000` hz. So it will correspond to `0.04` seconds.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from gradio_webrtc import ReplyOnPause, WebRTC
|
||||||
|
|
||||||
|
with gr.Blocks as demo:
|
||||||
|
audio = WebRTC(...)
|
||||||
|
audio.stream(ReplyOnPause(..., output_sample_rate=24000, output_frame_size=960)
|
||||||
|
)
|
||||||
|
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
1
docs/bolt.svg
Normal file
1
docs/bolt.svg
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960" width="24px" fill="#e8eaed"><path d="m422-232 207-248H469l29-227-185 267h139l-30 208ZM320-80l40-280H160l360-520h80l-40 320h240L400-80h-80Zm151-390Z"/></svg>
|
||||||
|
After Width: | Height: | Size: 235 B |
87
docs/cookbook.md
Normal file
87
docs/cookbook.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
<div class="grid cards" markdown>
|
||||||
|
|
||||||
|
- :speaking_head:{ .lg .middle } __Audio Input/Output with mini-omni2__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Build a GPT-4o like experience with mini-omni2, an audio-native LLM.
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/mini-omni2-webrtc)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/mini-omni2-webrtc/blob/main/app.py)
|
||||||
|
|
||||||
|
- :speaking_head:{ .lg .middle } __Talk to Claude__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Use the Anthropic and Play.Ht APIs to have an audio conversation with Claude
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-claude)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-claude/blob/main/app.py)
|
||||||
|
|
||||||
|
- :speaking_head:{ .lg .middle } __Talk to Llama 3.2 3b__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Use the Lepton API to make Llama 3.2 talk back to you!
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/llama-3.2-3b-voice-webrtc)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/llama-3.2-3b-voice-webrtc/blob/main/app.py)
|
||||||
|
|
||||||
|
|
||||||
|
- :speaking_head:{ .lg .middle } __Talk to Ultravox__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Talk to Fixie.AI's audio-native Ultravox LLM with the transformers library.
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-ultravox)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-ultravox/blob/main/app.py)
|
||||||
|
|
||||||
|
|
||||||
|
- :robot:{ .lg .middle } __Talk to Qwen2-Audio__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Qwen2-Audio is a SOTA audio-to-text LLM developed by Alibaba.
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/talk-to-qwen-webrtc)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/talk-to-qwen-webrtc/blob/main/app.py)
|
||||||
|
|
||||||
|
|
||||||
|
- :camera:{ .lg .middle } __Yolov10 Object Detection__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Run the Yolov10 model on a user webcam stream in real time!
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n/blob/main/app.py)
|
||||||
|
|
||||||
|
- :camera:{ .lg .middle } __Video Object Detection with RT-DETR__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Upload a video and stream out frames with detected objects (powered by RT-DETR) model.
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc/blob/main/app.py)
|
||||||
|
|
||||||
|
- :speaker:{ .lg .middle } __Text-to-Speech with Parler__
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Stream out audio generated by Parler TTS!
|
||||||
|
|
||||||
|
[:octicons-arrow-right-24: Demo](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc)
|
||||||
|
|
||||||
|
[:octicons-code-16: Code](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc/blob/main/app.py)
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
24
docs/deployment.md
Normal file
24
docs/deployment.md
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
When deploying in a cloud environment (like Hugging Face Spaces, EC2, etc), you need to set up a TURN server to relay the WebRTC traffic.
|
||||||
|
The easiest way to do this is to use a service like Twilio.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from twilio.rest import Client
|
||||||
|
import os
|
||||||
|
|
||||||
|
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
|
||||||
|
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
|
||||||
|
|
||||||
|
client = Client(account_sid, auth_token)
|
||||||
|
|
||||||
|
token = client.tokens.create()
|
||||||
|
|
||||||
|
rtc_configuration = {
|
||||||
|
"iceServers": token.ice_servers,
|
||||||
|
"iceTransportPolicy": "relay",
|
||||||
|
}
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
...
|
||||||
|
rtc = WebRTC(rtc_configuration=rtc_configuration, ...)
|
||||||
|
...
|
||||||
|
```
|
||||||
3
docs/faq.md
Normal file
3
docs/faq.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
## Demo does not work when deploying to the cloud
|
||||||
|
|
||||||
|
Make sure you are using a TURN server. See [deployment](/deployment).
|
||||||
28
docs/index.md
Normal file
28
docs/index.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
<h1 style='text-align: center; margin-bottom: 1rem; color: white;'> Gradio WebRTC ⚡️ </h1>
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: row; justify-content: center">
|
||||||
|
<img style="display: block; padding-right: 5px; height: 20px;" alt="Static Badge" src="https://img.shields.io/pypi/v/gradio_webrtc">
|
||||||
|
<a href="https://github.com/freddyaboulton/gradio-webrtc" target="_blank"><img alt="Static Badge" src="https://img.shields.io/badge/github-white?logo=github&logoColor=black"></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3 style='text-align: center'>
|
||||||
|
Stream video and audio in real time with Gradio using WebRTC.
|
||||||
|
</h3>
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install gradio_webrtc
|
||||||
|
```
|
||||||
|
|
||||||
|
to use built-in pause detection (see [conversational ai](#conversational-ai)), install the `vad` extra:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install gradio_webrtc[vad]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
1. [Object Detection from Webcam with YOLOv10](https://huggingface.co/spaces/freddyaboulton/webrtc-yolov10n) 📷
|
||||||
|
2. [Streaming Object Detection from Video with RT-DETR](https://huggingface.co/spaces/freddyaboulton/rt-detr-object-detection-webrtc) 🎥
|
||||||
|
3. [Text-to-Speech](https://huggingface.co/spaces/freddyaboulton/parler-tts-streaming-webrtc) 🗣️
|
||||||
|
4. [Conversational AI](https://huggingface.co/spaces/freddyaboulton/omni-mini-webrtc) 🤖🗣️
|
||||||
291
docs/user-guide.md
Normal file
291
docs/user-guide.md
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
# User Guide
|
||||||
|
|
||||||
|
To get started with WebRTC streams, all that's needed is to import the `WebRTC` component from this package and implement its `stream` event.
|
||||||
|
|
||||||
|
This page will show how to do so with simple code examples.
|
||||||
|
For complete implementations of common tasks, see the [cookbook](/cookbook).
|
||||||
|
|
||||||
|
## Audio Streaming
|
||||||
|
|
||||||
|
### Reply on Pause
|
||||||
|
|
||||||
|
Typically, you want to run an AI model that generates audio when the user has stopped speaking. This can be done by wrapping a python generator with the `ReplyOnPause` class
|
||||||
|
and passing it to the `stream` event of the `WebRTC` component.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
``` py title="ReplyonPause"
|
||||||
|
import gradio as gr
|
||||||
|
from gradio_webrtc import WebRTC, ReplyOnPause
|
||||||
|
|
||||||
|
def response(audio: tuple[int, np.ndarray]): # (1)
|
||||||
|
"""This function must yield audio frames"""
|
||||||
|
...
|
||||||
|
for numpy_array in generated_audio:
|
||||||
|
yield (sampling_rate, numpy_array, "mono") # (2)
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
gr.HTML(
|
||||||
|
"""
|
||||||
|
<h1 style='text-align: center'>
|
||||||
|
Chat (Powered by WebRTC ⚡️)
|
||||||
|
</h1>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
with gr.Column():
|
||||||
|
with gr.Group():
|
||||||
|
audio = WebRTC(
|
||||||
|
mode="send-receive", # (3)
|
||||||
|
modality="audio",
|
||||||
|
)
|
||||||
|
audio.stream(fn=ReplyOnPause(response),
|
||||||
|
inputs=[audio], outputs=[audio], # (4)
|
||||||
|
time_limit=60) # (5)
|
||||||
|
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. The python generator will receive the **entire** audio up until the user stopped. It will be a tuple of the form (sampling_rate, numpy array of audio). The array will have a shape of (1, num_samples). You can also pass in additional input components.
|
||||||
|
|
||||||
|
2. The generator must yield audio chunks as a tuple of (sampling_rate, numpy audio array). Each numpy audio array must have a shape of (1, num_samples).
|
||||||
|
|
||||||
|
3. The `mode` and `modality` arguments must be set to `"send-receive"` and `"audio"`.
|
||||||
|
|
||||||
|
4. The `WebRTC` component must be the first input and output component.
|
||||||
|
|
||||||
|
5. Set a `time_limit` to control how long a conversation will last. If the `concurrency_count` is 1 (default), only one conversation will be handled at a time.
|
||||||
|
=== "Notes"
|
||||||
|
1. The python generator will receive the **entire** audio up until the user stopped. It will be a tuple of the form (sampling_rate, numpy array of audio). The array will have a shape of (1, num_samples). You can also pass in additional input components.
|
||||||
|
|
||||||
|
2. The generator must yield audio chunks as a tuple of (sampling_rate, numpy audio arrays). Each numpy audio array must have a shape of (1, num_samples).
|
||||||
|
|
||||||
|
3. The `mode` and `modality` arguments must be set to `"send-receive"` and `"audio"`.
|
||||||
|
|
||||||
|
4. The `WebRTC` component must be the first input and output component.
|
||||||
|
|
||||||
|
5. Set a `time_limit` to control how long a conversation will last. If the `concurrency_count` is 1 (default), only one conversation will be handled at a time.
|
||||||
|
|
||||||
|
### Stream Handler
|
||||||
|
|
||||||
|
`ReplyOnPause` is an implementation of a `StreamHandler`. The `StreamHandler` is a low-level
|
||||||
|
abstraction that gives you arbitrary control over how the input audio stream and output audio stream are created. The following example echos back the user audio.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
``` py title="Stream Handler"
|
||||||
|
import gradio as gr
|
||||||
|
from gradio_webrtc import WebRTC, StreamHandler
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
|
class EchoHandler(StreamHandler):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.queue = Queue()
|
||||||
|
|
||||||
|
def receive(self, frame: tuple[int, np.ndarray]) -> None: # (1)
|
||||||
|
self.queue.put(frame)
|
||||||
|
|
||||||
|
def emit(self) -> None: # (2)
|
||||||
|
return self.queue.get()
|
||||||
|
|
||||||
|
def copy(self) -> StreamHandler:
|
||||||
|
return EchoHandler()
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
with gr.Column():
|
||||||
|
with gr.Group():
|
||||||
|
audio = WebRTC(
|
||||||
|
mode="send-receive",
|
||||||
|
modality="audio",
|
||||||
|
)
|
||||||
|
|
||||||
|
audio.stream(fn=EchoHandler(),
|
||||||
|
inputs=[audio], outputs=[audio],
|
||||||
|
time_limit=15)
|
||||||
|
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. The `StreamHandler` class implements three methods: `receive`, `emit` and `copy`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client. The `copy` method is called at the beginning of the stream to ensure each user has a unique stream handler.
|
||||||
|
2. The `emit` method SHOULD NOT block. If a frame is not ready to be sent, the method should return `None`.
|
||||||
|
|
||||||
|
=== "Notes"
|
||||||
|
1. The `StreamHandler` class implements three methods: `receive`, `emit` and `copy`. The `receive` method is called when a new frame is received from the client, and the `emit` method returns the next frame to send to the client. The `copy` method is called at the beginning of the stream to ensure each user has a unique stream handler.
|
||||||
|
2. The `emit` method SHOULD NOT block. If a frame is not ready to be sent, the method should return `None`.
|
||||||
|
|
||||||
|
### Server-To-Client Only
|
||||||
|
|
||||||
|
To stream only from the server to the client, implement a python generator and pass it to the component's `stream` event. The stream event must also specify a `trigger` corresponding to a UI interaction that starts the stream. In this case, it's a button click.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
|
||||||
|
``` py title="Server-To-CLient"
|
||||||
|
import gradio as gr
|
||||||
|
from gradio_webrtc import WebRTC
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
def generation(num_steps):
|
||||||
|
for _ in range(num_steps):
|
||||||
|
segment = AudioSegment.from_file("audio_file.wav")
|
||||||
|
array = np.array(segment.get_array_of_samples()).reshape(1, -1)
|
||||||
|
yield (segment.frame_rate, array)
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
audio = WebRTC(label="Stream", mode="receive", # (1)
|
||||||
|
modality="audio")
|
||||||
|
num_steps = gr.Slider(label="Number of Steps", minimum=1,
|
||||||
|
maximum=10, step=1, value=5)
|
||||||
|
button = gr.Button("Generate")
|
||||||
|
|
||||||
|
audio.stream(
|
||||||
|
fn=generation, inputs=[num_steps], outputs=[audio],
|
||||||
|
trigger=button.click # (2)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Set `mode="receive"` to only receive audio from the server.
|
||||||
|
2. The `stream` event must take a `trigger` that corresponds to the gradio event that starts the stream. In this case, it's the button click.
|
||||||
|
=== "Notes"
|
||||||
|
1. Set `mode="receive"` to only receive audio from the server.
|
||||||
|
2. The `stream` event must take a `trigger` that corresponds to the gradio event that starts the stream. In this case, it's the button click.
|
||||||
|
|
||||||
|
## Video Streaming
|
||||||
|
|
||||||
|
### Input/Output Streaming
|
||||||
|
Set up a video Input/Output stream to continuosly receive webcam frames from the user and run an arbitrary python function to return a modified frame.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
|
||||||
|
``` py title="Input/Output Streaming"
|
||||||
|
import gradio as gr
|
||||||
|
from gradio_webrtc import WebRTC
|
||||||
|
|
||||||
|
|
||||||
|
def detection(image, conf_threshold=0.3): # (1)
|
||||||
|
... your detection code here ...
|
||||||
|
return modified_frame # (2)
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
image = WebRTC(label="Stream", mode="send-receive", modality="video") # (3)
|
||||||
|
conf_threshold = gr.Slider(
|
||||||
|
label="Confidence Threshold",
|
||||||
|
minimum=0.0,
|
||||||
|
maximum=1.0,
|
||||||
|
step=0.05,
|
||||||
|
value=0.30,
|
||||||
|
)
|
||||||
|
image.stream(
|
||||||
|
fn=detection,
|
||||||
|
inputs=[image, conf_threshold], # (4)
|
||||||
|
outputs=[image], time_limit=10
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. The webcam frame will be represented as a numpy array of shape (height, width, RGB).
|
||||||
|
2. The function must return a numpy array. It can take arbitrary values from other components.
|
||||||
|
3. Set the `modality="video"` and `mode="send-receive"`
|
||||||
|
4. The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component.
|
||||||
|
=== "Notes"
|
||||||
|
1. The webcam frame will be represented as a numpy array of shape (height, width, RGB).
|
||||||
|
2. The function must return a numpy array. It can take arbitrary values from other components.
|
||||||
|
3. Set the `modality="video"` and `mode="send-receive"`
|
||||||
|
4. The `inputs` parameter should be a list where the first element is the WebRTC component. The only output allowed is the WebRTC component.
|
||||||
|
|
||||||
|
### Server-to-Client Only
|
||||||
|
|
||||||
|
Set up a server-to-client stream to stream video from an arbitrary user interaction.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
``` py title="Server-To-Client"
|
||||||
|
import gradio as gr
|
||||||
|
from gradio_webrtc import WebRTC
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
def generation():
|
||||||
|
url = "https://download.tsi.telecom-paristech.fr/gpac/dataset/dash/uhd/mux_sources/hevcds_720p30_2M.mp4"
|
||||||
|
cap = cv2.VideoCapture(url)
|
||||||
|
iterating = True
|
||||||
|
while iterating:
|
||||||
|
iterating, frame = cap.read()
|
||||||
|
yield frame # (1)
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
output_video = WebRTC(label="Video Stream", mode="receive", # (2)
|
||||||
|
modality="video")
|
||||||
|
button = gr.Button("Start", variant="primary")
|
||||||
|
output_video.stream(
|
||||||
|
fn=generation, inputs=None, outputs=[output_video],
|
||||||
|
trigger=button.click # (3)
|
||||||
|
)
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. The `stream` event's `fn` parameter is a generator function that yields the next frame from the video as a **numpy array**.
|
||||||
|
2. Set `mode="receive"` to only receive audio from the server.
|
||||||
|
3. The `trigger` parameter the gradio event that will trigger the stream. In this case, the button click event.
|
||||||
|
=== "Notes"
|
||||||
|
1. The `stream` event's `fn` parameter is a generator function that yields the next frame from the video as a **numpy array**.
|
||||||
|
2. Set `mode="receive"` to only receive audio from the server.
|
||||||
|
3. The `trigger` parameter the gradio event that will trigger the stream. In this case, the button click event.
|
||||||
|
|
||||||
|
|
||||||
|
## Additional Outputs
|
||||||
|
|
||||||
|
In order to modify other components from within the WebRTC stream, you must yield an instance of `AdditionalOutputs` and add an `on_additional_outputs` event to the `WebRTC` component.
|
||||||
|
|
||||||
|
This is common for displaying a multimodal text/audio conversation in a Chatbot UI.
|
||||||
|
|
||||||
|
=== "Code"
|
||||||
|
|
||||||
|
``` py title="Additional Outputs"
|
||||||
|
from gradio_webrtc import AdditionalOutputs, WebRTC
|
||||||
|
|
||||||
|
def transcribe(audio: tuple[int, np.ndarray],
|
||||||
|
transformers_convo: list[dict],
|
||||||
|
gradio_convo: list[dict]):
|
||||||
|
... generate text response ...
|
||||||
|
response = model.generate(**inputs, max_length=256)
|
||||||
|
transformers_convo.append({"role": "assistant", "content": response})
|
||||||
|
gradio_convo.append({"role": "assistant", "content": response})
|
||||||
|
yield AdditionalOutputs(transformers_convo, gradio_convo) # (1)
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
gr.HTML(
|
||||||
|
"""
|
||||||
|
<h1 style='text-align: center'>
|
||||||
|
Talk to Qwen2Audio (Powered by WebRTC ⚡️)
|
||||||
|
</h1>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
transformers_convo = gr.State(value=[])
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column():
|
||||||
|
audio = WebRTC(
|
||||||
|
label="Stream",
|
||||||
|
mode="send", # (2)
|
||||||
|
modality="audio",
|
||||||
|
)
|
||||||
|
with gr.Column():
|
||||||
|
transcript = gr.Chatbot(label="transcript", type="messages")
|
||||||
|
|
||||||
|
audio.stream(ReplyOnPause(transcribe),
|
||||||
|
inputs=[audio, transformers_convo, transcript],
|
||||||
|
outputs=[audio], time_limit=90)
|
||||||
|
audio.on_additional_outputs(lambda s,a: (s,a), # (3)
|
||||||
|
outputs=[transformers_convo, transcript],
|
||||||
|
queue=False, show_progress="hidden")
|
||||||
|
demo.launch()
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Pass your data to `AdditionalOutputs` and yield it.
|
||||||
|
2. In this case, no audio is being returned, so we set `mode="send"`. However, if we set `mode="send-receive"`, we could also yield generated audio and `AdditionalOutputs`.
|
||||||
|
3. The `on_additional_outputs` event does not take `inputs`. It's common practice to not run this event on the queue since it is just a quick UI update.
|
||||||
|
=== "Notes"
|
||||||
|
1. Pass your data to `AdditionalOutputs` and yield it.
|
||||||
|
2. In this case, no audio is being returned, so we set `mode="send"`. However, if we set `mode="send-receive"`, we could also yield generated audio and `AdditionalOutputs`.
|
||||||
|
3. The `on_additional_outputs` event does not take `inputs`. It's common practice to not run this event on the queue since it is just a quick UI update.
|
||||||
36
mkdocs.yml
Normal file
36
mkdocs.yml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
site_name: Gradio WebRTC
|
||||||
|
site_url: https://sitename.example
|
||||||
|
repo_name: gradio-webrtc
|
||||||
|
repo_url: https://github.com/freddyaboulton/gradio-webrtc
|
||||||
|
theme:
|
||||||
|
name: material
|
||||||
|
palette:
|
||||||
|
scheme: slate
|
||||||
|
primary: black
|
||||||
|
accent: yellow
|
||||||
|
features:
|
||||||
|
- content.code.copy
|
||||||
|
- content.code.annotate
|
||||||
|
logo: bolt.svg
|
||||||
|
favicon: bolt.svg
|
||||||
|
nav:
|
||||||
|
- Home: index.md
|
||||||
|
- User Guide: user-guide.md
|
||||||
|
- Cookbook: cookbook.md
|
||||||
|
- Deployment: deployment.md
|
||||||
|
- Advanced Configuration: advanced-configuration.md
|
||||||
|
markdown_extensions:
|
||||||
|
- pymdownx.highlight:
|
||||||
|
anchor_linenums: true
|
||||||
|
line_spans: __span
|
||||||
|
pygments_lang_class: true
|
||||||
|
- pymdownx.inlinehilite
|
||||||
|
- pymdownx.snippets
|
||||||
|
- pymdownx.superfences
|
||||||
|
- pymdownx.tabbed:
|
||||||
|
alternate_style: true
|
||||||
|
- attr_list
|
||||||
|
- md_in_html
|
||||||
|
- pymdownx.emoji:
|
||||||
|
emoji_index: !!python/name:material.extensions.emoji.twemoji
|
||||||
|
emoji_generator: !!python/name:material.extensions.emoji.to_svg
|
||||||
@@ -8,7 +8,7 @@ build-backend = "hatchling.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "gradio_webrtc"
|
name = "gradio_webrtc"
|
||||||
version = "0.0.12"
|
version = "0.0.13"
|
||||||
description = "Stream images in realtime with webrtc"
|
description = "Stream images in realtime with webrtc"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = "apache-2.0"
|
license = "apache-2.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user