From 24ed2ca178fcc4474cb38a28b5c86a1b60ec93f1 Mon Sep 17 00:00:00 2001 From: Freddy Boulton Date: Mon, 10 Mar 2025 17:23:25 -0400 Subject: [PATCH] Add docs on how to contribute (#161) * Add code * add code * Add code --- docs/speech_to_text_gallery.md | 37 ++++++++++++++++++++++++- docs/vad_gallery.md | 50 +++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/docs/speech_to_text_gallery.md b/docs/speech_to_text_gallery.md index 9ff8e92..22e0091 100644 --- a/docs/speech_to_text_gallery.md +++ b/docs/speech_to_text_gallery.md @@ -79,4 +79,39 @@ document.querySelectorAll('.tag-button').forEach(button => { [:octicons-arrow-right-24: Demo](Your demo here) - [:octicons-code-16: Repository](Code here) \ No newline at end of file + [:octicons-code-16: Repository](Code here) + + + +## How to add your own STT model + +1. Your model can be implemented in **any** framework you want but it must implement the `STTModel` protocol. + + ```python + class STTModel(Protocol): + def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: ... + ``` + + * The `stt` method should take in an audio tuple `(sample_rate, audio_array)` and return a string of the transcribed text. + + * The `audio` tuple should be of the form `(sample_rate, audio_array)` where `sample_rate` is the sample rate of the audio array and `audio_array` is a numpy array of the audio data. It can be of type `np.int16` or `np.float32`. + +2. Once you have your model implemented, you can use it in your handler! + + ```python + from fastrtc import Stream, AdditionalOutputs, ReplyOnPause + from your_model import YourModel + + model = YourModel() # implement the STTModel protocol + + def echo(audio): + text = model.stt(audio) + yield AdditionalOutputs(text) + + stream = Stream(ReplyOnPause(echo), mode="send-receive", modality="audio", + additional_outputs=[gr.Textbox(label="Transcription")], + additional_outputs_handler=lambda old,new:old + new) + stream.ui.launch() + ``` + +3. Open a [PR](https://github.com/freddyaboulton/fastrtc/edit/main/docs/speech_to_text_gallery.md) to add your model to the gallery! Ideally you model package should be pip installable so other can try it out easily. \ No newline at end of file diff --git a/docs/vad_gallery.md b/docs/vad_gallery.md index e8632e0..1acc106 100644 --- a/docs/vad_gallery.md +++ b/docs/vad_gallery.md @@ -57,4 +57,52 @@ document.querySelectorAll('.tag-button').forEach(button => { [:octicons-arrow-right-24: Demo](Your demo here) - [:octicons-code-16: Repository](Code here) \ No newline at end of file + [:octicons-code-16: Repository](Code here) + + + +## How to add your own VAD model + +1. Your model can be implemented in **any** framework you want but it must implement the `PauseDetectionModel` protocol. + ```python + ModelOptions: TypeAlias = Any + + + class PauseDetectionModel(Protocol): + def vad( + self, + audio: tuple[int, NDArray[np.int16] | NDArray[np.float32]], + options: ModelOptions, + ) -> tuple[float, list[AudioChunk]]: ... + + def warmup( + self, + ) -> None: ... + ``` + + * The `vad` method should take a numpy array of audio data and return a tuple of the form `(speech_duration, and list[AudioChunk])` where `speech_duration` is the duration of the human speech in the audio chunk and `AudioChunk` is a dictionary with the following fields: `(start, end)` where `start` and `end` are the start and end times of the human speech in the audio array. + + * The `audio` tuple should be of the form `(sample_rate, audio_array)` where `sample_rate` is the sample rate of the audio array and `audio_array` is a numpy array of the audio data. It can be of type `np.int16` or `np.float32`. + + * The `warmup` method is optional but recommended to warm up the model when the server starts. + +2. Once you have your model implemented, you can use it in the `ReplyOnPause` class by passing in the model and any options you need. + + ```python + from fastrtc import ReplyOnPause, Stream + from your_model import YourModel + + def echo(audio): + yield audio + + model = YourModel() # implement the PauseDetectionModel protocol + reply_on_pause = ReplyOnPause( + echo, + model=model, + options=YourModelOptions(), + ) + stream = Stream(reply_on_pause, mode="send-receive", modality="audio") + stream.ui.launch() + ``` + +3. Open a [PR](https://github.com/freddyaboulton/fastrtc/edit/main/docs/vad_gallery.md) to add your model to the gallery! Ideally you model package should be pip installable so other can try it out easily. \ No newline at end of file