mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
Add API Reference and llms.txt (#256)
* stream api reference * docs * Add code * Add code * code
This commit is contained in:
@@ -20,15 +20,33 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReplyOnStopWordsState(AppState):
|
||||
"""Extends AppState to include state specific to stop word detection."""
|
||||
|
||||
stop_word_detected: bool = False
|
||||
post_stop_word_buffer: np.ndarray | None = None
|
||||
started_talking_pre_stop_word: bool = False
|
||||
|
||||
def new(self):
|
||||
"""Creates a new instance of ReplyOnStopWordsState."""
|
||||
return ReplyOnStopWordsState()
|
||||
|
||||
|
||||
class ReplyOnStopWords(ReplyOnPause):
|
||||
"""
|
||||
A stream handler that extends ReplyOnPause to trigger based on stop words
|
||||
followed by a pause.
|
||||
|
||||
This handler listens to the incoming audio stream, performs Speech-to-Text (STT)
|
||||
to detect predefined stop words. Once a stop word is detected, it waits for a
|
||||
subsequent pause in speech (using the VAD model) before triggering the reply
|
||||
function (`fn`) with the audio recorded *after* the stop word.
|
||||
|
||||
Attributes:
|
||||
stop_words (list[str]): A list of words or phrases that trigger the pause detection.
|
||||
state (ReplyOnStopWordsState): The current state of the stop word and pause detection logic.
|
||||
stt_model: The Speech-to-Text model instance used for detecting stop words.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fn: ReplyFnGenerator,
|
||||
@@ -43,6 +61,25 @@ class ReplyOnStopWords(ReplyOnPause):
|
||||
input_sample_rate: int = 48000,
|
||||
model: PauseDetectionModel | None = None,
|
||||
):
|
||||
"""
|
||||
Initializes the ReplyOnStopWords handler.
|
||||
|
||||
Args:
|
||||
fn: The generator function to execute upon stop word and pause detection.
|
||||
It receives `(sample_rate, audio_array)` and optionally `*args`.
|
||||
stop_words: A list of strings (words or phrases) to listen for.
|
||||
Detection is case-insensitive and ignores punctuation.
|
||||
startup_fn: An optional function to run once at the beginning.
|
||||
algo_options: Options for the pause detection algorithm (used after stop word).
|
||||
model_options: Options for the VAD model.
|
||||
can_interrupt: If True, incoming audio during `fn` execution
|
||||
will stop the generator and process the new audio.
|
||||
expected_layout: Expected input audio layout ('mono' or 'stereo').
|
||||
output_sample_rate: The sample rate expected for audio yielded by `fn`.
|
||||
output_frame_size: Deprecated.
|
||||
input_sample_rate: The expected sample rate of incoming audio.
|
||||
model: An optional pre-initialized VAD model instance.
|
||||
"""
|
||||
super().__init__(
|
||||
fn,
|
||||
algo_options=algo_options,
|
||||
@@ -60,6 +97,18 @@ class ReplyOnStopWords(ReplyOnPause):
|
||||
self.stt_model = get_stt_model("moonshine/base")
|
||||
|
||||
def stop_word_detected(self, text: str) -> bool:
|
||||
"""
|
||||
Checks if any of the configured stop words are present in the text.
|
||||
|
||||
Performs a case-insensitive search, treating multi-word stop phrases
|
||||
correctly and ignoring basic punctuation.
|
||||
|
||||
Args:
|
||||
text: The text transcribed from the audio.
|
||||
|
||||
Returns:
|
||||
True if a stop word is found, False otherwise.
|
||||
"""
|
||||
for stop_word in self.stop_words:
|
||||
stop_word = stop_word.lower().strip().split(" ")
|
||||
if bool(
|
||||
@@ -75,17 +124,36 @@ class ReplyOnStopWords(ReplyOnPause):
|
||||
async def _send_stopword(
|
||||
self,
|
||||
):
|
||||
"""Internal async method to send a 'stopword' message via the channel."""
|
||||
if self.channel:
|
||||
self.channel.send(create_message("stopword", ""))
|
||||
logger.debug("Sent stopword")
|
||||
|
||||
def send_stopword(self):
|
||||
"""Sends a 'stopword' message asynchronously via the communication channel."""
|
||||
asyncio.run_coroutine_threadsafe(self._send_stopword(), self.loop)
|
||||
|
||||
def determine_pause( # type: ignore
|
||||
self, audio: np.ndarray, sampling_rate: int, state: ReplyOnStopWordsState
|
||||
) -> bool:
|
||||
"""Take in the stream, determine if a pause happened"""
|
||||
"""
|
||||
Analyzes an audio chunk to detect stop words and subsequent pauses.
|
||||
|
||||
Overrides the `ReplyOnPause.determine_pause` method.
|
||||
First, it performs STT on the audio buffer to detect stop words.
|
||||
Once a stop word is detected (`state.stop_word_detected` is True), it then
|
||||
uses the VAD model (similar to `ReplyOnPause`) to detect a pause in the
|
||||
audio *following* the stop word.
|
||||
|
||||
Args:
|
||||
audio: The numpy array containing the audio chunk.
|
||||
sampling_rate: The sample rate of the audio chunk.
|
||||
state: The current application state (ReplyOnStopWordsState).
|
||||
|
||||
Returns:
|
||||
True if a stop word has been detected and a subsequent pause
|
||||
satisfying the configured thresholds is detected, False otherwise.
|
||||
"""
|
||||
import librosa
|
||||
|
||||
duration = len(audio) / sampling_rate
|
||||
@@ -142,12 +210,19 @@ class ReplyOnStopWords(ReplyOnPause):
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Resets the handler state to its initial condition.
|
||||
|
||||
Clears accumulated audio, resets state flags (including stop word state),
|
||||
closes any active generator, and clears the event flag.
|
||||
"""
|
||||
super().reset()
|
||||
self.generator = None
|
||||
self.event.clear()
|
||||
self.state = ReplyOnStopWordsState()
|
||||
|
||||
def copy(self):
|
||||
"""Creates a new instance of ReplyOnStopWords with the same configuration."""
|
||||
return ReplyOnStopWords(
|
||||
self.fn,
|
||||
self.stop_words,
|
||||
|
||||
Reference in New Issue
Block a user