From d1d9e6368fe1deaebce582f91a682202809f30c8 Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Tue, 26 Jan 2021 11:54:09 +0000 Subject: [PATCH 1/2] add min_speech_samples param --- README.md | 1 + utils.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index eb03f7a..9edb550 100644 --- a/README.md +++ b/README.md @@ -322,6 +322,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks - `neg_trig_sum` - same as `trig_sum`, but for switching from triggered to non-triggered state (non-speech) - `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8) - `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434)); +- `min_speech_sample` - minimum speech chunk duration in samples ### How VAD Works diff --git a/utils.py b/utils.py index bff6a3d..5fafca7 100644 --- a/utils.py +++ b/utils.py @@ -59,6 +59,7 @@ def get_speech_ts(wav: torch.Tensor, num_steps: int = 8, batch_size: int = 200, num_samples_per_window: int = 4000, + min_speech_samples: int = 10000, #samples run_function=validate): num_samples = num_samples_per_window @@ -97,7 +98,7 @@ def get_speech_ts(wav: torch.Tensor, current_speech['start'] = step * max(0, i-num_steps) if ((sum(buffer) / len(buffer)) < neg_trig_sum) and triggered: current_speech['end'] = step * i - if (current_speech['end'] - current_speech['start']) > 10000: + if (current_speech['end'] - current_speech['start']) > min_speech_samples: speeches.append(current_speech) current_speech = {} triggered = False From 5a7028ebfe07b036a53e3dd8a790b5a57ae597be Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Tue, 26 Jan 2021 11:56:18 +0000 Subject: [PATCH 2/2] fx --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9edb550..0c9ef5c 100644 --- a/README.md +++ b/README.md @@ -322,7 +322,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks - `neg_trig_sum` - same as `trig_sum`, but for switching from triggered to non-triggered state (non-speech) - `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8) - `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434)); -- `min_speech_sample` - minimum speech chunk duration in samples +- `min_speech_samples` - minimum speech chunk duration in samples ### How VAD Works