mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 18:09:22 +08:00
@@ -322,6 +322,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks
|
|||||||
- `neg_trig_sum` - same as `trig_sum`, but for switching from triggered to non-triggered state (non-speech)
|
- `neg_trig_sum` - same as `trig_sum`, but for switching from triggered to non-triggered state (non-speech)
|
||||||
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
|
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
|
||||||
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
||||||
|
- `min_speech_samples` - minimum speech chunk duration in samples
|
||||||
|
|
||||||
### How VAD Works
|
### How VAD Works
|
||||||
|
|
||||||
|
|||||||
3
utils.py
3
utils.py
@@ -59,6 +59,7 @@ def get_speech_ts(wav: torch.Tensor,
|
|||||||
num_steps: int = 8,
|
num_steps: int = 8,
|
||||||
batch_size: int = 200,
|
batch_size: int = 200,
|
||||||
num_samples_per_window: int = 4000,
|
num_samples_per_window: int = 4000,
|
||||||
|
min_speech_samples: int = 10000, #samples
|
||||||
run_function=validate):
|
run_function=validate):
|
||||||
|
|
||||||
num_samples = num_samples_per_window
|
num_samples = num_samples_per_window
|
||||||
@@ -97,7 +98,7 @@ def get_speech_ts(wav: torch.Tensor,
|
|||||||
current_speech['start'] = step * max(0, i-num_steps)
|
current_speech['start'] = step * max(0, i-num_steps)
|
||||||
if ((sum(buffer) / len(buffer)) < neg_trig_sum) and triggered:
|
if ((sum(buffer) / len(buffer)) < neg_trig_sum) and triggered:
|
||||||
current_speech['end'] = step * i
|
current_speech['end'] = step * i
|
||||||
if (current_speech['end'] - current_speech['start']) > 10000:
|
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
triggered = False
|
triggered = False
|
||||||
|
|||||||
Reference in New Issue
Block a user