mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 18:09:22 +08:00
@@ -25,7 +25,7 @@
|
|||||||
|
|
||||||
|
|
||||||
# Silero VAD
|
# Silero VAD
|
||||||

|

|
||||||
|
|
||||||
**Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier.**
|
**Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier.**
|
||||||
Enterprise-grade Speech Products made refreshingly simple (see our [STT](https://github.com/snakers4/silero-models) models).
|
Enterprise-grade Speech Products made refreshingly simple (see our [STT](https://github.com/snakers4/silero-models) models).
|
||||||
@@ -60,6 +60,7 @@ The models are small enough to be included directly into this repository. Newer
|
|||||||
| model= | Params | Model type | Streaming | Languages | PyTorch | ONNX | Colab |
|
| model= | Params | Model type | Streaming | Languages | PyTorch | ONNX | Colab |
|
||||||
|--------------------------------|--------|---------------------|--------------------|----------------|---------|------|-------|
|
|--------------------------------|--------|---------------------|--------------------|----------------|---------|------|-------|
|
||||||
| `'silero_vad'` | 1.1M | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
| `'silero_vad'` | 1.1M | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
||||||
|
| `'silero_vad_micro'` | 10K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
||||||
| `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
| `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
||||||
| `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
| `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
||||||
|
|
||||||
@@ -79,6 +80,7 @@ What models do:
|
|||||||
| `v1.1` | 2020-12-24 | better vad models compatible with chunks shorter than 250 ms
|
| `v1.1` | 2020-12-24 | better vad models compatible with chunks shorter than 250 ms
|
||||||
| `v1.2` | 2020-12-30 | Number Detector added
|
| `v1.2` | 2020-12-30 | Number Detector added
|
||||||
| `v2` | 2021-01-11 | Add Language Classifier heads (en, ru, de, es) |
|
| `v2` | 2021-01-11 | Add Language Classifier heads (en, ru, de, es) |
|
||||||
|
| `v2.1` | 2021-02-11 | Add micro (10k params) VAD models |
|
||||||
|
|
||||||
### PyTorch
|
### PyTorch
|
||||||
|
|
||||||
@@ -333,7 +335,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks
|
|||||||
|
|
||||||
[Auditok](https://github.com/amsehili/auditok) - logic same as Webrtc, but we use 50ms frames.
|
[Auditok](https://github.com/amsehili/auditok) - logic same as Webrtc, but we use 50ms frames.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
## FAQ
|
## FAQ
|
||||||
|
|
||||||
@@ -346,6 +348,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks
|
|||||||
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
|
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
|
||||||
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
||||||
- `min_speech_samples` - minimum speech chunk duration in samples
|
- `min_speech_samples` - minimum speech chunk duration in samples
|
||||||
|
- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks
|
||||||
|
|
||||||
Optimal parameters may vary per domain, but we provided a tiny tool to learn the best parameters. You can invoke `speech_timestamps` with visualize_probs=True (`pandas` required):
|
Optimal parameters may vary per domain, but we provided a tiny tool to learn the best parameters. You can invoke `speech_timestamps` with visualize_probs=True (`pandas` required):
|
||||||
|
|
||||||
|
|||||||
BIN
files/model_micro.jit
Normal file
BIN
files/model_micro.jit
Normal file
Binary file not shown.
BIN
files/model_micro.onnx
Normal file
BIN
files/model_micro.onnx
Normal file
Binary file not shown.
17
hubconf.py
17
hubconf.py
@@ -29,6 +29,23 @@ def silero_vad(**kwargs):
|
|||||||
return model, utils
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
|
def silero_vad_micro(**kwargs):
|
||||||
|
"""Silero Voice Activity Detector
|
||||||
|
Returns a model with a set of utils
|
||||||
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
|
"""
|
||||||
|
hub_dir = torch.hub.get_dir()
|
||||||
|
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_micro.jit')
|
||||||
|
utils = (get_speech_ts,
|
||||||
|
save_audio,
|
||||||
|
read_audio,
|
||||||
|
state_generator,
|
||||||
|
single_audio_stream,
|
||||||
|
collect_chunks)
|
||||||
|
|
||||||
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
def silero_number_detector(**kwargs):
|
def silero_number_detector(**kwargs):
|
||||||
"""Silero Number Detector
|
"""Silero Number Detector
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
|
|||||||
22
utils_vad.py
22
utils_vad.py
@@ -60,6 +60,7 @@ def get_speech_ts(wav: torch.Tensor,
|
|||||||
batch_size: int = 200,
|
batch_size: int = 200,
|
||||||
num_samples_per_window: int = 4000,
|
num_samples_per_window: int = 4000,
|
||||||
min_speech_samples: int = 10000, #samples
|
min_speech_samples: int = 10000, #samples
|
||||||
|
min_silence_samples: int = 500,
|
||||||
run_function=validate,
|
run_function=validate,
|
||||||
visualize_probs=False):
|
visualize_probs=False):
|
||||||
|
|
||||||
@@ -95,20 +96,31 @@ def get_speech_ts(wav: torch.Tensor,
|
|||||||
smoothed_probs = []
|
smoothed_probs = []
|
||||||
|
|
||||||
speech_probs = outs[:, 1] # this is very misleading
|
speech_probs = outs[:, 1] # this is very misleading
|
||||||
|
temp_end = 0
|
||||||
for i, predict in enumerate(speech_probs): # add name
|
for i, predict in enumerate(speech_probs): # add name
|
||||||
buffer.append(predict)
|
buffer.append(predict)
|
||||||
smoothed_prob = (sum(buffer) / len(buffer))
|
smoothed_prob = (sum(buffer) / len(buffer))
|
||||||
if visualize_probs:
|
if visualize_probs:
|
||||||
smoothed_probs.append(float(smoothed_prob))
|
smoothed_probs.append(float(smoothed_prob))
|
||||||
|
if (smoothed_prob >= trig_sum) and temp_end:
|
||||||
|
temp_end=0
|
||||||
if (smoothed_prob >= trig_sum) and not triggered:
|
if (smoothed_prob >= trig_sum) and not triggered:
|
||||||
triggered = True
|
triggered = True
|
||||||
current_speech['start'] = step * max(0, i-num_steps)
|
current_speech['start'] = step * max(0, i-num_steps)
|
||||||
|
continue
|
||||||
if (smoothed_prob < neg_trig_sum) and triggered:
|
if (smoothed_prob < neg_trig_sum) and triggered:
|
||||||
current_speech['end'] = step * i
|
if not temp_end:
|
||||||
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
temp_end = step * i
|
||||||
speeches.append(current_speech)
|
if step * i - temp_end < min_silence_samples:
|
||||||
current_speech = {}
|
continue
|
||||||
triggered = False
|
else:
|
||||||
|
current_speech['end'] = temp_end
|
||||||
|
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
||||||
|
speeches.append(current_speech)
|
||||||
|
temp_end = 0
|
||||||
|
current_speech = {}
|
||||||
|
triggered = False
|
||||||
|
continue
|
||||||
if current_speech:
|
if current_speech:
|
||||||
current_speech['end'] = len(wav)
|
current_speech['end'] = len(wav)
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
|
|||||||
Reference in New Issue
Block a user