mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 01:49:22 +08:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
915dd3d639 | ||
|
|
ac128b3c55 | ||
|
|
82d199ff22 | ||
|
|
5ba388d894 | ||
|
|
790844ba0f | ||
|
|
51b5245410 | ||
|
|
888970e77d | ||
|
|
cb6d308335 | ||
|
|
1b212c6e95 | ||
|
|
452060ad65 | ||
|
|
c7eab751b5 | ||
|
|
d1714a9ff7 | ||
|
|
94c79d899d | ||
|
|
1baf307b35 | ||
|
|
e324285cdc | ||
|
|
13dce2d067 | ||
|
|
081e6b9886 | ||
|
|
572134fdf1 | ||
|
|
a799dea837 | ||
|
|
17209e6c4f | ||
|
|
6661cc9691 | ||
|
|
7c671a75c2 | ||
|
|
622016e672 | ||
|
|
8eba346bc9 | ||
|
|
900c71a109 | ||
|
|
bf0127e016 | ||
|
|
ea7af70fe9 | ||
|
|
8cdc8d36c9 | ||
|
|
6e9fd77500 | ||
|
|
6cc08b1077 | ||
|
|
0e8e080894 | ||
|
|
af6931d1de | ||
|
|
76687cbe25 | ||
|
|
b2329fa5f2 | ||
|
|
005886e7eb | ||
|
|
f6b1294cb2 | ||
|
|
2392ea33f4 | ||
|
|
45d72863b6 | ||
|
|
f40cc128a4 | ||
|
|
0d61e4cee1 | ||
|
|
011268e492 |
29
README.md
29
README.md
@@ -15,7 +15,7 @@ This repository also includes Number Detector and Language classifier [models](h
|
|||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://user-images.githubusercontent.com/36505480/145563071-681b57e3-06b5-4cd0-bdee-e2ade3d50a60.png" />
|
<img src="https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png" />
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
@@ -29,17 +29,17 @@ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-
|
|||||||
<h2 align="center">Key Features</h2>
|
<h2 align="center">Key Features</h2>
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
- **High accuracy**
|
- **Stellar accuracy**
|
||||||
|
|
||||||
Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
|
Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
|
||||||
|
|
||||||
- **Fast**
|
- **Fast**
|
||||||
|
|
||||||
One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) around **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably.
|
One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
|
||||||
|
|
||||||
- **Lightweight**
|
- **Lightweight**
|
||||||
|
|
||||||
JIT model is less than one megabyte in size.
|
JIT model is around one megabyte in size.
|
||||||
|
|
||||||
- **General**
|
- **General**
|
||||||
|
|
||||||
@@ -47,11 +47,19 @@ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-
|
|||||||
|
|
||||||
- **Flexible sampling rate**
|
- **Flexible sampling rate**
|
||||||
|
|
||||||
Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** (JIT) and **16000 Hz** (ONNX) [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
|
Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
|
||||||
|
|
||||||
- **Flexible chunk size**
|
- **Flexible chunk size**
|
||||||
|
|
||||||
Model was trained on audio chunks of different lengths. **30 ms**, **60 ms** and **100 ms** long chunks are supported directly, others may work as well.
|
Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well.
|
||||||
|
|
||||||
|
- **Highly Portable**
|
||||||
|
|
||||||
|
Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
|
||||||
|
|
||||||
|
- **No Strings Attached**
|
||||||
|
|
||||||
|
Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
<h2 align="center">Typical Use Cases</h2>
|
<h2 align="center">Typical Use Cases</h2>
|
||||||
@@ -70,9 +78,10 @@ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-
|
|||||||
- [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
|
- [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
|
||||||
- [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
|
- [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
|
||||||
- [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
|
- [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
|
||||||
- Number Detector and Language classifier [models](https://github.com/snakers4/silero-vad/wiki/Other-Models)
|
- [Number Detector and Language classifier models](https://github.com/snakers4/silero-vad/wiki/Other-Models)
|
||||||
- [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
|
- [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
|
||||||
- [Further reading](https://github.com/snakers4/silero-models#further-reading)
|
- [Further reading](https://github.com/snakers4/silero-models#further-reading)
|
||||||
|
- [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
<h2 align="center">Get In Touch</h2>
|
<h2 align="center">Get In Touch</h2>
|
||||||
@@ -96,3 +105,9 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers
|
|||||||
email = {hello@silero.ai}
|
email = {hello@silero.ai}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
<h2 align="center">VAD-based Community Apps</h2>
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
38
hubconf.py
38
hubconf.py
@@ -1,5 +1,6 @@
|
|||||||
dependencies = ['torch', 'torchaudio']
|
dependencies = ['torch', 'torchaudio']
|
||||||
import torch
|
import torch
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
from utils_vad import (init_jit_model,
|
from utils_vad import (init_jit_model,
|
||||||
get_speech_timestamps,
|
get_speech_timestamps,
|
||||||
@@ -15,16 +16,27 @@ from utils_vad import (init_jit_model,
|
|||||||
OnnxWrapper)
|
OnnxWrapper)
|
||||||
|
|
||||||
|
|
||||||
def silero_vad(onnx=False):
|
def versiontuple(v):
|
||||||
|
return tuple(map(int, (v.split('+')[0].split("."))))
|
||||||
|
|
||||||
|
|
||||||
|
def silero_vad(onnx=False, force_onnx_cpu=False):
|
||||||
"""Silero Voice Activity Detector
|
"""Silero Voice Activity Detector
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
"""
|
"""
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
|
if not onnx:
|
||||||
|
installed_version = torch.__version__
|
||||||
|
supported_version = '1.12.0'
|
||||||
|
if versiontuple(installed_version) < versiontuple(supported_version):
|
||||||
|
raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
|
||||||
|
|
||||||
|
model_dir = os.path.join(os.path.dirname(__file__), 'files')
|
||||||
if onnx:
|
if onnx:
|
||||||
model = OnnxWrapper(f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.onnx')
|
model = OnnxWrapper(os.path.join(model_dir, 'silero_vad.onnx'), force_onnx_cpu)
|
||||||
else:
|
else:
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.jit')
|
model = init_jit_model(os.path.join(model_dir, 'silero_vad.jit'))
|
||||||
utils = (get_speech_timestamps,
|
utils = (get_speech_timestamps,
|
||||||
save_audio,
|
save_audio,
|
||||||
read_audio,
|
read_audio,
|
||||||
@@ -34,7 +46,7 @@ def silero_vad(onnx=False):
|
|||||||
return model, utils
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
def silero_number_detector(onnx=False):
|
def silero_number_detector(onnx=False, force_onnx_cpu=False):
|
||||||
"""Silero Number Detector
|
"""Silero Number Detector
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
@@ -43,7 +55,7 @@ def silero_number_detector(onnx=False):
|
|||||||
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
||||||
else:
|
else:
|
||||||
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
||||||
model = Validator(url)
|
model = Validator(url, force_onnx_cpu)
|
||||||
utils = (get_number_ts,
|
utils = (get_number_ts,
|
||||||
save_audio,
|
save_audio,
|
||||||
read_audio,
|
read_audio,
|
||||||
@@ -53,7 +65,7 @@ def silero_number_detector(onnx=False):
|
|||||||
return model, utils
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
def silero_lang_detector(onnx=False):
|
def silero_lang_detector(onnx=False, force_onnx_cpu=False):
|
||||||
"""Silero Language Classifier
|
"""Silero Language Classifier
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
@@ -62,30 +74,30 @@ def silero_lang_detector(onnx=False):
|
|||||||
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
||||||
else:
|
else:
|
||||||
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
||||||
model = Validator(url)
|
model = Validator(url, force_onnx_cpu)
|
||||||
utils = (get_language,
|
utils = (get_language,
|
||||||
read_audio)
|
read_audio)
|
||||||
|
|
||||||
return model, utils
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
def silero_lang_detector_95(onnx=False):
|
def silero_lang_detector_95(onnx=False, force_onnx_cpu=False):
|
||||||
"""Silero Language Classifier (95 languages)
|
"""Silero Language Classifier (95 languages)
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
"""
|
"""
|
||||||
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
if onnx:
|
if onnx:
|
||||||
url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx'
|
url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx'
|
||||||
else:
|
else:
|
||||||
url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit'
|
url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit'
|
||||||
model = Validator(url)
|
model = Validator(url, force_onnx_cpu)
|
||||||
|
|
||||||
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_95.json', 'r') as f:
|
model_dir = os.path.join(os.path.dirname(__file__), 'files')
|
||||||
|
with open(os.path.join(model_dir, 'lang_dict_95.json'), 'r') as f:
|
||||||
lang_dict = json.load(f)
|
lang_dict = json.load(f)
|
||||||
|
|
||||||
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_95.json', 'r') as f:
|
with open(os.path.join(model_dir, 'lang_group_dict_95.json'), 'r') as f:
|
||||||
lang_group_dict = json.load(f)
|
lang_group_dict = json.load(f)
|
||||||
|
|
||||||
utils = (get_language_and_group, read_audio)
|
utils = (get_language_and_group, read_audio)
|
||||||
|
|||||||
@@ -138,7 +138,10 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"window_size_samples = 1536 # number of samples in a single audio chunk\n",
|
"window_size_samples = 1536 # number of samples in a single audio chunk\n",
|
||||||
"for i in range(0, len(wav), window_size_samples):\n",
|
"for i in range(0, len(wav), window_size_samples):\n",
|
||||||
" speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n",
|
" chunk = wav[i: i+ window_size_samples]\n",
|
||||||
|
" if len(chunk) < window_size_samples:\n",
|
||||||
|
" break\n",
|
||||||
|
" speech_dict = vad_iterator(chunk, return_seconds=True)\n",
|
||||||
" if speech_dict:\n",
|
" if speech_dict:\n",
|
||||||
" print(speech_dict, end=' ')\n",
|
" print(speech_dict, end=' ')\n",
|
||||||
"vad_iterator.reset_states() # reset model states after each audio"
|
"vad_iterator.reset_states() # reset model states after each audio"
|
||||||
@@ -158,7 +161,10 @@
|
|||||||
"speech_probs = []\n",
|
"speech_probs = []\n",
|
||||||
"window_size_samples = 1536\n",
|
"window_size_samples = 1536\n",
|
||||||
"for i in range(0, len(wav), window_size_samples):\n",
|
"for i in range(0, len(wav), window_size_samples):\n",
|
||||||
" speech_prob = model(wav[i: i+ window_size_samples], SAMPLING_RATE).item()\n",
|
" chunk = wav[i: i+ window_size_samples]\n",
|
||||||
|
" if len(chunk) < window_size_samples:\n",
|
||||||
|
" break\n",
|
||||||
|
" speech_prob = model(chunk, SAMPLING_RATE).item()\n",
|
||||||
" speech_probs.append(speech_prob)\n",
|
" speech_probs.append(speech_prob)\n",
|
||||||
"vad_iterator.reset_states() # reset model states after each audio\n",
|
"vad_iterator.reset_states() # reset model states after each audio\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
106
utils_vad.py
106
utils_vad.py
@@ -9,51 +9,98 @@ languages = ['ru', 'en', 'de', 'es']
|
|||||||
|
|
||||||
class OnnxWrapper():
|
class OnnxWrapper():
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, force_onnx_cpu=False):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
global np
|
global np
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
self.session = onnxruntime.InferenceSession(path)
|
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
||||||
|
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'])
|
||||||
|
else:
|
||||||
|
self.session = onnxruntime.InferenceSession(path)
|
||||||
self.session.intra_op_num_threads = 1
|
self.session.intra_op_num_threads = 1
|
||||||
self.session.inter_op_num_threads = 1
|
self.session.inter_op_num_threads = 1
|
||||||
|
|
||||||
self.reset_states()
|
self.reset_states()
|
||||||
|
self.sample_rates = [8000, 16000]
|
||||||
|
|
||||||
def reset_states(self):
|
def _validate_input(self, x, sr: int):
|
||||||
self._h = np.zeros((2, 1, 64)).astype('float32')
|
|
||||||
self._c = np.zeros((2, 1, 64)).astype('float32')
|
|
||||||
|
|
||||||
def __call__(self, x, sr: int):
|
|
||||||
if x.dim() == 1:
|
if x.dim() == 1:
|
||||||
x = x.unsqueeze(0)
|
x = x.unsqueeze(0)
|
||||||
if x.dim() > 2:
|
if x.dim() > 2:
|
||||||
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
||||||
|
|
||||||
if x.shape[0] > 1:
|
if sr != 16000 and (sr % 16000 == 0):
|
||||||
raise ValueError("Onnx model does not support batching")
|
step = sr // 16000
|
||||||
|
x = x[::step]
|
||||||
|
sr = 16000
|
||||||
|
|
||||||
if sr not in [16000]:
|
if sr not in self.sample_rates:
|
||||||
raise ValueError(f"Supported sample rates: {[16000]}")
|
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
||||||
|
|
||||||
if sr / x.shape[1] > 31.25:
|
if sr / x.shape[1] > 31.25:
|
||||||
raise ValueError("Input audio chunk is too short")
|
raise ValueError("Input audio chunk is too short")
|
||||||
|
|
||||||
ort_inputs = {'input': x.numpy(), 'h0': self._h, 'c0': self._c}
|
return x, sr
|
||||||
ort_outs = self.session.run(None, ort_inputs)
|
|
||||||
out, self._h, self._c = ort_outs
|
|
||||||
|
|
||||||
out = torch.tensor(out).squeeze(2)[:, 1] # make output type match JIT analog
|
def reset_states(self, batch_size=1):
|
||||||
|
self._h = np.zeros((2, batch_size, 64)).astype('float32')
|
||||||
|
self._c = np.zeros((2, batch_size, 64)).astype('float32')
|
||||||
|
self._last_sr = 0
|
||||||
|
self._last_batch_size = 0
|
||||||
|
|
||||||
|
def __call__(self, x, sr: int):
|
||||||
|
|
||||||
|
x, sr = self._validate_input(x, sr)
|
||||||
|
batch_size = x.shape[0]
|
||||||
|
|
||||||
|
if not self._last_batch_size:
|
||||||
|
self.reset_states(batch_size)
|
||||||
|
if (self._last_sr) and (self._last_sr != sr):
|
||||||
|
self.reset_states(batch_size)
|
||||||
|
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
||||||
|
self.reset_states(batch_size)
|
||||||
|
|
||||||
|
if sr in [8000, 16000]:
|
||||||
|
ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr)}
|
||||||
|
ort_outs = self.session.run(None, ort_inputs)
|
||||||
|
out, self._h, self._c = ort_outs
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
self._last_sr = sr
|
||||||
|
self._last_batch_size = batch_size
|
||||||
|
|
||||||
|
out = torch.tensor(out)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def audio_forward(self, x, sr: int, num_samples: int = 512):
|
||||||
|
outs = []
|
||||||
|
x, sr = self._validate_input(x, sr)
|
||||||
|
|
||||||
|
if x.shape[1] % num_samples:
|
||||||
|
pad_num = num_samples - (x.shape[1] % num_samples)
|
||||||
|
x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
|
||||||
|
|
||||||
|
self.reset_states(x.shape[0])
|
||||||
|
for i in range(0, x.shape[1], num_samples):
|
||||||
|
wavs_batch = x[:, i:i+num_samples]
|
||||||
|
out_chunk = self.__call__(wavs_batch, sr)
|
||||||
|
outs.append(out_chunk)
|
||||||
|
|
||||||
|
stacked = torch.cat(outs, dim=1)
|
||||||
|
return stacked.cpu()
|
||||||
|
|
||||||
|
|
||||||
class Validator():
|
class Validator():
|
||||||
def __init__(self, url):
|
def __init__(self, url, force_onnx_cpu):
|
||||||
self.onnx = True if url.endswith('.onnx') else False
|
self.onnx = True if url.endswith('.onnx') else False
|
||||||
torch.hub.download_url_to_file(url, 'inf.model')
|
torch.hub.download_url_to_file(url, 'inf.model')
|
||||||
if self.onnx:
|
if self.onnx:
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
self.model = onnxruntime.InferenceSession('inf.model')
|
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
||||||
|
self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
|
||||||
|
else:
|
||||||
|
self.model = onnxruntime.InferenceSession('inf.model')
|
||||||
else:
|
else:
|
||||||
self.model = init_jit_model(model_path='inf.model')
|
self.model = init_jit_model(model_path='inf.model')
|
||||||
|
|
||||||
@@ -117,7 +164,7 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
sampling_rate: int = 16000,
|
sampling_rate: int = 16000,
|
||||||
min_speech_duration_ms: int = 250,
|
min_speech_duration_ms: int = 250,
|
||||||
min_silence_duration_ms: int = 100,
|
min_silence_duration_ms: int = 100,
|
||||||
window_size_samples: int = 1536,
|
window_size_samples: int = 512,
|
||||||
speech_pad_ms: int = 30,
|
speech_pad_ms: int = 30,
|
||||||
return_seconds: bool = False,
|
return_seconds: bool = False,
|
||||||
visualize_probs: bool = False):
|
visualize_probs: bool = False):
|
||||||
@@ -177,8 +224,16 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
if len(audio.shape) > 1:
|
if len(audio.shape) > 1:
|
||||||
raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
|
raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
|
||||||
|
|
||||||
|
if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
|
||||||
|
step = sampling_rate // 16000
|
||||||
|
sampling_rate = 16000
|
||||||
|
audio = audio[::step]
|
||||||
|
warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
|
||||||
|
else:
|
||||||
|
step = 1
|
||||||
|
|
||||||
if sampling_rate == 8000 and window_size_samples > 768:
|
if sampling_rate == 8000 and window_size_samples > 768:
|
||||||
warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 1536 for 8000 sample rate!')
|
warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
|
||||||
if window_size_samples not in [256, 512, 768, 1024, 1536]:
|
if window_size_samples not in [256, 512, 768, 1024, 1536]:
|
||||||
warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
|
warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
|
||||||
|
|
||||||
@@ -226,7 +281,7 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
triggered = False
|
triggered = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if current_speech:
|
if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
|
||||||
current_speech['end'] = audio_length_samples
|
current_speech['end'] = audio_length_samples
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
|
|
||||||
@@ -239,7 +294,8 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
speech['end'] += int(silence_duration // 2)
|
speech['end'] += int(silence_duration // 2)
|
||||||
speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
|
speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
|
||||||
else:
|
else:
|
||||||
speech['end'] += int(speech_pad_samples)
|
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
||||||
|
speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
|
||||||
else:
|
else:
|
||||||
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
||||||
|
|
||||||
@@ -247,6 +303,10 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
for speech_dict in speeches:
|
for speech_dict in speeches:
|
||||||
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
|
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
|
||||||
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
|
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
|
||||||
|
elif step > 1:
|
||||||
|
for speech_dict in speeches:
|
||||||
|
speech_dict['start'] *= step
|
||||||
|
speech_dict['end'] *= step
|
||||||
|
|
||||||
if visualize_probs:
|
if visualize_probs:
|
||||||
make_visualization(speech_probs, window_size_samples / sampling_rate)
|
make_visualization(speech_probs, window_size_samples / sampling_rate)
|
||||||
@@ -353,6 +413,10 @@ class VADIterator:
|
|||||||
self.model = model
|
self.model = model
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
|
|
||||||
|
if sampling_rate not in [8000, 16000]:
|
||||||
|
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
||||||
|
|
||||||
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||||
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||||
self.reset_states()
|
self.reset_states()
|
||||||
|
|||||||
Reference in New Issue
Block a user