mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 09:59:20 +08:00
Merge branch 'master' of https://github.com/snakers4/silero-vad into max_speech_duration_v4
This commit is contained in:
80
utils_vad.py
80
utils_vad.py
@@ -9,21 +9,21 @@ languages = ['ru', 'en', 'de', 'es']
|
||||
|
||||
class OnnxWrapper():
|
||||
|
||||
def __init__(self, path):
|
||||
def __init__(self, path, force_onnx_cpu=False):
|
||||
import numpy as np
|
||||
global np
|
||||
import onnxruntime
|
||||
self.session = onnxruntime.InferenceSession(path)
|
||||
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
||||
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'])
|
||||
else:
|
||||
self.session = onnxruntime.InferenceSession(path)
|
||||
self.session.intra_op_num_threads = 1
|
||||
self.session.inter_op_num_threads = 1
|
||||
|
||||
self.reset_states()
|
||||
self.sample_rates = [8000, 16000]
|
||||
|
||||
def reset_states(self):
|
||||
self._h = np.zeros((2, 1, 64)).astype('float32')
|
||||
self._c = np.zeros((2, 1, 64)).astype('float32')
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
def _validate_input(self, x, sr: int):
|
||||
if x.dim() == 1:
|
||||
x = x.unsqueeze(0)
|
||||
if x.dim() > 2:
|
||||
@@ -34,31 +34,73 @@ class OnnxWrapper():
|
||||
x = x[::step]
|
||||
sr = 16000
|
||||
|
||||
if x.shape[0] > 1:
|
||||
raise ValueError("Onnx model does not support batching")
|
||||
|
||||
if sr not in [16000]:
|
||||
raise ValueError(f"Supported sample rates: {[16000]}")
|
||||
if sr not in self.sample_rates:
|
||||
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
||||
|
||||
if sr / x.shape[1] > 31.25:
|
||||
raise ValueError("Input audio chunk is too short")
|
||||
|
||||
ort_inputs = {'input': x.numpy(), 'h0': self._h, 'c0': self._c}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, self._h, self._c = ort_outs
|
||||
return x, sr
|
||||
|
||||
out = torch.tensor(out).squeeze(2)[:, 1] # make output type match JIT analog
|
||||
def reset_states(self, batch_size=1):
|
||||
self._h = np.zeros((2, batch_size, 64)).astype('float32')
|
||||
self._c = np.zeros((2, batch_size, 64)).astype('float32')
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
|
||||
x, sr = self._validate_input(x, sr)
|
||||
batch_size = x.shape[0]
|
||||
|
||||
if not self._last_batch_size:
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_sr) and (self._last_sr != sr):
|
||||
self.reset_states(batch_size)
|
||||
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
||||
self.reset_states(batch_size)
|
||||
|
||||
if sr in [8000, 16000]:
|
||||
ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr)}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, self._h, self._c = ort_outs
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
self._last_sr = sr
|
||||
self._last_batch_size = batch_size
|
||||
|
||||
out = torch.tensor(out)
|
||||
return out
|
||||
|
||||
def audio_forward(self, x, sr: int, num_samples: int = 512):
|
||||
outs = []
|
||||
x, sr = self._validate_input(x, sr)
|
||||
|
||||
if x.shape[1] % num_samples:
|
||||
pad_num = num_samples - (x.shape[1] % num_samples)
|
||||
x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
|
||||
|
||||
self.reset_states(x.shape[0])
|
||||
for i in range(0, x.shape[1], num_samples):
|
||||
wavs_batch = x[:, i:i+num_samples]
|
||||
out_chunk = self.__call__(wavs_batch, sr)
|
||||
outs.append(out_chunk)
|
||||
|
||||
stacked = torch.cat(outs, dim=1)
|
||||
return stacked.cpu()
|
||||
|
||||
|
||||
class Validator():
|
||||
def __init__(self, url):
|
||||
def __init__(self, url, force_onnx_cpu):
|
||||
self.onnx = True if url.endswith('.onnx') else False
|
||||
torch.hub.download_url_to_file(url, 'inf.model')
|
||||
if self.onnx:
|
||||
import onnxruntime
|
||||
self.model = onnxruntime.InferenceSession('inf.model')
|
||||
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
||||
self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
|
||||
else:
|
||||
self.model = onnxruntime.InferenceSession('inf.model')
|
||||
else:
|
||||
self.model = init_jit_model(model_path='inf.model')
|
||||
|
||||
@@ -123,7 +165,7 @@ def get_speech_timestamps(audio: torch.Tensor,
|
||||
min_speech_duration_ms: int = 250,
|
||||
max_speech_duration_s: float = float('inf'),
|
||||
min_silence_duration_ms: int = 100,
|
||||
window_size_samples: int = 1536,
|
||||
window_size_samples: int = 512,
|
||||
speech_pad_ms: int = 30,
|
||||
return_seconds: bool = False,
|
||||
visualize_probs: bool = False):
|
||||
|
||||
Reference in New Issue
Block a user