diff --git a/README.md b/README.md index 3044536..e0c6a94 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ The models are small enough to be included directly into this repository. Newer | `'silero_vad'` | 1.1M | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_vad_micro'` | 10K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_vad_micro_8k'` | 10K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | +| `'silero_vad_mini'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | +| `'silero_vad_mini_8k'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | @@ -84,6 +86,7 @@ What models do: | `v2` | 2021-01-11 | Add Language Classifier heads (en, ru, de, es) | | `v2.1` | 2021-02-11 | Add micro (10k params) VAD models | | `v2.2` | 2021-03-22 | Add micro 8000 sample rate VAD models | +| `v2.3` | 2021-04-12 | Add mini (100k params) VAD models (8k and 16k sample rate) + **new** adaptive utils for full audio and single audio stream| ### PyTorch diff --git a/files/model_mini.jit b/files/model_mini.jit new file mode 100644 index 0000000..4dee04e Binary files /dev/null and b/files/model_mini.jit differ diff --git a/files/model_mini.onnx b/files/model_mini.onnx new file mode 100644 index 0000000..2a33c2f Binary files /dev/null and b/files/model_mini.onnx differ diff --git a/files/model_mini_8k.jit b/files/model_mini_8k.jit new file mode 100644 index 0000000..aa1e980 Binary files /dev/null and b/files/model_mini_8k.jit differ diff --git a/files/model_mini_8k.onnx b/files/model_mini_8k.onnx new file mode 100644 index 0000000..4dd1867 Binary files /dev/null and b/files/model_mini_8k.onnx differ diff --git a/hubconf.py b/hubconf.py index 64a8c0a..a45ca4d 100644 --- a/hubconf.py +++ b/hubconf.py @@ -2,6 +2,7 @@ dependencies = ['torch', 'torchaudio'] import torch from utils_vad import (init_jit_model, get_speech_ts, + get_speech_ts_adaptive, get_number_ts, get_language, save_audio, @@ -20,6 +21,7 @@ def silero_vad(**kwargs): hub_dir = torch.hub.get_dir() model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model.jit') utils = (get_speech_ts, + get_speech_ts_adaptive, save_audio, read_audio, state_generator, @@ -37,6 +39,7 @@ def silero_vad_micro(**kwargs): hub_dir = torch.hub.get_dir() model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_micro.jit') utils = (get_speech_ts, + get_speech_ts_adaptive, save_audio, read_audio, state_generator, @@ -54,6 +57,43 @@ def silero_vad_micro_8k(**kwargs): hub_dir = torch.hub.get_dir() model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_micro_8k.jit') utils = (get_speech_ts, + get_speech_ts_adaptive, + save_audio, + read_audio, + state_generator, + single_audio_stream, + collect_chunks) + + return model, utils + + +def silero_vad_mini(**kwargs): + """Silero Voice Activity Detector + Returns a model with a set of utils + Please see https://github.com/snakers4/silero-vad for usage examples + """ + hub_dir = torch.hub.get_dir() + model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_mini.jit') + utils = (get_speech_ts, + get_speech_ts_adaptive, + save_audio, + read_audio, + state_generator, + single_audio_stream, + collect_chunks) + + return model, utils + + +def silero_vad_mini_8k(**kwargs): + """Silero Voice Activity Detector + Returns a model with a set of utils + Please see https://github.com/snakers4/silero-vad for usage examples + """ + hub_dir = torch.hub.get_dir() + model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_mini_8k.jit') + utils = (get_speech_ts, + get_speech_ts_adaptive, save_audio, read_audio, state_generator, diff --git a/utils_vad.py b/utils_vad.py index 296cbf3..c4fd682 100644 --- a/utils_vad.py +++ b/utils_vad.py @@ -12,6 +12,30 @@ torchaudio.set_audio_backend("soundfile") # switch backend languages = ['ru', 'en', 'de', 'es'] +class IterativeMedianMeter(): + def __init__(self): + self.reset() + + def reset(self): + self.median = 0 + self.counts = {} + for i in range(0, 101, 1): + self.counts[i / 100] = 0 + self.total_values = 0 + + def __call__(self, val): + self.total_values += 1 + rounded = round(abs(val), 2) + self.counts[rounded] += 1 + bin_sum = 0 + for j in self.counts: + bin_sum += self.counts[j] + if bin_sum >= self.total_values / 2: + self.median = j + break + return self.median + + def validate(model, inputs: torch.Tensor): with torch.no_grad(): @@ -130,6 +154,127 @@ def get_speech_ts(wav: torch.Tensor, return speeches +def get_speech_ts_adaptive(wav: torch.Tensor, + model, + batch_size: int = 200, + step: int = 500, + num_samples_per_window: int = 4000, # Number of samples per audio chunk to feed to NN (4000 for 16k SR, 2000 for 8k SR is optimal) + min_speech_samples: int = 10000, # samples + min_silence_samples: int = 4000, + speech_pad_samples: int = 2000, + run_function=validate, + visualize_probs=False, + device='cpu'): + """ + This function is used for splitting long audios into speech chunks using silero VAD + Attention! All default sample rate values are optimal for 16000 sample rate model, if you are using 8000 sample rate model optimal values are half as much! + + Parameters + ---------- + batch_size: int + batch size to feed to silero VAD (default - 200) + + step: int + step size in samples, (default - 500) + + num_samples_per_window: int + window size in samples (chunk length in samples to feed to NN, default - 4000) + + min_speech_samples: int + if speech duration is shorter than this value, do not consider it speech (default - 10000) + + min_silence_samples: int + number of samples to wait before considering as the end of speech (default - 4000) + + speech_pad_samples: int + widen speech by this amount of samples each side (default - 2000) + + run_function: function + function to use for the model call + + visualize_probs: bool + whether draw prob hist or not (default: False) + + device: string + torch device to use for the model call (default - "cpu") + + Returns + ---------- + speeches: list + list containing ends and beginnings of speech chunks (in samples) + """ + + num_samples = num_samples_per_window + num_steps = int(num_samples / step) + assert min_silence_samples >= step + outs = [] + to_concat = [] + for i in range(0, len(wav), step): + chunk = wav[i: i+num_samples] + if len(chunk) < num_samples: + chunk = F.pad(chunk, (0, num_samples - len(chunk))) + to_concat.append(chunk.unsqueeze(0)) + if len(to_concat) >= batch_size: + chunks = torch.Tensor(torch.cat(to_concat, dim=0)).to(device) + out = run_function(model, chunks) + outs.append(out) + to_concat = [] + + if to_concat: + chunks = torch.Tensor(torch.cat(to_concat, dim=0)).to(device) + out = run_function(model, chunks) + outs.append(out) + + outs = torch.cat(outs, dim=0).cpu() + + buffer = deque(maxlen=num_steps) + triggered = False + speeches = [] + smoothed_probs = [] + current_speech = {} + speech_probs = outs[:, 1] # 0 index for silence probs, 1 index for speech probs + median_probs = speech_probs.median() + + trig_sum = 0.89 * median_probs + 0.08 # 0.08 when median is zero, 0.97 when median is 1 + + temp_end = 0 + for i, predict in enumerate(speech_probs): + buffer.append(predict) + smoothed_prob = max(buffer) + if visualize_probs: + smoothed_probs.append(float(smoothed_prob)) + if (smoothed_prob >= trig_sum) and temp_end: + temp_end = 0 + if (smoothed_prob >= trig_sum) and not triggered: + triggered = True + current_speech['start'] = step * max(0, i-num_steps) + continue + if (smoothed_prob < trig_sum) and triggered: + if not temp_end: + temp_end = step * i + if step * i - temp_end < min_silence_samples: + continue + else: + current_speech['end'] = temp_end + if (current_speech['end'] - current_speech['start']) > min_speech_samples: + speeches.append(current_speech) + temp_end = 0 + current_speech = {} + triggered = False + continue + if current_speech: + current_speech['end'] = len(wav) + speeches.append(current_speech) + if visualize_probs: + pd.DataFrame({'probs': smoothed_probs}).plot(figsize=(16, 8)) + + for ts in speeches: + ts['start'] = max(0, ts['start'] - speech_pad_samples) + ts['end'] += speech_pad_samples + + return speeches + + def get_number_ts(wav: torch.Tensor, model, model_stride=8, @@ -234,6 +379,109 @@ class VADiterator: return current_speech, self.current_name +class VADiteratorAdaptive: + def __init__(self, + trig_sum: float = 0.26, + neg_trig_sum: float = 0.06, + step: int = 500, + num_samples_per_window: int = 4000, + speech_pad_samples: int = 1000, + accum_period: int = 50): + """ + This class is used for streaming silero VAD usage + + Parameters + ---------- + trig_sum: float + trigger value for speech probability, probs above this value are considered speech, switch to TRIGGERED state (default - 0.26) + + neg_trig_sum: float + in triggered state probabilites below this value are considered nonspeech, switch to NONTRIGGERED state (default - 0.06) + + step: int + step size in samples, (default - 500) + + num_samples_per_window: int + window size in samples (chunk length in samples to feed to NN, default - 4000) + + speech_pad_samples: int + widen speech by this amount of samples each side (default - 1000) + + accum_period: int + number of chunks / iterations to wait before switching from constant (initial) trig and neg_trig coeffs to adaptive median coeffs (default - 50) + + """ + self.num_samples = num_samples_per_window + self.num_steps = int(num_samples_per_window / step) + self.step = step + self.prev = torch.zeros(self.num_samples) + self.last = False + self.triggered = False + self.buffer = deque(maxlen=self.num_steps) + self.num_frames = 0 + self.trig_sum = trig_sum + self.neg_trig_sum = neg_trig_sum + self.current_name = '' + self.median_meter = IterativeMedianMeter() + self.median = 0 + self.total_steps = 0 + self.accum_period = accum_period + self.speech_pad_samples = speech_pad_samples + + def refresh(self): + self.prev = torch.zeros(self.num_samples) + self.last = False + self.triggered = False + self.buffer = deque(maxlen=self.num_steps) + self.num_frames = 0 + self.median_meter.reset() + self.median = 0 + self.total_steps = 0 + + def prepare_batch(self, wav_chunk, name=None): + if (name is not None) and (name != self.current_name): + self.refresh() + self.current_name = name + assert len(wav_chunk) <= self.num_samples + self.num_frames += len(wav_chunk) + if len(wav_chunk) < self.num_samples: + wav_chunk = F.pad(wav_chunk, (0, self.num_samples - len(wav_chunk))) # short chunk => eof audio + self.last = True + + stacked = torch.cat([self.prev, wav_chunk]) + self.prev = wav_chunk + + overlap_chunks = [stacked[i:i+self.num_samples].unsqueeze(0) + for i in range(self.step, self.num_samples+1, self.step)] + return torch.cat(overlap_chunks, dim=0) + + def state(self, model_out): + current_speech = {} + speech_probs = model_out[:, 1] # 0 index for silence probs, 1 index for speech probs + for i, predict in enumerate(speech_probs): + self.median = self.median_meter(predict.item()) + if self.total_steps < self.accum_period: + trig_sum = self.trig_sum + neg_trig_sum = self.neg_trig_sum + else: + trig_sum = 0.89 * self.median + 0.08 # 0.08 when median is zero, 0.97 when median is 1 + neg_trig_sum = 0.6 * self.median + self.total_steps += 1 + self.buffer.append(predict) + smoothed_prob = max(self.buffer) + if (smoothed_prob >= trig_sum) and not self.triggered: + self.triggered = True + current_speech[max(0, self.num_frames - (self.num_steps-i) * self.step - self.speech_pad_samples)] = 'start' + if (smoothed_prob < neg_trig_sum) and self.triggered: + current_speech[self.num_frames - (self.num_steps-i) * self.step + self.speech_pad_samples] = 'end' + self.triggered = False + if self.triggered and self.last: + current_speech[self.num_frames] = 'end' + if self.last: + self.refresh() + return current_speech, self.current_name + + def state_generator(model, audios: List[str], onnx: bool = False, @@ -296,25 +544,27 @@ def stream_imitator(audios: List[str], def single_audio_stream(model, - audio: str, - onnx: bool = False, - trig_sum: float = 0.26, - neg_trig_sum: float = 0.07, - num_steps: int = 8, - num_samples_per_window: int = 4000, - run_function=validate): + audio: torch.Tensor, + num_samples_per_window:int = 4000, + run_function=validate, + iterator_type='basic', + **kwargs): + num_samples = num_samples_per_window - VADiter = VADiterator(trig_sum, neg_trig_sum, num_steps, num_samples_per_window) + if iterator_type == 'basic': + VADiter = VADiterator(num_samples_per_window=num_samples_per_window, **kwargs) + elif iterator_type == 'adaptive': + VADiter = VADiteratorAdaptive(num_samples_per_window=num_samples_per_window, **kwargs) + wav = read_audio(audio) wav_chunks = iter([wav[i:i+num_samples] for i in range(0, len(wav), num_samples)]) for chunk in wav_chunks: batch = VADiter.prepare_batch(chunk) outs = run_function(model, batch) - vad_outs = outs # this is very misleading states = [] - state = VADiter.state(vad_outs) + state = VADiter.state(outs) if state[0]: states.append(state[0]) yield states