diff --git a/silero-vad.ipynb b/silero-vad.ipynb index ec9f8b1..aefc8b5 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -2,14 +2,25 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:14:25.443732Z", - "start_time": "2020-12-11T14:14:24.835612Z" + "end_time": "2020-12-11T15:10:52.128138Z", + "start_time": "2020-12-11T15:10:51.548322Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", + " warnings.warn(\n", + "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import glob\n", "import torch\n", @@ -18,7 +29,7 @@ "# import torch.nn.functional as F\n", "from IPython.display import Audio\n", "torch.set_num_threads(1)\n", - "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n", + "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator, single_audio_stream\n", "extractor = STFTExtractor()" ] }, @@ -26,16 +37,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Full audio example" + "# Full audio example" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:19:25.895033Z", - "start_time": "2020-12-11T14:19:25.891112Z" + "end_time": "2020-12-11T14:25:05.274301Z", + "start_time": "2020-12-11T14:25:05.271313Z" } }, "outputs": [], @@ -49,11 +60,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:19:41.758975Z", - "start_time": "2020-12-11T14:19:41.522818Z" + "end_time": "2020-12-11T14:25:06.395183Z", + "start_time": "2020-12-11T14:25:06.082595Z" } }, "outputs": [], @@ -63,67 +74,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:19:52.024425Z", - "start_time": "2020-12-11T14:19:51.978279Z" + "end_time": "2020-12-11T14:25:25.523423Z", + "start_time": "2020-12-11T14:25:25.493581Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Audio('files/test_audio_6.wav')" + "Audio('files/test_audio_8.wav')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:20:12.363579Z", - "start_time": "2020-12-11T14:20:12.346354Z" + "end_time": "2020-12-11T14:25:43.023784Z", + "start_time": "2020-12-11T14:25:43.017360Z" } }, "outputs": [], "source": [ - "wav = read_audio('files/test_audio_6.wav')" + "wav = read_audio('files/test_audio_8.wav')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:20:49.910862Z", - "start_time": "2020-12-11T14:20:49.906902Z" - } - }, - "outputs": [], - "source": [ - "torch.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-11T14:20:42.130546Z", - "start_time": "2020-12-11T14:20:42.122245Z" - } - }, - "outputs": [], - "source": [ - "torch.vstack" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-11T14:20:28.888271Z", - "start_time": "2020-12-11T14:20:28.787459Z" + "end_time": "2020-12-11T14:25:45.083872Z", + "start_time": "2020-12-11T14:25:43.371366Z" } }, "outputs": [], @@ -133,47 +135,145 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:32:11.698816Z", - "start_time": "2020-12-11T13:32:11.671735Z" + "end_time": "2020-12-11T14:25:45.130371Z", + "start_time": "2020-12-11T14:25:45.091010Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n", "Audio('only_speech.wav')" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, - "source": [ - "## Stream example" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Single stream example" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T14:22:54.451814Z", - "start_time": "2020-12-11T14:22:54.211738Z" + "end_time": "2020-12-11T15:10:55.789272Z", + "start_time": "2020-12-11T15:10:55.543652Z" } }, "outputs": [], "source": [ - "!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar" + "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')\n", + "audio = 'files/test_audio_6.wav'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-11T15:10:59.503301Z", + "start_time": "2020-12-11T15:10:55.790671Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/torch/functional.py:515: UserWarning: stft will require the return_complex parameter be explicitly specified in a future PyTorch release. Use return_complex=False to preserve the current behavior or return_complex=True to return a complex output. (Triggered internally at /opt/conda/conda-bld/pytorch_1603729096996/work/aten/src/ATen/native/SpectralOps.cpp:653.)\n", + " return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore\n", + "/opt/conda/lib/python3.8/site-packages/torch/functional.py:515: UserWarning: The function torch.rfft is deprecated and will be removed in a future PyTorch release. Use the new torch.fft module functions, instead, by importing torch.fft and calling torch.fft.fft or torch.fft.rfft. (Triggered internally at /opt/conda/conda-bld/pytorch_1603729096996/work/aten/src/ATen/native/SpectralOps.cpp:590.)\n", + " return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{183500: 'start'}]\n", + "[{202500: 'end'}]\n", + "[{226500: 'start'}]\n", + "[{283500: 'end'}]\n", + "[{337500: 'start'}]\n", + "[{503000: 'end'}]\n", + "[{507500: 'start'}]\n", + "[{627500: 'end'}]\n", + "[{631500: 'start'}]\n", + "[{927488: 'end'}]\n" + ] + } + ], + "source": [ + "for i in single_audio_stream(model, audio, extractor):\n", + " if i:\n", + " print(i)" ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple stream example" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:31:34.137062Z", - "start_time": "2020-12-11T13:31:33.957092Z" + "end_time": "2020-12-11T14:28:09.649303Z", + "start_time": "2020-12-11T14:28:09.373634Z" } }, "outputs": [], @@ -183,14 +283,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:31:36.332200Z", - "start_time": "2020-12-11T13:31:36.328087Z" + "end_time": "2020-12-11T14:28:12.273951Z", + "start_time": "2020-12-11T14:28:12.269729Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "audios_for_stream = glob.glob('files/test*.wav')\n", "len(audios_for_stream)" @@ -198,19 +309,113 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:31:52.668041Z", - "start_time": "2020-12-11T13:31:37.357340Z" + "end_time": "2020-12-11T14:28:32.459872Z", + "start_time": "2020-12-11T14:28:14.502871Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done initial Loading\n", + "[({106500: 'start'}, 'files/test_audio_1.wav')]\n", + "[({174000: 'start'}, 'files/test_audio_3.wav')]\n", + "[({261000: 'end'}, 'files/test_audio_1.wav')]\n", + "Loading next wav: files/test_audio_7.wav\n", + "[({134000: 'start'}, 'files/test_audio_7.wav')]\n", + "[({147500: 'end'}, 'files/test_audio_7.wav')]\n", + "[({442000: 'end'}, 'files/test_audio_3.wav')]\n", + "[({450500: 'start'}, 'files/test_audio_3.wav')]\n", + "[({209500: 'start'}, 'files/test_audio_7.wav')]\n", + "[({519500: 'end'}, 'files/test_audio_3.wav')]\n", + "[({533500: 'start'}, 'files/test_audio_3.wav')]\n", + "[({599904: 'end'}, 'files/test_audio_3.wav')]\n", + "Loading next wav: files/test_audio_6.wav\n", + "[({183500: 'start'}, 'files/test_audio_6.wav')]\n", + "[({503500: 'end'}, 'files/test_audio_7.wav')]\n", + "[({202500: 'end'}, 'files/test_audio_6.wav')]\n", + "[({537500: 'start'}, 'files/test_audio_7.wav')]\n", + "[({226500: 'start'}, 'files/test_audio_6.wav')]\n", + "[({283500: 'end'}, 'files/test_audio_6.wav')]\n", + "[({616500: 'end'}, 'files/test_audio_7.wav')]\n", + "[({337500: 'start'}, 'files/test_audio_6.wav')]\n", + "[({661500: 'start'}, 'files/test_audio_7.wav')]\n", + "[({785000: 'end'}, 'files/test_audio_7.wav')]\n", + "[({503000: 'end'}, 'files/test_audio_6.wav')]\n", + "[({507500: 'start'}, 'files/test_audio_6.wav')]\n", + "[({851500: 'start'}, 'files/test_audio_7.wav')]\n", + "[({919000: 'end'}, 'files/test_audio_7.wav')]\n", + "Loading next wav: files/test_audio_5.wav\n", + "[({627500: 'end'}, 'files/test_audio_6.wav')]\n", + "[({631500: 'start'}, 'files/test_audio_6.wav')]\n", + "[({151000: 'start'}, 'files/test_audio_5.wav')]\n", + "[({169000: 'end'}, 'files/test_audio_5.wav')]\n", + "[({211000: 'start'}, 'files/test_audio_5.wav')]\n", + "[({221500: 'end'}, 'files/test_audio_5.wav')]\n", + "Loading next wav: files/test_audio_2.wav\n", + "[({927488: 'end'}, 'files/test_audio_6.wav')]\n", + "Loading next wav: files/test_audio_8.wav\n", + "[({228000: 'start'}, 'files/test_audio_2.wav')]\n", + "[({179500: 'start'}, 'files/test_audio_8.wav')]\n", + "[({241500: 'end'}, 'files/test_audio_2.wav')]\n", + "[({279000: 'start'}, 'files/test_audio_2.wav')]\n", + "[({274500: 'end'}, 'files/test_audio_8.wav')]\n", + "[({300500: 'start'}, 'files/test_audio_8.wav')]\n", + "[({369500: 'end'}, 'files/test_audio_2.wav')]\n", + "[({378500: 'start'}, 'files/test_audio_2.wav')]\n", + "[({436500: 'end'}, 'files/test_audio_2.wav')]\n", + "[({423000: 'end'}, 'files/test_audio_8.wav')]\n", + "[({488500: 'start'}, 'files/test_audio_2.wav')]\n", + "[({458500: 'start'}, 'files/test_audio_8.wav')]\n", + "[({599904: 'end'}, 'files/test_audio_2.wav')]\n", + "Loading next wav: files/test_audio_4.wav\n", + "[({583500: 'end'}, 'files/test_audio_8.wav')]\n", + "[({599500: 'start'}, 'files/test_audio_8.wav')]\n", + "[({632500: 'end'}, 'files/test_audio_8.wav')]\n", + "[({660000: 'start'}, 'files/test_audio_8.wav')]\n", + "[({737000: 'end'}, 'files/test_audio_8.wav')]\n", + "[({761000: 'start'}, 'files/test_audio_8.wav')]\n", + "[({249500: 'start'}, 'files/test_audio_4.wav')]\n", + "[({257168: 'end'}, 'files/test_audio_4.wav')]\n", + "Loading next wav: files/test_audio_9.wav\n", + "[({843000: 'end'}, 'files/test_audio_8.wav')]\n", + "Loading next wav: files/test_audio_0.wav\n", + "[({133000: 'start'}, 'files/test_audio_9.wav')]\n", + "[({143500: 'end'}, 'files/test_audio_9.wav')]\n", + "[({272000: 'start'}, 'files/test_audio_9.wav')]\n", + "[({256500: 'start'}, 'files/test_audio_0.wav')]\n", + "[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n", + "[({406500: 'start'}, 'files/test_audio_9.wav')]\n", + "[({460000: 'end'}, 'files/test_audio_9.wav')]\n", + "[({476000: 'start'}, 'files/test_audio_9.wav')]\n", + "[({494500: 'end'}, 'files/test_audio_9.wav')]\n", + "[({544500: 'start'}, 'files/test_audio_9.wav')]\n", + "[({564500: 'end'}, 'files/test_audio_9.wav')]\n", + "[({595000: 'start'}, 'files/test_audio_9.wav')]\n", + "[({682000: 'end'}, 'files/test_audio_9.wav')]\n", + "[({728500: 'start'}, 'files/test_audio_9.wav')]\n", + "[({786000: 'end'}, 'files/test_audio_9.wav')]\n", + "[({814000: 'start'}, 'files/test_audio_9.wav')]\n", + "[({826000: 'end'}, 'files/test_audio_9.wav')]\n" + ] + } + ], "source": [ "for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n", " if i:\n", " print(i)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -229,7 +434,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.3" }, "toc": { "base_numbering": 1, diff --git a/utils.py b/utils.py index 30ee9a5..a4e05ed 100644 --- a/utils.py +++ b/utils.py @@ -77,15 +77,16 @@ def get_speech_ts(wav, model, extractor, trig_sum=0.25, neg_trig_sum=0.01, num_steps=8, batch_size=200): - assert 4000 % num_steps == 0 - step = int(4000 / num_steps) # stride / hop + num_samples = 4000 + assert num_samples % num_steps == 0 + step = int(num_samples / num_steps) # stride / hop outs = [] to_concat = [] for i in range(0, len(wav), step): - chunk = wav[i: i+4000] - if len(chunk) < 4000: - chunk = F.pad(chunk, (0, 4000 - len(chunk))) + chunk = wav[i: i+num_samples] + if len(chunk) < num_samples: + chunk = F.pad(chunk, (0, num_samples - len(chunk))) to_concat.append(chunk) if len(to_concat) >= batch_size: chunks = torch.Tensor(torch.vstack(to_concat)) @@ -107,7 +108,8 @@ def get_speech_ts(wav, model, extractor, speeches = [] current_speech = {} - for i, predict in enumerate(outs[:, 1]): # add name + speech_probs = outs[:, 1] + for i, predict in enumerate(speech_probs): # add name buffer.append(predict) if (np.mean(buffer) >= trig_sum) and not triggered: triggered = True @@ -158,44 +160,46 @@ class VADiterator: def __init__(self, trig_sum=0.26, neg_trig_sum=0.01, num_steps=8): + self.num_samples = 4000 self.num_steps = num_steps - assert 4000 % num_steps == 0 - self.step = int(4000 / num_steps) - self.prev = torch.zeros(4000) + assert self.num_samples % num_steps == 0 + self.step = int(self.num_samples / num_steps) + self.prev = torch.zeros(self.num_samples) self.last = False self.triggered = False - self.buffer = deque(maxlen=8) + self.buffer = deque(maxlen=num_steps) self.num_frames = 0 self.trig_sum = trig_sum self.neg_trig_sum = neg_trig_sum self.current_name = '' def refresh(self): - self.prev = torch.zeros(4000) + self.prev = torch.zeros(self.num_samples) self.last = False self.triggered = False - self.buffer = deque(maxlen=8) + self.buffer = deque(maxlen=self.num_steps) self.num_frames = 0 def prepare_batch(self, wav_chunk, name=None): if (name is not None) and (name != self.current_name): self.refresh() self.current_name = name - assert len(wav_chunk) <= 4000 + assert len(wav_chunk) <= self.num_samples self.num_frames += len(wav_chunk) - if len(wav_chunk) < 4000: - wav_chunk = F.pad(wav_chunk, (0, 4000 - len(wav_chunk))) # assume that short chunk means end of the audio + if len(wav_chunk) < self.num_samples: + wav_chunk = F.pad(wav_chunk, (0, self.num_samples - len(wav_chunk))) # assume that short chunk means end of the audio self.last = True stacked = torch.hstack([self.prev, wav_chunk]) self.prev = wav_chunk - overlap_chunks = [stacked[i:i+4000] for i in range(self.step, 4001, self.step)] # 500 step is good enough + overlap_chunks = [stacked[i:i+self.num_samples] for i in range(self.step, self.num_samples+1, self.step)] # 500 step is good enough return torch.vstack(overlap_chunks) def state(self, model_out): current_speech = {} - for i, predict in enumerate(model_out[:, 1]): # add name + speech_probs = model_out[:, 1] + for i, predict in enumerate(speech_probs): # add name self.buffer.append(predict) if (np.mean(self.buffer) >= self.trig_sum) and not self.triggered: self.triggered = True @@ -236,14 +240,15 @@ def state_generator(model, audios, extractor, yield states -def stream_imitator(stereo, audios_in_stream): - stereo_iter = iter(stereo) +def stream_imitator(audios, audios_in_stream): + audio_iter = iter(audios) iterators = [] + num_samples = 4000 # initial wavs for i in range(audios_in_stream): - next_wav = next(stereo_iter) + next_wav = next(audio_iter) wav = read_audio(next_wav) - wav_chunks = iter([(wav[i:i+4000], next_wav) for i in range(0, len(wav), 4000)]) + wav_chunks = iter([(wav[i:i+num_samples], next_wav) for i in range(0, len(wav), num_samples)]) iterators.append(wav_chunks) print('Done initial Loading') good_iters = audios_in_stream @@ -254,16 +259,40 @@ def stream_imitator(stereo, audios_in_stream): out, wav_name = next(it) except StopIteration: try: - next_wav = next(stereo_iter) + next_wav = next(audio_iter) print('Loading next wav: ', next_wav) wav = read_audio(next_wav) - iterators[i] = iter([(wav[i:i+4000], next_wav) for i in range(0, len(wav), 4000)]) + iterators[i] = iter([(wav[i:i+num_samples], next_wav) for i in range(0, len(wav), num_samples)]) out, wav_name = next(iterators[i]) except StopIteration: good_iters -= 1 - iterators[i] = repeat((torch.zeros(4000), 'junk')) + iterators[i] = repeat((torch.zeros(num_samples), 'junk')) out, wav_name = next(iterators[i]) if good_iters == 0: return values.append((out, wav_name)) yield values + +def single_audio_stream(model, audio, extractor, onnx=False, trig_sum=0.26, + neg_trig_sum=0.01, num_steps=8): + num_samples = 4000 + VADiter = VADiterator(trig_sum, neg_trig_sum, num_steps) + wav = read_audio(audio) + wav_chunks = iter([wav[i:i+num_samples] for i in range(0, len(wav), num_samples)]) + for chunk in wav_chunks: + batch = VADiter.prepare_batch(chunk) + + with torch.no_grad(): + if onnx: + ort_inputs = {'input': to_numpy(extractor(batch))} + ort_outs = model.run(None, ort_inputs) + vad_outs = ort_outs[-2] + else: + outs = model(extractor(batch)) + vad_outs = outs[-2] + + states = [] + state = VADiter.state(vad_outs) + if state[0]: + states.append(state[0]) + yield states