diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 81fa68f..ec9f8b1 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -2,31 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:30:32.615246Z", - "start_time": "2020-12-11T13:30:32.126553Z" + "end_time": "2020-12-11T14:14:25.443732Z", + "start_time": "2020-12-11T14:14:24.835612Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", - " warnings.warn(\n", - "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "import torch\n", - "import numpy as np\n", "import glob\n", - "import torch.nn.functional as F\n", + "import torch\n", + "import numpy as np # use only torch?\n", "import soundfile as sf\n", + "# import torch.nn.functional as F\n", "from IPython.display import Audio\n", "torch.set_num_threads(1)\n", "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n", @@ -42,11 +31,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:32:01.978079Z", - "start_time": "2020-12-11T13:32:01.974912Z" + "end_time": "2020-12-11T14:19:25.895033Z", + "start_time": "2020-12-11T14:19:25.891112Z" } }, "outputs": [], @@ -60,25 +49,39 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:31:55.255097Z", - "start_time": "2020-12-11T13:31:55.020705Z" + "end_time": "2020-12-11T14:19:41.758975Z", + "start_time": "2020-12-11T14:19:41.522818Z" } }, "outputs": [], "source": [ - "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')" + "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu') # from yml file" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:32:10.391589Z", - "start_time": "2020-12-11T13:32:10.387109Z" + "end_time": "2020-12-11T14:19:52.024425Z", + "start_time": "2020-12-11T14:19:51.978279Z" + } + }, + "outputs": [], + "source": [ + "Audio('files/test_audio_6.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-11T14:20:12.363579Z", + "start_time": "2020-12-11T14:20:12.346354Z" } }, "outputs": [], @@ -88,47 +91,56 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-11T13:32:11.670091Z", - "start_time": "2020-12-11T13:32:10.814378Z" + "end_time": "2020-12-11T14:20:49.910862Z", + "start_time": "2020-12-11T14:20:49.906902Z" } }, "outputs": [], "source": [ - "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4)" + "torch.__version__" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-11T14:20:42.130546Z", + "start_time": "2020-12-11T14:20:42.122245Z" + } + }, + "outputs": [], + "source": [ + "torch.vstack" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-11T14:20:28.888271Z", + "start_time": "2020-12-11T14:20:28.787459Z" + } + }, + "outputs": [], + "source": [ + "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4) # kill extractor" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:32:11.698816Z", "start_time": "2020-12-11T13:32:11.671735Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n", "Audio('only_speech.wav')" @@ -143,7 +155,21 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-11T14:22:54.451814Z", + "start_time": "2020-12-11T14:22:54.211738Z" + } + }, + "outputs": [], + "source": [ + "!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:34.137062Z", @@ -157,25 +183,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:36.332200Z", "start_time": "2020-12-11T13:31:36.328087Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "10" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "audios_for_stream = glob.glob('files/test*.wav')\n", "len(audios_for_stream)" @@ -183,101 +198,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:52.668041Z", "start_time": "2020-12-11T13:31:37.357340Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done initial Loading\n", - "[({106500: 'start'}, 'files/test_audio_1.wav')]\n", - "[({174000: 'start'}, 'files/test_audio_3.wav')]\n", - "[({261000: 'end'}, 'files/test_audio_1.wav')]\n", - "Loading next wav: files/test_audio_7.wav\n", - "[({134000: 'start'}, 'files/test_audio_7.wav')]\n", - "[({147500: 'end'}, 'files/test_audio_7.wav')]\n", - "[({442000: 'end'}, 'files/test_audio_3.wav')]\n", - "[({450500: 'start'}, 'files/test_audio_3.wav')]\n", - "[({209500: 'start'}, 'files/test_audio_7.wav')]\n", - "[({519500: 'end'}, 'files/test_audio_3.wav')]\n", - "[({533500: 'start'}, 'files/test_audio_3.wav')]\n", - "[({599904: 'end'}, 'files/test_audio_3.wav')]\n", - "Loading next wav: files/test_audio_6.wav\n", - "[({183500: 'start'}, 'files/test_audio_6.wav')]\n", - "[({503500: 'end'}, 'files/test_audio_7.wav')]\n", - "[({202500: 'end'}, 'files/test_audio_6.wav')]\n", - "[({537500: 'start'}, 'files/test_audio_7.wav')]\n", - "[({226500: 'start'}, 'files/test_audio_6.wav')]\n", - "[({283500: 'end'}, 'files/test_audio_6.wav')]\n", - "[({616500: 'end'}, 'files/test_audio_7.wav')]\n", - "[({337500: 'start'}, 'files/test_audio_6.wav')]\n", - "[({661500: 'start'}, 'files/test_audio_7.wav')]\n", - "[({785000: 'end'}, 'files/test_audio_7.wav')]\n", - "[({503000: 'end'}, 'files/test_audio_6.wav')]\n", - "[({507500: 'start'}, 'files/test_audio_6.wav')]\n", - "[({851500: 'start'}, 'files/test_audio_7.wav')]\n", - "[({919000: 'end'}, 'files/test_audio_7.wav')]\n", - "Loading next wav: files/test_audio_5.wav\n", - "[({627500: 'end'}, 'files/test_audio_6.wav')]\n", - "[({631500: 'start'}, 'files/test_audio_6.wav')]\n", - "[({151000: 'start'}, 'files/test_audio_5.wav')]\n", - "[({169000: 'end'}, 'files/test_audio_5.wav')]\n", - "[({211000: 'start'}, 'files/test_audio_5.wav')]\n", - "[({221500: 'end'}, 'files/test_audio_5.wav')]\n", - "Loading next wav: files/test_audio_2.wav\n", - "[({927488: 'end'}, 'files/test_audio_6.wav')]\n", - "Loading next wav: files/test_audio_8.wav\n", - "[({228000: 'start'}, 'files/test_audio_2.wav')]\n", - "[({179500: 'start'}, 'files/test_audio_8.wav')]\n", - "[({241500: 'end'}, 'files/test_audio_2.wav')]\n", - "[({279000: 'start'}, 'files/test_audio_2.wav')]\n", - "[({274500: 'end'}, 'files/test_audio_8.wav')]\n", - "[({300500: 'start'}, 'files/test_audio_8.wav')]\n", - "[({369500: 'end'}, 'files/test_audio_2.wav')]\n", - "[({378500: 'start'}, 'files/test_audio_2.wav')]\n", - "[({436500: 'end'}, 'files/test_audio_2.wav')]\n", - "[({423000: 'end'}, 'files/test_audio_8.wav')]\n", - "[({488500: 'start'}, 'files/test_audio_2.wav')]\n", - "[({458500: 'start'}, 'files/test_audio_8.wav')]\n", - "[({599904: 'end'}, 'files/test_audio_2.wav')]\n", - "Loading next wav: files/test_audio_4.wav\n", - "[({583500: 'end'}, 'files/test_audio_8.wav')]\n", - "[({599500: 'start'}, 'files/test_audio_8.wav')]\n", - "[({632500: 'end'}, 'files/test_audio_8.wav')]\n", - "[({660000: 'start'}, 'files/test_audio_8.wav')]\n", - "[({737000: 'end'}, 'files/test_audio_8.wav')]\n", - "[({761000: 'start'}, 'files/test_audio_8.wav')]\n", - "[({249500: 'start'}, 'files/test_audio_4.wav')]\n", - "[({257168: 'end'}, 'files/test_audio_4.wav')]\n", - "Loading next wav: files/test_audio_9.wav\n", - "[({843000: 'end'}, 'files/test_audio_8.wav')]\n", - "Loading next wav: files/test_audio_0.wav\n", - "[({133000: 'start'}, 'files/test_audio_9.wav')]\n", - "[({143500: 'end'}, 'files/test_audio_9.wav')]\n", - "[({272000: 'start'}, 'files/test_audio_9.wav')]\n", - "[({256500: 'start'}, 'files/test_audio_0.wav')]\n", - "[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n", - "[({406500: 'start'}, 'files/test_audio_9.wav')]\n", - "[({460000: 'end'}, 'files/test_audio_9.wav')]\n", - "[({476000: 'start'}, 'files/test_audio_9.wav')]\n", - "[({494500: 'end'}, 'files/test_audio_9.wav')]\n", - "[({544500: 'start'}, 'files/test_audio_9.wav')]\n", - "[({564500: 'end'}, 'files/test_audio_9.wav')]\n", - "[({595000: 'start'}, 'files/test_audio_9.wav')]\n", - "[({682000: 'end'}, 'files/test_audio_9.wav')]\n", - "[({728500: 'start'}, 'files/test_audio_9.wav')]\n", - "[({786000: 'end'}, 'files/test_audio_9.wav')]\n", - "[({814000: 'start'}, 'files/test_audio_9.wav')]\n", - "[({826000: 'end'}, 'files/test_audio_9.wav')]\n" - ] - } - ], + "outputs": [], "source": [ "for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n", " if i:\n", diff --git a/utils.py b/utils.py index 985ba5e..30ee9a5 100644 --- a/utils.py +++ b/utils.py @@ -195,7 +195,7 @@ class VADiterator: def state(self, model_out): current_speech = {} - for i, predict in enumerate(model_out[:, 1]): + for i, predict in enumerate(model_out[:, 1]): # add name self.buffer.append(predict) if (np.mean(self.buffer) >= self.trig_sum) and not self.triggered: self.triggered = True @@ -210,7 +210,10 @@ class VADiterator: return current_speech, self.current_name -def state_generator(model, audios, extractor, onnx=False, trig_sum=0.26, neg_trig_sum=0.01, num_steps=8, audios_in_stream=5): +def state_generator(model, audios, extractor, + onnx=False, + trig_sum=0.26, neg_trig_sum=0.01, + num_steps=8, audios_in_stream=5): VADiters = [VADiterator(trig_sum, neg_trig_sum, num_steps) for i in range(audios_in_stream)] for i, current_pieces in enumerate(stream_imitator(audios, audios_in_stream)): for_batch = [x.prepare_batch(*y) for x, y in zip(VADiters, current_pieces)] @@ -264,4 +267,3 @@ def stream_imitator(stereo, audios_in_stream): return values.append((out, wav_name)) yield values -