diff --git a/silero-vad.ipynb b/silero-vad.ipynb
index 81fa68f..ec9f8b1 100644
--- a/silero-vad.ipynb
+++ b/silero-vad.ipynb
@@ -2,31 +2,20 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:30:32.615246Z",
- "start_time": "2020-12-11T13:30:32.126553Z"
+ "end_time": "2020-12-11T14:14:25.443732Z",
+ "start_time": "2020-12-11T14:14:24.835612Z"
}
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
- " warnings.warn(\n",
- "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "import torch\n",
- "import numpy as np\n",
"import glob\n",
- "import torch.nn.functional as F\n",
+ "import torch\n",
+ "import numpy as np # use only torch?\n",
"import soundfile as sf\n",
+ "# import torch.nn.functional as F\n",
"from IPython.display import Audio\n",
"torch.set_num_threads(1)\n",
"from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n",
@@ -42,11 +31,11 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:32:01.978079Z",
- "start_time": "2020-12-11T13:32:01.974912Z"
+ "end_time": "2020-12-11T14:19:25.895033Z",
+ "start_time": "2020-12-11T14:19:25.891112Z"
}
},
"outputs": [],
@@ -60,25 +49,39 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:31:55.255097Z",
- "start_time": "2020-12-11T13:31:55.020705Z"
+ "end_time": "2020-12-11T14:19:41.758975Z",
+ "start_time": "2020-12-11T14:19:41.522818Z"
}
},
"outputs": [],
"source": [
- "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')"
+ "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu') # from yml file"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:32:10.391589Z",
- "start_time": "2020-12-11T13:32:10.387109Z"
+ "end_time": "2020-12-11T14:19:52.024425Z",
+ "start_time": "2020-12-11T14:19:51.978279Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "Audio('files/test_audio_6.wav')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-11T14:20:12.363579Z",
+ "start_time": "2020-12-11T14:20:12.346354Z"
}
},
"outputs": [],
@@ -88,47 +91,56 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:32:11.670091Z",
- "start_time": "2020-12-11T13:32:10.814378Z"
+ "end_time": "2020-12-11T14:20:49.910862Z",
+ "start_time": "2020-12-11T14:20:49.906902Z"
}
},
"outputs": [],
"source": [
- "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4)"
+ "torch.__version__"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-11T14:20:42.130546Z",
+ "start_time": "2020-12-11T14:20:42.122245Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "torch.vstack"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-11T14:20:28.888271Z",
+ "start_time": "2020-12-11T14:20:28.787459Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4) # kill extractor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:32:11.698816Z",
"start_time": "2020-12-11T13:32:11.671735Z"
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n",
"Audio('only_speech.wav')"
@@ -143,7 +155,21 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-11T14:22:54.451814Z",
+ "start_time": "2020-12-11T14:22:54.211738Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:34.137062Z",
@@ -157,25 +183,14 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:36.332200Z",
"start_time": "2020-12-11T13:31:36.328087Z"
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "10"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"audios_for_stream = glob.glob('files/test*.wav')\n",
"len(audios_for_stream)"
@@ -183,101 +198,14 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:52.668041Z",
"start_time": "2020-12-11T13:31:37.357340Z"
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done initial Loading\n",
- "[({106500: 'start'}, 'files/test_audio_1.wav')]\n",
- "[({174000: 'start'}, 'files/test_audio_3.wav')]\n",
- "[({261000: 'end'}, 'files/test_audio_1.wav')]\n",
- "Loading next wav: files/test_audio_7.wav\n",
- "[({134000: 'start'}, 'files/test_audio_7.wav')]\n",
- "[({147500: 'end'}, 'files/test_audio_7.wav')]\n",
- "[({442000: 'end'}, 'files/test_audio_3.wav')]\n",
- "[({450500: 'start'}, 'files/test_audio_3.wav')]\n",
- "[({209500: 'start'}, 'files/test_audio_7.wav')]\n",
- "[({519500: 'end'}, 'files/test_audio_3.wav')]\n",
- "[({533500: 'start'}, 'files/test_audio_3.wav')]\n",
- "[({599904: 'end'}, 'files/test_audio_3.wav')]\n",
- "Loading next wav: files/test_audio_6.wav\n",
- "[({183500: 'start'}, 'files/test_audio_6.wav')]\n",
- "[({503500: 'end'}, 'files/test_audio_7.wav')]\n",
- "[({202500: 'end'}, 'files/test_audio_6.wav')]\n",
- "[({537500: 'start'}, 'files/test_audio_7.wav')]\n",
- "[({226500: 'start'}, 'files/test_audio_6.wav')]\n",
- "[({283500: 'end'}, 'files/test_audio_6.wav')]\n",
- "[({616500: 'end'}, 'files/test_audio_7.wav')]\n",
- "[({337500: 'start'}, 'files/test_audio_6.wav')]\n",
- "[({661500: 'start'}, 'files/test_audio_7.wav')]\n",
- "[({785000: 'end'}, 'files/test_audio_7.wav')]\n",
- "[({503000: 'end'}, 'files/test_audio_6.wav')]\n",
- "[({507500: 'start'}, 'files/test_audio_6.wav')]\n",
- "[({851500: 'start'}, 'files/test_audio_7.wav')]\n",
- "[({919000: 'end'}, 'files/test_audio_7.wav')]\n",
- "Loading next wav: files/test_audio_5.wav\n",
- "[({627500: 'end'}, 'files/test_audio_6.wav')]\n",
- "[({631500: 'start'}, 'files/test_audio_6.wav')]\n",
- "[({151000: 'start'}, 'files/test_audio_5.wav')]\n",
- "[({169000: 'end'}, 'files/test_audio_5.wav')]\n",
- "[({211000: 'start'}, 'files/test_audio_5.wav')]\n",
- "[({221500: 'end'}, 'files/test_audio_5.wav')]\n",
- "Loading next wav: files/test_audio_2.wav\n",
- "[({927488: 'end'}, 'files/test_audio_6.wav')]\n",
- "Loading next wav: files/test_audio_8.wav\n",
- "[({228000: 'start'}, 'files/test_audio_2.wav')]\n",
- "[({179500: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({241500: 'end'}, 'files/test_audio_2.wav')]\n",
- "[({279000: 'start'}, 'files/test_audio_2.wav')]\n",
- "[({274500: 'end'}, 'files/test_audio_8.wav')]\n",
- "[({300500: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({369500: 'end'}, 'files/test_audio_2.wav')]\n",
- "[({378500: 'start'}, 'files/test_audio_2.wav')]\n",
- "[({436500: 'end'}, 'files/test_audio_2.wav')]\n",
- "[({423000: 'end'}, 'files/test_audio_8.wav')]\n",
- "[({488500: 'start'}, 'files/test_audio_2.wav')]\n",
- "[({458500: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({599904: 'end'}, 'files/test_audio_2.wav')]\n",
- "Loading next wav: files/test_audio_4.wav\n",
- "[({583500: 'end'}, 'files/test_audio_8.wav')]\n",
- "[({599500: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({632500: 'end'}, 'files/test_audio_8.wav')]\n",
- "[({660000: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({737000: 'end'}, 'files/test_audio_8.wav')]\n",
- "[({761000: 'start'}, 'files/test_audio_8.wav')]\n",
- "[({249500: 'start'}, 'files/test_audio_4.wav')]\n",
- "[({257168: 'end'}, 'files/test_audio_4.wav')]\n",
- "Loading next wav: files/test_audio_9.wav\n",
- "[({843000: 'end'}, 'files/test_audio_8.wav')]\n",
- "Loading next wav: files/test_audio_0.wav\n",
- "[({133000: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({143500: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({272000: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({256500: 'start'}, 'files/test_audio_0.wav')]\n",
- "[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n",
- "[({406500: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({460000: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({476000: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({494500: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({544500: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({564500: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({595000: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({682000: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({728500: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({786000: 'end'}, 'files/test_audio_9.wav')]\n",
- "[({814000: 'start'}, 'files/test_audio_9.wav')]\n",
- "[({826000: 'end'}, 'files/test_audio_9.wav')]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n",
" if i:\n",
diff --git a/utils.py b/utils.py
index 985ba5e..30ee9a5 100644
--- a/utils.py
+++ b/utils.py
@@ -195,7 +195,7 @@ class VADiterator:
def state(self, model_out):
current_speech = {}
- for i, predict in enumerate(model_out[:, 1]):
+ for i, predict in enumerate(model_out[:, 1]): # add name
self.buffer.append(predict)
if (np.mean(self.buffer) >= self.trig_sum) and not self.triggered:
self.triggered = True
@@ -210,7 +210,10 @@ class VADiterator:
return current_speech, self.current_name
-def state_generator(model, audios, extractor, onnx=False, trig_sum=0.26, neg_trig_sum=0.01, num_steps=8, audios_in_stream=5):
+def state_generator(model, audios, extractor,
+ onnx=False,
+ trig_sum=0.26, neg_trig_sum=0.01,
+ num_steps=8, audios_in_stream=5):
VADiters = [VADiterator(trig_sum, neg_trig_sum, num_steps) for i in range(audios_in_stream)]
for i, current_pieces in enumerate(stream_imitator(audios, audios_in_stream)):
for_batch = [x.prepare_batch(*y) for x, y in zip(VADiters, current_pieces)]
@@ -264,4 +267,3 @@ def stream_imitator(stereo, audios_in_stream):
return
values.append((out, wav_name))
yield values
-