From c762bb5b529d24102a4dee94380f853142c08bea Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Thu, 15 Apr 2021 14:01:05 +0000 Subject: [PATCH 1/6] add adaptive examples --- README.md | 37 ++++++- silero-vad.ipynb | 260 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 281 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index e0c6a94..3f13f39 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', force_reload=True) (get_speech_ts, + get_speech_ts_adaptive _, read_audio, _, _, _) = utils @@ -122,9 +123,15 @@ files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files' wav = read_audio(f'{files_dir}/en.wav') # full audio # get speech timestamps from full audio file + +# classic way speech_timestamps = get_speech_ts(wav, model, num_steps=4) pprint(speech_timestamps) + +# adaptive way +speech_timestamps = get_speech_ts_adaptive(wav, model) +pprint(speech_timestamps) ``` #### Number Detector @@ -195,6 +202,7 @@ _, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', force_reload=True) (get_speech_ts, + get_speech_ts_adaptive _, read_audio, _, _, _) = utils @@ -208,14 +216,20 @@ def validate_onnx(model, inputs): ort_inputs = {'input': inputs.cpu().numpy()} outs = model.run(None, ort_inputs) outs = [torch.Tensor(x) for x in outs] - return outs + return outs[0] model = init_onnx_model(f'{files_dir}/model.onnx') wav = read_audio(f'{files_dir}/en.wav') # get speech timestamps from full audio file + +# classic way speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) pprint(speech_timestamps) + +# adaptive way +speech_timestamps = get_speech_ts(wav, model, run_function=validate_onnx) +pprint(speech_timestamps) ``` #### Number Detector @@ -347,6 +361,9 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks ### VAD Parameter Fine Tuning +#### **Classic way** + +**This is straightforward classic method `get_speech_ts` where tresholds (`trig_sum` and `neg_trig_sum`) are specified by users** - Among others, we provide several [utils](https://github.com/snakers4/silero-vad/blob/8b28767292b424e3e505c55f15cd3c4b91e4804b/utils.py#L52-L59) to simplify working with VAD; - We provide sensible basic hyper-parameters that work for us, but your case can be different; - `trig_sum` - overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state); @@ -365,6 +382,24 @@ speech_timestamps = get_speech_ts(wav, model, visualize_probs=True) ``` +#### **Adaptive way** + +**Adaptive algorythm (`get_speech_ts_adaptive`) automatically selects tresholds (`trig_sum` and `neg_trig_sum`) based on median speech probabilities over whole audio, SOME ARGUMENTS VARY FROM CLASSIC WAY FUNCTION ARGUMENTS** +- `batch_size` - batch size to feed to silero VAD (default - `200`) +- `step` - step size in samples, (default - `500`) (`num_samples_per_window` / `num_steps` from classic method) +- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434)); +- `min_speech_samples` - minimum speech chunk duration in samples (default - `10000`) +- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks (default - `4000`) +- `speech_pad_samples` - widen speech by this amount of samples each side (default - `2000`) + +``` +speech_timestamps = get_speech_ts_adaptive(wav, model, + num_samples_per_window=4000, + step=500, + visualize_probs=True) +``` + + The chart should looks something like this: ![image](https://user-images.githubusercontent.com/12515440/106242896-79142580-6219-11eb-9add-fa7195d6fd26.png) diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 0cb0a9d..c8235cf 100755 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -3,6 +3,7 @@ { "cell_type": "markdown", "metadata": { + "heading_collapsed": true, "id": "sVNOuHQQjsrp" }, "source": [ @@ -12,7 +13,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## VAD" @@ -57,6 +59,7 @@ " force_reload=True)\n", "\n", "(get_speech_ts,\n", + " get_speech_ts_adaptive,\n", " save_audio,\n", " read_audio,\n", " state_generator,\n", @@ -77,6 +80,15 @@ "### Full Audio" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,6 +128,43 @@ "Audio('only_speech.wav')" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en.wav')\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_ts_adaptive(wav, model, step=500, num_samples_per_window=4000)\n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav',\n", + " collect_chunks(speech_timestamps, wav), 16000) \n", + "Audio('only_speech.wav')" + ] + }, { "cell_type": "markdown", "metadata": { @@ -127,6 +176,19 @@ "### Single Audio Stream" ] }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-15T13:29:04.224833Z", + "start_time": "2021-04-15T13:29:04.220588Z" + }, + "hidden": true + }, + "source": [ + "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + ] + }, { "cell_type": "code", "execution_count": null, @@ -147,6 +209,30 @@ " print(batch)" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "wav = f'{files_dir}/en.wav'\n", + "\n", + "for batch in single_audio_stream(model, wav, iterator_type='adaptive'):\n", + " if batch:\n", + " print(batch)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -196,7 +282,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## Number detector" @@ -315,7 +402,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## Language detector" @@ -387,6 +475,7 @@ { "cell_type": "markdown", "metadata": { + "heading_collapsed": true, "id": "57avIBd6jsrz" }, "source": [ @@ -396,7 +485,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## VAD" @@ -415,13 +505,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { + "ExecuteTime": { + "end_time": "2021-04-15T13:30:22.938755Z", + "start_time": "2021-04-15T13:30:20.970574Z" + }, "cellView": "form", "hidden": true, "id": "Q4QIfSpprnkI" }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'torch' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mIPython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdisplay\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m _, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'silero_vad'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m force_reload=True)\n", + "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined" + ] + } + ], "source": [ "#@title Install and Import Dependencies\n", "\n", @@ -439,6 +545,7 @@ " force_reload=True)\n", "\n", "(get_speech_ts,\n", + " get_speech_ts_adaptive,\n", " save_audio,\n", " read_audio,\n", " state_generator,\n", @@ -470,17 +577,42 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-12-15T13:09:06.643812Z", - "start_time": "2020-12-15T13:09:06.473386Z" + "end_time": "2021-04-15T13:34:22.554010Z", + "start_time": "2021-04-15T13:34:22.550308Z" + }, + "hidden": true + }, + "source": [ + "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-15T13:30:14.475412Z", + "start_time": "2021-04-15T13:30:14.427933Z" }, "hidden": true, "id": "krnGoA6Kjsr0" }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'init_onnx_model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minit_onnx_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{files_dir}/model.onnx'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{files_dir}/en.wav'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# get speech timestamps from full audio file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mspeech_timestamps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_speech_ts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_function\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidate_onnx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'init_onnx_model' is not defined" + ] + } + ], "source": [ "model = init_onnx_model(f'{files_dir}/model.onnx')\n", "wav = read_audio(f'{files_dir}/en.wav')\n", @@ -508,6 +640,60 @@ "Audio('only_speech.wav')" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/model.onnx')\n", + "wav = read_audio(f'{files_dir}/en.wav')\n", + "\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_ts_adaptive(wav, model, run_function=validate_onnx) \n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-15T13:34:41.375446Z", + "start_time": "2021-04-15T13:34:41.368055Z" + }, + "hidden": true + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'save_audio' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# merge all speech chunks to one audio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msave_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'only_speech.wav'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcollect_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspeech_timestamps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m16000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mAudio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'only_speech.wav'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'save_audio' is not defined" + ] + } + ], + "source": [ + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)\n", + "Audio('only_speech.wav')" + ] + }, { "cell_type": "markdown", "metadata": { @@ -519,6 +705,15 @@ "### Single Audio Stream" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + ] + }, { "cell_type": "code", "execution_count": null, @@ -554,6 +749,40 @@ " pprint(batch)" ] }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/model.onnx')\n", + "wav = f'{files_dir}/en.wav'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true + }, + "outputs": [], + "source": [ + "for batch in single_audio_stream(model, wav, iterator_type='adaptive', run_function=validate_onnx):\n", + " if batch:\n", + " pprint(batch)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -604,7 +833,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## Number detector" @@ -753,7 +983,8 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true + "heading_collapsed": true, + "hidden": true }, "source": [ "## Language detector" @@ -819,7 +1050,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, "hidden": true, "id": "5JHErdB7jsr0" }, @@ -863,7 +1093,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.8.8" }, "toc": { "base_numbering": 1, From ec16a93fc451a14ec3c584769d9225e458a57076 Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Thu, 15 Apr 2021 14:08:49 +0000 Subject: [PATCH 2/6] fx notebook --- silero-vad.ipynb | 459 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 371 insertions(+), 88 deletions(-) mode change 100755 => 100644 silero-vad.ipynb diff --git a/silero-vad.ipynb b/silero-vad.ipynb old mode 100755 new mode 100644 index c8235cf..89babc3 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -14,7 +14,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "FpMplOCA2Fwp" }, "source": [ "## VAD" @@ -24,7 +25,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "62A6F_072Fwq" }, "source": [ "### Install Dependencies" @@ -32,15 +34,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-12-30T17:35:43.397137Z", "start_time": "2020-12-30T17:33:10.962078Z" }, - "hidden": true + "colab": { + "base_uri": "https://localhost:8080/" + }, + "hidden": true, + "id": "5w5AkskZ2Fwr", + "outputId": "545c0988-965d-4462-eb06-d4c5a48d8969" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l\r\u001b[K |▏ | 10kB 16.5MB/s eta 0:00:01\r\u001b[K |▍ | 20kB 20.7MB/s eta 0:00:01\r\u001b[K |▌ | 30kB 23.6MB/s eta 0:00:01\r\u001b[K |▊ | 40kB 24.5MB/s eta 0:00:01\r\u001b[K |▉ | 51kB 25.9MB/s eta 0:00:01\r\u001b[K |█ | 61kB 23.6MB/s eta 0:00:01\r\u001b[K |█▏ | 71kB 19.5MB/s eta 0:00:01\r\u001b[K |█▍ | 81kB 20.3MB/s eta 0:00:01\r\u001b[K |█▌ | 92kB 18.4MB/s eta 0:00:01\r\u001b[K |█▊ | 102kB 17.6MB/s eta 0:00:01\r\u001b[K |█▉ | 112kB 17.6MB/s eta 0:00:01\r\u001b[K |██ | 122kB 17.6MB/s eta 0:00:01\r\u001b[K |██▏ | 133kB 17.6MB/s eta 0:00:01\r\u001b[K |██▍ | 143kB 17.6MB/s eta 0:00:01\r\u001b[K |██▌ | 153kB 17.6MB/s eta 0:00:01\r\u001b[K |██▊ | 163kB 17.6MB/s eta 0:00:01\r\u001b[K |██▉ | 174kB 17.6MB/s eta 0:00:01\r\u001b[K |███ | 184kB 17.6MB/s eta 0:00:01\r\u001b[K |███▏ | 194kB 17.6MB/s eta 0:00:01\r\u001b[K |███▍ | 204kB 17.6MB/s eta 0:00:01\r\u001b[K |███▌ | 215kB 17.6MB/s eta 0:00:01\r\u001b[K |███▊ | 225kB 17.6MB/s eta 0:00:01\r\u001b[K |███▉ | 235kB 17.6MB/s eta 0:00:01\r\u001b[K |████ | 245kB 17.6MB/s eta 0:00:01\r\u001b[K |████▏ | 256kB 17.6MB/s eta 0:00:01\r\u001b[K |████▍ | 266kB 17.6MB/s eta 0:00:01\r\u001b[K |████▌ | 276kB 17.6MB/s eta 0:00:01\r\u001b[K |████▊ | 286kB 17.6MB/s eta 0:00:01\r\u001b[K |█████ | 296kB 17.6MB/s eta 0:00:01\r\u001b[K |█████ | 307kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▎ | 317kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▍ | 327kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▋ | 337kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▊ | 348kB 17.6MB/s eta 0:00:01\r\u001b[K |██████ | 358kB 17.6MB/s eta 0:00:01\r\u001b[K |██████ | 368kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▎ | 378kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▍ | 389kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▋ | 399kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▊ | 409kB 17.6MB/s eta 0:00:01\r\u001b[K |███████ | 419kB 17.6MB/s eta 0:00:01\r\u001b[K |███████ | 430kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▎ | 440kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▍ | 450kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▋ | 460kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▊ | 471kB 17.6MB/s eta 0:00:01\r\u001b[K |████████ | 481kB 17.6MB/s eta 0:00:01\r\u001b[K |████████ | 491kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▎ | 501kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▍ | 512kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▋ | 522kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▊ | 532kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████ | 542kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████ | 552kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▎ | 563kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▌ | 573kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▋ | 583kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▉ | 593kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████ | 604kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▏ | 614kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▎ | 624kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▌ | 634kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▋ | 645kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▉ | 655kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████ | 665kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▏ | 675kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▎ | 686kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▌ | 696kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▋ | 706kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▉ | 716kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████ | 727kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▏ | 737kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▎ | 747kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▌ | 757kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▋ | 768kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▉ | 778kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████ | 788kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 798kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▎ | 808kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 819kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 829kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▉ | 839kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████ | 849kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▏ | 860kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▍ | 870kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▌ | 880kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 890kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▉ | 901kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████ | 911kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▏ | 921kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 931kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▌ | 942kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 952kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▉ | 962kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████ | 972kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▏ | 983kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 993kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▊ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▍ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▌ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▊ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▉ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▌ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▎ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▋ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▎ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▋ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▊ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▍ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▊ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▍ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▋ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▎ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▌ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▋ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▎ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▋ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▏ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▌ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▉ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▏ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▎ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▋ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▉ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▋ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▍ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▊ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▉ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▌ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▊ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▉ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▏ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▌ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▉ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▍| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▊| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 1.9MB 17.6MB/s \n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /root/.cache/torch/hub/master.zip\n" + ] + } + ], "source": [ "#@title Install and Import Dependencies\n", "\n", @@ -83,7 +106,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "dY2Us3_Q2Fws" }, "source": [ "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" @@ -91,16 +115,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-12-30T17:35:44.362860Z", "start_time": "2020-12-30T17:35:43.398441Z" }, + "colab": { + "base_uri": "https://localhost:8080/" + }, "hidden": true, - "id": "aI_eydBPjsrx" + "id": "aI_eydBPjsrx", + "outputId": "17d317e6-ec8c-46a2-c5ec-682c1391e58d" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py:889: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:639.)\n", + " result = self.forward(*input, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'end': 35000, 'start': 0},\n", + " {'end': 112000, 'start': 35000},\n", + " {'end': 124000, 'start': 112000},\n", + " {'end': 320000, 'start': 143000},\n", + " {'end': 628000, 'start': 319000},\n", + " {'end': 752000, 'start': 632000},\n", + " {'end': 801000, 'start': 775000},\n", + " {'end': 960000, 'start': 811000}]\n" + ] + } + ], "source": [ "wav = read_audio(f'{files_dir}/en.wav')\n", "# get speech timestamps from full audio file\n", @@ -131,7 +182,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "n8plzbJU2Fws" }, "source": [ "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" @@ -139,11 +191,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { - "hidden": true + "colab": { + "base_uri": "https://localhost:8080/" + }, + "hidden": true, + "id": "SQOtu2Vl2Fwt", + "outputId": "3a560cf3-a882-4db7-ad7e-0ab9bf1a9698" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'end': 35000, 'start': 0},\n", + " {'end': 112000, 'start': 35500},\n", + " {'end': 246000, 'start': 142500},\n", + " {'end': 288500, 'start': 251500},\n", + " {'end': 315500, 'start': 289500},\n", + " {'end': 603500, 'start': 318000},\n", + " {'end': 623000, 'start': 606500},\n", + " {'end': 713000, 'start': 631000},\n", + " {'end': 728500, 'start': 712000},\n", + " {'end': 748500, 'start': 726500},\n", + " {'end': 798500, 'start': 775000},\n", + " {'end': 899500, 'start': 811000},\n", + " {'end': 914000, 'start': 897000},\n", + " {'end': 962000, 'start': 913000}]\n" + ] + } + ], "source": [ "wav = read_audio(f'{files_dir}/en.wav')\n", "# get speech timestamps from full audio file\n", @@ -155,7 +233,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "Lr6zCGXh2Fwt" }, "outputs": [], "source": [ @@ -183,7 +262,8 @@ "end_time": "2021-04-15T13:29:04.224833Z", "start_time": "2021-04-15T13:29:04.220588Z" }, - "hidden": true + "hidden": true, + "id": "xCM-HrUR2Fwu" }, "source": [ "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" @@ -191,16 +271,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:59.199321Z", "start_time": "2020-12-15T13:09:59.196823Z" }, + "colab": { + "base_uri": "https://localhost:8080/" + }, "hidden": true, - "id": "q-lql_2Wjsry" + "id": "q-lql_2Wjsry", + "outputId": "ada632d4-eaba-475e-b00c-fa8238411792" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{4000: 'start'}]\n", + "[{39000: 'end'}]\n", + "[{43000: 'start'}]\n", + "[{115500: 'end'}]\n", + "[{121500: 'start'}]\n", + "[{127500: 'end'}]\n", + "[{150500: 'start'}]\n", + "[{291000: 'end'}]\n", + "[{295000: 'start'}]\n", + "[{322000: 'end'}]\n", + "[{326500: 'start'}]\n", + "[{631500: 'end'}]\n", + "[{640500: 'start'}]\n", + "[{755000: 'end'}]\n", + "[{782500: 'start'}]\n", + "[{804500: 'end'}]\n", + "[{818500: 'start'}]\n" + ] + } + ], "source": [ "wav = f'{files_dir}/en.wav'\n", "\n", @@ -212,7 +320,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "t8TXtnvk2Fwv" }, "source": [ "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" @@ -220,11 +329,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { - "hidden": true + "colab": { + "base_uri": "https://localhost:8080/" + }, + "hidden": true, + "id": "BX3UgwwB2Fwv", + "outputId": "8d704639-6f3e-4520-d6ac-7ac988265286" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{2000: 'start'}]\n", + "[{40000: 'end'}]\n", + "[{44000: 'start'}]\n", + "[{115500: 'end'}]\n", + "[{151000: 'start'}]\n", + "[{251000: 'end'}]\n", + "[{260000: 'start'}]\n", + "[{291500: 'end'}]\n", + "[{298000: 'start'}]\n", + "[{320500: 'end'}]\n", + "[{326500: 'start'}]\n", + "[{612500: 'end'}]\n", + "[{615000: 'start'}]\n", + "[{628000: 'end'}]\n", + "[{639500: 'start'}]\n", + "[{718500: 'end'}]\n", + "[{720500: 'start'}]\n", + "[{755500: 'end'}]\n", + "[{783500: 'start'}]\n", + "[{805000: 'end'}]\n", + "[{819500: 'start'}]\n", + "[{902000: 'end'}]\n", + "[{905500: 'start'}]\n", + "[{921000: 'start'}]\n" + ] + } + ], "source": [ "wav = f'{files_dir}/en.wav'\n", "\n", @@ -283,7 +428,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "36jY0niD2Fww" }, "source": [ "## Number detector" @@ -293,7 +439,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "scd1DlS42Fwx" }, "source": [ "### Install Dependencies" @@ -303,7 +450,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "Kq5gQuYq2Fwx" }, "outputs": [], "source": [ @@ -336,7 +484,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "qhPa30ij2Fwy" }, "source": [ "### Full audio" @@ -346,7 +495,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "EXpau6xq2Fwy" }, "outputs": [], "source": [ @@ -360,7 +510,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "u-KfXRhZ2Fwy" }, "outputs": [], "source": [ @@ -375,7 +526,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "iwYEC4aZ2Fwy" }, "outputs": [], "source": [ @@ -389,7 +541,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "fHaYejX12Fwy" }, "outputs": [], "source": [ @@ -403,7 +556,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "PnKtJKbq2Fwz" }, "source": [ "## Language detector" @@ -413,7 +567,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "F5cAmMbP2Fwz" }, "source": [ "### Install Dependencies" @@ -423,7 +578,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "Zu9D0t6n2Fwz" }, "outputs": [], "source": [ @@ -453,7 +609,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "iC696eMX2Fwz" }, "source": [ "### Full audio" @@ -463,7 +620,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "c8UYnYBF2Fw0" }, "outputs": [], "source": [ @@ -486,7 +644,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "hEhnfORV2Fw0" }, "source": [ "## VAD" @@ -505,26 +664,34 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2021-04-15T13:30:22.938755Z", "start_time": "2021-04-15T13:30:20.970574Z" }, "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, "hidden": true, - "id": "Q4QIfSpprnkI" + "id": "Q4QIfSpprnkI", + "outputId": "119e85c9-bb9a-43bb-ae23-7d197b470096" }, "outputs": [ { - "ename": "NameError", - "evalue": "name 'torch' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m\u001b[0m", - "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mIPython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdisplay\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m _, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'silero_vad'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m force_reload=True)\n", - "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K |████████████████████████████████| 4.1MB 19.5MB/s \n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /root/.cache/torch/hub/master.zip\n" ] } ], @@ -583,7 +750,8 @@ "end_time": "2021-04-15T13:34:22.554010Z", "start_time": "2021-04-15T13:34:22.550308Z" }, - "hidden": true + "hidden": true, + "id": "TNEtK5zi2Fw2" }, "source": [ "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" @@ -591,25 +759,32 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2021-04-15T13:30:14.475412Z", "start_time": "2021-04-15T13:30:14.427933Z" }, + "colab": { + "base_uri": "https://localhost:8080/" + }, "hidden": true, - "id": "krnGoA6Kjsr0" + "id": "krnGoA6Kjsr0", + "outputId": "edab010a-e066-42a0-9b4c-2ab2579b6b47" }, "outputs": [ { - "ename": "NameError", - "evalue": "name 'init_onnx_model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m\u001b[0m", - "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minit_onnx_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{files_dir}/model.onnx'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'{files_dir}/en.wav'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# get speech timestamps from full audio file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mspeech_timestamps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_speech_ts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_steps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_function\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidate_onnx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'init_onnx_model' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'end': 33000, 'start': 0},\n", + " {'end': 112000, 'start': 35000},\n", + " {'end': 287000, 'start': 143000},\n", + " {'end': 317000, 'start': 287000},\n", + " {'end': 623000, 'start': 319000},\n", + " {'end': 752000, 'start': 632000},\n", + " {'end': 801000, 'start': 775000},\n", + " {'end': 960000, 'start': 811000}]\n" ] } ], @@ -643,7 +818,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "21RE8KEC2Fw2" }, "source": [ "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" @@ -651,11 +827,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { - "hidden": true + "colab": { + "base_uri": "https://localhost:8080/" + }, + "hidden": true, + "id": "uIVs56rb2Fw2", + "outputId": "50ce9117-17d8-4bef-eb53-7204c56c4b7b" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'end': 35000, 'start': 0},\n", + " {'end': 112500, 'start': 34500},\n", + " {'end': 245000, 'start': 140000},\n", + " {'end': 286500, 'start': 251500},\n", + " {'end': 315000, 'start': 285000},\n", + " {'end': 527500, 'start': 316500},\n", + " {'end': 603500, 'start': 524500},\n", + " {'end': 623500, 'start': 606500},\n", + " {'end': 713000, 'start': 629500},\n", + " {'end': 738500, 'start': 711500},\n", + " {'end': 751000, 'start': 735000},\n", + " {'end': 797500, 'start': 772500},\n", + " {'end': 883000, 'start': 809000},\n", + " {'end': 914500, 'start': 897000},\n", + " {'end': 962000, 'start': 911500}]\n" + ] + } + ], "source": [ "model = init_onnx_model(f'{files_dir}/model.onnx')\n", "wav = read_audio(f'{files_dir}/en.wav')\n", @@ -667,13 +870,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-04-15T13:34:41.375446Z", "start_time": "2021-04-15T13:34:41.368055Z" }, - "hidden": true + "hidden": true, + "id": "cox6oumC2Fw3", + "outputId": "1c18d7b1-ae80-42cb-c2e1-b6494104e5f7" }, "outputs": [ { @@ -708,7 +913,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "i8EZwtaA2Fw3" }, "source": [ "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" @@ -716,7 +922,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:09.606031Z", @@ -733,16 +939,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:11.453171Z", "start_time": "2020-12-15T13:09:09.633435Z" }, + "colab": { + "base_uri": "https://localhost:8080/" + }, "hidden": true, - "id": "NC6Jim0hjsr1" + "id": "NC6Jim0hjsr1", + "outputId": "4c48843d-8510-4d26-c546-220e22a85361" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{4000: 'start'}]\n", + "[{37000: 'end'}]\n", + "[{43000: 'start'}]\n", + "[{115500: 'end'}]\n", + "[{150500: 'start'}]\n", + "[{291000: 'end'}]\n", + "[{294500: 'start'}]\n", + "[{321000: 'end'}]\n", + "[{326500: 'start'}]\n", + "[{627000: 'end'}]\n", + "[{639000: 'start'}]\n", + "[{718000: 'end'}]\n", + "[{721000: 'start'}]\n", + "[{755500: 'end'}]\n", + "[{783000: 'start'}]\n", + "[{804000: 'end'}]\n", + "[{818500: 'start'}]\n" + ] + } + ], "source": [ "for batch in single_audio_stream(model, wav, run_function=validate_onnx):\n", " if batch:\n", @@ -752,7 +986,8 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "hidden": true, + "id": "0pSKslpz2Fw3" }, "source": [ "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" @@ -760,9 +995,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { - "hidden": true + "hidden": true, + "id": "RZwc-Khk2Fw4" }, "outputs": [], "source": [ @@ -772,11 +1008,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { - "hidden": true + "colab": { + "base_uri": "https://localhost:8080/" + }, + "hidden": true, + "id": "Z4lzFPs02Fw4", + "outputId": "8d2f9cb4-dbc7-4c7c-dde1-ff3b3297aa07" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{0: 'start'}]\n", + "[{38000: 'end'}]\n", + "[{43000: 'start'}]\n", + "[{115000: 'end'}]\n", + "[{148500: 'start'}]\n", + "[{250500: 'end'}]\n", + "[{260000: 'start'}]\n", + "[{292000: 'end'}]\n", + "[{293500: 'start'}]\n", + "[{320000: 'end'}]\n", + "[{325000: 'start'}]\n", + "[{548000: 'end'}]\n", + "[{547500: 'start'}]\n", + "[{613000: 'end'}]\n", + "[{615000: 'start'}]\n", + "[{626500: 'end'}]\n", + "[{638000: 'start'}]\n", + "[{697500: 'start'}]\n", + "[{718000: 'end'}]\n", + "[{720000: 'start'}]\n", + "[{756000: 'end'}]\n", + "[{781000: 'start'}]\n", + "[{804500: 'end'}]\n", + "[{817500: 'start'}]\n", + "[{872000: 'end'}]\n", + "[{871000: 'start'}]\n", + "[{902000: 'end'}]\n", + "[{905500: 'start'}]\n", + "[{920500: 'end'}]\n", + "[{920000: 'start'}]\n" + ] + } + ], "source": [ "for batch in single_audio_stream(model, wav, iterator_type='adaptive', run_function=validate_onnx):\n", " if batch:\n", @@ -834,7 +1112,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "7QMvUvpg2Fw4" }, "source": [ "## Number detector" @@ -845,7 +1124,7 @@ "metadata": { "heading_collapsed": true, "hidden": true, - "id": "bL4kn4KJrlyL" + "id": "tBPDkpHr2Fw4" }, "source": [ "### Install Dependencies" @@ -861,7 +1140,7 @@ }, "cellView": "form", "hidden": true, - "id": "Q4QIfSpprnkI" + "id": "PdjGd56R2Fw5" }, "outputs": [], "source": [ @@ -905,7 +1184,7 @@ "metadata": { "heading_collapsed": true, "hidden": true, - "id": "5JHErdB7jsr0" + "id": "I9QWSFZh2Fw5" }, "source": [ "### Full Audio" @@ -920,7 +1199,7 @@ "start_time": "2020-12-15T13:09:06.473386Z" }, "hidden": true, - "id": "krnGoA6Kjsr0" + "id": "_r6QZiwu2Fw5" }, "outputs": [], "source": [ @@ -936,7 +1215,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "FN4aDwLV2Fw5" }, "outputs": [], "source": [ @@ -956,7 +1236,7 @@ "start_time": "2020-12-15T13:09:08.820014Z" }, "hidden": true, - "id": "B176Lzfnjsr1" + "id": "JnvS6WTK2Fw5" }, "outputs": [], "source": [ @@ -970,7 +1250,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "yUxOcOFG2Fw6" }, "outputs": [], "source": [ @@ -984,7 +1265,8 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true + "hidden": true, + "id": "SR8Bgcd52Fw6" }, "source": [ "## Language detector" @@ -995,7 +1277,7 @@ "metadata": { "heading_collapsed": true, "hidden": true, - "id": "bL4kn4KJrlyL" + "id": "PBnXPtKo2Fw6" }, "source": [ "### Install Dependencies" @@ -1011,7 +1293,7 @@ }, "cellView": "form", "hidden": true, - "id": "Q4QIfSpprnkI" + "id": "iNkDWJ3H2Fw6" }, "outputs": [], "source": [ @@ -1051,7 +1333,7 @@ "cell_type": "markdown", "metadata": { "hidden": true, - "id": "5JHErdB7jsr0" + "id": "G8N8oP4q2Fw6" }, "source": [ "### Full Audio" @@ -1061,7 +1343,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true + "hidden": true, + "id": "WHXnh9IV2Fw6" }, "outputs": [], "source": [ @@ -1110,5 +1393,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } From ac86c5bc529f79977f0cb845105b68384a1e0296 Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Thu, 15 Apr 2021 14:12:30 +0000 Subject: [PATCH 3/6] clear out --- silero-vad.ipynb | 350 +++++------------------------------------------ 1 file changed, 34 insertions(+), 316 deletions(-) diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 89babc3..7d40db1 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -34,36 +34,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-30T17:35:43.397137Z", "start_time": "2020-12-30T17:33:10.962078Z" }, - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "5w5AkskZ2Fwr", - "outputId": "545c0988-965d-4462-eb06-d4c5a48d8969" + "id": "5w5AkskZ2Fwr" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[?25l\r\u001b[K |▏ | 10kB 16.5MB/s eta 0:00:01\r\u001b[K |▍ | 20kB 20.7MB/s eta 0:00:01\r\u001b[K |▌ | 30kB 23.6MB/s eta 0:00:01\r\u001b[K |▊ | 40kB 24.5MB/s eta 0:00:01\r\u001b[K |▉ | 51kB 25.9MB/s eta 0:00:01\r\u001b[K |█ | 61kB 23.6MB/s eta 0:00:01\r\u001b[K |█▏ | 71kB 19.5MB/s eta 0:00:01\r\u001b[K |█▍ | 81kB 20.3MB/s eta 0:00:01\r\u001b[K |█▌ | 92kB 18.4MB/s eta 0:00:01\r\u001b[K |█▊ | 102kB 17.6MB/s eta 0:00:01\r\u001b[K |█▉ | 112kB 17.6MB/s eta 0:00:01\r\u001b[K |██ | 122kB 17.6MB/s eta 0:00:01\r\u001b[K |██▏ | 133kB 17.6MB/s eta 0:00:01\r\u001b[K |██▍ | 143kB 17.6MB/s eta 0:00:01\r\u001b[K |██▌ | 153kB 17.6MB/s eta 0:00:01\r\u001b[K |██▊ | 163kB 17.6MB/s eta 0:00:01\r\u001b[K |██▉ | 174kB 17.6MB/s eta 0:00:01\r\u001b[K |███ | 184kB 17.6MB/s eta 0:00:01\r\u001b[K |███▏ | 194kB 17.6MB/s eta 0:00:01\r\u001b[K |███▍ | 204kB 17.6MB/s eta 0:00:01\r\u001b[K |███▌ | 215kB 17.6MB/s eta 0:00:01\r\u001b[K |███▊ | 225kB 17.6MB/s eta 0:00:01\r\u001b[K |███▉ | 235kB 17.6MB/s eta 0:00:01\r\u001b[K |████ | 245kB 17.6MB/s eta 0:00:01\r\u001b[K |████▏ | 256kB 17.6MB/s eta 0:00:01\r\u001b[K |████▍ | 266kB 17.6MB/s eta 0:00:01\r\u001b[K |████▌ | 276kB 17.6MB/s eta 0:00:01\r\u001b[K |████▊ | 286kB 17.6MB/s eta 0:00:01\r\u001b[K |█████ | 296kB 17.6MB/s eta 0:00:01\r\u001b[K |█████ | 307kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▎ | 317kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▍ | 327kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▋ | 337kB 17.6MB/s eta 0:00:01\r\u001b[K |█████▊ | 348kB 17.6MB/s eta 0:00:01\r\u001b[K |██████ | 358kB 17.6MB/s eta 0:00:01\r\u001b[K |██████ | 368kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▎ | 378kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▍ | 389kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▋ | 399kB 17.6MB/s eta 0:00:01\r\u001b[K |██████▊ | 409kB 17.6MB/s eta 0:00:01\r\u001b[K |███████ | 419kB 17.6MB/s eta 0:00:01\r\u001b[K |███████ | 430kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▎ | 440kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▍ | 450kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▋ | 460kB 17.6MB/s eta 0:00:01\r\u001b[K |███████▊ | 471kB 17.6MB/s eta 0:00:01\r\u001b[K |████████ | 481kB 17.6MB/s eta 0:00:01\r\u001b[K |████████ | 491kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▎ | 501kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▍ | 512kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▋ | 522kB 17.6MB/s eta 0:00:01\r\u001b[K |████████▊ | 532kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████ | 542kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████ | 552kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▎ | 563kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▌ | 573kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▋ | 583kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████▉ | 593kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████ | 604kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▏ | 614kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▎ | 624kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▌ | 634kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▋ | 645kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████▉ | 655kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████ | 665kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▏ | 675kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▎ | 686kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▌ | 696kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▋ | 706kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████▉ | 716kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████ | 727kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▏ | 737kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▎ | 747kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▌ | 757kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▋ | 768kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████▉ | 778kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████ | 788kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 798kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▎ | 808kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 819kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 829kB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████▉ | 839kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████ | 849kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▏ | 860kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▍ | 870kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▌ | 880kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 890kB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████▉ | 901kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████ | 911kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▏ | 921kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 931kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▌ | 942kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 952kB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████▉ | 962kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████ | 972kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▏ | 983kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 993kB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▊ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 1.0MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▍ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▌ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▊ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████▉ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▌ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████ | 1.1MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▎ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▋ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▎ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▋ | 1.2MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████▊ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▍ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▊ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 1.3MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▍ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▋ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▎ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▌ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▋ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 1.4MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▎ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▋ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▏ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▌ | 1.5MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▉ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▏ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▎ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▋ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████▉ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 1.6MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▋ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▍ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▊ | 1.7MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████▉ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▌ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▊ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▉ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▏ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 1.8MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▌ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▉ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████ | 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▍| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▊| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 1.9MB 17.6MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 1.9MB 17.6MB/s \n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /root/.cache/torch/hub/master.zip\n" - ] - } - ], + "outputs": [], "source": [ "#@title Install and Import Dependencies\n", "\n", @@ -115,43 +95,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-30T17:35:44.362860Z", "start_time": "2020-12-30T17:35:43.398441Z" }, - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "aI_eydBPjsrx", - "outputId": "17d317e6-ec8c-46a2-c5ec-682c1391e58d" + "id": "aI_eydBPjsrx" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py:889: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:639.)\n", - " result = self.forward(*input, **kwargs)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'end': 35000, 'start': 0},\n", - " {'end': 112000, 'start': 35000},\n", - " {'end': 124000, 'start': 112000},\n", - " {'end': 320000, 'start': 143000},\n", - " {'end': 628000, 'start': 319000},\n", - " {'end': 752000, 'start': 632000},\n", - " {'end': 801000, 'start': 775000},\n", - " {'end': 960000, 'start': 811000}]\n" - ] - } - ], + "outputs": [], "source": [ "wav = read_audio(f'{files_dir}/en.wav')\n", "# get speech timestamps from full audio file\n", @@ -191,37 +144,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "SQOtu2Vl2Fwt", - "outputId": "3a560cf3-a882-4db7-ad7e-0ab9bf1a9698" + "id": "SQOtu2Vl2Fwt" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'end': 35000, 'start': 0},\n", - " {'end': 112000, 'start': 35500},\n", - " {'end': 246000, 'start': 142500},\n", - " {'end': 288500, 'start': 251500},\n", - " {'end': 315500, 'start': 289500},\n", - " {'end': 603500, 'start': 318000},\n", - " {'end': 623000, 'start': 606500},\n", - " {'end': 713000, 'start': 631000},\n", - " {'end': 728500, 'start': 712000},\n", - " {'end': 748500, 'start': 726500},\n", - " {'end': 798500, 'start': 775000},\n", - " {'end': 899500, 'start': 811000},\n", - " {'end': 914000, 'start': 897000},\n", - " {'end': 962000, 'start': 913000}]\n" - ] - } - ], + "outputs": [], "source": [ "wav = read_audio(f'{files_dir}/en.wav')\n", "# get speech timestamps from full audio file\n", @@ -271,44 +199,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:59.199321Z", "start_time": "2020-12-15T13:09:59.196823Z" }, - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "q-lql_2Wjsry", - "outputId": "ada632d4-eaba-475e-b00c-fa8238411792" + "id": "q-lql_2Wjsry" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{4000: 'start'}]\n", - "[{39000: 'end'}]\n", - "[{43000: 'start'}]\n", - "[{115500: 'end'}]\n", - "[{121500: 'start'}]\n", - "[{127500: 'end'}]\n", - "[{150500: 'start'}]\n", - "[{291000: 'end'}]\n", - "[{295000: 'start'}]\n", - "[{322000: 'end'}]\n", - "[{326500: 'start'}]\n", - "[{631500: 'end'}]\n", - "[{640500: 'start'}]\n", - "[{755000: 'end'}]\n", - "[{782500: 'start'}]\n", - "[{804500: 'end'}]\n", - "[{818500: 'start'}]\n" - ] - } - ], + "outputs": [], "source": [ "wav = f'{files_dir}/en.wav'\n", "\n", @@ -329,47 +229,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "BX3UgwwB2Fwv", - "outputId": "8d704639-6f3e-4520-d6ac-7ac988265286" + "id": "BX3UgwwB2Fwv" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{2000: 'start'}]\n", - "[{40000: 'end'}]\n", - "[{44000: 'start'}]\n", - "[{115500: 'end'}]\n", - "[{151000: 'start'}]\n", - "[{251000: 'end'}]\n", - "[{260000: 'start'}]\n", - "[{291500: 'end'}]\n", - "[{298000: 'start'}]\n", - "[{320500: 'end'}]\n", - "[{326500: 'start'}]\n", - "[{612500: 'end'}]\n", - "[{615000: 'start'}]\n", - "[{628000: 'end'}]\n", - "[{639500: 'start'}]\n", - "[{718500: 'end'}]\n", - "[{720500: 'start'}]\n", - "[{755500: 'end'}]\n", - "[{783500: 'start'}]\n", - "[{805000: 'end'}]\n", - "[{819500: 'start'}]\n", - "[{902000: 'end'}]\n", - "[{905500: 'start'}]\n", - "[{921000: 'start'}]\n" - ] - } - ], + "outputs": [], "source": [ "wav = f'{files_dir}/en.wav'\n", "\n", @@ -664,37 +529,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-04-15T13:30:22.938755Z", "start_time": "2021-04-15T13:30:20.970574Z" }, "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "Q4QIfSpprnkI", - "outputId": "119e85c9-bb9a-43bb-ae23-7d197b470096" + "id": "Q4QIfSpprnkI" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[K |████████████████████████████████| 4.1MB 19.5MB/s \n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /root/.cache/torch/hub/master.zip\n" - ] - } - ], + "outputs": [], "source": [ "#@title Install and Import Dependencies\n", "\n", @@ -759,35 +604,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2021-04-15T13:30:14.475412Z", "start_time": "2021-04-15T13:30:14.427933Z" }, - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "krnGoA6Kjsr0", - "outputId": "edab010a-e066-42a0-9b4c-2ab2579b6b47" + "id": "krnGoA6Kjsr0" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'end': 33000, 'start': 0},\n", - " {'end': 112000, 'start': 35000},\n", - " {'end': 287000, 'start': 143000},\n", - " {'end': 317000, 'start': 287000},\n", - " {'end': 623000, 'start': 319000},\n", - " {'end': 752000, 'start': 632000},\n", - " {'end': 801000, 'start': 775000},\n", - " {'end': 960000, 'start': 811000}]\n" - ] - } - ], + "outputs": [], "source": [ "model = init_onnx_model(f'{files_dir}/model.onnx')\n", "wav = read_audio(f'{files_dir}/en.wav')\n", @@ -827,38 +653,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "uIVs56rb2Fw2", - "outputId": "50ce9117-17d8-4bef-eb53-7204c56c4b7b" + "id": "uIVs56rb2Fw2" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'end': 35000, 'start': 0},\n", - " {'end': 112500, 'start': 34500},\n", - " {'end': 245000, 'start': 140000},\n", - " {'end': 286500, 'start': 251500},\n", - " {'end': 315000, 'start': 285000},\n", - " {'end': 527500, 'start': 316500},\n", - " {'end': 603500, 'start': 524500},\n", - " {'end': 623500, 'start': 606500},\n", - " {'end': 713000, 'start': 629500},\n", - " {'end': 738500, 'start': 711500},\n", - " {'end': 751000, 'start': 735000},\n", - " {'end': 797500, 'start': 772500},\n", - " {'end': 883000, 'start': 809000},\n", - " {'end': 914500, 'start': 897000},\n", - " {'end': 962000, 'start': 911500}]\n" - ] - } - ], + "outputs": [], "source": [ "model = init_onnx_model(f'{files_dir}/model.onnx')\n", "wav = read_audio(f'{files_dir}/en.wav')\n", @@ -877,22 +677,9 @@ "start_time": "2021-04-15T13:34:41.368055Z" }, "hidden": true, - "id": "cox6oumC2Fw3", - "outputId": "1c18d7b1-ae80-42cb-c2e1-b6494104e5f7" + "id": "cox6oumC2Fw3" }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'save_audio' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m\u001b[0m", - "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# merge all speech chunks to one audio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msave_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'only_speech.wav'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcollect_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspeech_timestamps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m16000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mAudio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'only_speech.wav'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'save_audio' is not defined" - ] - } - ], + "outputs": [], "source": [ "# merge all speech chunks to one audio\n", "save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)\n", @@ -922,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:09.606031Z", @@ -939,44 +726,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:11.453171Z", "start_time": "2020-12-15T13:09:09.633435Z" }, - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "NC6Jim0hjsr1", - "outputId": "4c48843d-8510-4d26-c546-220e22a85361" + "id": "NC6Jim0hjsr1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{4000: 'start'}]\n", - "[{37000: 'end'}]\n", - "[{43000: 'start'}]\n", - "[{115500: 'end'}]\n", - "[{150500: 'start'}]\n", - "[{291000: 'end'}]\n", - "[{294500: 'start'}]\n", - "[{321000: 'end'}]\n", - "[{326500: 'start'}]\n", - "[{627000: 'end'}]\n", - "[{639000: 'start'}]\n", - "[{718000: 'end'}]\n", - "[{721000: 'start'}]\n", - "[{755500: 'end'}]\n", - "[{783000: 'start'}]\n", - "[{804000: 'end'}]\n", - "[{818500: 'start'}]\n" - ] - } - ], + "outputs": [], "source": [ "for batch in single_audio_stream(model, wav, run_function=validate_onnx):\n", " if batch:\n", @@ -995,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "hidden": true, "id": "RZwc-Khk2Fw4" @@ -1008,53 +767,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, "hidden": true, - "id": "Z4lzFPs02Fw4", - "outputId": "8d2f9cb4-dbc7-4c7c-dde1-ff3b3297aa07" + "id": "Z4lzFPs02Fw4" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{0: 'start'}]\n", - "[{38000: 'end'}]\n", - "[{43000: 'start'}]\n", - "[{115000: 'end'}]\n", - "[{148500: 'start'}]\n", - "[{250500: 'end'}]\n", - "[{260000: 'start'}]\n", - "[{292000: 'end'}]\n", - "[{293500: 'start'}]\n", - "[{320000: 'end'}]\n", - "[{325000: 'start'}]\n", - "[{548000: 'end'}]\n", - "[{547500: 'start'}]\n", - "[{613000: 'end'}]\n", - "[{615000: 'start'}]\n", - "[{626500: 'end'}]\n", - "[{638000: 'start'}]\n", - "[{697500: 'start'}]\n", - "[{718000: 'end'}]\n", - "[{720000: 'start'}]\n", - "[{756000: 'end'}]\n", - "[{781000: 'start'}]\n", - "[{804500: 'end'}]\n", - "[{817500: 'start'}]\n", - "[{872000: 'end'}]\n", - "[{871000: 'start'}]\n", - "[{902000: 'end'}]\n", - "[{905500: 'start'}]\n", - "[{920500: 'end'}]\n", - "[{920000: 'start'}]\n" - ] - } - ], + "outputs": [], "source": [ "for batch in single_audio_stream(model, wav, iterator_type='adaptive', run_function=validate_onnx):\n", " if batch:\n", From 1f741e2bec648542966bf12b72962296b058153b Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Thu, 15 Apr 2021 14:27:20 +0000 Subject: [PATCH 4/6] fx --- README.md | 8 +-- silero-vad.ipynb | 142 +++-------------------------------------------- 2 files changed, 13 insertions(+), 137 deletions(-) diff --git a/README.md b/README.md index 3f13f39..ee4cdd0 100644 --- a/README.md +++ b/README.md @@ -351,7 +351,7 @@ We use random 250 ms audio chunks for validation. Speech to non-speech ratio amo Since our VAD (only VAD, other networks are more flexible) was trained on chunks of the same length, model's output is just one float from 0 to 1 - **speech probability**. We use speech probabilities as thresholds for precision-recall curve. This can be extended to 100 - 150 ms. Less than 100 - 150 ms cannot be distinguished as speech with confidence. -[Webrtc](https://github.com/wiseman/py-webrtcvad) splits audio into frames, each frame has corresponding number (0 **or** 1). We use 30ms frames for webrtc, so each 250 ms chunk is split into 8 frames, their **mean** value is used as a treshold for plot. +[Webrtc](https://github.com/wiseman/py-webrtcvad) splits audio into frames, each frame has corresponding number (0 **or** 1). We use 30ms frames for webrtc, so each 250 ms chunk is split into 8 frames, their **mean** value is used as a threshold for plot. [Auditok](https://github.com/amsehili/auditok) - logic same as Webrtc, but we use 50ms frames. @@ -363,7 +363,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks #### **Classic way** -**This is straightforward classic method `get_speech_ts` where tresholds (`trig_sum` and `neg_trig_sum`) are specified by users** +**This is straightforward classic method `get_speech_ts` where thresholds (`trig_sum` and `neg_trig_sum`) are specified by users** - Among others, we provide several [utils](https://github.com/snakers4/silero-vad/blob/8b28767292b424e3e505c55f15cd3c4b91e4804b/utils.py#L52-L59) to simplify working with VAD; - We provide sensible basic hyper-parameters that work for us, but your case can be different; - `trig_sum` - overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state); @@ -384,7 +384,7 @@ speech_timestamps = get_speech_ts(wav, model, #### **Adaptive way** -**Adaptive algorythm (`get_speech_ts_adaptive`) automatically selects tresholds (`trig_sum` and `neg_trig_sum`) based on median speech probabilities over whole audio, SOME ARGUMENTS VARY FROM CLASSIC WAY FUNCTION ARGUMENTS** +**Adaptive algorithm (`get_speech_ts_adaptive`) automatically selects thresholds (`trig_sum` and `neg_trig_sum`) based on median speech probabilities over the whole audio, SOME ARGUMENTS VARY FROM THE CLASSIC WAY FUNCTION ARGUMENTS** - `batch_size` - batch size to feed to silero VAD (default - `200`) - `step` - step size in samples, (default - `500`) (`num_samples_per_window` / `num_steps` from classic method) - `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434)); @@ -425,7 +425,7 @@ Please see [Quality Metrics](#quality-metrics) ### How Number Detector Works - It is recommended to split long audio into short ones (< 15s) and apply model on each of them; -- Number Detector can classify if whole audio contains a number, or if each audio frame contains a number; +- Number Detector can classify if the whole audio contains a number, or if each audio frame contains a number; - Audio is splitted into frames in a certain way, so, having a per-frame output, we can restore timing bounds for a numbers with an accuracy of about 0.2s; ### How Language Classifier Works diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 7d40db1..d15661f 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -3,7 +3,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, "id": "sVNOuHQQjsrp" }, "source": [ @@ -13,8 +12,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "FpMplOCA2Fwp" }, "source": [ @@ -25,7 +22,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "62A6F_072Fwq" }, "source": [ @@ -36,10 +32,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-30T17:35:43.397137Z", - "start_time": "2020-12-30T17:33:10.962078Z" - }, "hidden": true, "id": "5w5AkskZ2Fwr" }, @@ -75,8 +67,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "fXbbaUO3jsrw" }, "source": [ @@ -86,22 +76,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "dY2Us3_Q2Fws" }, "source": [ - "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + "**Classic way of getting speech chunks, you may need to select the thresholds yourself**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-30T17:35:44.362860Z", - "start_time": "2020-12-30T17:35:43.398441Z" - }, - "hidden": true, "id": "aI_eydBPjsrx" }, "outputs": [], @@ -117,11 +101,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-30T17:35:44.419280Z", - "start_time": "2020-12-30T17:35:44.364175Z" - }, - "hidden": true, "id": "OuEobLchjsry" }, "outputs": [], @@ -135,18 +114,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "n8plzbJU2Fws" }, "source": [ - "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + "**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "SQOtu2Vl2Fwt" }, "outputs": [], @@ -161,7 +138,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "Lr6zCGXh2Fwt" }, "outputs": [], @@ -175,8 +151,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "iDKQbVr8jsry" }, "source": [ @@ -186,26 +160,16 @@ { "cell_type": "markdown", "metadata": { - "ExecuteTime": { - "end_time": "2021-04-15T13:29:04.224833Z", - "start_time": "2021-04-15T13:29:04.220588Z" - }, - "hidden": true, "id": "xCM-HrUR2Fwu" }, "source": [ - "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + "**Classic way of getting speech chunks, you may need to select the thresholds yourself**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:59.199321Z", - "start_time": "2020-12-15T13:09:59.196823Z" - }, - "hidden": true, "id": "q-lql_2Wjsry" }, "outputs": [], @@ -220,18 +184,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "t8TXtnvk2Fwv" }, "source": [ - "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + "**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "BX3UgwwB2Fwv" }, "outputs": [], @@ -247,7 +209,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "KBDVybJCjsrz" }, "source": [ @@ -258,10 +219,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:10:03.590358Z", - "start_time": "2020-12-15T13:10:03.587071Z" - }, "hidden": true, "id": "BK4tGfWgjsrz" }, @@ -275,10 +232,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:10:15.762491Z", - "start_time": "2020-12-15T13:10:03.591388Z" - }, "hidden": true, "id": "v1l8sam1jsrz" }, @@ -293,7 +246,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "36jY0niD2Fww" }, "source": [ @@ -421,7 +373,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "PnKtJKbq2Fwz" }, "source": [ @@ -498,7 +449,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, "id": "57avIBd6jsrz" }, "source": [ @@ -508,8 +458,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "hEhnfORV2Fw0" }, "source": [ @@ -520,7 +468,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "bL4kn4KJrlyL" }, "source": [ @@ -531,10 +478,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2021-04-15T13:30:22.938755Z", - "start_time": "2021-04-15T13:30:20.970574Z" - }, "cellView": "form", "hidden": true, "id": "Q4QIfSpprnkI" @@ -580,8 +523,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "5JHErdB7jsr0" }, "source": [ @@ -591,26 +532,16 @@ { "cell_type": "markdown", "metadata": { - "ExecuteTime": { - "end_time": "2021-04-15T13:34:22.554010Z", - "start_time": "2021-04-15T13:34:22.550308Z" - }, - "hidden": true, "id": "TNEtK5zi2Fw2" }, "source": [ - "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + "**Classic way of getting speech chunks, you may need to select the thresholds yourself**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2021-04-15T13:30:14.475412Z", - "start_time": "2021-04-15T13:30:14.427933Z" - }, - "hidden": true, "id": "krnGoA6Kjsr0" }, "outputs": [], @@ -627,11 +558,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:08.862421Z", - "start_time": "2020-12-15T13:09:08.820014Z" - }, - "hidden": true, "id": "B176Lzfnjsr1" }, "outputs": [], @@ -644,18 +570,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "21RE8KEC2Fw2" }, "source": [ - "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + "**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "uIVs56rb2Fw2" }, "outputs": [], @@ -672,11 +596,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2021-04-15T13:34:41.375446Z", - "start_time": "2021-04-15T13:34:41.368055Z" - }, - "hidden": true, "id": "cox6oumC2Fw3" }, "outputs": [], @@ -689,8 +608,6 @@ { "cell_type": "markdown", "metadata": { - "heading_collapsed": true, - "hidden": true, "id": "Rio9W50gjsr1" }, "source": [ @@ -700,22 +617,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "i8EZwtaA2Fw3" }, "source": [ - "**Classic way of getting speech chunks, you may need to select the tresholds yourself**" + "**Classic way of getting speech chunks, you may need to select the thresholds yourself**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:09.606031Z", - "start_time": "2020-12-15T13:09:09.504239Z" - }, - "hidden": true, "id": "IPkl8Yy1jsr1" }, "outputs": [], @@ -728,11 +639,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:11.453171Z", - "start_time": "2020-12-15T13:09:09.633435Z" - }, - "hidden": true, "id": "NC6Jim0hjsr1" }, "outputs": [], @@ -745,18 +651,16 @@ { "cell_type": "markdown", "metadata": { - "hidden": true, "id": "0pSKslpz2Fw3" }, "source": [ - "**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**" + "**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "RZwc-Khk2Fw4" }, "outputs": [], @@ -769,7 +673,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "hidden": true, "id": "Z4lzFPs02Fw4" }, "outputs": [], @@ -783,7 +686,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "WNZ42u0ajsr1" }, "source": [ @@ -794,10 +696,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:11.540423Z", - "start_time": "2020-12-15T13:09:11.455706Z" - }, "hidden": true, "id": "XjhGQGppjsr1" }, @@ -812,10 +710,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:19.565434Z", - "start_time": "2020-12-15T13:09:11.552097Z" - }, "hidden": true, "id": "QI7-arlqjsr2" }, @@ -830,7 +724,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "7QMvUvpg2Fw4" }, "source": [ @@ -852,10 +745,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-30T17:25:19.107534Z", - "start_time": "2020-12-30T17:24:51.853293Z" - }, "cellView": "form", "hidden": true, "id": "PdjGd56R2Fw5" @@ -912,10 +801,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:06.643812Z", - "start_time": "2020-12-15T13:09:06.473386Z" - }, "hidden": true, "id": "_r6QZiwu2Fw5" }, @@ -949,10 +834,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:08.862421Z", - "start_time": "2020-12-15T13:09:08.820014Z" - }, "hidden": true, "id": "JnvS6WTK2Fw5" }, @@ -983,7 +864,6 @@ "cell_type": "markdown", "metadata": { "heading_collapsed": true, - "hidden": true, "id": "SR8Bgcd52Fw6" }, "source": [ @@ -1005,10 +885,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-30T17:25:19.107534Z", - "start_time": "2020-12-30T17:24:51.853293Z" - }, "cellView": "form", "hidden": true, "id": "iNkDWJ3H2Fw6" @@ -1111,5 +987,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From fe77c7b79c283eb2a1c6ba55042fafbd49cc0856 Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Thu, 15 Apr 2021 14:31:00 +0000 Subject: [PATCH 5/6] collapse --- silero-vad.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/silero-vad.ipynb b/silero-vad.ipynb index d15661f..f841184 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -987,5 +987,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } From 2c487a237a699c0215afad039f1632df07221b1e Mon Sep 17 00:00:00 2001 From: Dimitrii Voronin <36505480+adamnsandle@users.noreply.github.com> Date: Mon, 19 Apr 2021 16:05:03 +0300 Subject: [PATCH 6/6] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee4cdd0..dd71db6 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', force_reload=True) (get_speech_ts, - get_speech_ts_adaptive + get_speech_ts_adaptive, _, read_audio, _, _, _) = utils @@ -202,7 +202,7 @@ _, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', force_reload=True) (get_speech_ts, - get_speech_ts_adaptive + get_speech_ts_adaptive, _, read_audio, _, _, _) = utils