From 6572cad0faec75e0259e18b0e2dea2522f7381f1 Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Tue, 15 Dec 2020 14:05:12 +0000 Subject: [PATCH 1/2] fx --- hubconf.py | 3 +- silero-vad.ipynb | 308 ++++++++++++++++++++++++----------------------- 2 files changed, 162 insertions(+), 149 deletions(-) diff --git a/hubconf.py b/hubconf.py index 006fcde..fc1d39e 100644 --- a/hubconf.py +++ b/hubconf.py @@ -14,7 +14,8 @@ def silero_vad(**kwargs): Returns a model with a set of utils Please see https://github.com/snakers4/silero-vad for usage examples """ - model = init_jit_model(model_path='files/model.jit') + hub_dir = torch.hub.get_dir() + model = init_jit_model(model_path=f'{hub_dir}/files/model.jit') utils = (get_speech_ts, save_audio, read_audio, diff --git a/silero-vad.ipynb b/silero-vad.ipynb index cf83b5b..77acd3e 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -2,41 +2,62 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "sVNOuHQQjsrp" + }, "source": [ - "# Jit example" + "# PyTorch Example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ZTzCtc5kYVg" + }, + "source": [ + "## Install Dependencies" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q torchaudio\n", - "!pip install -q ipython # For jupyter audio display" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-12-15T13:09:54.623434Z", - "start_time": "2020-12-15T13:09:54.241855Z" - } + "end_time": "2020-12-15T14:00:15.701867Z", + "start_time": "2020-12-15T14:00:09.512876Z" + }, + "cellView": "form", + "id": "rllMjjsekbjt" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /home/keras/.cache/torch/hub/master.zip\n", + "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", + " warnings.warn(\n", + "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "# dependencies\n", + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "#!pip install -q torchaudio soundfile\n", + "\n", "import glob\n", "import torch\n", "torch.set_num_threads(1)\n", + "\n", "from IPython.display import Audio\n", "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_vad')\n", "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", "\n", "(get_speech_ts,\n", " save_audio,\n", @@ -46,9 +67,36 @@ " collect_speeches) = utils" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-12-15T14:03:38.006309Z", + "start_time": "2020-12-15T14:03:38.002613Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'/home/keras/.cache/torch/hub'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.hub.get_dir()" + ] + }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "fXbbaUO3jsrw" + }, "source": [ "## Full audio" ] @@ -60,39 +108,16 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:56.879818Z", "start_time": "2020-12-15T13:09:56.864765Z" - } + }, + "id": "aI_eydBPjsrx" }, "outputs": [], "source": [ - "wav = read_audio('files/en.wav')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:58.876034Z", - "start_time": "2020-12-15T13:09:57.139254Z" - } - }, - "outputs": [], - "source": [ - "speech_timestamps = get_speech_ts(wav, model, num_steps=4) # get speech timestamps from full audio file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:58.885802Z", - "start_time": "2020-12-15T13:09:58.877327Z" - } - }, - "outputs": [], - "source": [ - "speech_timestamps" + "wav = read_audio('files/en.wav')\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_ts(wav, model,\n", + " num_steps=4)\n", + "print(speech_timestamps)" ] }, { @@ -102,19 +127,24 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:58.941063Z", "start_time": "2020-12-15T13:09:58.887006Z" - } + }, + "id": "OuEobLchjsry" }, "outputs": [], "source": [ - "save_audio('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000) # merge all speech chunks to one audio\n", + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav',\n", + " collect_speeches(speech_timestamps, wav), 16000) \n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "iDKQbVr8jsry" + }, "source": [ - "## Single audio stream" + "## Single Audio Stream" ] }, { @@ -124,24 +154,13 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:59.199321Z", "start_time": "2020-12-15T13:09:59.196823Z" - } - }, - "outputs": [], - "source": [ - "wav = 'files/en.wav'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:10:03.585644Z", - "start_time": "2020-12-15T13:09:59.429757Z" - } + }, + "id": "q-lql_2Wjsry" }, "outputs": [], "source": [ + "wav = 'files/en.wav'\n", + "\n", "for batch in single_audio_stream(model, wav):\n", " if batch:\n", " print(batch)" @@ -149,9 +168,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "KBDVybJCjsrz" + }, "source": [ - "## Multiple audio stream" + "## Multiple Audio Streams" ] }, { @@ -161,7 +182,8 @@ "ExecuteTime": { "end_time": "2020-12-15T13:10:03.590358Z", "start_time": "2020-12-15T13:10:03.587071Z" - } + }, + "id": "BK4tGfWgjsrz" }, "outputs": [], "source": [ @@ -176,7 +198,8 @@ "ExecuteTime": { "end_time": "2020-12-15T13:10:15.762491Z", "start_time": "2020-12-15T13:10:03.591388Z" - } + }, + "id": "v1l8sam1jsrz" }, "outputs": [], "source": [ @@ -187,43 +210,52 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "57avIBd6jsrz" + }, "source": [ - "# Onnx example" + "# ONNX Example" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "id": "bL4kn4KJrlyL" + }, "source": [ - "!pip install -q ipython # For jupyter audio display\n", - "!pip install -q onnxruntime" + "## Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:05.932256Z", - "start_time": "2020-12-15T13:09:05.043659Z" - } + "id": "Q4QIfSpprnkI" }, "outputs": [], "source": [ - "# dependencies\n", + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile onnxruntime\n", + "\n", "import glob\n", - "import torch\n", - "from IPython.display import Audio\n", - "torch.set_num_threads(1)\n", "import onnxruntime\n", "\n", - "from utils import (get_speech_ts, save_audio, read_audio, \n", - " state_generator, single_audio_stream, collect_speeches)\n", + "from IPython.display import Audio\n", "\n", - "def init_onnx_model(model_path: str):\n", + "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "(get_speech_ts,\n", + " save_audio,\n", + " read_audio,\n", + " state_generator,\n", + " single_audio_stream,\n", + " collect_speeches) = utils\n", + "\n", + " def init_onnx_model(model_path: str):\n", " return onnxruntime.InferenceSession(model_path)\n", "\n", "def validate_onnx(model, inputs):\n", @@ -236,7 +268,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "5JHErdB7jsr0" + }, "source": [ "## Full audio" ] @@ -248,40 +282,17 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:06.643812Z", "start_time": "2020-12-15T13:09:06.473386Z" - } + }, + "id": "krnGoA6Kjsr0" }, "outputs": [], "source": [ "model = init_onnx_model('files/model.onnx')\n", - "wav = read_audio('files/en.wav')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:08.094414Z", - "start_time": "2020-12-15T13:09:07.073253Z" - } - }, - "outputs": [], - "source": [ - "speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) # get speech timestamps from full audio file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:08.107584Z", - "start_time": "2020-12-15T13:09:08.096550Z" - } - }, - "outputs": [], - "source": [ - "speech_timestamps" + "wav = read_audio('files/en.wav')\n", + "\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) \n", + "print(speech_timestamps)" ] }, { @@ -291,17 +302,21 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:08.862421Z", "start_time": "2020-12-15T13:09:08.820014Z" - } + }, + "id": "B176Lzfnjsr1" }, "outputs": [], "source": [ - "save_audio('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000) # merge all speech chunks to one audio\n", + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "Rio9W50gjsr1" + }, "source": [ "## Single audio stream" ] @@ -313,7 +328,8 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:09.606031Z", "start_time": "2020-12-15T13:09:09.504239Z" - } + }, + "id": "IPkl8Yy1jsr1" }, "outputs": [], "source": [ @@ -328,7 +344,8 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:11.453171Z", "start_time": "2020-12-15T13:09:09.633435Z" - } + }, + "id": "NC6Jim0hjsr1" }, "outputs": [], "source": [ @@ -339,7 +356,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "WNZ42u0ajsr1" + }, "source": [ "## Multiple audio stream" ] @@ -351,26 +370,14 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:11.540423Z", "start_time": "2020-12-15T13:09:11.455706Z" - } - }, - "outputs": [], - "source": [ - "model = init_onnx_model('files/model.onnx')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-15T13:09:11.550815Z", - "start_time": "2020-12-15T13:09:11.542954Z" - } + }, + "id": "XjhGQGppjsr1" }, "outputs": [], "source": [ + "model = init_onnx_model('files/model.onnx')\n", "audios_for_stream = glob.glob('files/*.wav')\n", - "len(audios_for_stream) # total 4 audios" + "print(len(audios_for_stream)) # total 4 audios" ] }, { @@ -380,7 +387,8 @@ "ExecuteTime": { "end_time": "2020-12-15T13:09:19.565434Z", "start_time": "2020-12-15T13:09:11.552097Z" - } + }, + "id": "QI7-arlqjsr2" }, "outputs": [], "source": [ @@ -391,6 +399,10 @@ } ], "metadata": { + "colab": { + "name": "silero-vad.ipynb", + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -423,5 +435,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 1 } From f892e22b0dc831294e06c53c4f6f520c9da39e6e Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Tue, 15 Dec 2020 14:08:28 +0000 Subject: [PATCH 2/2] fx --- hubconf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hubconf.py b/hubconf.py index fc1d39e..006fcde 100644 --- a/hubconf.py +++ b/hubconf.py @@ -14,8 +14,7 @@ def silero_vad(**kwargs): Returns a model with a set of utils Please see https://github.com/snakers4/silero-vad for usage examples """ - hub_dir = torch.hub.get_dir() - model = init_jit_model(model_path=f'{hub_dir}/files/model.jit') + model = init_jit_model(model_path='files/model.jit') utils = (get_speech_ts, save_audio, read_audio,