{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "sVNOuHQQjsrp" }, "source": [ "# PyTorch Example" ] }, { "cell_type": "markdown", "metadata": { "id": "9ZTzCtc5kYVg" }, "source": [ "## Install Dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T14:00:15.701867Z", "start_time": "2020-12-15T14:00:09.512876Z" }, "cellView": "form", "id": "rllMjjsekbjt" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /home/keras/.cache/torch/hub/master.zip\n", "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", " warnings.warn(\n", "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", " warnings.warn(\n" ] } ], "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "#!pip install -q torchaudio soundfile\n", "\n", "import glob\n", "import torch\n", "torch.set_num_threads(1)\n", "\n", "from IPython.display import Audio\n", "\n", "\n", "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_vad',\n", " force_reload=True)\n", "\n", "(get_speech_ts,\n", " save_audio,\n", " read_audio,\n", " state_generator,\n", " single_audio_stream,\n", " collect_speeches) = utils" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T14:03:38.006309Z", "start_time": "2020-12-15T14:03:38.002613Z" } }, "outputs": [ { "data": { "text/plain": [ "'/home/keras/.cache/torch/hub'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.hub.get_dir()" ] }, { "cell_type": "markdown", "metadata": { "id": "fXbbaUO3jsrw" }, "source": [ "## Full audio" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:56.879818Z", "start_time": "2020-12-15T13:09:56.864765Z" }, "id": "aI_eydBPjsrx" }, "outputs": [], "source": [ "wav = read_audio('files/en.wav')\n", "# get speech timestamps from full audio file\n", "speech_timestamps = get_speech_ts(wav, model,\n", " num_steps=4)\n", "print(speech_timestamps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:58.941063Z", "start_time": "2020-12-15T13:09:58.887006Z" }, "id": "OuEobLchjsry" }, "outputs": [], "source": [ "# merge all speech chunks to one audio\n", "save_audio('only_speech.wav',\n", " collect_speeches(speech_timestamps, wav), 16000) \n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", "metadata": { "id": "iDKQbVr8jsry" }, "source": [ "## Single Audio Stream" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:59.199321Z", "start_time": "2020-12-15T13:09:59.196823Z" }, "id": "q-lql_2Wjsry" }, "outputs": [], "source": [ "wav = 'files/en.wav'\n", "\n", "for batch in single_audio_stream(model, wav):\n", " if batch:\n", " print(batch)" ] }, { "cell_type": "markdown", "metadata": { "id": "KBDVybJCjsrz" }, "source": [ "## Multiple Audio Streams" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:10:03.590358Z", "start_time": "2020-12-15T13:10:03.587071Z" }, "id": "BK4tGfWgjsrz" }, "outputs": [], "source": [ "audios_for_stream = glob.glob('files/*.wav')\n", "len(audios_for_stream) # total 4 audios" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:10:15.762491Z", "start_time": "2020-12-15T13:10:03.591388Z" }, "id": "v1l8sam1jsrz" }, "outputs": [], "source": [ "for batch in state_generator(model, audios_for_stream, audios_in_stream=2): # 2 audio stream\n", " if batch:\n", " print(batch)" ] }, { "cell_type": "markdown", "metadata": { "id": "57avIBd6jsrz" }, "source": [ "# ONNX Example" ] }, { "cell_type": "markdown", "metadata": { "id": "bL4kn4KJrlyL" }, "source": [ "## Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Q4QIfSpprnkI" }, "outputs": [], "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile onnxruntime\n", "\n", "import glob\n", "import onnxruntime\n", "\n", "from IPython.display import Audio\n", "\n", "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_vad',\n", " force_reload=True)\n", "\n", "(get_speech_ts,\n", " save_audio,\n", " read_audio,\n", " state_generator,\n", " single_audio_stream,\n", " collect_speeches) = utils\n", "\n", " def init_onnx_model(model_path: str):\n", " return onnxruntime.InferenceSession(model_path)\n", "\n", "def validate_onnx(model, inputs):\n", " with torch.no_grad():\n", " ort_inputs = {'input': inputs.cpu().numpy()}\n", " outs = model.run(None, ort_inputs)\n", " outs = [torch.Tensor(x) for x in outs]\n", " return outs" ] }, { "cell_type": "markdown", "metadata": { "id": "5JHErdB7jsr0" }, "source": [ "## Full audio" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:06.643812Z", "start_time": "2020-12-15T13:09:06.473386Z" }, "id": "krnGoA6Kjsr0" }, "outputs": [], "source": [ "model = init_onnx_model('files/model.onnx')\n", "wav = read_audio('files/en.wav')\n", "\n", "# get speech timestamps from full audio file\n", "speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) \n", "print(speech_timestamps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:08.862421Z", "start_time": "2020-12-15T13:09:08.820014Z" }, "id": "B176Lzfnjsr1" }, "outputs": [], "source": [ "# merge all speech chunks to one audio\n", "save_audio('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", "metadata": { "id": "Rio9W50gjsr1" }, "source": [ "## Single audio stream" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:09.606031Z", "start_time": "2020-12-15T13:09:09.504239Z" }, "id": "IPkl8Yy1jsr1" }, "outputs": [], "source": [ "model = init_onnx_model('files/model.onnx')\n", "wav = 'files/en.wav'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:11.453171Z", "start_time": "2020-12-15T13:09:09.633435Z" }, "id": "NC6Jim0hjsr1" }, "outputs": [], "source": [ "for batch in single_audio_stream(model, wav, run_function=validate_onnx):\n", " if batch:\n", " print(batch)" ] }, { "cell_type": "markdown", "metadata": { "id": "WNZ42u0ajsr1" }, "source": [ "## Multiple audio stream" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:11.540423Z", "start_time": "2020-12-15T13:09:11.455706Z" }, "id": "XjhGQGppjsr1" }, "outputs": [], "source": [ "model = init_onnx_model('files/model.onnx')\n", "audios_for_stream = glob.glob('files/*.wav')\n", "print(len(audios_for_stream)) # total 4 audios" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-12-15T13:09:19.565434Z", "start_time": "2020-12-15T13:09:11.552097Z" }, "id": "QI7-arlqjsr2" }, "outputs": [], "source": [ "for batch in state_generator(model, audios_for_stream, audios_in_stream=2, run_function=validate_onnx): # 2 audio stream\n", " if batch:\n", " print(batch)" ] } ], "metadata": { "colab": { "name": "silero-vad.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }