From f638c475956f7b84ef74fbd3ff0a651513ae24ac Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Tue, 7 Dec 2021 10:54:50 +0000 Subject: [PATCH] collab fx --- silero-vad.ipynb | 1320 +++++++++++++++++++++++----------------------- utils_vad.py | 1 - 2 files changed, 660 insertions(+), 661 deletions(-) diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 8a84e81..9fb5d8a 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -1,662 +1,662 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "silero-vad.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sVNOuHQQjsrp" + }, + "source": [ + "# PyTorch Examples" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "sVNOuHQQjsrp" - }, - "source": [ - "# PyTorch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FpMplOCA2Fwp" - }, - "source": [ - "## VAD" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "62A6F_072Fwq" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "5w5AkskZ2Fwr" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "SAMPLE_RATE = 16000\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_vad',\n", - " force_reload=True)\n", - "\n", - "(get_speech_timestamps,\n", - " save_audio,\n", - " read_audio,\n", - " VADIterator,\n", - " collect_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fXbbaUO3jsrw" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RAfJPb_a-Auj" - }, - "source": [ - "**Speech timestapms from full audio**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aI_eydBPjsrx" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "# get speech timestamps from full audio file\n", - "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", - "pprint(speech_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "OuEobLchjsry" - }, - "source": [ - "# merge all speech chunks to one audio\n", - "save_audio('only_speech.wav',\n", - " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", - "Audio('only_speech.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iDKQbVr8jsry" - }, - "source": [ - "**Stream imitation example**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "q-lql_2Wjsry" - }, - "source": [ - "## using VADIterator class\n", - "\n", - "vad_iterator = VADiterator(double_model)\n", - "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "\n", - "window_size_samples = 1536 # number of samples in a single audio chunk\n", - "for i in range(0, len(wav), window_size_samples):\n", - " speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n", - " if speech_dict:\n", - " print(speech_dict, end=' ')\n", - "vad_iterator.reset_states() # reset model states after each audio" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BX3UgwwB2Fwv" - }, - "source": [ - "## just probabilities\n", - "\n", - "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "speech_probs = []\n", - "window_size_samples = 1536\n", - "for i in range(0, len(wav), window_size_samples):\n", - " speech_prob = model(wav[i: i+ window_size_samples], SAMPLE_RATE).item()\n", - " speech_probs.append(speech_prob)\n", - "\n", - "pprint(speech_probs[:100])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "36jY0niD2Fww" - }, - "source": [ - "## Number detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "scd1DlS42Fwx" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "Kq5gQuYq2Fwx" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_number_detector',\n", - " force_reload=True)\n", - "\n", - "(get_number_ts,\n", - " save_audio,\n", - " read_audio,\n", - " collect_chunks,\n", - " drop_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "qhPa30ij2Fwy" - }, - "source": [ - "### Full audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "EXpau6xq2Fwy" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en_num.wav')\n", - "# get number timestamps from full audio file\n", - "number_timestamps = get_number_ts(wav, model)\n", - "pprint(number_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "u-KfXRhZ2Fwy" - }, - "source": [ - "sample_rate = 16000\n", - "# convert ms in timestamps to samples\n", - "for timestamp in number_timestamps:\n", - " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", - " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "iwYEC4aZ2Fwy" - }, - "source": [ - "# merge all number chunks to one audio\n", - "save_audio('only_numbers.wav',\n", - " collect_chunks(number_timestamps, wav), sample_rate) \n", - "Audio('only_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "fHaYejX12Fwy" - }, - "source": [ - "# drop all number chunks from audio\n", - "save_audio('no_numbers.wav',\n", - " drop_chunks(number_timestamps, wav), sample_rate) \n", - "Audio('no_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "PnKtJKbq2Fwz" - }, - "source": [ - "## Language detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "F5cAmMbP2Fwz" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "Zu9D0t6n2Fwz" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_lang_detector',\n", - " force_reload=True)\n", - "\n", - "(get_language,\n", - " read_audio) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "iC696eMX2Fwz" - }, - "source": [ - "### Full audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "c8UYnYBF2Fw0" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en.wav')\n", - "lang = get_language(wav, model)\n", - "print(lang)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "57avIBd6jsrz" - }, - "source": [ - "# ONNX Example" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hEhnfORV2Fw0" - }, - "source": [ - "## VAD" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cy7y-NAyALSe" - }, - "source": [ - "**TO BE DONE**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "7QMvUvpg2Fw4" - }, - "source": [ - "## Number detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "tBPDkpHr2Fw4" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "hidden": true, - "id": "PdjGd56R2Fw5" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile onnxruntime\n", - "\n", - "import glob\n", - "import torch\n", - "import onnxruntime\n", - "from pprint import pprint\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_number_detector',\n", - " force_reload=True)\n", - "\n", - "(get_number_ts,\n", - " save_audio,\n", - " read_audio,\n", - " collect_chunks,\n", - " drop_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", - "\n", - "def init_onnx_model(model_path: str):\n", - " return onnxruntime.InferenceSession(model_path)\n", - "\n", - "def validate_onnx(model, inputs):\n", - " with torch.no_grad():\n", - " ort_inputs = {'input': inputs.cpu().numpy()}\n", - " outs = model.run(None, ort_inputs)\n", - " outs = [torch.Tensor(x) for x in outs]\n", - " return outs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "I9QWSFZh2Fw5" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "_r6QZiwu2Fw5" - }, - "source": [ - "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", - "wav = read_audio(f'{files_dir}/en_num.wav')\n", - "\n", - "# get number timestamps from full audio file\n", - "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", - "pprint(number_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "FN4aDwLV2Fw5" - }, - "source": [ - "sample_rate = 16000\n", - "# convert ms in timestamps to samples\n", - "for timestamp in number_timestamps:\n", - " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", - " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "JnvS6WTK2Fw5" - }, - "source": [ - "# merge all number chunks to one audio\n", - "save_audio('only_numbers.wav',\n", - " collect_chunks(number_timestamps, wav), 16000) \n", - "Audio('only_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "yUxOcOFG2Fw6" - }, - "source": [ - "# drop all number chunks from audio\n", - "save_audio('no_numbers.wav',\n", - " drop_chunks(number_timestamps, wav), 16000) \n", - "Audio('no_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "SR8Bgcd52Fw6" - }, - "source": [ - "## Language detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "PBnXPtKo2Fw6" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "hidden": true, - "id": "iNkDWJ3H2Fw6" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile onnxruntime\n", - "\n", - "import glob\n", - "import torch\n", - "import onnxruntime\n", - "from pprint import pprint\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_lang_detector',\n", - " force_reload=True)\n", - "\n", - "(get_language,\n", - " read_audio) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", - "\n", - "def init_onnx_model(model_path: str):\n", - " return onnxruntime.InferenceSession(model_path)\n", - "\n", - "def validate_onnx(model, inputs):\n", - " with torch.no_grad():\n", - " ort_inputs = {'input': inputs.cpu().numpy()}\n", - " outs = model.run(None, ort_inputs)\n", - " outs = [torch.Tensor(x) for x in outs]\n", - " return outs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true, - "id": "G8N8oP4q2Fw6" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "WHXnh9IV2Fw6" - }, - "source": [ - "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", - "wav = read_audio(f'{files_dir}/en.wav')\n", - "\n", - "lang = get_language(wav, model, run_function=validate_onnx)\n", - "print(lang)" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "FpMplOCA2Fwp" + }, + "source": [ + "## VAD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "62A6F_072Fwq" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "5w5AkskZ2Fwr" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio\n", + "\n", + "SAMPLE_RATE = 16000\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "(get_speech_timestamps,\n", + " save_audio,\n", + " read_audio,\n", + " VADIterator,\n", + " collect_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fXbbaUO3jsrw" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RAfJPb_a-Auj" + }, + "source": [ + "**Speech timestapms from full audio**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aI_eydBPjsrx" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OuEobLchjsry" + }, + "outputs": [], + "source": [ + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav',\n", + " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", + "Audio('only_speech.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iDKQbVr8jsry" + }, + "source": [ + "### Stream imitation example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q-lql_2Wjsry" + }, + "outputs": [], + "source": [ + "## using VADIterator class\n", + "\n", + "vad_iterator = VADIterator(model)\n", + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "\n", + "window_size_samples = 1536 # number of samples in a single audio chunk\n", + "for i in range(0, len(wav), window_size_samples):\n", + " speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n", + " if speech_dict:\n", + " print(speech_dict, end=' ')\n", + "vad_iterator.reset_states() # reset model states after each audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BX3UgwwB2Fwv" + }, + "outputs": [], + "source": [ + "## just probabilities\n", + "\n", + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "speech_probs = []\n", + "window_size_samples = 1536\n", + "for i in range(0, len(wav), window_size_samples):\n", + " speech_prob = model(wav[i: i+ window_size_samples], SAMPLE_RATE).item()\n", + " speech_probs.append(speech_prob)\n", + "\n", + "pprint(speech_probs[:100])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "36jY0niD2Fww" + }, + "source": [ + "## Number detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "scd1DlS42Fwx" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "Kq5gQuYq2Fwx" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_number_detector',\n", + " force_reload=True)\n", + "\n", + "(get_number_ts,\n", + " save_audio,\n", + " read_audio,\n", + " collect_chunks,\n", + " drop_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "qhPa30ij2Fwy" + }, + "source": [ + "### Full audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "EXpau6xq2Fwy" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en_num.wav')\n", + "# get number timestamps from full audio file\n", + "number_timestamps = get_number_ts(wav, model)\n", + "pprint(number_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "u-KfXRhZ2Fwy" + }, + "outputs": [], + "source": [ + "sample_rate = 16000\n", + "# convert ms in timestamps to samples\n", + "for timestamp in number_timestamps:\n", + " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", + " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "iwYEC4aZ2Fwy" + }, + "outputs": [], + "source": [ + "# merge all number chunks to one audio\n", + "save_audio('only_numbers.wav',\n", + " collect_chunks(number_timestamps, wav), sample_rate) \n", + "Audio('only_numbers.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "fHaYejX12Fwy" + }, + "outputs": [], + "source": [ + "# drop all number chunks from audio\n", + "save_audio('no_numbers.wav',\n", + " drop_chunks(number_timestamps, wav), sample_rate) \n", + "Audio('no_numbers.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "PnKtJKbq2Fwz" + }, + "source": [ + "## Language detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "F5cAmMbP2Fwz" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "Zu9D0t6n2Fwz" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_lang_detector',\n", + " force_reload=True)\n", + "\n", + "(get_language,\n", + " read_audio) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "iC696eMX2Fwz" + }, + "source": [ + "### Full audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "c8UYnYBF2Fw0" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en.wav')\n", + "lang = get_language(wav, model)\n", + "print(lang)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "57avIBd6jsrz" + }, + "source": [ + "# ONNX Example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hEhnfORV2Fw0" + }, + "source": [ + "## VAD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cy7y-NAyALSe" + }, + "source": [ + "**TO BE DONE**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "7QMvUvpg2Fw4" + }, + "source": [ + "## Number detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "tBPDkpHr2Fw4" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "hidden": true, + "id": "PdjGd56R2Fw5" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile onnxruntime\n", + "\n", + "import glob\n", + "import torch\n", + "import onnxruntime\n", + "from pprint import pprint\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_number_detector',\n", + " force_reload=True)\n", + "\n", + "(get_number_ts,\n", + " save_audio,\n", + " read_audio,\n", + " collect_chunks,\n", + " drop_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", + "\n", + "def init_onnx_model(model_path: str):\n", + " return onnxruntime.InferenceSession(model_path)\n", + "\n", + "def validate_onnx(model, inputs):\n", + " with torch.no_grad():\n", + " ort_inputs = {'input': inputs.cpu().numpy()}\n", + " outs = model.run(None, ort_inputs)\n", + " outs = [torch.Tensor(x) for x in outs]\n", + " return outs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "I9QWSFZh2Fw5" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "_r6QZiwu2Fw5" + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", + "wav = read_audio(f'{files_dir}/en_num.wav')\n", + "\n", + "# get number timestamps from full audio file\n", + "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", + "pprint(number_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "FN4aDwLV2Fw5" + }, + "outputs": [], + "source": [ + "sample_rate = 16000\n", + "# convert ms in timestamps to samples\n", + "for timestamp in number_timestamps:\n", + " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", + " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "JnvS6WTK2Fw5" + }, + "outputs": [], + "source": [ + "# merge all number chunks to one audio\n", + "save_audio('only_numbers.wav',\n", + " collect_chunks(number_timestamps, wav), 16000) \n", + "Audio('only_numbers.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "yUxOcOFG2Fw6" + }, + "outputs": [], + "source": [ + "# drop all number chunks from audio\n", + "save_audio('no_numbers.wav',\n", + " drop_chunks(number_timestamps, wav), 16000) \n", + "Audio('no_numbers.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "SR8Bgcd52Fw6" + }, + "source": [ + "## Language detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "PBnXPtKo2Fw6" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "hidden": true, + "id": "iNkDWJ3H2Fw6" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile onnxruntime\n", + "\n", + "import glob\n", + "import torch\n", + "import onnxruntime\n", + "from pprint import pprint\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_lang_detector',\n", + " force_reload=True)\n", + "\n", + "(get_language,\n", + " read_audio) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", + "\n", + "def init_onnx_model(model_path: str):\n", + " return onnxruntime.InferenceSession(model_path)\n", + "\n", + "def validate_onnx(model, inputs):\n", + " with torch.no_grad():\n", + " ort_inputs = {'input': inputs.cpu().numpy()}\n", + " outs = model.run(None, ort_inputs)\n", + " outs = [torch.Tensor(x) for x in outs]\n", + " return outs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true, + "id": "G8N8oP4q2Fw6" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "WHXnh9IV2Fw6" + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", + "wav = read_audio(f'{files_dir}/en.wav')\n", + "\n", + "lang = get_language(wav, model, run_function=validate_onnx)\n", + "print(lang)" + ] + } + ], + "metadata": { + "colab": { + "name": "silero-vad.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/utils_vad.py b/utils_vad.py index aed2a0e..2211f06 100644 --- a/utils_vad.py +++ b/utils_vad.py @@ -20,7 +20,6 @@ def validate(model, def read_audio(path: str, sampling_rate: int = 16000): - assert torchaudio.get_audio_backend() == 'soundfile' wav, sr = torchaudio.load(path) if wav.size(0) > 1: