{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "silero-vad.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "sVNOuHQQjsrp" }, "source": [ "# PyTorch Examples" ] }, { "cell_type": "markdown", "metadata": { "id": "FpMplOCA2Fwp" }, "source": [ "## VAD" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "62A6F_072Fwq" }, "source": [ "### Install Dependencies" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "5w5AkskZ2Fwr" }, "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile\n", "\n", "SAMPLE_RATE = 16000\n", "\n", "import glob\n", "import torch\n", "torch.set_num_threads(1)\n", "\n", "from IPython.display import Audio\n", "from pprint import pprint\n", "\n", "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_vad',\n", " force_reload=True)\n", "\n", "(get_speech_timestamps,\n", " save_audio,\n", " read_audio,\n", " VADIterator,\n", " collect_chunks) = utils\n", "\n", "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "fXbbaUO3jsrw" }, "source": [ "### Full Audio" ] }, { "cell_type": "markdown", "metadata": { "id": "RAfJPb_a-Auj" }, "source": [ "**Speech timestapms from full audio**" ] }, { "cell_type": "code", "metadata": { "id": "aI_eydBPjsrx" }, "source": [ "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "# get speech timestamps from full audio file\n", "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", "pprint(speech_timestamps)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OuEobLchjsry" }, "source": [ "# merge all speech chunks to one audio\n", "save_audio('only_speech.wav',\n", " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", "Audio('only_speech.wav')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "iDKQbVr8jsry" }, "source": [ "**Stream imitation example**" ] }, { "cell_type": "code", "metadata": { "id": "q-lql_2Wjsry" }, "source": [ "## using VADIterator class\n", "\n", "vad_iterator = VADiterator(double_model)\n", "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "\n", "window_size_samples = 1536 # number of samples in a single audio chunk\n", "for i in range(0, len(wav), window_size_samples):\n", " speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n", " if speech_dict:\n", " print(speech_dict, end=' ')\n", "vad_iterator.reset_states() # reset model states after each audio" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "BX3UgwwB2Fwv" }, "source": [ "## just probabilities\n", "\n", "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "speech_probs = []\n", "window_size_samples = 1536\n", "for i in range(0, len(wav), window_size_samples):\n", " speech_prob = model(wav[i: i+ window_size_samples], SAMPLE_RATE).item()\n", " speech_probs.append(speech_prob)\n", "\n", "pprint(speech_probs[:100])" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "36jY0niD2Fww" }, "source": [ "## Number detector" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "scd1DlS42Fwx" }, "source": [ "### Install Dependencies" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "Kq5gQuYq2Fwx" }, "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile\n", "\n", "import glob\n", "import torch\n", "torch.set_num_threads(1)\n", "\n", "from IPython.display import Audio\n", "from pprint import pprint\n", "\n", "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_number_detector',\n", " force_reload=True)\n", "\n", "(get_number_ts,\n", " save_audio,\n", " read_audio,\n", " collect_chunks,\n", " drop_chunks) = utils\n", "\n", "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "qhPa30ij2Fwy" }, "source": [ "### Full audio" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "EXpau6xq2Fwy" }, "source": [ "wav = read_audio(f'{files_dir}/en_num.wav')\n", "# get number timestamps from full audio file\n", "number_timestamps = get_number_ts(wav, model)\n", "pprint(number_timestamps)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "u-KfXRhZ2Fwy" }, "source": [ "sample_rate = 16000\n", "# convert ms in timestamps to samples\n", "for timestamp in number_timestamps:\n", " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "iwYEC4aZ2Fwy" }, "source": [ "# merge all number chunks to one audio\n", "save_audio('only_numbers.wav',\n", " collect_chunks(number_timestamps, wav), sample_rate) \n", "Audio('only_numbers.wav')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "fHaYejX12Fwy" }, "source": [ "# drop all number chunks from audio\n", "save_audio('no_numbers.wav',\n", " drop_chunks(number_timestamps, wav), sample_rate) \n", "Audio('no_numbers.wav')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "PnKtJKbq2Fwz" }, "source": [ "## Language detector" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "F5cAmMbP2Fwz" }, "source": [ "### Install Dependencies" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "Zu9D0t6n2Fwz" }, "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile\n", "\n", "import glob\n", "import torch\n", "torch.set_num_threads(1)\n", "\n", "from IPython.display import Audio\n", "from pprint import pprint\n", "\n", "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_lang_detector',\n", " force_reload=True)\n", "\n", "(get_language,\n", " read_audio) = utils\n", "\n", "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "iC696eMX2Fwz" }, "source": [ "### Full audio" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "c8UYnYBF2Fw0" }, "source": [ "wav = read_audio(f'{files_dir}/en.wav')\n", "lang = get_language(wav, model)\n", "print(lang)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "57avIBd6jsrz" }, "source": [ "# ONNX Example" ] }, { "cell_type": "markdown", "metadata": { "id": "hEhnfORV2Fw0" }, "source": [ "## VAD" ] }, { "cell_type": "markdown", "metadata": { "id": "Cy7y-NAyALSe" }, "source": [ "**TO BE DONE**" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "7QMvUvpg2Fw4" }, "source": [ "## Number detector" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "tBPDkpHr2Fw4" }, "source": [ "### Install Dependencies" ] }, { "cell_type": "code", "metadata": { "cellView": "form", "hidden": true, "id": "PdjGd56R2Fw5" }, "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile onnxruntime\n", "\n", "import glob\n", "import torch\n", "import onnxruntime\n", "from pprint import pprint\n", "\n", "from IPython.display import Audio\n", "\n", "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_number_detector',\n", " force_reload=True)\n", "\n", "(get_number_ts,\n", " save_audio,\n", " read_audio,\n", " collect_chunks,\n", " drop_chunks) = utils\n", "\n", "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", "\n", "def init_onnx_model(model_path: str):\n", " return onnxruntime.InferenceSession(model_path)\n", "\n", "def validate_onnx(model, inputs):\n", " with torch.no_grad():\n", " ort_inputs = {'input': inputs.cpu().numpy()}\n", " outs = model.run(None, ort_inputs)\n", " outs = [torch.Tensor(x) for x in outs]\n", " return outs" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "I9QWSFZh2Fw5" }, "source": [ "### Full Audio" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "_r6QZiwu2Fw5" }, "source": [ "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", "wav = read_audio(f'{files_dir}/en_num.wav')\n", "\n", "# get number timestamps from full audio file\n", "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", "pprint(number_timestamps)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "FN4aDwLV2Fw5" }, "source": [ "sample_rate = 16000\n", "# convert ms in timestamps to samples\n", "for timestamp in number_timestamps:\n", " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "JnvS6WTK2Fw5" }, "source": [ "# merge all number chunks to one audio\n", "save_audio('only_numbers.wav',\n", " collect_chunks(number_timestamps, wav), 16000) \n", "Audio('only_numbers.wav')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "yUxOcOFG2Fw6" }, "source": [ "# drop all number chunks from audio\n", "save_audio('no_numbers.wav',\n", " drop_chunks(number_timestamps, wav), 16000) \n", "Audio('no_numbers.wav')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "SR8Bgcd52Fw6" }, "source": [ "## Language detector" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true, "id": "PBnXPtKo2Fw6" }, "source": [ "### Install Dependencies" ] }, { "cell_type": "code", "metadata": { "cellView": "form", "hidden": true, "id": "iNkDWJ3H2Fw6" }, "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio soundfile onnxruntime\n", "\n", "import glob\n", "import torch\n", "import onnxruntime\n", "from pprint import pprint\n", "\n", "from IPython.display import Audio\n", "\n", "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_lang_detector',\n", " force_reload=True)\n", "\n", "(get_language,\n", " read_audio) = utils\n", "\n", "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", "\n", "def init_onnx_model(model_path: str):\n", " return onnxruntime.InferenceSession(model_path)\n", "\n", "def validate_onnx(model, inputs):\n", " with torch.no_grad():\n", " ort_inputs = {'input': inputs.cpu().numpy()}\n", " outs = model.run(None, ort_inputs)\n", " outs = [torch.Tensor(x) for x in outs]\n", " return outs" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "hidden": true, "id": "G8N8oP4q2Fw6" }, "source": [ "### Full Audio" ] }, { "cell_type": "code", "metadata": { "hidden": true, "id": "WHXnh9IV2Fw6" }, "source": [ "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", "wav = read_audio(f'{files_dir}/en.wav')\n", "\n", "lang = get_language(wav, model, run_function=validate_onnx)\n", "print(lang)" ], "execution_count": null, "outputs": [] } ] }