silero-vad/silero-vad.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:14:25.443732Z",
     "start_time": "2020-12-11T14:14:24.835612Z"
    }
   },
   "outputs": [],
   "source": [
    "import glob\n",
    "import torch\n",
    "import numpy as np  # use only torch?\n",
    "import soundfile as sf\n",
    "# import torch.nn.functional as F\n",
    "from IPython.display import Audio\n",
    "torch.set_num_threads(1)\n",
    "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n",
    "extractor = STFTExtractor()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Full audio example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:19:25.895033Z",
     "start_time": "2020-12-11T14:19:25.891112Z"
    }
   },
   "outputs": [],
   "source": [
    "def collect_speeches(tss, wav):\n",
    "    speech_chunks = []\n",
    "    for i in tss:\n",
    "        speech_chunks.append(wav[i['start']: i['end']])\n",
    "    return np.concatenate(speech_chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:19:41.758975Z",
     "start_time": "2020-12-11T14:19:41.522818Z"
    }
   },
   "outputs": [],
   "source": [
    "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')  # from yml file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:19:52.024425Z",
     "start_time": "2020-12-11T14:19:51.978279Z"
    }
   },
   "outputs": [],
   "source": [
    "Audio('files/test_audio_6.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:20:12.363579Z",
     "start_time": "2020-12-11T14:20:12.346354Z"
    }
   },
   "outputs": [],
   "source": [
    "wav = read_audio('files/test_audio_6.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:20:49.910862Z",
     "start_time": "2020-12-11T14:20:49.906902Z"
    }
   },
   "outputs": [],
   "source": [
    "torch.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:20:42.130546Z",
     "start_time": "2020-12-11T14:20:42.122245Z"
    }
   },
   "outputs": [],
   "source": [
    "torch.vstack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:20:28.888271Z",
     "start_time": "2020-12-11T14:20:28.787459Z"
    }
   },
   "outputs": [],
   "source": [
    "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4)  # kill extractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T13:32:11.698816Z",
     "start_time": "2020-12-11T13:32:11.671735Z"
    }
   },
   "outputs": [],
   "source": [
    "sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n",
    "Audio('only_speech.wav')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stream example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T14:22:54.451814Z",
     "start_time": "2020-12-11T14:22:54.211738Z"
    }
   },
   "outputs": [],
   "source": [
    "!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T13:31:34.137062Z",
     "start_time": "2020-12-11T13:31:33.957092Z"
    }
   },
   "outputs": [],
   "source": [
    "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T13:31:36.332200Z",
     "start_time": "2020-12-11T13:31:36.328087Z"
    }
   },
   "outputs": [],
   "source": [
    "audios_for_stream = glob.glob('files/test*.wav')\n",
    "len(audios_for_stream)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-11T13:31:52.668041Z",
     "start_time": "2020-12-11T13:31:37.357340Z"
    }
   },
   "outputs": [],
   "source": [
    "for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n",
    "    if i:\n",
    "        print(i)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}