{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:30:32.615246Z", "start_time": "2020-12-11T13:30:32.126553Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", " warnings.warn(\n", "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", " warnings.warn(\n" ] } ], "source": [ "import torch\n", "import numpy as np\n", "import glob\n", "import torch.nn.functional as F\n", "import soundfile as sf\n", "from IPython.display import Audio\n", "torch.set_num_threads(1)\n", "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n", "extractor = STFTExtractor()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Full audio example" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:32:01.978079Z", "start_time": "2020-12-11T13:32:01.974912Z" } }, "outputs": [], "source": [ "def collect_speeches(tss, wav):\n", " speech_chunks = []\n", " for i in tss:\n", " speech_chunks.append(wav[i['start']: i['end']])\n", " return np.concatenate(speech_chunks)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:55.255097Z", "start_time": "2020-12-11T13:31:55.020705Z" } }, "outputs": [], "source": [ "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:32:10.391589Z", "start_time": "2020-12-11T13:32:10.387109Z" } }, "outputs": [], "source": [ "wav = read_audio('files/test_audio_6.wav')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:32:11.670091Z", "start_time": "2020-12-11T13:32:10.814378Z" } }, "outputs": [], "source": [ "speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:32:11.698816Z", "start_time": "2020-12-11T13:32:11.671735Z" } }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stream example" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:34.137062Z", "start_time": "2020-12-11T13:31:33.957092Z" } }, "outputs": [], "source": [ "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:36.332200Z", "start_time": "2020-12-11T13:31:36.328087Z" } }, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audios_for_stream = glob.glob('files/test*.wav')\n", "len(audios_for_stream)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-12-11T13:31:52.668041Z", "start_time": "2020-12-11T13:31:37.357340Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done initial Loading\n", "[({106500: 'start'}, 'files/test_audio_1.wav')]\n", "[({174000: 'start'}, 'files/test_audio_3.wav')]\n", "[({261000: 'end'}, 'files/test_audio_1.wav')]\n", "Loading next wav: files/test_audio_7.wav\n", "[({134000: 'start'}, 'files/test_audio_7.wav')]\n", "[({147500: 'end'}, 'files/test_audio_7.wav')]\n", "[({442000: 'end'}, 'files/test_audio_3.wav')]\n", "[({450500: 'start'}, 'files/test_audio_3.wav')]\n", "[({209500: 'start'}, 'files/test_audio_7.wav')]\n", "[({519500: 'end'}, 'files/test_audio_3.wav')]\n", "[({533500: 'start'}, 'files/test_audio_3.wav')]\n", "[({599904: 'end'}, 'files/test_audio_3.wav')]\n", "Loading next wav: files/test_audio_6.wav\n", "[({183500: 'start'}, 'files/test_audio_6.wav')]\n", "[({503500: 'end'}, 'files/test_audio_7.wav')]\n", "[({202500: 'end'}, 'files/test_audio_6.wav')]\n", "[({537500: 'start'}, 'files/test_audio_7.wav')]\n", "[({226500: 'start'}, 'files/test_audio_6.wav')]\n", "[({283500: 'end'}, 'files/test_audio_6.wav')]\n", "[({616500: 'end'}, 'files/test_audio_7.wav')]\n", "[({337500: 'start'}, 'files/test_audio_6.wav')]\n", "[({661500: 'start'}, 'files/test_audio_7.wav')]\n", "[({785000: 'end'}, 'files/test_audio_7.wav')]\n", "[({503000: 'end'}, 'files/test_audio_6.wav')]\n", "[({507500: 'start'}, 'files/test_audio_6.wav')]\n", "[({851500: 'start'}, 'files/test_audio_7.wav')]\n", "[({919000: 'end'}, 'files/test_audio_7.wav')]\n", "Loading next wav: files/test_audio_5.wav\n", "[({627500: 'end'}, 'files/test_audio_6.wav')]\n", "[({631500: 'start'}, 'files/test_audio_6.wav')]\n", "[({151000: 'start'}, 'files/test_audio_5.wav')]\n", "[({169000: 'end'}, 'files/test_audio_5.wav')]\n", "[({211000: 'start'}, 'files/test_audio_5.wav')]\n", "[({221500: 'end'}, 'files/test_audio_5.wav')]\n", "Loading next wav: files/test_audio_2.wav\n", "[({927488: 'end'}, 'files/test_audio_6.wav')]\n", "Loading next wav: files/test_audio_8.wav\n", "[({228000: 'start'}, 'files/test_audio_2.wav')]\n", "[({179500: 'start'}, 'files/test_audio_8.wav')]\n", "[({241500: 'end'}, 'files/test_audio_2.wav')]\n", "[({279000: 'start'}, 'files/test_audio_2.wav')]\n", "[({274500: 'end'}, 'files/test_audio_8.wav')]\n", "[({300500: 'start'}, 'files/test_audio_8.wav')]\n", "[({369500: 'end'}, 'files/test_audio_2.wav')]\n", "[({378500: 'start'}, 'files/test_audio_2.wav')]\n", "[({436500: 'end'}, 'files/test_audio_2.wav')]\n", "[({423000: 'end'}, 'files/test_audio_8.wav')]\n", "[({488500: 'start'}, 'files/test_audio_2.wav')]\n", "[({458500: 'start'}, 'files/test_audio_8.wav')]\n", "[({599904: 'end'}, 'files/test_audio_2.wav')]\n", "Loading next wav: files/test_audio_4.wav\n", "[({583500: 'end'}, 'files/test_audio_8.wav')]\n", "[({599500: 'start'}, 'files/test_audio_8.wav')]\n", "[({632500: 'end'}, 'files/test_audio_8.wav')]\n", "[({660000: 'start'}, 'files/test_audio_8.wav')]\n", "[({737000: 'end'}, 'files/test_audio_8.wav')]\n", "[({761000: 'start'}, 'files/test_audio_8.wav')]\n", "[({249500: 'start'}, 'files/test_audio_4.wav')]\n", "[({257168: 'end'}, 'files/test_audio_4.wav')]\n", "Loading next wav: files/test_audio_9.wav\n", "[({843000: 'end'}, 'files/test_audio_8.wav')]\n", "Loading next wav: files/test_audio_0.wav\n", "[({133000: 'start'}, 'files/test_audio_9.wav')]\n", "[({143500: 'end'}, 'files/test_audio_9.wav')]\n", "[({272000: 'start'}, 'files/test_audio_9.wav')]\n", "[({256500: 'start'}, 'files/test_audio_0.wav')]\n", "[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n", "[({406500: 'start'}, 'files/test_audio_9.wav')]\n", "[({460000: 'end'}, 'files/test_audio_9.wav')]\n", "[({476000: 'start'}, 'files/test_audio_9.wav')]\n", "[({494500: 'end'}, 'files/test_audio_9.wav')]\n", "[({544500: 'start'}, 'files/test_audio_9.wav')]\n", "[({564500: 'end'}, 'files/test_audio_9.wav')]\n", "[({595000: 'start'}, 'files/test_audio_9.wav')]\n", "[({682000: 'end'}, 'files/test_audio_9.wav')]\n", "[({728500: 'start'}, 'files/test_audio_9.wav')]\n", "[({786000: 'end'}, 'files/test_audio_9.wav')]\n", "[({814000: 'start'}, 'files/test_audio_9.wav')]\n", "[({826000: 'end'}, 'files/test_audio_9.wav')]\n" ] } ], "source": [ "for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n", " if i:\n", " print(i)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }