{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:30:32.615246Z",
"start_time": "2020-12-11T13:30:32.126553Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
" warnings.warn(\n",
"/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import torch\n",
"import numpy as np\n",
"import glob\n",
"import torch.nn.functional as F\n",
"import soundfile as sf\n",
"from IPython.display import Audio\n",
"torch.set_num_threads(1)\n",
"from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n",
"extractor = STFTExtractor()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Full audio example"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:32:01.978079Z",
"start_time": "2020-12-11T13:32:01.974912Z"
}
},
"outputs": [],
"source": [
"def collect_speeches(tss, wav):\n",
" speech_chunks = []\n",
" for i in tss:\n",
" speech_chunks.append(wav[i['start']: i['end']])\n",
" return np.concatenate(speech_chunks)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:55.255097Z",
"start_time": "2020-12-11T13:31:55.020705Z"
}
},
"outputs": [],
"source": [
"model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:32:10.391589Z",
"start_time": "2020-12-11T13:32:10.387109Z"
}
},
"outputs": [],
"source": [
"wav = read_audio('files/test_audio_6.wav')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:32:11.670091Z",
"start_time": "2020-12-11T13:32:10.814378Z"
}
},
"outputs": [],
"source": [
"speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:32:11.698816Z",
"start_time": "2020-12-11T13:32:11.671735Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n",
"Audio('only_speech.wav')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stream example"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:34.137062Z",
"start_time": "2020-12-11T13:31:33.957092Z"
}
},
"outputs": [],
"source": [
"model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:36.332200Z",
"start_time": "2020-12-11T13:31:36.328087Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"10"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audios_for_stream = glob.glob('files/test*.wav')\n",
"len(audios_for_stream)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-12-11T13:31:52.668041Z",
"start_time": "2020-12-11T13:31:37.357340Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done initial Loading\n",
"[({106500: 'start'}, 'files/test_audio_1.wav')]\n",
"[({174000: 'start'}, 'files/test_audio_3.wav')]\n",
"[({261000: 'end'}, 'files/test_audio_1.wav')]\n",
"Loading next wav: files/test_audio_7.wav\n",
"[({134000: 'start'}, 'files/test_audio_7.wav')]\n",
"[({147500: 'end'}, 'files/test_audio_7.wav')]\n",
"[({442000: 'end'}, 'files/test_audio_3.wav')]\n",
"[({450500: 'start'}, 'files/test_audio_3.wav')]\n",
"[({209500: 'start'}, 'files/test_audio_7.wav')]\n",
"[({519500: 'end'}, 'files/test_audio_3.wav')]\n",
"[({533500: 'start'}, 'files/test_audio_3.wav')]\n",
"[({599904: 'end'}, 'files/test_audio_3.wav')]\n",
"Loading next wav: files/test_audio_6.wav\n",
"[({183500: 'start'}, 'files/test_audio_6.wav')]\n",
"[({503500: 'end'}, 'files/test_audio_7.wav')]\n",
"[({202500: 'end'}, 'files/test_audio_6.wav')]\n",
"[({537500: 'start'}, 'files/test_audio_7.wav')]\n",
"[({226500: 'start'}, 'files/test_audio_6.wav')]\n",
"[({283500: 'end'}, 'files/test_audio_6.wav')]\n",
"[({616500: 'end'}, 'files/test_audio_7.wav')]\n",
"[({337500: 'start'}, 'files/test_audio_6.wav')]\n",
"[({661500: 'start'}, 'files/test_audio_7.wav')]\n",
"[({785000: 'end'}, 'files/test_audio_7.wav')]\n",
"[({503000: 'end'}, 'files/test_audio_6.wav')]\n",
"[({507500: 'start'}, 'files/test_audio_6.wav')]\n",
"[({851500: 'start'}, 'files/test_audio_7.wav')]\n",
"[({919000: 'end'}, 'files/test_audio_7.wav')]\n",
"Loading next wav: files/test_audio_5.wav\n",
"[({627500: 'end'}, 'files/test_audio_6.wav')]\n",
"[({631500: 'start'}, 'files/test_audio_6.wav')]\n",
"[({151000: 'start'}, 'files/test_audio_5.wav')]\n",
"[({169000: 'end'}, 'files/test_audio_5.wav')]\n",
"[({211000: 'start'}, 'files/test_audio_5.wav')]\n",
"[({221500: 'end'}, 'files/test_audio_5.wav')]\n",
"Loading next wav: files/test_audio_2.wav\n",
"[({927488: 'end'}, 'files/test_audio_6.wav')]\n",
"Loading next wav: files/test_audio_8.wav\n",
"[({228000: 'start'}, 'files/test_audio_2.wav')]\n",
"[({179500: 'start'}, 'files/test_audio_8.wav')]\n",
"[({241500: 'end'}, 'files/test_audio_2.wav')]\n",
"[({279000: 'start'}, 'files/test_audio_2.wav')]\n",
"[({274500: 'end'}, 'files/test_audio_8.wav')]\n",
"[({300500: 'start'}, 'files/test_audio_8.wav')]\n",
"[({369500: 'end'}, 'files/test_audio_2.wav')]\n",
"[({378500: 'start'}, 'files/test_audio_2.wav')]\n",
"[({436500: 'end'}, 'files/test_audio_2.wav')]\n",
"[({423000: 'end'}, 'files/test_audio_8.wav')]\n",
"[({488500: 'start'}, 'files/test_audio_2.wav')]\n",
"[({458500: 'start'}, 'files/test_audio_8.wav')]\n",
"[({599904: 'end'}, 'files/test_audio_2.wav')]\n",
"Loading next wav: files/test_audio_4.wav\n",
"[({583500: 'end'}, 'files/test_audio_8.wav')]\n",
"[({599500: 'start'}, 'files/test_audio_8.wav')]\n",
"[({632500: 'end'}, 'files/test_audio_8.wav')]\n",
"[({660000: 'start'}, 'files/test_audio_8.wav')]\n",
"[({737000: 'end'}, 'files/test_audio_8.wav')]\n",
"[({761000: 'start'}, 'files/test_audio_8.wav')]\n",
"[({249500: 'start'}, 'files/test_audio_4.wav')]\n",
"[({257168: 'end'}, 'files/test_audio_4.wav')]\n",
"Loading next wav: files/test_audio_9.wav\n",
"[({843000: 'end'}, 'files/test_audio_8.wav')]\n",
"Loading next wav: files/test_audio_0.wav\n",
"[({133000: 'start'}, 'files/test_audio_9.wav')]\n",
"[({143500: 'end'}, 'files/test_audio_9.wav')]\n",
"[({272000: 'start'}, 'files/test_audio_9.wav')]\n",
"[({256500: 'start'}, 'files/test_audio_0.wav')]\n",
"[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n",
"[({406500: 'start'}, 'files/test_audio_9.wav')]\n",
"[({460000: 'end'}, 'files/test_audio_9.wav')]\n",
"[({476000: 'start'}, 'files/test_audio_9.wav')]\n",
"[({494500: 'end'}, 'files/test_audio_9.wav')]\n",
"[({544500: 'start'}, 'files/test_audio_9.wav')]\n",
"[({564500: 'end'}, 'files/test_audio_9.wav')]\n",
"[({595000: 'start'}, 'files/test_audio_9.wav')]\n",
"[({682000: 'end'}, 'files/test_audio_9.wav')]\n",
"[({728500: 'start'}, 'files/test_audio_9.wav')]\n",
"[({786000: 'end'}, 'files/test_audio_9.wav')]\n",
"[({814000: 'start'}, 'files/test_audio_9.wav')]\n",
"[({826000: 'end'}, 'files/test_audio_9.wav')]\n"
]
}
],
"source": [
"for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n",
" if i:\n",
" print(i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}