mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
251 lines
5.4 KiB
Plaintext
251 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:14:25.443732Z",
|
|
"start_time": "2020-12-11T14:14:24.835612Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import glob\n",
|
|
"import torch\n",
|
|
"import numpy as np # use only torch?\n",
|
|
"import soundfile as sf\n",
|
|
"# import torch.nn.functional as F\n",
|
|
"from IPython.display import Audio\n",
|
|
"torch.set_num_threads(1)\n",
|
|
"from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n",
|
|
"extractor = STFTExtractor()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Full audio example"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:19:25.895033Z",
|
|
"start_time": "2020-12-11T14:19:25.891112Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def collect_speeches(tss, wav):\n",
|
|
" speech_chunks = []\n",
|
|
" for i in tss:\n",
|
|
" speech_chunks.append(wav[i['start']: i['end']])\n",
|
|
" return np.concatenate(speech_chunks)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:19:41.758975Z",
|
|
"start_time": "2020-12-11T14:19:41.522818Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu') # from yml file"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:19:52.024425Z",
|
|
"start_time": "2020-12-11T14:19:51.978279Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"Audio('files/test_audio_6.wav')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:20:12.363579Z",
|
|
"start_time": "2020-12-11T14:20:12.346354Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"wav = read_audio('files/test_audio_6.wav')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:20:49.910862Z",
|
|
"start_time": "2020-12-11T14:20:49.906902Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"torch.__version__"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:20:42.130546Z",
|
|
"start_time": "2020-12-11T14:20:42.122245Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"torch.vstack"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:20:28.888271Z",
|
|
"start_time": "2020-12-11T14:20:28.787459Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"speech_timestamps = get_speech_ts(wav, model, extractor, num_steps=4) # kill extractor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T13:32:11.698816Z",
|
|
"start_time": "2020-12-11T13:32:11.671735Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n",
|
|
"Audio('only_speech.wav')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Stream example"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T14:22:54.451814Z",
|
|
"start_time": "2020-12-11T14:22:54.211738Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T13:31:34.137062Z",
|
|
"start_time": "2020-12-11T13:31:33.957092Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T13:31:36.332200Z",
|
|
"start_time": "2020-12-11T13:31:36.328087Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"audios_for_stream = glob.glob('files/test*.wav')\n",
|
|
"len(audios_for_stream)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2020-12-11T13:31:52.668041Z",
|
|
"start_time": "2020-12-11T13:31:37.357340Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n",
|
|
" if i:\n",
|
|
" print(i)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.7"
|
|
},
|
|
"toc": {
|
|
"base_numbering": 1,
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": false,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|