Merge pull request #128 from snakers4/adamnsandle

Adamnsandle
This commit is contained in:
Alexander Veysov
2021-12-07 14:16:28 +03:00
committed by GitHub
2 changed files with 680 additions and 681 deletions

View File

@@ -1,42 +1,4 @@
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "silero-vad.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -68,15 +30,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "5w5AkskZ2Fwr" "id": "5w5AkskZ2Fwr"
}, },
"outputs": [],
"source": [ "source": [
"#@title Install and Import Dependencies\n", "#@title Install and Import Dependencies\n",
"\n", "\n",
"# this assumes that you have a relevant version of PyTorch installed\n", "# this assumes that you have a relevant version of PyTorch installed\n",
"!pip install -q torchaudio soundfile\n", "!pip install -q torchaudio\n",
"\n", "\n",
"SAMPLE_RATE = 16000\n", "SAMPLE_RATE = 16000\n",
"\n", "\n",
@@ -98,9 +62,7 @@
" collect_chunks) = utils\n", " collect_chunks) = utils\n",
"\n", "\n",
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -122,31 +84,31 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "aI_eydBPjsrx" "id": "aI_eydBPjsrx"
}, },
"outputs": [],
"source": [ "source": [
"wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n",
"# get speech timestamps from full audio file\n", "# get speech timestamps from full audio file\n",
"speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n",
"pprint(speech_timestamps)" "pprint(speech_timestamps)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "OuEobLchjsry" "id": "OuEobLchjsry"
}, },
"outputs": [],
"source": [ "source": [
"# merge all speech chunks to one audio\n", "# merge all speech chunks to one audio\n",
"save_audio('only_speech.wav',\n", "save_audio('only_speech.wav',\n",
" collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n",
"Audio('only_speech.wav')" "Audio('only_speech.wav')"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -154,19 +116,21 @@
"id": "iDKQbVr8jsry" "id": "iDKQbVr8jsry"
}, },
"source": [ "source": [
"**Stream imitation example**" "### Stream imitation example"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "q-lql_2Wjsry" "id": "q-lql_2Wjsry"
}, },
"outputs": [],
"source": [ "source": [
"## using VADIterator class\n", "## using VADIterator class\n",
"\n", "\n",
"vad_iterator = VADiterator(double_model)\n", "vad_iterator = VADIterator(model)\n",
"wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n",
"\n", "\n",
"window_size_samples = 1536 # number of samples in a single audio chunk\n", "window_size_samples = 1536 # number of samples in a single audio chunk\n",
"for i in range(0, len(wav), window_size_samples):\n", "for i in range(0, len(wav), window_size_samples):\n",
@@ -174,19 +138,19 @@
" if speech_dict:\n", " if speech_dict:\n",
" print(speech_dict, end=' ')\n", " print(speech_dict, end=' ')\n",
"vad_iterator.reset_states() # reset model states after each audio" "vad_iterator.reset_states() # reset model states after each audio"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "BX3UgwwB2Fwv" "id": "BX3UgwwB2Fwv"
}, },
"outputs": [],
"source": [ "source": [
"## just probabilities\n", "## just probabilities\n",
"\n", "\n",
"wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n",
"speech_probs = []\n", "speech_probs = []\n",
"window_size_samples = 1536\n", "window_size_samples = 1536\n",
"for i in range(0, len(wav), window_size_samples):\n", "for i in range(0, len(wav), window_size_samples):\n",
@@ -194,9 +158,7 @@
" speech_probs.append(speech_prob)\n", " speech_probs.append(speech_prob)\n",
"\n", "\n",
"pprint(speech_probs[:100])" "pprint(speech_probs[:100])"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -221,10 +183,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "Kq5gQuYq2Fwx" "id": "Kq5gQuYq2Fwx"
}, },
"outputs": [],
"source": [ "source": [
"#@title Install and Import Dependencies\n", "#@title Install and Import Dependencies\n",
"\n", "\n",
@@ -249,9 +213,7 @@
" drop_chunks) = utils\n", " drop_chunks) = utils\n",
"\n", "\n",
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -266,64 +228,64 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "EXpau6xq2Fwy" "id": "EXpau6xq2Fwy"
}, },
"outputs": [],
"source": [ "source": [
"wav = read_audio(f'{files_dir}/en_num.wav')\n", "wav = read_audio(f'{files_dir}/en_num.wav')\n",
"# get number timestamps from full audio file\n", "# get number timestamps from full audio file\n",
"number_timestamps = get_number_ts(wav, model)\n", "number_timestamps = get_number_ts(wav, model)\n",
"pprint(number_timestamps)" "pprint(number_timestamps)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "u-KfXRhZ2Fwy" "id": "u-KfXRhZ2Fwy"
}, },
"outputs": [],
"source": [ "source": [
"sample_rate = 16000\n", "sample_rate = 16000\n",
"# convert ms in timestamps to samples\n", "# convert ms in timestamps to samples\n",
"for timestamp in number_timestamps:\n", "for timestamp in number_timestamps:\n",
" timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n",
" timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "iwYEC4aZ2Fwy" "id": "iwYEC4aZ2Fwy"
}, },
"outputs": [],
"source": [ "source": [
"# merge all number chunks to one audio\n", "# merge all number chunks to one audio\n",
"save_audio('only_numbers.wav',\n", "save_audio('only_numbers.wav',\n",
" collect_chunks(number_timestamps, wav), sample_rate) \n", " collect_chunks(number_timestamps, wav), sample_rate) \n",
"Audio('only_numbers.wav')" "Audio('only_numbers.wav')"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "fHaYejX12Fwy" "id": "fHaYejX12Fwy"
}, },
"outputs": [],
"source": [ "source": [
"# drop all number chunks from audio\n", "# drop all number chunks from audio\n",
"save_audio('no_numbers.wav',\n", "save_audio('no_numbers.wav',\n",
" drop_chunks(number_timestamps, wav), sample_rate) \n", " drop_chunks(number_timestamps, wav), sample_rate) \n",
"Audio('no_numbers.wav')" "Audio('no_numbers.wav')"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -348,10 +310,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "Zu9D0t6n2Fwz" "id": "Zu9D0t6n2Fwz"
}, },
"outputs": [],
"source": [ "source": [
"#@title Install and Import Dependencies\n", "#@title Install and Import Dependencies\n",
"\n", "\n",
@@ -373,9 +337,7 @@
" read_audio) = utils\n", " read_audio) = utils\n",
"\n", "\n",
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -390,17 +352,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "c8UYnYBF2Fw0" "id": "c8UYnYBF2Fw0"
}, },
"outputs": [],
"source": [ "source": [
"wav = read_audio(f'{files_dir}/en.wav')\n", "wav = read_audio(f'{files_dir}/en.wav')\n",
"lang = get_language(wav, model)\n", "lang = get_language(wav, model)\n",
"print(lang)" "print(lang)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -452,11 +414,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"cellView": "form", "cellView": "form",
"hidden": true, "hidden": true,
"id": "PdjGd56R2Fw5" "id": "PdjGd56R2Fw5"
}, },
"outputs": [],
"source": [ "source": [
"#@title Install and Import Dependencies\n", "#@title Install and Import Dependencies\n",
"\n", "\n",
@@ -491,9 +455,7 @@
" outs = model.run(None, ort_inputs)\n", " outs = model.run(None, ort_inputs)\n",
" outs = [torch.Tensor(x) for x in outs]\n", " outs = [torch.Tensor(x) for x in outs]\n",
" return outs" " return outs"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -508,10 +470,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "_r6QZiwu2Fw5" "id": "_r6QZiwu2Fw5"
}, },
"outputs": [],
"source": [ "source": [
"model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n",
"wav = read_audio(f'{files_dir}/en_num.wav')\n", "wav = read_audio(f'{files_dir}/en_num.wav')\n",
@@ -519,55 +483,53 @@
"# get number timestamps from full audio file\n", "# get number timestamps from full audio file\n",
"number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n",
"pprint(number_timestamps)" "pprint(number_timestamps)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "FN4aDwLV2Fw5" "id": "FN4aDwLV2Fw5"
}, },
"outputs": [],
"source": [ "source": [
"sample_rate = 16000\n", "sample_rate = 16000\n",
"# convert ms in timestamps to samples\n", "# convert ms in timestamps to samples\n",
"for timestamp in number_timestamps:\n", "for timestamp in number_timestamps:\n",
" timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n",
" timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "JnvS6WTK2Fw5" "id": "JnvS6WTK2Fw5"
}, },
"outputs": [],
"source": [ "source": [
"# merge all number chunks to one audio\n", "# merge all number chunks to one audio\n",
"save_audio('only_numbers.wav',\n", "save_audio('only_numbers.wav',\n",
" collect_chunks(number_timestamps, wav), 16000) \n", " collect_chunks(number_timestamps, wav), 16000) \n",
"Audio('only_numbers.wav')" "Audio('only_numbers.wav')"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "yUxOcOFG2Fw6" "id": "yUxOcOFG2Fw6"
}, },
"outputs": [],
"source": [ "source": [
"# drop all number chunks from audio\n", "# drop all number chunks from audio\n",
"save_audio('no_numbers.wav',\n", "save_audio('no_numbers.wav',\n",
" drop_chunks(number_timestamps, wav), 16000) \n", " drop_chunks(number_timestamps, wav), 16000) \n",
"Audio('no_numbers.wav')" "Audio('no_numbers.wav')"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -592,11 +554,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"cellView": "form", "cellView": "form",
"hidden": true, "hidden": true,
"id": "iNkDWJ3H2Fw6" "id": "iNkDWJ3H2Fw6"
}, },
"outputs": [],
"source": [ "source": [
"#@title Install and Import Dependencies\n", "#@title Install and Import Dependencies\n",
"\n", "\n",
@@ -628,9 +592,7 @@
" outs = model.run(None, ort_inputs)\n", " outs = model.run(None, ort_inputs)\n",
" outs = [torch.Tensor(x) for x in outs]\n", " outs = [torch.Tensor(x) for x in outs]\n",
" return outs" " return outs"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -644,19 +606,57 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"hidden": true, "hidden": true,
"id": "WHXnh9IV2Fw6" "id": "WHXnh9IV2Fw6"
}, },
"outputs": [],
"source": [ "source": [
"model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n",
"wav = read_audio(f'{files_dir}/en.wav')\n", "wav = read_audio(f'{files_dir}/en.wav')\n",
"\n", "\n",
"lang = get_language(wav, model, run_function=validate_onnx)\n", "lang = get_language(wav, model, run_function=validate_onnx)\n",
"print(lang)" "print(lang)"
],
"execution_count": null,
"outputs": []
}
] ]
} }
],
"metadata": {
"colab": {
"name": "silero-vad.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -20,7 +20,6 @@ def validate(model,
def read_audio(path: str, def read_audio(path: str,
sampling_rate: int = 16000): sampling_rate: int = 16000):
assert torchaudio.get_audio_backend() == 'soundfile'
wav, sr = torchaudio.load(path) wav, sr = torchaudio.load(path)
if wav.size(0) > 1: if wav.size(0) > 1:
@@ -63,7 +62,7 @@ def make_visualization(probs, step):
def get_speech_timestamps(audio: torch.Tensor, def get_speech_timestamps(audio: torch.Tensor,
model, model,
threshold: float = 0.5, threshold: float = 0.5,
sample_rate: int = 16000, sampling_rate: int = 16000,
min_speech_duration_ms: int = 250, min_speech_duration_ms: int = 250,
min_silence_duration_ms: int = 100, min_silence_duration_ms: int = 100,
window_size_samples: int = 1536, window_size_samples: int = 1536,
@@ -85,7 +84,7 @@ def get_speech_timestamps(audio: torch.Tensor,
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
sample_rate: int (default - 16000) sampling_rate: int (default - 16000)
Currently silero VAD models support 8000 and 16000 sample rates Currently silero VAD models support 8000 and 16000 sample rates
min_speech_duration_ms: int (default - 250 milliseconds) min_speech_duration_ms: int (default - 250 milliseconds)
@@ -126,15 +125,15 @@ def get_speech_timestamps(audio: torch.Tensor,
if len(audio.shape) > 1: if len(audio.shape) > 1:
raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?") raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
if sample_rate == 8000 and window_size_samples > 768: if sampling_rate == 8000 and window_size_samples > 768:
warnings.warn('window_size_samples is too big for 8000 sample_rate! Better set window_size_samples to 256, 512 or 1536 for 8000 sample rate!') warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 1536 for 8000 sample rate!')
if window_size_samples not in [256, 512, 768, 1024, 1536]: if window_size_samples not in [256, 512, 768, 1024, 1536]:
warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sample_rate\n - [256, 512, 768] for 8000 sample_rate') warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
model.reset_states() model.reset_states()
min_speech_samples = sample_rate * min_speech_duration_ms / 1000 min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
min_silence_samples = sample_rate * min_silence_duration_ms / 1000 min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
speech_pad_samples = sample_rate * speech_pad_ms / 1000 speech_pad_samples = sampling_rate * speech_pad_ms / 1000
audio_length_samples = len(audio) audio_length_samples = len(audio)
@@ -143,7 +142,7 @@ def get_speech_timestamps(audio: torch.Tensor,
chunk = audio[current_start_sample: current_start_sample + window_size_samples] chunk = audio[current_start_sample: current_start_sample + window_size_samples]
if len(chunk) < window_size_samples: if len(chunk) < window_size_samples:
chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk)))) chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob = model(chunk, sample_rate).item() speech_prob = model(chunk, sampling_rate).item()
speech_probs.append(speech_prob) speech_probs.append(speech_prob)
triggered = False triggered = False
@@ -194,11 +193,11 @@ def get_speech_timestamps(audio: torch.Tensor,
if return_seconds: if return_seconds:
for speech_dict in speeches: for speech_dict in speeches:
speech_dict['start'] = round(speech_dict['start'] / sample_rate, 1) speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
speech_dict['end'] = round(speech_dict['end'] / sample_rate, 1) speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
if visualize_probs: if visualize_probs:
make_visualization(speech_probs, window_size_samples / sample_rate) make_visualization(speech_probs, window_size_samples / sampling_rate)
return speeches return speeches
@@ -276,7 +275,7 @@ class VADIterator:
def __init__(self, def __init__(self,
model, model,
threshold: float = 0.5, threshold: float = 0.5,
sample_rate: int = 16000, sampling_rate: int = 16000,
min_silence_duration_ms: int = 100, min_silence_duration_ms: int = 100,
speech_pad_ms: int = 30 speech_pad_ms: int = 30
): ):
@@ -292,7 +291,7 @@ class VADIterator:
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
sample_rate: int (default - 16000) sampling_rate: int (default - 16000)
Currently silero VAD models support 8000 and 16000 sample rates Currently silero VAD models support 8000 and 16000 sample rates
min_silence_duration_ms: int (default - 100 milliseconds) min_silence_duration_ms: int (default - 100 milliseconds)
@@ -304,9 +303,9 @@ class VADIterator:
self.model = model self.model = model
self.threshold = threshold self.threshold = threshold
self.sample_rate = sample_rate self.sampling_rate = sampling_rate
self.min_silence_samples = sample_rate * min_silence_duration_ms / 1000 self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
self.speech_pad_samples = sample_rate * speech_pad_ms / 1000 self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
self.reset_states() self.reset_states()
def reset_states(self): def reset_states(self):
@@ -327,7 +326,7 @@ class VADIterator:
window_size_samples = len(x[0]) if x.dim() == 2 else len(x) window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
self.current_sample += window_size_samples self.current_sample += window_size_samples
speech_prob = self.model(x, self.sample_rate).item() speech_prob = self.model(x, self.sampling_rate).item()
if (speech_prob >= self.threshold) and self.temp_end: if (speech_prob >= self.threshold) and self.temp_end:
self.temp_end = 0 self.temp_end = 0
@@ -335,7 +334,7 @@ class VADIterator:
if (speech_prob >= self.threshold) and not self.triggered: if (speech_prob >= self.threshold) and not self.triggered:
self.triggered = True self.triggered = True
speech_start = self.current_sample - self.speech_pad_samples speech_start = self.current_sample - self.speech_pad_samples
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sample_rate, 1)} return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
if (speech_prob < self.threshold - 0.15) and self.triggered: if (speech_prob < self.threshold - 0.15) and self.triggered:
if not self.temp_end: if not self.temp_end:
@@ -346,7 +345,7 @@ class VADIterator:
speech_end = self.temp_end + self.speech_pad_samples speech_end = self.temp_end + self.speech_pad_samples
self.temp_end = 0 self.temp_end = 0
self.triggered = False self.triggered = False
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sample_rate, 1)} return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
return None return None