diff --git a/silero-vad.ipynb b/silero-vad.ipynb
index ec9f8b1..aefc8b5 100644
--- a/silero-vad.ipynb
+++ b/silero-vad.ipynb
@@ -2,14 +2,25 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:14:25.443732Z",
- "start_time": "2020-12-11T14:14:24.835612Z"
+ "end_time": "2020-12-11T15:10:52.128138Z",
+ "start_time": "2020-12-11T15:10:51.548322Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:53: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
+ " warnings.warn(\n",
+ "/opt/conda/lib/python3.8/site-packages/torchaudio/backend/utils.py:63: UserWarning: The interface of \"soundfile\" backend is planned to change in 0.8.0 to match that of \"sox_io\" backend and the current interface will be removed in 0.9.0. To use the new interface, do `torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False` before setting the backend to \"soundfile\". Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
"source": [
"import glob\n",
"import torch\n",
@@ -18,7 +29,7 @@
"# import torch.nn.functional as F\n",
"from IPython.display import Audio\n",
"torch.set_num_threads(1)\n",
- "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator\n",
+ "from utils import init_jit_model, STFTExtractor, get_speech_ts, read_audio, state_generator, single_audio_stream\n",
"extractor = STFTExtractor()"
]
},
@@ -26,16 +37,16 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Full audio example"
+ "# Full audio example"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:19:25.895033Z",
- "start_time": "2020-12-11T14:19:25.891112Z"
+ "end_time": "2020-12-11T14:25:05.274301Z",
+ "start_time": "2020-12-11T14:25:05.271313Z"
}
},
"outputs": [],
@@ -49,11 +60,11 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:19:41.758975Z",
- "start_time": "2020-12-11T14:19:41.522818Z"
+ "end_time": "2020-12-11T14:25:06.395183Z",
+ "start_time": "2020-12-11T14:25:06.082595Z"
}
},
"outputs": [],
@@ -63,67 +74,58 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:19:52.024425Z",
- "start_time": "2020-12-11T14:19:51.978279Z"
+ "end_time": "2020-12-11T14:25:25.523423Z",
+ "start_time": "2020-12-11T14:25:25.493581Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "Audio('files/test_audio_6.wav')"
+ "Audio('files/test_audio_8.wav')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:20:12.363579Z",
- "start_time": "2020-12-11T14:20:12.346354Z"
+ "end_time": "2020-12-11T14:25:43.023784Z",
+ "start_time": "2020-12-11T14:25:43.017360Z"
}
},
"outputs": [],
"source": [
- "wav = read_audio('files/test_audio_6.wav')"
+ "wav = read_audio('files/test_audio_8.wav')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:20:49.910862Z",
- "start_time": "2020-12-11T14:20:49.906902Z"
- }
- },
- "outputs": [],
- "source": [
- "torch.__version__"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-11T14:20:42.130546Z",
- "start_time": "2020-12-11T14:20:42.122245Z"
- }
- },
- "outputs": [],
- "source": [
- "torch.vstack"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-11T14:20:28.888271Z",
- "start_time": "2020-12-11T14:20:28.787459Z"
+ "end_time": "2020-12-11T14:25:45.083872Z",
+ "start_time": "2020-12-11T14:25:43.371366Z"
}
},
"outputs": [],
@@ -133,47 +135,145 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:32:11.698816Z",
- "start_time": "2020-12-11T13:32:11.671735Z"
+ "end_time": "2020-12-11T14:25:45.130371Z",
+ "start_time": "2020-12-11T14:25:45.091010Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"sf.write('only_speech.wav', collect_speeches(speech_timestamps, wav), 16000)\n",
"Audio('only_speech.wav')"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
- "source": [
- "## Stream example"
- ]
+ "outputs": [],
+ "source": []
},
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Single stream example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T14:22:54.451814Z",
- "start_time": "2020-12-11T14:22:54.211738Z"
+ "end_time": "2020-12-11T15:10:55.789272Z",
+ "start_time": "2020-12-11T15:10:55.543652Z"
}
},
"outputs": [],
"source": [
- "!ls -laht files/joint_VAD_just_RU_jit_cut_q.pth.tar"
+ "model = init_jit_model('files/joint_VAD_just_RU_jit_cut_q.pth.tar', 'cpu')\n",
+ "audio = 'files/test_audio_6.wav'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-11T15:10:59.503301Z",
+ "start_time": "2020-12-11T15:10:55.790671Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.8/site-packages/torch/functional.py:515: UserWarning: stft will require the return_complex parameter be explicitly specified in a future PyTorch release. Use return_complex=False to preserve the current behavior or return_complex=True to return a complex output. (Triggered internally at /opt/conda/conda-bld/pytorch_1603729096996/work/aten/src/ATen/native/SpectralOps.cpp:653.)\n",
+ " return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore\n",
+ "/opt/conda/lib/python3.8/site-packages/torch/functional.py:515: UserWarning: The function torch.rfft is deprecated and will be removed in a future PyTorch release. Use the new torch.fft module functions, instead, by importing torch.fft and calling torch.fft.fft or torch.fft.rfft. (Triggered internally at /opt/conda/conda-bld/pytorch_1603729096996/work/aten/src/ATen/native/SpectralOps.cpp:590.)\n",
+ " return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[{183500: 'start'}]\n",
+ "[{202500: 'end'}]\n",
+ "[{226500: 'start'}]\n",
+ "[{283500: 'end'}]\n",
+ "[{337500: 'start'}]\n",
+ "[{503000: 'end'}]\n",
+ "[{507500: 'start'}]\n",
+ "[{627500: 'end'}]\n",
+ "[{631500: 'start'}]\n",
+ "[{927488: 'end'}]\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in single_audio_stream(model, audio, extractor):\n",
+ " if i:\n",
+ " print(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Multiple stream example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:31:34.137062Z",
- "start_time": "2020-12-11T13:31:33.957092Z"
+ "end_time": "2020-12-11T14:28:09.649303Z",
+ "start_time": "2020-12-11T14:28:09.373634Z"
}
},
"outputs": [],
@@ -183,14 +283,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:31:36.332200Z",
- "start_time": "2020-12-11T13:31:36.328087Z"
+ "end_time": "2020-12-11T14:28:12.273951Z",
+ "start_time": "2020-12-11T14:28:12.269729Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"audios_for_stream = glob.glob('files/test*.wav')\n",
"len(audios_for_stream)"
@@ -198,19 +309,113 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"metadata": {
"ExecuteTime": {
- "end_time": "2020-12-11T13:31:52.668041Z",
- "start_time": "2020-12-11T13:31:37.357340Z"
+ "end_time": "2020-12-11T14:28:32.459872Z",
+ "start_time": "2020-12-11T14:28:14.502871Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done initial Loading\n",
+ "[({106500: 'start'}, 'files/test_audio_1.wav')]\n",
+ "[({174000: 'start'}, 'files/test_audio_3.wav')]\n",
+ "[({261000: 'end'}, 'files/test_audio_1.wav')]\n",
+ "Loading next wav: files/test_audio_7.wav\n",
+ "[({134000: 'start'}, 'files/test_audio_7.wav')]\n",
+ "[({147500: 'end'}, 'files/test_audio_7.wav')]\n",
+ "[({442000: 'end'}, 'files/test_audio_3.wav')]\n",
+ "[({450500: 'start'}, 'files/test_audio_3.wav')]\n",
+ "[({209500: 'start'}, 'files/test_audio_7.wav')]\n",
+ "[({519500: 'end'}, 'files/test_audio_3.wav')]\n",
+ "[({533500: 'start'}, 'files/test_audio_3.wav')]\n",
+ "[({599904: 'end'}, 'files/test_audio_3.wav')]\n",
+ "Loading next wav: files/test_audio_6.wav\n",
+ "[({183500: 'start'}, 'files/test_audio_6.wav')]\n",
+ "[({503500: 'end'}, 'files/test_audio_7.wav')]\n",
+ "[({202500: 'end'}, 'files/test_audio_6.wav')]\n",
+ "[({537500: 'start'}, 'files/test_audio_7.wav')]\n",
+ "[({226500: 'start'}, 'files/test_audio_6.wav')]\n",
+ "[({283500: 'end'}, 'files/test_audio_6.wav')]\n",
+ "[({616500: 'end'}, 'files/test_audio_7.wav')]\n",
+ "[({337500: 'start'}, 'files/test_audio_6.wav')]\n",
+ "[({661500: 'start'}, 'files/test_audio_7.wav')]\n",
+ "[({785000: 'end'}, 'files/test_audio_7.wav')]\n",
+ "[({503000: 'end'}, 'files/test_audio_6.wav')]\n",
+ "[({507500: 'start'}, 'files/test_audio_6.wav')]\n",
+ "[({851500: 'start'}, 'files/test_audio_7.wav')]\n",
+ "[({919000: 'end'}, 'files/test_audio_7.wav')]\n",
+ "Loading next wav: files/test_audio_5.wav\n",
+ "[({627500: 'end'}, 'files/test_audio_6.wav')]\n",
+ "[({631500: 'start'}, 'files/test_audio_6.wav')]\n",
+ "[({151000: 'start'}, 'files/test_audio_5.wav')]\n",
+ "[({169000: 'end'}, 'files/test_audio_5.wav')]\n",
+ "[({211000: 'start'}, 'files/test_audio_5.wav')]\n",
+ "[({221500: 'end'}, 'files/test_audio_5.wav')]\n",
+ "Loading next wav: files/test_audio_2.wav\n",
+ "[({927488: 'end'}, 'files/test_audio_6.wav')]\n",
+ "Loading next wav: files/test_audio_8.wav\n",
+ "[({228000: 'start'}, 'files/test_audio_2.wav')]\n",
+ "[({179500: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({241500: 'end'}, 'files/test_audio_2.wav')]\n",
+ "[({279000: 'start'}, 'files/test_audio_2.wav')]\n",
+ "[({274500: 'end'}, 'files/test_audio_8.wav')]\n",
+ "[({300500: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({369500: 'end'}, 'files/test_audio_2.wav')]\n",
+ "[({378500: 'start'}, 'files/test_audio_2.wav')]\n",
+ "[({436500: 'end'}, 'files/test_audio_2.wav')]\n",
+ "[({423000: 'end'}, 'files/test_audio_8.wav')]\n",
+ "[({488500: 'start'}, 'files/test_audio_2.wav')]\n",
+ "[({458500: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({599904: 'end'}, 'files/test_audio_2.wav')]\n",
+ "Loading next wav: files/test_audio_4.wav\n",
+ "[({583500: 'end'}, 'files/test_audio_8.wav')]\n",
+ "[({599500: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({632500: 'end'}, 'files/test_audio_8.wav')]\n",
+ "[({660000: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({737000: 'end'}, 'files/test_audio_8.wav')]\n",
+ "[({761000: 'start'}, 'files/test_audio_8.wav')]\n",
+ "[({249500: 'start'}, 'files/test_audio_4.wav')]\n",
+ "[({257168: 'end'}, 'files/test_audio_4.wav')]\n",
+ "Loading next wav: files/test_audio_9.wav\n",
+ "[({843000: 'end'}, 'files/test_audio_8.wav')]\n",
+ "Loading next wav: files/test_audio_0.wav\n",
+ "[({133000: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({143500: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({272000: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({256500: 'start'}, 'files/test_audio_0.wav')]\n",
+ "[({336500: 'end'}, 'files/test_audio_9.wav'), ({281232: 'end'}, 'files/test_audio_0.wav')]\n",
+ "[({406500: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({460000: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({476000: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({494500: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({544500: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({564500: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({595000: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({682000: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({728500: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({786000: 'end'}, 'files/test_audio_9.wav')]\n",
+ "[({814000: 'start'}, 'files/test_audio_9.wav')]\n",
+ "[({826000: 'end'}, 'files/test_audio_9.wav')]\n"
+ ]
+ }
+ ],
"source": [
"for i in state_generator(model, audios_for_stream, extractor, audios_in_stream=2):\n",
" if i:\n",
" print(i)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -229,7 +434,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.7"
+ "version": "3.8.3"
},
"toc": {
"base_numbering": 1,
diff --git a/utils.py b/utils.py
index 30ee9a5..a4e05ed 100644
--- a/utils.py
+++ b/utils.py
@@ -77,15 +77,16 @@ def get_speech_ts(wav, model, extractor,
trig_sum=0.25, neg_trig_sum=0.01,
num_steps=8, batch_size=200):
- assert 4000 % num_steps == 0
- step = int(4000 / num_steps) # stride / hop
+ num_samples = 4000
+ assert num_samples % num_steps == 0
+ step = int(num_samples / num_steps) # stride / hop
outs = []
to_concat = []
for i in range(0, len(wav), step):
- chunk = wav[i: i+4000]
- if len(chunk) < 4000:
- chunk = F.pad(chunk, (0, 4000 - len(chunk)))
+ chunk = wav[i: i+num_samples]
+ if len(chunk) < num_samples:
+ chunk = F.pad(chunk, (0, num_samples - len(chunk)))
to_concat.append(chunk)
if len(to_concat) >= batch_size:
chunks = torch.Tensor(torch.vstack(to_concat))
@@ -107,7 +108,8 @@ def get_speech_ts(wav, model, extractor,
speeches = []
current_speech = {}
- for i, predict in enumerate(outs[:, 1]): # add name
+ speech_probs = outs[:, 1]
+ for i, predict in enumerate(speech_probs): # add name
buffer.append(predict)
if (np.mean(buffer) >= trig_sum) and not triggered:
triggered = True
@@ -158,44 +160,46 @@ class VADiterator:
def __init__(self,
trig_sum=0.26, neg_trig_sum=0.01,
num_steps=8):
+ self.num_samples = 4000
self.num_steps = num_steps
- assert 4000 % num_steps == 0
- self.step = int(4000 / num_steps)
- self.prev = torch.zeros(4000)
+ assert self.num_samples % num_steps == 0
+ self.step = int(self.num_samples / num_steps)
+ self.prev = torch.zeros(self.num_samples)
self.last = False
self.triggered = False
- self.buffer = deque(maxlen=8)
+ self.buffer = deque(maxlen=num_steps)
self.num_frames = 0
self.trig_sum = trig_sum
self.neg_trig_sum = neg_trig_sum
self.current_name = ''
def refresh(self):
- self.prev = torch.zeros(4000)
+ self.prev = torch.zeros(self.num_samples)
self.last = False
self.triggered = False
- self.buffer = deque(maxlen=8)
+ self.buffer = deque(maxlen=self.num_steps)
self.num_frames = 0
def prepare_batch(self, wav_chunk, name=None):
if (name is not None) and (name != self.current_name):
self.refresh()
self.current_name = name
- assert len(wav_chunk) <= 4000
+ assert len(wav_chunk) <= self.num_samples
self.num_frames += len(wav_chunk)
- if len(wav_chunk) < 4000:
- wav_chunk = F.pad(wav_chunk, (0, 4000 - len(wav_chunk))) # assume that short chunk means end of the audio
+ if len(wav_chunk) < self.num_samples:
+ wav_chunk = F.pad(wav_chunk, (0, self.num_samples - len(wav_chunk))) # assume that short chunk means end of the audio
self.last = True
stacked = torch.hstack([self.prev, wav_chunk])
self.prev = wav_chunk
- overlap_chunks = [stacked[i:i+4000] for i in range(self.step, 4001, self.step)] # 500 step is good enough
+ overlap_chunks = [stacked[i:i+self.num_samples] for i in range(self.step, self.num_samples+1, self.step)] # 500 step is good enough
return torch.vstack(overlap_chunks)
def state(self, model_out):
current_speech = {}
- for i, predict in enumerate(model_out[:, 1]): # add name
+ speech_probs = model_out[:, 1]
+ for i, predict in enumerate(speech_probs): # add name
self.buffer.append(predict)
if (np.mean(self.buffer) >= self.trig_sum) and not self.triggered:
self.triggered = True
@@ -236,14 +240,15 @@ def state_generator(model, audios, extractor,
yield states
-def stream_imitator(stereo, audios_in_stream):
- stereo_iter = iter(stereo)
+def stream_imitator(audios, audios_in_stream):
+ audio_iter = iter(audios)
iterators = []
+ num_samples = 4000
# initial wavs
for i in range(audios_in_stream):
- next_wav = next(stereo_iter)
+ next_wav = next(audio_iter)
wav = read_audio(next_wav)
- wav_chunks = iter([(wav[i:i+4000], next_wav) for i in range(0, len(wav), 4000)])
+ wav_chunks = iter([(wav[i:i+num_samples], next_wav) for i in range(0, len(wav), num_samples)])
iterators.append(wav_chunks)
print('Done initial Loading')
good_iters = audios_in_stream
@@ -254,16 +259,40 @@ def stream_imitator(stereo, audios_in_stream):
out, wav_name = next(it)
except StopIteration:
try:
- next_wav = next(stereo_iter)
+ next_wav = next(audio_iter)
print('Loading next wav: ', next_wav)
wav = read_audio(next_wav)
- iterators[i] = iter([(wav[i:i+4000], next_wav) for i in range(0, len(wav), 4000)])
+ iterators[i] = iter([(wav[i:i+num_samples], next_wav) for i in range(0, len(wav), num_samples)])
out, wav_name = next(iterators[i])
except StopIteration:
good_iters -= 1
- iterators[i] = repeat((torch.zeros(4000), 'junk'))
+ iterators[i] = repeat((torch.zeros(num_samples), 'junk'))
out, wav_name = next(iterators[i])
if good_iters == 0:
return
values.append((out, wav_name))
yield values
+
+def single_audio_stream(model, audio, extractor, onnx=False, trig_sum=0.26,
+ neg_trig_sum=0.01, num_steps=8):
+ num_samples = 4000
+ VADiter = VADiterator(trig_sum, neg_trig_sum, num_steps)
+ wav = read_audio(audio)
+ wav_chunks = iter([wav[i:i+num_samples] for i in range(0, len(wav), num_samples)])
+ for chunk in wav_chunks:
+ batch = VADiter.prepare_batch(chunk)
+
+ with torch.no_grad():
+ if onnx:
+ ort_inputs = {'input': to_numpy(extractor(batch))}
+ ort_outs = model.run(None, ort_inputs)
+ vad_outs = ort_outs[-2]
+ else:
+ outs = model(extractor(batch))
+ vad_outs = outs[-2]
+
+ states = []
+ state = VADiter.state(vad_outs)
+ if state[0]:
+ states.append(state[0])
+ yield states