Merge branch 'adamnsandle' of github.com:snakers4/silero-vad into adamnsandle

This commit is contained in:
adamnsandle
2021-02-10 18:25:04 +00:00
2 changed files with 18 additions and 5 deletions

View File

@@ -346,6 +346,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
- `min_speech_samples` - minimum speech chunk duration in samples
- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks
Optimal parameters may vary per domain, but we provided a tiny tool to learn the best parameters. You can invoke `speech_timestamps` with visualize_probs=True (`pandas` required):

View File

@@ -60,6 +60,7 @@ def get_speech_ts(wav: torch.Tensor,
batch_size: int = 200,
num_samples_per_window: int = 4000,
min_speech_samples: int = 10000, #samples
min_silence_samples: int = 8000,
run_function=validate,
visualize_probs=False):
@@ -95,20 +96,31 @@ def get_speech_ts(wav: torch.Tensor,
smoothed_probs = []
speech_probs = outs[:, 1] # this is very misleading
temp_end = 0
for i, predict in enumerate(speech_probs): # add name
buffer.append(predict)
smoothed_prob = (sum(buffer) / len(buffer))
if visualize_probs:
smoothed_probs.append(float(smoothed_prob))
if (smoothed_prob >= trig_sum) and temp_end:
temp_end=0
if (smoothed_prob >= trig_sum) and not triggered:
triggered = True
current_speech['start'] = step * max(0, i-num_steps)
continue
if (smoothed_prob < neg_trig_sum) and triggered:
current_speech['end'] = step * i
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
speeches.append(current_speech)
current_speech = {}
triggered = False
if not temp_end:
temp_end = step * i
if step * i - temp_end < min_silence_samples:
continue
else:
current_speech['end'] = temp_end
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
speeches.append(current_speech)
temp_end = 0
current_speech = {}
triggered = False
continue
if current_speech:
current_speech['end'] = len(wav)
speeches.append(current_speech)