mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
utils_vad max duration
This commit is contained in:
42
utils_vad.py
42
utils_vad.py
@@ -121,6 +121,7 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
threshold: float = 0.5,
|
threshold: float = 0.5,
|
||||||
sampling_rate: int = 16000,
|
sampling_rate: int = 16000,
|
||||||
min_speech_duration_ms: int = 250,
|
min_speech_duration_ms: int = 250,
|
||||||
|
max_speech_duration_s: float = float('inf'),
|
||||||
min_silence_duration_ms: int = 100,
|
min_silence_duration_ms: int = 100,
|
||||||
window_size_samples: int = 1536,
|
window_size_samples: int = 1536,
|
||||||
speech_pad_ms: int = 30,
|
speech_pad_ms: int = 30,
|
||||||
@@ -147,6 +148,11 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
min_speech_duration_ms: int (default - 250 milliseconds)
|
min_speech_duration_ms: int (default - 250 milliseconds)
|
||||||
Final speech chunks shorter min_speech_duration_ms are thrown out
|
Final speech chunks shorter min_speech_duration_ms are thrown out
|
||||||
|
|
||||||
|
max_speech_duration_s: int (default - inf)
|
||||||
|
Maximum duration of speech chunks in seconds
|
||||||
|
Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100s (if any), to prevent agressive cutting.
|
||||||
|
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
||||||
|
|
||||||
min_silence_duration_ms: int (default - 100 milliseconds)
|
min_silence_duration_ms: int (default - 100 milliseconds)
|
||||||
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
||||||
|
|
||||||
@@ -197,8 +203,10 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
|
|
||||||
model.reset_states()
|
model.reset_states()
|
||||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||||
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
|
||||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||||
|
max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
|
||||||
|
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
||||||
|
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
||||||
|
|
||||||
audio_length_samples = len(audio)
|
audio_length_samples = len(audio)
|
||||||
|
|
||||||
@@ -214,28 +222,56 @@ def get_speech_timestamps(audio: torch.Tensor,
|
|||||||
speeches = []
|
speeches = []
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
neg_threshold = threshold - 0.15
|
neg_threshold = threshold - 0.15
|
||||||
temp_end = 0
|
temp_end = 0 # to save potential segment end (and tolerate some silence)
|
||||||
|
prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
|
||||||
|
|
||||||
for i, speech_prob in enumerate(speech_probs):
|
for i, speech_prob in enumerate(speech_probs):
|
||||||
if (speech_prob >= threshold) and temp_end:
|
if (speech_prob >= threshold) and temp_end:
|
||||||
temp_end = 0
|
temp_end = 0
|
||||||
|
if next_start < prev_end:
|
||||||
|
next_start = window_size_samples * i
|
||||||
|
|
||||||
if (speech_prob >= threshold) and not triggered:
|
if (speech_prob >= threshold) and not triggered:
|
||||||
triggered = True
|
triggered = True
|
||||||
current_speech['start'] = window_size_samples * i
|
current_speech['start'] = window_size_samples * i
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
|
||||||
|
if prev_end:
|
||||||
|
current_speech['end'] = prev_end
|
||||||
|
#print("st", current_speech['start'], 'end', current_speech['end'],
|
||||||
|
# 'dur', current_speech['end'] - current_speech['start'])
|
||||||
|
speeches.append(current_speech)
|
||||||
|
current_speech = {}
|
||||||
|
if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
|
||||||
|
triggered = False
|
||||||
|
else:
|
||||||
|
current_speech['start'] = next_start
|
||||||
|
prev_end = next_start = temp_end = 0
|
||||||
|
else:
|
||||||
|
#print("strict cut at ", window_size_samples * i / sampling_rate)
|
||||||
|
current_speech['end'] = window_size_samples * i
|
||||||
|
speeches.append(current_speech)
|
||||||
|
current_speech = {}
|
||||||
|
prev_end = next_start = temp_end = 0
|
||||||
|
triggered = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
if (speech_prob < neg_threshold) and triggered:
|
if (speech_prob < neg_threshold) and triggered:
|
||||||
if not temp_end:
|
if not temp_end:
|
||||||
|
#print(window_size_samples * i / sampling_rate)
|
||||||
temp_end = window_size_samples * i
|
temp_end = window_size_samples * i
|
||||||
|
if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
|
||||||
|
prev_end = temp_end
|
||||||
if (window_size_samples * i) - temp_end < min_silence_samples:
|
if (window_size_samples * i) - temp_end < min_silence_samples:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
current_speech['end'] = temp_end
|
current_speech['end'] = temp_end
|
||||||
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
||||||
speeches.append(current_speech)
|
speeches.append(current_speech)
|
||||||
temp_end = 0
|
|
||||||
current_speech = {}
|
current_speech = {}
|
||||||
|
prev_end = next_start = temp_end = 0
|
||||||
triggered = False
|
triggered = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user