utils_vad max duration

2026-02-04 17:39:22 +08:00 · 2022-11-04 21:42:22 +01:00
parent 7c671a75c2
commit 324bc74a58
1 changed files with 39 additions and 3 deletions
--- a/utils_vad.py
+++ b/utils_vad.py
@@ -121,6 +121,7 @@ def get_speech_timestamps(audio: torch.Tensor,
                          threshold: float = 0.5,
                          sampling_rate: int = 16000,
                          min_speech_duration_ms: int = 250,
                          max_speech_duration_s: float = float('inf'),
                          min_silence_duration_ms: int = 100,
                          window_size_samples: int = 1536,
                          speech_pad_ms: int = 30,
@@ -147,6 +148,11 @@ def get_speech_timestamps(audio: torch.Tensor,
    min_speech_duration_ms: int (default - 250 milliseconds)
        Final speech chunks shorter min_speech_duration_ms are thrown out
    max_speech_duration_s: int (default -  inf)
        Maximum duration of speech chunks in seconds
        Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100s (if any), to prevent agressive cutting.
        Otherwise, they will be split aggressively just before max_speech_duration_s.
    min_silence_duration_ms: int (default - 100 milliseconds)
        In the end of each speech chunk wait for min_silence_duration_ms before separating it
@@ -197,8 +203,10 @@ def get_speech_timestamps(audio: torch.Tensor,
    model.reset_states()
    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
    max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
    min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
    audio_length_samples = len(audio)
@@ -214,28 +222,56 @@ def get_speech_timestamps(audio: torch.Tensor,
    speeches = []
    current_speech = {}
    neg_threshold = threshold - 0.15
-    temp_end = 0
+    temp_end = 0 # to save potential segment end (and tolerate some silence)
    prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
    for i, speech_prob in enumerate(speech_probs):
        if (speech_prob >= threshold) and temp_end:
            temp_end = 0
            if next_start < prev_end:
               next_start = window_size_samples * i
        if (speech_prob >= threshold) and not triggered:
            triggered = True
            current_speech['start'] = window_size_samples * i
            continue
        if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
            if prev_end:
                current_speech['end'] = prev_end
                #print("st", current_speech['start'], 'end', current_speech['end'], 
                #        'dur', current_speech['end'] - current_speech['start'])
                speeches.append(current_speech)
                current_speech = {}
                if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
                    triggered = False
                else:
                    current_speech['start'] = next_start
                prev_end = next_start = temp_end = 0
            else:
                #print("strict cut at ", window_size_samples * i / sampling_rate)
                current_speech['end'] = window_size_samples * i
                speeches.append(current_speech)
                current_speech = {}
                prev_end = next_start = temp_end = 0
                triggered = False
                continue
        if (speech_prob < neg_threshold) and triggered:
            if not temp_end:
                #print(window_size_samples * i / sampling_rate)
                temp_end = window_size_samples * i
            if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
                prev_end = temp_end
            if (window_size_samples * i) - temp_end < min_silence_samples:
                continue
            else:
                current_speech['end'] = temp_end
                if (current_speech['end'] - current_speech['start']) > min_speech_samples:
                    speeches.append(current_speech)
                temp_end = 0
                current_speech = {}
                prev_end = next_start = temp_end = 0
                triggered = False
                continue