From 324bc74a58cefe41dc5d16556ae2d3c8d98c0bad Mon Sep 17 00:00:00 2001
From: Mohamed Bouaziz <mbouaziz@zaion.ai>
Date: Fri, 4 Nov 2022 21:42:22 +0100
Subject: [PATCH] utils_vad max duration

---
 utils_vad.py | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/utils_vad.py b/utils_vad.py
index 80db3c9..5f9fc02 100644
--- a/utils_vad.py
+++ b/utils_vad.py
@@ -121,6 +121,7 @@ def get_speech_timestamps(audio: torch.Tensor,
                           threshold: float = 0.5,
                           sampling_rate: int = 16000,
                           min_speech_duration_ms: int = 250,
+                          max_speech_duration_s: float = float('inf'),
                           min_silence_duration_ms: int = 100,
                           window_size_samples: int = 1536,
                           speech_pad_ms: int = 30,
@@ -147,6 +148,11 @@ def get_speech_timestamps(audio: torch.Tensor,
     min_speech_duration_ms: int (default - 250 milliseconds)
         Final speech chunks shorter min_speech_duration_ms are thrown out
 
+    max_speech_duration_s: int (default -  inf)
+        Maximum duration of speech chunks in seconds
+        Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100s (if any), to prevent agressive cutting.
+        Otherwise, they will be split aggressively just before max_speech_duration_s.
+
     min_silence_duration_ms: int (default - 100 milliseconds)
         In the end of each speech chunk wait for min_silence_duration_ms before separating it
 
@@ -197,8 +203,10 @@ def get_speech_timestamps(audio: torch.Tensor,
 
     model.reset_states()
     min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
-    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
     speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+    max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
+    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+    min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
 
     audio_length_samples = len(audio)
 
@@ -214,28 +222,56 @@ def get_speech_timestamps(audio: torch.Tensor,
     speeches = []
     current_speech = {}
     neg_threshold = threshold - 0.15
-    temp_end = 0
+    temp_end = 0 # to save potential segment end (and tolerate some silence)
+    prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
 
     for i, speech_prob in enumerate(speech_probs):
         if (speech_prob >= threshold) and temp_end:
             temp_end = 0
+            if next_start < prev_end:
+               next_start = window_size_samples * i
 
         if (speech_prob >= threshold) and not triggered:
             triggered = True
             current_speech['start'] = window_size_samples * i
             continue
+        
+        if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
+            if prev_end:
+                current_speech['end'] = prev_end
+                #print("st", current_speech['start'], 'end', current_speech['end'], 
+                #        'dur', current_speech['end'] - current_speech['start'])
+                speeches.append(current_speech)
+                current_speech = {}
+                if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    triggered = False
+                else:
+                    current_speech['start'] = next_start
+                prev_end = next_start = temp_end = 0
+            else:
+                #print("strict cut at ", window_size_samples * i / sampling_rate)
+                current_speech['end'] = window_size_samples * i
+                speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue
+                
 
         if (speech_prob < neg_threshold) and triggered:
             if not temp_end:
+                #print(window_size_samples * i / sampling_rate)
                 temp_end = window_size_samples * i
+            if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
+                prev_end = temp_end
             if (window_size_samples * i) - temp_end < min_silence_samples:
                 continue
             else:
                 current_speech['end'] = temp_end
                 if (current_speech['end'] - current_speech['start']) > min_speech_samples:
                     speeches.append(current_speech)
-                temp_end = 0
                 current_speech = {}
+                prev_end = next_start = temp_end = 0
                 triggered = False
                 continue