From ab54e692c3fb46249e7b481536d6419bb93e0367 Mon Sep 17 00:00:00 2001
From: adamnsandle <dvoronin322@gmail.com>
Date: Fri, 5 Feb 2021 15:08:35 +0000
Subject: [PATCH 1/2] add min_silence_duration

---
 utils_vad.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/utils_vad.py b/utils_vad.py
index a8f8c60..258001b 100644
--- a/utils_vad.py
+++ b/utils_vad.py
@@ -60,6 +60,7 @@ def get_speech_ts(wav: torch.Tensor,
                   batch_size: int = 200,
                   num_samples_per_window: int = 4000,
                   min_speech_samples: int = 10000, #samples
+                  min_silence_samples: int = 8000,
                   run_function=validate,
                   visualize_probs=False):
 
@@ -95,20 +96,31 @@ def get_speech_ts(wav: torch.Tensor,
       smoothed_probs = []
 
     speech_probs = outs[:, 1]  # this is very misleading
+    temp_end = 0
     for i, predict in enumerate(speech_probs):  # add name
         buffer.append(predict)
         smoothed_prob = (sum(buffer) / len(buffer))
         if visualize_probs:
           smoothed_probs.append(float(smoothed_prob))
+        if (smoothed_prob >= trig_sum) and temp_end:
+            temp_end=0
         if (smoothed_prob >= trig_sum) and not triggered:
             triggered = True
             current_speech['start'] = step * max(0, i-num_steps)
+            continue
         if (smoothed_prob < neg_trig_sum) and triggered:
-            current_speech['end'] = step * i
-            if (current_speech['end'] - current_speech['start']) > min_speech_samples:
-                speeches.append(current_speech)
-            current_speech = {}
-            triggered = False
+            if not temp_end:
+                temp_end = step * i
+            if step * i - temp_end < min_silence_samples:
+                continue
+            else:
+                current_speech['end'] = temp_end
+                if (current_speech['end'] - current_speech['start']) > min_speech_samples:
+                    speeches.append(current_speech)
+                temp_end = 0
+                current_speech = {}
+                triggered = False
+                continue
     if current_speech:
         current_speech['end'] = len(wav)
         speeches.append(current_speech)

From bdb0da895932411103a92620de1f90b0917cb4ed Mon Sep 17 00:00:00 2001
From: Dimitrii Voronin <36505480+adamnsandle@users.noreply.github.com>
Date: Fri, 5 Feb 2021 17:13:18 +0200
Subject: [PATCH 2/2] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8814f68..1fd8277 100644
--- a/README.md
+++ b/README.md
@@ -328,6 +328,7 @@ Since our VAD (only VAD, other networks are more flexible) was trained on chunks
 - `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
 - `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
 - `min_speech_samples` - minimum speech chunk duration in samples
+- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks
 
 Optimal parameters may vary per domain, but we provided a tiny tool to learn the best parameters. You can invoke `speech_timestamps` with visualize_probs=True (`pandas` required):