Merge pull request #710 from Purfview/patch-3

Fixes and refines - use_max_poss_sil_at_max_speech arg
This commit is contained in:
Dimitrii Voronin
2025-10-29 12:36:58 +03:00
committed by GitHub

View File

@@ -346,53 +346,70 @@ def get_speech_timestamps(audio: torch.Tensor,
possible_ends = [] possible_ends = []
for i, speech_prob in enumerate(speech_probs): for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= threshold) and temp_end: cur_sample = window_size_samples * i
if temp_end != 0:
sil_dur = (window_size_samples * i) - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i
# If speech returns after a temp_end, record candidate silence if long enough and clear temp_end
if (speech_prob >= threshold) and temp_end:
sil_dur = cur_sample - temp_end
if sil_dur > min_silence_samples_at_max_speech:
possible_ends.append((temp_end, sil_dur))
temp_end = 0
if next_start < prev_end:
next_start = cur_sample
# Start of speech
if (speech_prob >= threshold) and not triggered: if (speech_prob >= threshold) and not triggered:
triggered = True triggered = True
current_speech['start'] = window_size_samples * i current_speech['start'] = cur_sample
continue continue
if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples: # Max speech length reached: decide where to cut
if possible_ends: if triggered and (cur_sample - current_speech['start'] > max_speech_samples):
if use_max_poss_sil_at_max_speech: if use_max_poss_sil_at_max_speech and possible_ends:
prev_end, dur = max(possible_ends, key=lambda x: x[1]) # use the longest possible silence segment in the current speech chunk prev_end, dur = max(possible_ends, key=lambda x: x[1]) # use the longest possible silence segment in the current speech chunk
else:
prev_end, dur = possible_ends[-1] # use the last possible silence segment
current_speech['end'] = prev_end current_speech['end'] = prev_end
speeches.append(current_speech) speeches.append(current_speech)
current_speech = {} current_speech = {}
next_start = prev_end + dur next_start = prev_end + dur
if next_start < prev_end + window_size_samples * i: # previously reached silence (< neg_thres) and is still not speech (< thres)
#triggered = False if next_start < prev_end + cur_sample: # previously reached silence (< neg_thres) and is still not speech (< thres)
current_speech['start'] = next_start current_speech['start'] = next_start
else: else:
triggered = False triggered = False
#current_speech['start'] = next_start
prev_end = next_start = temp_end = 0 prev_end = next_start = temp_end = 0
possible_ends = [] possible_ends = []
else: else:
current_speech['end'] = window_size_samples * i # Legacy max-speech cut (use_max_poss_sil_at_max_speech=False): prefer last valid silence (prev_end) if available
speeches.append(current_speech) if prev_end:
current_speech = {} current_speech['end'] = prev_end
prev_end = next_start = temp_end = 0 speeches.append(current_speech)
triggered = False current_speech = {}
possible_ends = [] if next_start < prev_end:
continue triggered = False
else:
current_speech['start'] = next_start
prev_end = next_start = temp_end = 0
possible_ends = []
else:
# No prev_end -> fallback to cutting at current sample
current_speech['end'] = cur_sample
speeches.append(current_speech)
current_speech = {}
prev_end = next_start = temp_end = 0
triggered = False
possible_ends = []
continue
# Silence detection while in speech
if (speech_prob < neg_threshold) and triggered: if (speech_prob < neg_threshold) and triggered:
if not temp_end: if not temp_end:
temp_end = window_size_samples * i temp_end = cur_sample
# if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech: # condition to avoid cutting in very short silence sil_dur_now = cur_sample - temp_end
# prev_end = temp_end
if (window_size_samples * i) - temp_end < min_silence_samples: if not use_max_poss_sil_at_max_speech and sil_dur_now > min_silence_samples_at_max_speech: # condition to avoid cutting in very short silence
prev_end = temp_end
if sil_dur_now < min_silence_samples:
continue continue
else: else:
current_speech['end'] = temp_end current_speech['end'] = temp_end