mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 09:29:22 +08:00
250 lines
8.8 KiB
C#
250 lines
8.8 KiB
C#
using NAudio.Wave;
|
|
using VADdotnet;
|
|
|
|
namespace VadDotNet;
|
|
|
|
public class SileroVadDetector
|
|
{
|
|
private readonly SileroVadOnnxModel _model;
|
|
private readonly float _threshold;
|
|
private readonly float _negThreshold;
|
|
private readonly int _samplingRate;
|
|
private readonly int _windowSizeSample;
|
|
private readonly float _minSpeechSamples;
|
|
private readonly float _speechPadSamples;
|
|
private readonly float _maxSpeechSamples;
|
|
private readonly float _minSilenceSamples;
|
|
private readonly float _minSilenceSamplesAtMaxSpeech;
|
|
private int _audioLengthSamples;
|
|
private const float THRESHOLD_GAP = 0.15f;
|
|
// ReSharper disable once InconsistentNaming
|
|
private const int SAMPLING_RATE_8K = 8000;
|
|
// ReSharper disable once InconsistentNaming
|
|
private const int SAMPLING_RATE_16K = 16000;
|
|
|
|
public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate,
|
|
int minSpeechDurationMs, float maxSpeechDurationSeconds,
|
|
int minSilenceDurationMs, int speechPadMs)
|
|
{
|
|
if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
|
|
{
|
|
throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]");
|
|
}
|
|
|
|
this._model = new SileroVadOnnxModel(onnxModelPath);
|
|
this._samplingRate = samplingRate;
|
|
this._threshold = threshold;
|
|
this._negThreshold = threshold - THRESHOLD_GAP;
|
|
this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
|
|
this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
|
|
this._speechPadSamples = samplingRate * speechPadMs / 1000f;
|
|
this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
|
|
this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
|
this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
|
|
this.Reset();
|
|
}
|
|
|
|
public void Reset()
|
|
{
|
|
_model.ResetStates();
|
|
}
|
|
|
|
public List<SileroSpeechSegment> GetSpeechSegmentList(FileInfo wavFile)
|
|
{
|
|
Reset();
|
|
|
|
using (var audioFile = new AudioFileReader(wavFile.FullName))
|
|
{
|
|
List<float> speechProbList = new List<float>();
|
|
this._audioLengthSamples = (int)(audioFile.Length / 2);
|
|
float[] buffer = new float[this._windowSizeSample];
|
|
|
|
while (audioFile.Read(buffer, 0, buffer.Length) > 0)
|
|
{
|
|
float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
|
|
speechProbList.Add(speechProb);
|
|
}
|
|
|
|
return CalculateProb(speechProbList);
|
|
}
|
|
}
|
|
|
|
private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
|
|
{
|
|
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
|
|
bool triggered = false;
|
|
int tempEnd = 0, prevEnd = 0, nextStart = 0;
|
|
SileroSpeechSegment segment = new SileroSpeechSegment();
|
|
|
|
for (int i = 0; i < speechProbList.Count; i++)
|
|
{
|
|
float speechProb = speechProbList[i];
|
|
if (speechProb >= _threshold && (tempEnd != 0))
|
|
{
|
|
tempEnd = 0;
|
|
if (nextStart < prevEnd)
|
|
{
|
|
nextStart = _windowSizeSample * i;
|
|
}
|
|
}
|
|
|
|
if (speechProb >= _threshold && !triggered)
|
|
{
|
|
triggered = true;
|
|
segment.StartOffset = _windowSizeSample * i;
|
|
continue;
|
|
}
|
|
|
|
if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples)
|
|
{
|
|
if (prevEnd != 0)
|
|
{
|
|
segment.EndOffset = prevEnd;
|
|
result.Add(segment);
|
|
segment = new SileroSpeechSegment();
|
|
if (nextStart < prevEnd)
|
|
{
|
|
triggered = false;
|
|
}
|
|
else
|
|
{
|
|
segment.StartOffset = nextStart;
|
|
}
|
|
|
|
prevEnd = 0;
|
|
nextStart = 0;
|
|
tempEnd = 0;
|
|
}
|
|
else
|
|
{
|
|
segment.EndOffset = _windowSizeSample * i;
|
|
result.Add(segment);
|
|
segment = new SileroSpeechSegment();
|
|
prevEnd = 0;
|
|
nextStart = 0;
|
|
tempEnd = 0;
|
|
triggered = false;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (speechProb < _negThreshold && triggered)
|
|
{
|
|
if (tempEnd == 0)
|
|
{
|
|
tempEnd = _windowSizeSample * i;
|
|
}
|
|
|
|
if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech)
|
|
{
|
|
prevEnd = tempEnd;
|
|
}
|
|
|
|
if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples)
|
|
{
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
segment.EndOffset = tempEnd;
|
|
if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
|
|
{
|
|
result.Add(segment);
|
|
}
|
|
|
|
segment = new SileroSpeechSegment();
|
|
prevEnd = 0;
|
|
nextStart = 0;
|
|
tempEnd = 0;
|
|
triggered = false;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
|
|
{
|
|
segment.EndOffset = _audioLengthSamples;
|
|
result.Add(segment);
|
|
}
|
|
|
|
for (int i = 0; i < result.Count; i++)
|
|
{
|
|
SileroSpeechSegment item = result[i];
|
|
if (i == 0)
|
|
{
|
|
item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples);
|
|
}
|
|
|
|
if (i != result.Count - 1)
|
|
{
|
|
SileroSpeechSegment nextItem = result[i + 1];
|
|
int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
|
|
if (silenceDuration < 2 * _speechPadSamples)
|
|
{
|
|
item.EndOffset = item.EndOffset + (silenceDuration / 2);
|
|
nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
|
|
}
|
|
else
|
|
{
|
|
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
|
nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
|
}
|
|
}
|
|
|
|
return MergeListAndCalculateSecond(result, _samplingRate);
|
|
}
|
|
|
|
private List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
|
|
{
|
|
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
|
|
if (original == null || original.Count == 0)
|
|
{
|
|
return result;
|
|
}
|
|
|
|
int left = original[0].StartOffset.Value;
|
|
int right = original[0].EndOffset.Value;
|
|
if (original.Count > 1)
|
|
{
|
|
original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
|
|
for (int i = 1; i < original.Count; i++)
|
|
{
|
|
SileroSpeechSegment segment = original[i];
|
|
|
|
if (segment.StartOffset > right)
|
|
{
|
|
result.Add(new SileroSpeechSegment(left, right,
|
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
|
left = segment.StartOffset.Value;
|
|
right = segment.EndOffset.Value;
|
|
}
|
|
else
|
|
{
|
|
right = Math.Max(right, segment.EndOffset.Value);
|
|
}
|
|
}
|
|
|
|
result.Add(new SileroSpeechSegment(left, right,
|
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
|
}
|
|
else
|
|
{
|
|
result.Add(new SileroSpeechSegment(left, right,
|
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
private float CalculateSecondByOffset(int offset, int samplingRate)
|
|
{
|
|
float secondValue = offset * 1.0f / samplingRate;
|
|
return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;
|
|
}
|
|
} |