mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 18:09:22 +08:00
add csharp example
This commit is contained in:
250
examples/csharp/SileroVadDetector.cs
Normal file
250
examples/csharp/SileroVadDetector.cs
Normal file
@@ -0,0 +1,250 @@
|
||||
using NAudio.Wave;
|
||||
using VADdotnet;
|
||||
|
||||
namespace VadDotNet;
|
||||
|
||||
public class SileroVadDetector
|
||||
{
|
||||
private readonly SileroVadOnnxModel _model;
|
||||
private readonly float _threshold;
|
||||
private readonly float _negThreshold;
|
||||
private readonly int _samplingRate;
|
||||
private readonly int _windowSizeSample;
|
||||
private readonly float _minSpeechSamples;
|
||||
private readonly float _speechPadSamples;
|
||||
private readonly float _maxSpeechSamples;
|
||||
private readonly float _minSilenceSamples;
|
||||
private readonly float _minSilenceSamplesAtMaxSpeech;
|
||||
private int _audioLengthSamples;
|
||||
private const float THRESHOLD_GAP = 0.15f;
|
||||
// ReSharper disable once InconsistentNaming
|
||||
private const int SAMPLING_RATE_8K = 8000;
|
||||
// ReSharper disable once InconsistentNaming
|
||||
private const int SAMPLING_RATE_16K = 16000;
|
||||
|
||||
public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate,
|
||||
int minSpeechDurationMs, float maxSpeechDurationSeconds,
|
||||
int minSilenceDurationMs, int speechPadMs)
|
||||
{
|
||||
if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
|
||||
{
|
||||
throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]");
|
||||
}
|
||||
|
||||
this._model = new SileroVadOnnxModel(onnxModelPath);
|
||||
this._samplingRate = samplingRate;
|
||||
this._threshold = threshold;
|
||||
this._negThreshold = threshold - THRESHOLD_GAP;
|
||||
this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
|
||||
this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
|
||||
this._speechPadSamples = samplingRate * speechPadMs / 1000f;
|
||||
this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
|
||||
this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
||||
this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
_model.ResetStates();
|
||||
}
|
||||
|
||||
public List<SileroSpeechSegment> GetSpeechSegmentList(FileInfo wavFile)
|
||||
{
|
||||
Reset();
|
||||
|
||||
using (var audioFile = new AudioFileReader(wavFile.FullName))
|
||||
{
|
||||
List<float> speechProbList = new List<float>();
|
||||
this._audioLengthSamples = (int)(audioFile.Length / 2);
|
||||
float[] buffer = new float[this._windowSizeSample];
|
||||
|
||||
while (audioFile.Read(buffer, 0, buffer.Length) > 0)
|
||||
{
|
||||
float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
|
||||
speechProbList.Add(speechProb);
|
||||
}
|
||||
|
||||
return CalculateProb(speechProbList);
|
||||
}
|
||||
}
|
||||
|
||||
private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
|
||||
{
|
||||
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
|
||||
bool triggered = false;
|
||||
int tempEnd = 0, prevEnd = 0, nextStart = 0;
|
||||
SileroSpeechSegment segment = new SileroSpeechSegment();
|
||||
|
||||
for (int i = 0; i < speechProbList.Count; i++)
|
||||
{
|
||||
float speechProb = speechProbList[i];
|
||||
if (speechProb >= _threshold && (tempEnd != 0))
|
||||
{
|
||||
tempEnd = 0;
|
||||
if (nextStart < prevEnd)
|
||||
{
|
||||
nextStart = _windowSizeSample * i;
|
||||
}
|
||||
}
|
||||
|
||||
if (speechProb >= _threshold && !triggered)
|
||||
{
|
||||
triggered = true;
|
||||
segment.StartOffset = _windowSizeSample * i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples)
|
||||
{
|
||||
if (prevEnd != 0)
|
||||
{
|
||||
segment.EndOffset = prevEnd;
|
||||
result.Add(segment);
|
||||
segment = new SileroSpeechSegment();
|
||||
if (nextStart < prevEnd)
|
||||
{
|
||||
triggered = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
segment.StartOffset = nextStart;
|
||||
}
|
||||
|
||||
prevEnd = 0;
|
||||
nextStart = 0;
|
||||
tempEnd = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
segment.EndOffset = _windowSizeSample * i;
|
||||
result.Add(segment);
|
||||
segment = new SileroSpeechSegment();
|
||||
prevEnd = 0;
|
||||
nextStart = 0;
|
||||
tempEnd = 0;
|
||||
triggered = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (speechProb < _negThreshold && triggered)
|
||||
{
|
||||
if (tempEnd == 0)
|
||||
{
|
||||
tempEnd = _windowSizeSample * i;
|
||||
}
|
||||
|
||||
if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech)
|
||||
{
|
||||
prevEnd = tempEnd;
|
||||
}
|
||||
|
||||
if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
segment.EndOffset = tempEnd;
|
||||
if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
|
||||
{
|
||||
result.Add(segment);
|
||||
}
|
||||
|
||||
segment = new SileroSpeechSegment();
|
||||
prevEnd = 0;
|
||||
nextStart = 0;
|
||||
tempEnd = 0;
|
||||
triggered = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
|
||||
{
|
||||
segment.EndOffset = _audioLengthSamples;
|
||||
result.Add(segment);
|
||||
}
|
||||
|
||||
for (int i = 0; i < result.Count; i++)
|
||||
{
|
||||
SileroSpeechSegment item = result[i];
|
||||
if (i == 0)
|
||||
{
|
||||
item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples);
|
||||
}
|
||||
|
||||
if (i != result.Count - 1)
|
||||
{
|
||||
SileroSpeechSegment nextItem = result[i + 1];
|
||||
int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
|
||||
if (silenceDuration < 2 * _speechPadSamples)
|
||||
{
|
||||
item.EndOffset = item.EndOffset + (silenceDuration / 2);
|
||||
nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
|
||||
}
|
||||
else
|
||||
{
|
||||
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
||||
nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
||||
}
|
||||
}
|
||||
|
||||
return MergeListAndCalculateSecond(result, _samplingRate);
|
||||
}
|
||||
|
||||
private List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
|
||||
{
|
||||
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
|
||||
if (original == null || original.Count == 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
int left = original[0].StartOffset.Value;
|
||||
int right = original[0].EndOffset.Value;
|
||||
if (original.Count > 1)
|
||||
{
|
||||
original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
|
||||
for (int i = 1; i < original.Count; i++)
|
||||
{
|
||||
SileroSpeechSegment segment = original[i];
|
||||
|
||||
if (segment.StartOffset > right)
|
||||
{
|
||||
result.Add(new SileroSpeechSegment(left, right,
|
||||
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||
left = segment.StartOffset.Value;
|
||||
right = segment.EndOffset.Value;
|
||||
}
|
||||
else
|
||||
{
|
||||
right = Math.Max(right, segment.EndOffset.Value);
|
||||
}
|
||||
}
|
||||
|
||||
result.Add(new SileroSpeechSegment(left, right,
|
||||
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.Add(new SileroSpeechSegment(left, right,
|
||||
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private float CalculateSecondByOffset(int offset, int samplingRate)
|
||||
{
|
||||
float secondValue = offset * 1.0f / samplingRate;
|
||||
return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user