diff --git a/examples/csharp/Program.cs b/examples/csharp/Program.cs new file mode 100644 index 0000000..131a51d --- /dev/null +++ b/examples/csharp/Program.cs @@ -0,0 +1,35 @@ +using System.Text; + +namespace VadDotNet; + + +class Program +{ + private const string MODEL_PATH = "./resources/silero_vad.onnx"; + private const string EXAMPLE_WAV_FILE = "./resources/example.wav"; + private const int SAMPLE_RATE = 16000; + private const float THRESHOLD = 0.5f; + private const int MIN_SPEECH_DURATION_MS = 250; + private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity; + private const int MIN_SILENCE_DURATION_MS = 100; + private const int SPEECH_PAD_MS = 30; + + public static void Main(string[] args) + { + + var vadDetector = new SileroVadDetector(MODEL_PATH, THRESHOLD, SAMPLE_RATE, + MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS); + List speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE)); + //Console.WriteLine(speechTimeList.ToJson()); + StringBuilder sb = new StringBuilder(); + foreach (var speechSegment in speechTimeList) + { + sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n"); + + } + Console.WriteLine(sb.ToString()); + + } + + +} diff --git a/examples/csharp/SileroSpeechSegment.cs b/examples/csharp/SileroSpeechSegment.cs new file mode 100644 index 0000000..f9d85be --- /dev/null +++ b/examples/csharp/SileroSpeechSegment.cs @@ -0,0 +1,21 @@ +namespace VadDotNet; + +public class SileroSpeechSegment +{ + public int? StartOffset { get; set; } + public int? EndOffset { get; set; } + public float? StartSecond { get; set; } + public float? EndSecond { get; set; } + + public SileroSpeechSegment() + { + } + + public SileroSpeechSegment(int startOffset, int? endOffset, float? startSecond, float? endSecond) + { + StartOffset = startOffset; + EndOffset = endOffset; + StartSecond = startSecond; + EndSecond = endSecond; + } +} \ No newline at end of file diff --git a/examples/csharp/SileroVadDetector.cs b/examples/csharp/SileroVadDetector.cs new file mode 100644 index 0000000..8aec4ab --- /dev/null +++ b/examples/csharp/SileroVadDetector.cs @@ -0,0 +1,250 @@ +using NAudio.Wave; +using VADdotnet; + +namespace VadDotNet; + +public class SileroVadDetector +{ + private readonly SileroVadOnnxModel _model; + private readonly float _threshold; + private readonly float _negThreshold; + private readonly int _samplingRate; + private readonly int _windowSizeSample; + private readonly float _minSpeechSamples; + private readonly float _speechPadSamples; + private readonly float _maxSpeechSamples; + private readonly float _minSilenceSamples; + private readonly float _minSilenceSamplesAtMaxSpeech; + private int _audioLengthSamples; + private const float THRESHOLD_GAP = 0.15f; + // ReSharper disable once InconsistentNaming + private const int SAMPLING_RATE_8K = 8000; + // ReSharper disable once InconsistentNaming + private const int SAMPLING_RATE_16K = 16000; + + public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate, + int minSpeechDurationMs, float maxSpeechDurationSeconds, + int minSilenceDurationMs, int speechPadMs) + { + if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K) + { + throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]"); + } + + this._model = new SileroVadOnnxModel(onnxModelPath); + this._samplingRate = samplingRate; + this._threshold = threshold; + this._negThreshold = threshold - THRESHOLD_GAP; + this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256; + this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f; + this._speechPadSamples = samplingRate * speechPadMs / 1000f; + this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples; + this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f; + this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f; + this.Reset(); + } + + public void Reset() + { + _model.ResetStates(); + } + + public List GetSpeechSegmentList(FileInfo wavFile) + { + Reset(); + + using (var audioFile = new AudioFileReader(wavFile.FullName)) + { + List speechProbList = new List(); + this._audioLengthSamples = (int)(audioFile.Length / 2); + float[] buffer = new float[this._windowSizeSample]; + + while (audioFile.Read(buffer, 0, buffer.Length) > 0) + { + float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0]; + speechProbList.Add(speechProb); + } + + return CalculateProb(speechProbList); + } + } + + private List CalculateProb(List speechProbList) + { + List result = new List(); + bool triggered = false; + int tempEnd = 0, prevEnd = 0, nextStart = 0; + SileroSpeechSegment segment = new SileroSpeechSegment(); + + for (int i = 0; i < speechProbList.Count; i++) + { + float speechProb = speechProbList[i]; + if (speechProb >= _threshold && (tempEnd != 0)) + { + tempEnd = 0; + if (nextStart < prevEnd) + { + nextStart = _windowSizeSample * i; + } + } + + if (speechProb >= _threshold && !triggered) + { + triggered = true; + segment.StartOffset = _windowSizeSample * i; + continue; + } + + if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples) + { + if (prevEnd != 0) + { + segment.EndOffset = prevEnd; + result.Add(segment); + segment = new SileroSpeechSegment(); + if (nextStart < prevEnd) + { + triggered = false; + } + else + { + segment.StartOffset = nextStart; + } + + prevEnd = 0; + nextStart = 0; + tempEnd = 0; + } + else + { + segment.EndOffset = _windowSizeSample * i; + result.Add(segment); + segment = new SileroSpeechSegment(); + prevEnd = 0; + nextStart = 0; + tempEnd = 0; + triggered = false; + continue; + } + } + + if (speechProb < _negThreshold && triggered) + { + if (tempEnd == 0) + { + tempEnd = _windowSizeSample * i; + } + + if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech) + { + prevEnd = tempEnd; + } + + if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples) + { + continue; + } + else + { + segment.EndOffset = tempEnd; + if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples) + { + result.Add(segment); + } + + segment = new SileroSpeechSegment(); + prevEnd = 0; + nextStart = 0; + tempEnd = 0; + triggered = false; + continue; + } + } + } + + if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples) + { + segment.EndOffset = _audioLengthSamples; + result.Add(segment); + } + + for (int i = 0; i < result.Count; i++) + { + SileroSpeechSegment item = result[i]; + if (i == 0) + { + item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples); + } + + if (i != result.Count - 1) + { + SileroSpeechSegment nextItem = result[i + 1]; + int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value; + if (silenceDuration < 2 * _speechPadSamples) + { + item.EndOffset = item.EndOffset + (silenceDuration / 2); + nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2)); + } + else + { + item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples); + nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples); + } + } + else + { + item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples); + } + } + + return MergeListAndCalculateSecond(result, _samplingRate); + } + + private List MergeListAndCalculateSecond(List original, int samplingRate) + { + List result = new List(); + if (original == null || original.Count == 0) + { + return result; + } + + int left = original[0].StartOffset.Value; + int right = original[0].EndOffset.Value; + if (original.Count > 1) + { + original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value)); + for (int i = 1; i < original.Count; i++) + { + SileroSpeechSegment segment = original[i]; + + if (segment.StartOffset > right) + { + result.Add(new SileroSpeechSegment(left, right, + CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate))); + left = segment.StartOffset.Value; + right = segment.EndOffset.Value; + } + else + { + right = Math.Max(right, segment.EndOffset.Value); + } + } + + result.Add(new SileroSpeechSegment(left, right, + CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate))); + } + else + { + result.Add(new SileroSpeechSegment(left, right, + CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate))); + } + + return result; + } + + private float CalculateSecondByOffset(int offset, int samplingRate) + { + float secondValue = offset * 1.0f / samplingRate; + return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f; + } +} \ No newline at end of file diff --git a/examples/csharp/SileroVadOnnxModel.cs b/examples/csharp/SileroVadOnnxModel.cs new file mode 100644 index 0000000..f44ebe6 --- /dev/null +++ b/examples/csharp/SileroVadOnnxModel.cs @@ -0,0 +1,220 @@ +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace VADdotnet; + + + public class SileroVadOnnxModel : IDisposable + { + private readonly InferenceSession session; + private float[][][] state; + private float[][] context; + private int lastSr = 0; + private int lastBatchSize = 0; + private static readonly List SAMPLE_RATES = new List { 8000, 16000 }; + + public SileroVadOnnxModel(string modelPath) + { + var sessionOptions = new SessionOptions(); + sessionOptions.InterOpNumThreads = 1; + sessionOptions.IntraOpNumThreads = 1; + sessionOptions.EnableCpuMemArena = true; + + session = new InferenceSession(modelPath, sessionOptions); + ResetStates(); + } + + public void ResetStates() + { + state = new float[2][][]; + state[0] = new float[1][]; + state[1] = new float[1][]; + state[0][0] = new float[128]; + state[1][0] = new float[128]; + context = Array.Empty(); + lastSr = 0; + lastBatchSize = 0; + } + + public void Dispose() + { + session?.Dispose(); + } + + public class ValidationResult + { + public float[][] X { get; } + public int Sr { get; } + + public ValidationResult(float[][] x, int sr) + { + X = x; + Sr = sr; + } + } + + private ValidationResult ValidateInput(float[][] x, int sr) + { + if (x.Length == 1) + { + x = new float[][] { x[0] }; + } + if (x.Length > 2) + { + throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}"); + } + + if (sr != 16000 && (sr % 16000 == 0)) + { + int step = sr / 16000; + float[][] reducedX = new float[x.Length][]; + + for (int i = 0; i < x.Length; i++) + { + float[] current = x[i]; + float[] newArr = new float[(current.Length + step - 1) / step]; + + for (int j = 0, index = 0; j < current.Length; j += step, index++) + { + newArr[index] = current[j]; + } + + reducedX[i] = newArr; + } + + x = reducedX; + sr = 16000; + } + + if (!SAMPLE_RATES.Contains(sr)) + { + throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)"); + } + + if (((float)sr) / x[0].Length > 31.25) + { + throw new ArgumentException("Input audio is too short"); + } + + return new ValidationResult(x, sr); + } + + private static float[][] Concatenate(float[][] a, float[][] b) + { + if (a.Length != b.Length) + { + throw new ArgumentException("The number of rows in both arrays must be the same."); + } + + int rows = a.Length; + int colsA = a[0].Length; + int colsB = b[0].Length; + float[][] result = new float[rows][]; + + for (int i = 0; i < rows; i++) + { + result[i] = new float[colsA + colsB]; + Array.Copy(a[i], 0, result[i], 0, colsA); + Array.Copy(b[i], 0, result[i], colsA, colsB); + } + + return result; + } + + private static float[][] GetLastColumns(float[][] array, int contextSize) + { + int rows = array.Length; + int cols = array[0].Length; + + if (contextSize > cols) + { + throw new ArgumentException("contextSize cannot be greater than the number of columns in the array."); + } + + float[][] result = new float[rows][]; + + for (int i = 0; i < rows; i++) + { + result[i] = new float[contextSize]; + Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize); + } + + return result; + } + + public float[] Call(float[][] x, int sr) + { + var result = ValidateInput(x, sr); + x = result.X; + sr = result.Sr; + int numberSamples = sr == 16000 ? 512 : 256; + + if (x[0].Length != numberSamples) + { + throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)"); + } + + int batchSize = x.Length; + int contextSize = sr == 16000 ? 64 : 32; + + if (lastBatchSize == 0) + { + ResetStates(); + } + if (lastSr != 0 && lastSr != sr) + { + ResetStates(); + } + if (lastBatchSize != 0 && lastBatchSize != batchSize) + { + ResetStates(); + } + + if (context.Length == 0) + { + context = new float[batchSize][]; + for (int i = 0; i < batchSize; i++) + { + context[i] = new float[contextSize]; + } + } + + x = Concatenate(context, x); + + var inputs = new List + { + NamedOnnxValue.CreateFromTensor("input", new DenseTensor(x.SelectMany(a => a).ToArray(), new[] { x.Length, x[0].Length })), + NamedOnnxValue.CreateFromTensor("sr", new DenseTensor(new[] { (long)sr }, new[] { 1 })), + NamedOnnxValue.CreateFromTensor("state", new DenseTensor(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), new[] { state.Length, state[0].Length, state[0][0].Length })) + }; + + using (var outputs = session.Run(inputs)) + { + var output = outputs.First(o => o.Name == "output").AsTensor(); + var newState = outputs.First(o => o.Name == "stateN").AsTensor(); + + context = GetLastColumns(x, contextSize); + lastSr = sr; + lastBatchSize = batchSize; + + state = new float[newState.Dimensions[0]][][]; + for (int i = 0; i < newState.Dimensions[0]; i++) + { + state[i] = new float[newState.Dimensions[1]][]; + for (int j = 0; j < newState.Dimensions[1]; j++) + { + state[i][j] = new float[newState.Dimensions[2]]; + for (int k = 0; k < newState.Dimensions[2]; k++) + { + state[i][j][k] = newState[i, j, k]; + } + } + } + + return output.ToArray(); + } + } + } diff --git a/examples/csharp/VadDotNet.csproj b/examples/csharp/VadDotNet.csproj new file mode 100644 index 0000000..538fbf9 --- /dev/null +++ b/examples/csharp/VadDotNet.csproj @@ -0,0 +1,25 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + + + + PreserveNewest + + + + diff --git a/examples/csharp/resources/put_model_here.txt b/examples/csharp/resources/put_model_here.txt new file mode 100644 index 0000000..eb915d6 --- /dev/null +++ b/examples/csharp/resources/put_model_here.txt @@ -0,0 +1 @@ +place onnx model file and example.wav file in this folder