add csharp example

2026-02-04 17:39:22 +08:00 · 2024-07-20 22:59:18 -04:00
parent 7af8628a27
commit cb25c0c047
6 changed files with 552 additions and 0 deletions
--- a/examples/csharp/Program.cs
+++ b/examples/csharp/Program.cs
@@ -0,0 +1,35 @@
+using System.Text;
+
+namespace VadDotNet;
+
+
+class Program
+{
+    private const string MODEL_PATH = "./resources/silero_vad.onnx";
+    private const string EXAMPLE_WAV_FILE = "./resources/example.wav";
+    private const int SAMPLE_RATE = 16000;
+    private const float THRESHOLD = 0.5f;
+    private const int MIN_SPEECH_DURATION_MS = 250;
+    private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity;
+    private const int MIN_SILENCE_DURATION_MS = 100;
+    private const int SPEECH_PAD_MS = 30;
+
+    public static void Main(string[] args)
+    {
+        
+            var vadDetector = new SileroVadDetector(MODEL_PATH, THRESHOLD, SAMPLE_RATE,
+                MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
+            List<SileroSpeechSegment> speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE));
+            //Console.WriteLine(speechTimeList.ToJson());
+            StringBuilder sb = new StringBuilder();
+            foreach (var speechSegment in speechTimeList)
+            {
+                sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n");
+                
+            }
+            Console.WriteLine(sb.ToString());
+       
+    }
+    
+    
+}
--- a/examples/csharp/SileroSpeechSegment.cs
+++ b/examples/csharp/SileroSpeechSegment.cs
@@ -0,0 +1,21 @@
+namespace VadDotNet;
+
+public class SileroSpeechSegment
+{
+    public int? StartOffset { get; set; }
+    public int? EndOffset { get; set; }
+    public float? StartSecond { get; set; }
+    public float? EndSecond { get; set; }
+
+    public SileroSpeechSegment()
+    {
+    }
+
+    public SileroSpeechSegment(int startOffset, int? endOffset, float? startSecond, float? endSecond)
+    {
+        StartOffset = startOffset;
+        EndOffset = endOffset;
+        StartSecond = startSecond;
+        EndSecond = endSecond;
+    }
+}
--- a/examples/csharp/SileroVadDetector.cs
+++ b/examples/csharp/SileroVadDetector.cs
@@ -0,0 +1,250 @@
+using NAudio.Wave;
+using VADdotnet;
+
+namespace VadDotNet;
+
+public class SileroVadDetector
+{
+    private readonly SileroVadOnnxModel _model;
+    private readonly float _threshold;
+    private readonly float _negThreshold;
+    private readonly int _samplingRate;
+    private readonly int _windowSizeSample;
+    private readonly float _minSpeechSamples;
+    private readonly float _speechPadSamples;
+    private readonly float _maxSpeechSamples;
+    private readonly float _minSilenceSamples;
+    private readonly float _minSilenceSamplesAtMaxSpeech;
+    private int _audioLengthSamples;
+    private const float THRESHOLD_GAP = 0.15f;
+    // ReSharper disable once InconsistentNaming
+    private const int SAMPLING_RATE_8K = 8000;
+    // ReSharper disable once InconsistentNaming
+    private const int SAMPLING_RATE_16K = 16000;
+
+    public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate,
+        int minSpeechDurationMs, float maxSpeechDurationSeconds,
+        int minSilenceDurationMs, int speechPadMs)
+    {
+        if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
+        {
+            throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]");
+        }
+
+        this._model = new SileroVadOnnxModel(onnxModelPath);
+        this._samplingRate = samplingRate;
+        this._threshold = threshold;
+        this._negThreshold = threshold - THRESHOLD_GAP;
+        this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
+        this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
+        this._speechPadSamples = samplingRate * speechPadMs / 1000f;
+        this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
+        this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
+        this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
+        this.Reset();
+    }
+
+    public void Reset()
+    {
+        _model.ResetStates();
+    }
+
+    public List<SileroSpeechSegment> GetSpeechSegmentList(FileInfo wavFile)
+    {
+        Reset();
+
+        using (var audioFile = new AudioFileReader(wavFile.FullName))
+        {
+            List<float> speechProbList = new List<float>();
+            this._audioLengthSamples = (int)(audioFile.Length / 2);
+            float[] buffer = new float[this._windowSizeSample];
+
+            while (audioFile.Read(buffer, 0, buffer.Length) > 0)
+            {
+                float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
+                speechProbList.Add(speechProb);
+            }
+
+            return CalculateProb(speechProbList);
+        }
+    }
+
+    private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
+    {
+        List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
+        bool triggered = false;
+        int tempEnd = 0, prevEnd = 0, nextStart = 0;
+        SileroSpeechSegment segment = new SileroSpeechSegment();
+
+        for (int i = 0; i < speechProbList.Count; i++)
+        {
+            float speechProb = speechProbList[i];
+            if (speechProb >= _threshold && (tempEnd != 0))
+            {
+                tempEnd = 0;
+                if (nextStart < prevEnd)
+                {
+                    nextStart = _windowSizeSample * i;
+                }
+            }
+
+            if (speechProb >= _threshold && !triggered)
+            {
+                triggered = true;
+                segment.StartOffset = _windowSizeSample * i;
+                continue;
+            }
+
+            if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples)
+            {
+                if (prevEnd != 0)
+                {
+                    segment.EndOffset = prevEnd;
+                    result.Add(segment);
+                    segment = new SileroSpeechSegment();
+                    if (nextStart < prevEnd)
+                    {
+                        triggered = false;
+                    }
+                    else
+                    {
+                        segment.StartOffset = nextStart;
+                    }
+
+                    prevEnd = 0;
+                    nextStart = 0;
+                    tempEnd = 0;
+                }
+                else
+                {
+                    segment.EndOffset = _windowSizeSample * i;
+                    result.Add(segment);
+                    segment = new SileroSpeechSegment();
+                    prevEnd = 0;
+                    nextStart = 0;
+                    tempEnd = 0;
+                    triggered = false;
+                    continue;
+                }
+            }
+
+            if (speechProb < _negThreshold && triggered)
+            {
+                if (tempEnd == 0)
+                {
+                    tempEnd = _windowSizeSample * i;
+                }
+
+                if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech)
+                {
+                    prevEnd = tempEnd;
+                }
+
+                if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples)
+                {
+                    continue;
+                }
+                else
+                {
+                    segment.EndOffset = tempEnd;
+                    if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
+                    {
+                        result.Add(segment);
+                    }
+
+                    segment = new SileroSpeechSegment();
+                    prevEnd = 0;
+                    nextStart = 0;
+                    tempEnd = 0;
+                    triggered = false;
+                    continue;
+                }
+            }
+        }
+
+        if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
+        {
+            segment.EndOffset = _audioLengthSamples;
+            result.Add(segment);
+        }
+
+        for (int i = 0; i < result.Count; i++)
+        {
+            SileroSpeechSegment item = result[i];
+            if (i == 0)
+            {
+                item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples);
+            }
+
+            if (i != result.Count - 1)
+            {
+                SileroSpeechSegment nextItem = result[i + 1];
+                int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
+                if (silenceDuration < 2 * _speechPadSamples)
+                {
+                    item.EndOffset = item.EndOffset + (silenceDuration / 2);
+                    nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
+                }
+                else
+                {
+                    item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
+                    nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples);
+                }
+            }
+            else
+            {
+                item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
+            }
+        }
+
+        return MergeListAndCalculateSecond(result, _samplingRate);
+    }
+
+    private List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
+    {
+        List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
+        if (original == null || original.Count == 0)
+        {
+            return result;
+        }
+
+        int left = original[0].StartOffset.Value;
+        int right = original[0].EndOffset.Value;
+        if (original.Count > 1)
+        {
+            original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
+            for (int i = 1; i < original.Count; i++)
+            {
+                SileroSpeechSegment segment = original[i];
+
+                if (segment.StartOffset > right)
+                {
+                    result.Add(new SileroSpeechSegment(left, right,
+                        CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+                    left = segment.StartOffset.Value;
+                    right = segment.EndOffset.Value;
+                }
+                else
+                {
+                    right = Math.Max(right, segment.EndOffset.Value);
+                }
+            }
+
+            result.Add(new SileroSpeechSegment(left, right,
+                CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+        }
+        else
+        {
+            result.Add(new SileroSpeechSegment(left, right,
+                CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
+        }
+
+        return result;
+    }
+
+    private float CalculateSecondByOffset(int offset, int samplingRate)
+    {
+        float secondValue = offset * 1.0f / samplingRate;
+        return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;
+    }
+}
--- a/examples/csharp/SileroVadOnnxModel.cs
+++ b/examples/csharp/SileroVadOnnxModel.cs
@@ -0,0 +1,220 @@
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace VADdotnet;
+
+
+    public class SileroVadOnnxModel : IDisposable
+    {
+        private readonly InferenceSession session;
+        private float[][][] state;
+        private float[][] context;
+        private int lastSr = 0;
+        private int lastBatchSize = 0;
+        private static readonly List<int> SAMPLE_RATES = new List<int> { 8000, 16000 };
+
+        public SileroVadOnnxModel(string modelPath)
+        {
+            var sessionOptions = new SessionOptions();
+            sessionOptions.InterOpNumThreads = 1;
+            sessionOptions.IntraOpNumThreads = 1;
+            sessionOptions.EnableCpuMemArena = true;
+
+            session = new InferenceSession(modelPath, sessionOptions);
+            ResetStates();
+        }
+
+        public void ResetStates()
+        {
+            state = new float[2][][];
+            state[0] = new float[1][];
+            state[1] = new float[1][];
+            state[0][0] = new float[128];
+            state[1][0] = new float[128];
+            context = Array.Empty<float[]>();
+            lastSr = 0;
+            lastBatchSize = 0;
+        }
+
+        public void Dispose()
+        {
+            session?.Dispose();
+        }
+
+        public class ValidationResult
+        {
+            public float[][] X { get; }
+            public int Sr { get; }
+
+            public ValidationResult(float[][] x, int sr)
+            {
+                X = x;
+                Sr = sr;
+            }
+        }
+
+        private ValidationResult ValidateInput(float[][] x, int sr)
+        {
+            if (x.Length == 1)
+            {
+                x = new float[][] { x[0] };
+            }
+            if (x.Length > 2)
+            {
+                throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
+            }
+
+            if (sr != 16000 && (sr % 16000 == 0))
+            {
+                int step = sr / 16000;
+                float[][] reducedX = new float[x.Length][];
+
+                for (int i = 0; i < x.Length; i++)
+                {
+                    float[] current = x[i];
+                    float[] newArr = new float[(current.Length + step - 1) / step];
+
+                    for (int j = 0, index = 0; j < current.Length; j += step, index++)
+                    {
+                        newArr[index] = current[j];
+                    }
+
+                    reducedX[i] = newArr;
+                }
+
+                x = reducedX;
+                sr = 16000;
+            }
+
+            if (!SAMPLE_RATES.Contains(sr))
+            {
+                throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)");
+            }
+
+            if (((float)sr) / x[0].Length > 31.25)
+            {
+                throw new ArgumentException("Input audio is too short");
+            }
+
+            return new ValidationResult(x, sr);
+        }
+
+        private static float[][] Concatenate(float[][] a, float[][] b)
+        {
+            if (a.Length != b.Length)
+            {
+                throw new ArgumentException("The number of rows in both arrays must be the same.");
+            }
+
+            int rows = a.Length;
+            int colsA = a[0].Length;
+            int colsB = b[0].Length;
+            float[][] result = new float[rows][];
+
+            for (int i = 0; i < rows; i++)
+            {
+                result[i] = new float[colsA + colsB];
+                Array.Copy(a[i], 0, result[i], 0, colsA);
+                Array.Copy(b[i], 0, result[i], colsA, colsB);
+            }
+
+            return result;
+        }
+
+        private static float[][] GetLastColumns(float[][] array, int contextSize)
+        {
+            int rows = array.Length;
+            int cols = array[0].Length;
+
+            if (contextSize > cols)
+            {
+                throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
+            }
+
+            float[][] result = new float[rows][];
+
+            for (int i = 0; i < rows; i++)
+            {
+                result[i] = new float[contextSize];
+                Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize);
+            }
+
+            return result;
+        }
+
+        public float[] Call(float[][] x, int sr)
+        {
+            var result = ValidateInput(x, sr);
+            x = result.X;
+            sr = result.Sr;
+            int numberSamples = sr == 16000 ? 512 : 256;
+
+            if (x[0].Length != numberSamples)
+            {
+                throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
+            }
+
+            int batchSize = x.Length;
+            int contextSize = sr == 16000 ? 64 : 32;
+
+            if (lastBatchSize == 0)
+            {
+                ResetStates();
+            }
+            if (lastSr != 0 && lastSr != sr)
+            {
+                ResetStates();
+            }
+            if (lastBatchSize != 0 && lastBatchSize != batchSize)
+            {
+                ResetStates();
+            }
+
+            if (context.Length == 0)
+            {
+                context = new float[batchSize][];
+                for (int i = 0; i < batchSize; i++)
+                {
+                    context[i] = new float[contextSize];
+                }
+            }
+
+            x = Concatenate(context, x);
+
+            var inputs = new List<NamedOnnxValue>
+            {
+                NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), new[] { x.Length, x[0].Length })),
+                NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, new[] { 1 })),
+                NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), new[] { state.Length, state[0].Length, state[0][0].Length }))
+            };
+
+            using (var outputs = session.Run(inputs))
+            {
+                var output = outputs.First(o => o.Name == "output").AsTensor<float>();
+                var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>();
+
+                context = GetLastColumns(x, contextSize);
+                lastSr = sr;
+                lastBatchSize = batchSize;
+
+                state = new float[newState.Dimensions[0]][][];
+                for (int i = 0; i < newState.Dimensions[0]; i++)
+                {
+                    state[i] = new float[newState.Dimensions[1]][];
+                    for (int j = 0; j < newState.Dimensions[1]; j++)
+                    {
+                        state[i][j] = new float[newState.Dimensions[2]];
+                        for (int k = 0; k < newState.Dimensions[2]; k++)
+                        {
+                            state[i][j][k] = newState[i, j, k];
+                        }
+                    }
+                }
+
+                return output.ToArray();
+            }
+        }
+    }
--- a/examples/csharp/VadDotNet.csproj
+++ b/examples/csharp/VadDotNet.csproj
@@ -0,0 +1,25 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+    <PropertyGroup>
+        <OutputType>Exe</OutputType>
+        <TargetFramework>net8.0</TargetFramework>
+        <ImplicitUsings>enable</ImplicitUsings>
+        <Nullable>enable</Nullable>
+    </PropertyGroup>
+
+    <ItemGroup>
+      <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.18.1" />
+      <PackageReference Include="NAudio" Version="2.2.1" />
+    </ItemGroup>
+
+    <ItemGroup>
+      <Folder Include="resources\" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <Content Include="resources\**">
+            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        </Content>
+    </ItemGroup>
+
+</Project>
--- a/examples/csharp/resources/put_model_here.txt
+++ b/examples/csharp/resources/put_model_here.txt
@@ -0,0 +1 @@
+place onnx model file and example.wav file in this folder
				`@@ -0,0 +1 @@`
				`place onnx model file and example.wav file in this folder`