From 7b0aaa1c4c2ecc9caeb6efc8a6685a1ea6831077 Mon Sep 17 00:00:00 2001 From: dongfp Date: Mon, 10 Nov 2025 15:58:20 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8DCalculateProb=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E8=AE=A1=E7=AE=97=E5=8F=A5=E5=AD=90EndOffset=E7=9A=84?= =?UTF-8?q?bug=20=E4=BF=AE=E6=94=B9=E8=AF=AD=E6=B3=95=E6=8F=90=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/csharp/Program.cs | 2 +- examples/csharp/SileroVadDetector.cs | 40 +-- examples/csharp/SileroVadOnnxModel.cs | 377 +++++++++++++------------- examples/csharp/VadDotNet.sln | 25 ++ 4 files changed, 233 insertions(+), 211 deletions(-) create mode 100644 examples/csharp/VadDotNet.sln diff --git a/examples/csharp/Program.cs b/examples/csharp/Program.cs index 131a51d..13a27ca 100644 --- a/examples/csharp/Program.cs +++ b/examples/csharp/Program.cs @@ -21,7 +21,7 @@ class Program MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS); List speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE)); //Console.WriteLine(speechTimeList.ToJson()); - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new(); foreach (var speechSegment in speechTimeList) { sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n"); diff --git a/examples/csharp/SileroVadDetector.cs b/examples/csharp/SileroVadDetector.cs index 8aec4ab..051e5a8 100644 --- a/examples/csharp/SileroVadDetector.cs +++ b/examples/csharp/SileroVadDetector.cs @@ -53,28 +53,26 @@ public class SileroVadDetector { Reset(); - using (var audioFile = new AudioFileReader(wavFile.FullName)) + using var audioFile = new AudioFileReader(wavFile.FullName); + List speechProbList = []; + this._audioLengthSamples = (int)(audioFile.Length / 2); + float[] buffer = new float[this._windowSizeSample]; + + while (audioFile.Read(buffer, 0, buffer.Length) > 0) { - List speechProbList = new List(); - this._audioLengthSamples = (int)(audioFile.Length / 2); - float[] buffer = new float[this._windowSizeSample]; - - while (audioFile.Read(buffer, 0, buffer.Length) > 0) - { - float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0]; - speechProbList.Add(speechProb); - } - - return CalculateProb(speechProbList); + float speechProb = _model.Call([buffer], _samplingRate)[0]; + speechProbList.Add(speechProb); } + + return CalculateProb(speechProbList); } private List CalculateProb(List speechProbList) { - List result = new List(); + List result = []; bool triggered = false; int tempEnd = 0, prevEnd = 0, nextStart = 0; - SileroSpeechSegment segment = new SileroSpeechSegment(); + SileroSpeechSegment segment = new(); for (int i = 0; i < speechProbList.Count; i++) { @@ -164,7 +162,8 @@ public class SileroVadDetector if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples) { - segment.EndOffset = _audioLengthSamples; + //segment.EndOffset = _audioLengthSamples; + segment.EndOffset = speechProbList.Count * _windowSizeSample; result.Add(segment); } @@ -182,7 +181,7 @@ public class SileroVadDetector int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value; if (silenceDuration < 2 * _speechPadSamples) { - item.EndOffset = item.EndOffset + (silenceDuration / 2); + item.EndOffset += (silenceDuration / 2); nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2)); } else @@ -200,9 +199,9 @@ public class SileroVadDetector return MergeListAndCalculateSecond(result, _samplingRate); } - private List MergeListAndCalculateSecond(List original, int samplingRate) + private static List MergeListAndCalculateSecond(List original, int samplingRate) { - List result = new List(); + List result = []; if (original == null || original.Count == 0) { return result; @@ -216,7 +215,10 @@ public class SileroVadDetector for (int i = 1; i < original.Count; i++) { SileroSpeechSegment segment = original[i]; + if (i == 235) + { + } if (segment.StartOffset > right) { result.Add(new SileroSpeechSegment(left, right, @@ -242,7 +244,7 @@ public class SileroVadDetector return result; } - private float CalculateSecondByOffset(int offset, int samplingRate) + private static float CalculateSecondByOffset(int offset, int samplingRate) { float secondValue = offset * 1.0f / samplingRate; return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f; diff --git a/examples/csharp/SileroVadOnnxModel.cs b/examples/csharp/SileroVadOnnxModel.cs index f44ebe6..b57d464 100644 --- a/examples/csharp/SileroVadOnnxModel.cs +++ b/examples/csharp/SileroVadOnnxModel.cs @@ -1,5 +1,6 @@ using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; + using System; using System.Collections.Generic; using System.Linq; @@ -7,214 +8,208 @@ using System.Linq; namespace VADdotnet; - public class SileroVadOnnxModel : IDisposable +public class SileroVadOnnxModel : IDisposable +{ + private readonly InferenceSession session; + private float[][][] state; + private float[][] context; + private int lastSr = 0; + private int lastBatchSize = 0; + private static readonly List SAMPLE_RATES = [8000, 16000]; + + public SileroVadOnnxModel(string modelPath) { - private readonly InferenceSession session; - private float[][][] state; - private float[][] context; - private int lastSr = 0; - private int lastBatchSize = 0; - private static readonly List SAMPLE_RATES = new List { 8000, 16000 }; - - public SileroVadOnnxModel(string modelPath) + var sessionOptions = new SessionOptions { - var sessionOptions = new SessionOptions(); - sessionOptions.InterOpNumThreads = 1; - sessionOptions.IntraOpNumThreads = 1; - sessionOptions.EnableCpuMemArena = true; + InterOpNumThreads = 1, + IntraOpNumThreads = 1, + EnableCpuMemArena = true + }; - session = new InferenceSession(modelPath, sessionOptions); + session = new InferenceSession(modelPath, sessionOptions); + ResetStates(); + } + + public void ResetStates() + { + state = new float[2][][]; + state[0] = new float[1][]; + state[1] = new float[1][]; + state[0][0] = new float[128]; + state[1][0] = new float[128]; + context = []; + lastSr = 0; + lastBatchSize = 0; + } + + public void Dispose() + { + GC.SuppressFinalize(this); + } + + public class ValidationResult(float[][] x, int sr) + { + public float[][] X { get; } = x; + public int Sr { get; } = sr; + } + + private static ValidationResult ValidateInput(float[][] x, int sr) + { + if (x.Length == 1) + { + x = [x[0]]; + } + if (x.Length > 2) + { + throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}"); + } + + if (sr != 16000 && (sr % 16000 == 0)) + { + int step = sr / 16000; + float[][] reducedX = new float[x.Length][]; + + for (int i = 0; i < x.Length; i++) + { + float[] current = x[i]; + float[] newArr = new float[(current.Length + step - 1) / step]; + + for (int j = 0, index = 0; j < current.Length; j += step, index++) + { + newArr[index] = current[j]; + } + + reducedX[i] = newArr; + } + + x = reducedX; + sr = 16000; + } + + if (!SAMPLE_RATES.Contains(sr)) + { + throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)"); + } + + if (((float)sr) / x[0].Length > 31.25) + { + throw new ArgumentException("Input audio is too short"); + } + + return new ValidationResult(x, sr); + } + + private static float[][] Concatenate(float[][] a, float[][] b) + { + if (a.Length != b.Length) + { + throw new ArgumentException("The number of rows in both arrays must be the same."); + } + + int rows = a.Length; + int colsA = a[0].Length; + int colsB = b[0].Length; + float[][] result = new float[rows][]; + + for (int i = 0; i < rows; i++) + { + result[i] = new float[colsA + colsB]; + Array.Copy(a[i], 0, result[i], 0, colsA); + Array.Copy(b[i], 0, result[i], colsA, colsB); + } + + return result; + } + + private static float[][] GetLastColumns(float[][] array, int contextSize) + { + int rows = array.Length; + int cols = array[0].Length; + + if (contextSize > cols) + { + throw new ArgumentException("contextSize cannot be greater than the number of columns in the array."); + } + + float[][] result = new float[rows][]; + + for (int i = 0; i < rows; i++) + { + result[i] = new float[contextSize]; + Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize); + } + + return result; + } + + public float[] Call(float[][] x, int sr) + { + var result = ValidateInput(x, sr); + x = result.X; + sr = result.Sr; + int numberSamples = sr == 16000 ? 512 : 256; + + if (x[0].Length != numberSamples) + { + throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)"); + } + + int batchSize = x.Length; + int contextSize = sr == 16000 ? 64 : 32; + + if (lastBatchSize == 0) + { + ResetStates(); + } + if (lastSr != 0 && lastSr != sr) + { + ResetStates(); + } + if (lastBatchSize != 0 && lastBatchSize != batchSize) + { ResetStates(); } - public void ResetStates() + if (context.Length == 0) { - state = new float[2][][]; - state[0] = new float[1][]; - state[1] = new float[1][]; - state[0][0] = new float[128]; - state[1][0] = new float[128]; - context = Array.Empty(); - lastSr = 0; - lastBatchSize = 0; - } - - public void Dispose() - { - session?.Dispose(); - } - - public class ValidationResult - { - public float[][] X { get; } - public int Sr { get; } - - public ValidationResult(float[][] x, int sr) + context = new float[batchSize][]; + for (int i = 0; i < batchSize; i++) { - X = x; - Sr = sr; + context[i] = new float[contextSize]; } } - private ValidationResult ValidateInput(float[][] x, int sr) - { - if (x.Length == 1) + x = Concatenate(context, x); + + var inputs = new List { - x = new float[][] { x[0] }; - } - if (x.Length > 2) - { - throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}"); - } - - if (sr != 16000 && (sr % 16000 == 0)) - { - int step = sr / 16000; - float[][] reducedX = new float[x.Length][]; - - for (int i = 0; i < x.Length; i++) - { - float[] current = x[i]; - float[] newArr = new float[(current.Length + step - 1) / step]; - - for (int j = 0, index = 0; j < current.Length; j += step, index++) - { - newArr[index] = current[j]; - } - - reducedX[i] = newArr; - } - - x = reducedX; - sr = 16000; - } - - if (!SAMPLE_RATES.Contains(sr)) - { - throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)"); - } - - if (((float)sr) / x[0].Length > 31.25) - { - throw new ArgumentException("Input audio is too short"); - } - - return new ValidationResult(x, sr); - } - - private static float[][] Concatenate(float[][] a, float[][] b) - { - if (a.Length != b.Length) - { - throw new ArgumentException("The number of rows in both arrays must be the same."); - } - - int rows = a.Length; - int colsA = a[0].Length; - int colsB = b[0].Length; - float[][] result = new float[rows][]; - - for (int i = 0; i < rows; i++) - { - result[i] = new float[colsA + colsB]; - Array.Copy(a[i], 0, result[i], 0, colsA); - Array.Copy(b[i], 0, result[i], colsA, colsB); - } - - return result; - } - - private static float[][] GetLastColumns(float[][] array, int contextSize) - { - int rows = array.Length; - int cols = array[0].Length; - - if (contextSize > cols) - { - throw new ArgumentException("contextSize cannot be greater than the number of columns in the array."); - } - - float[][] result = new float[rows][]; - - for (int i = 0; i < rows; i++) - { - result[i] = new float[contextSize]; - Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize); - } - - return result; - } - - public float[] Call(float[][] x, int sr) - { - var result = ValidateInput(x, sr); - x = result.X; - sr = result.Sr; - int numberSamples = sr == 16000 ? 512 : 256; - - if (x[0].Length != numberSamples) - { - throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)"); - } - - int batchSize = x.Length; - int contextSize = sr == 16000 ? 64 : 32; - - if (lastBatchSize == 0) - { - ResetStates(); - } - if (lastSr != 0 && lastSr != sr) - { - ResetStates(); - } - if (lastBatchSize != 0 && lastBatchSize != batchSize) - { - ResetStates(); - } - - if (context.Length == 0) - { - context = new float[batchSize][]; - for (int i = 0; i < batchSize; i++) - { - context[i] = new float[contextSize]; - } - } - - x = Concatenate(context, x); - - var inputs = new List - { - NamedOnnxValue.CreateFromTensor("input", new DenseTensor(x.SelectMany(a => a).ToArray(), new[] { x.Length, x[0].Length })), - NamedOnnxValue.CreateFromTensor("sr", new DenseTensor(new[] { (long)sr }, new[] { 1 })), - NamedOnnxValue.CreateFromTensor("state", new DenseTensor(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), new[] { state.Length, state[0].Length, state[0][0].Length })) + NamedOnnxValue.CreateFromTensor("input", new DenseTensor(x.SelectMany(a => a).ToArray(), [x.Length, x[0].Length])), + NamedOnnxValue.CreateFromTensor("sr", new DenseTensor(new[] { (long)sr }, [1])), + NamedOnnxValue.CreateFromTensor("state", new DenseTensor(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), [state.Length, state[0].Length, state[0][0].Length])) }; - using (var outputs = session.Run(inputs)) + using var outputs = session.Run(inputs); + var output = outputs.First(o => o.Name == "output").AsTensor(); + var newState = outputs.First(o => o.Name == "stateN").AsTensor(); + + context = GetLastColumns(x, contextSize); + lastSr = sr; + lastBatchSize = batchSize; + + state = new float[newState.Dimensions[0]][][]; + for (int i = 0; i < newState.Dimensions[0]; i++) + { + state[i] = new float[newState.Dimensions[1]][]; + for (int j = 0; j < newState.Dimensions[1]; j++) { - var output = outputs.First(o => o.Name == "output").AsTensor(); - var newState = outputs.First(o => o.Name == "stateN").AsTensor(); - - context = GetLastColumns(x, contextSize); - lastSr = sr; - lastBatchSize = batchSize; - - state = new float[newState.Dimensions[0]][][]; - for (int i = 0; i < newState.Dimensions[0]; i++) + state[i][j] = new float[newState.Dimensions[2]]; + for (int k = 0; k < newState.Dimensions[2]; k++) { - state[i] = new float[newState.Dimensions[1]][]; - for (int j = 0; j < newState.Dimensions[1]; j++) - { - state[i][j] = new float[newState.Dimensions[2]]; - for (int k = 0; k < newState.Dimensions[2]; k++) - { - state[i][j][k] = newState[i, j, k]; - } - } + state[i][j][k] = newState[i, j, k]; } - - return output.ToArray(); } } + + return [.. output]; } +} diff --git a/examples/csharp/VadDotNet.sln b/examples/csharp/VadDotNet.sln new file mode 100644 index 0000000..430b66a --- /dev/null +++ b/examples/csharp/VadDotNet.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36616.10 d17.14 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadDotNet", "VadDotNet.csproj", "{F36E1741-EDDB-90C7-7501-4911058F8996}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {DFC4CEE8-1034-46B4-A5F4-D1649B3543E6} + EndGlobalSection +EndGlobal