修复CalculateProb方法计算句子EndOffset的bug

修改语法提示
This commit is contained in:
dongfp
2025-11-10 15:58:20 +08:00
parent be95df9152
commit 7b0aaa1c4c
4 changed files with 233 additions and 211 deletions

View File

@@ -21,7 +21,7 @@ class Program
MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
List<SileroSpeechSegment> speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE));
//Console.WriteLine(speechTimeList.ToJson());
StringBuilder sb = new StringBuilder();
StringBuilder sb = new();
foreach (var speechSegment in speechTimeList)
{
sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n");

View File

@@ -53,28 +53,26 @@ public class SileroVadDetector
{
Reset();
using (var audioFile = new AudioFileReader(wavFile.FullName))
using var audioFile = new AudioFileReader(wavFile.FullName);
List<float> speechProbList = [];
this._audioLengthSamples = (int)(audioFile.Length / 2);
float[] buffer = new float[this._windowSizeSample];
while (audioFile.Read(buffer, 0, buffer.Length) > 0)
{
List<float> speechProbList = new List<float>();
this._audioLengthSamples = (int)(audioFile.Length / 2);
float[] buffer = new float[this._windowSizeSample];
while (audioFile.Read(buffer, 0, buffer.Length) > 0)
{
float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
speechProbList.Add(speechProb);
}
return CalculateProb(speechProbList);
float speechProb = _model.Call([buffer], _samplingRate)[0];
speechProbList.Add(speechProb);
}
return CalculateProb(speechProbList);
}
private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
{
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
List<SileroSpeechSegment> result = [];
bool triggered = false;
int tempEnd = 0, prevEnd = 0, nextStart = 0;
SileroSpeechSegment segment = new SileroSpeechSegment();
SileroSpeechSegment segment = new();
for (int i = 0; i < speechProbList.Count; i++)
{
@@ -164,7 +162,8 @@ public class SileroVadDetector
if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
{
segment.EndOffset = _audioLengthSamples;
//segment.EndOffset = _audioLengthSamples;
segment.EndOffset = speechProbList.Count * _windowSizeSample;
result.Add(segment);
}
@@ -182,7 +181,7 @@ public class SileroVadDetector
int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
if (silenceDuration < 2 * _speechPadSamples)
{
item.EndOffset = item.EndOffset + (silenceDuration / 2);
item.EndOffset += (silenceDuration / 2);
nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
}
else
@@ -200,9 +199,9 @@ public class SileroVadDetector
return MergeListAndCalculateSecond(result, _samplingRate);
}
private List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
private static List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
{
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>();
List<SileroSpeechSegment> result = [];
if (original == null || original.Count == 0)
{
return result;
@@ -216,7 +215,10 @@ public class SileroVadDetector
for (int i = 1; i < original.Count; i++)
{
SileroSpeechSegment segment = original[i];
if (i == 235)
{
}
if (segment.StartOffset > right)
{
result.Add(new SileroSpeechSegment(left, right,
@@ -242,7 +244,7 @@ public class SileroVadDetector
return result;
}
private float CalculateSecondByOffset(int offset, int samplingRate)
private static float CalculateSecondByOffset(int offset, int samplingRate)
{
float secondValue = offset * 1.0f / samplingRate;
return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;

View File

@@ -1,5 +1,6 @@
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using System;
using System.Collections.Generic;
using System.Linq;
@@ -7,214 +8,208 @@ using System.Linq;
namespace VADdotnet;
public class SileroVadOnnxModel : IDisposable
public class SileroVadOnnxModel : IDisposable
{
private readonly InferenceSession session;
private float[][][] state;
private float[][] context;
private int lastSr = 0;
private int lastBatchSize = 0;
private static readonly List<int> SAMPLE_RATES = [8000, 16000];
public SileroVadOnnxModel(string modelPath)
{
private readonly InferenceSession session;
private float[][][] state;
private float[][] context;
private int lastSr = 0;
private int lastBatchSize = 0;
private static readonly List<int> SAMPLE_RATES = new List<int> { 8000, 16000 };
public SileroVadOnnxModel(string modelPath)
var sessionOptions = new SessionOptions
{
var sessionOptions = new SessionOptions();
sessionOptions.InterOpNumThreads = 1;
sessionOptions.IntraOpNumThreads = 1;
sessionOptions.EnableCpuMemArena = true;
InterOpNumThreads = 1,
IntraOpNumThreads = 1,
EnableCpuMemArena = true
};
session = new InferenceSession(modelPath, sessionOptions);
session = new InferenceSession(modelPath, sessionOptions);
ResetStates();
}
public void ResetStates()
{
state = new float[2][][];
state[0] = new float[1][];
state[1] = new float[1][];
state[0][0] = new float[128];
state[1][0] = new float[128];
context = [];
lastSr = 0;
lastBatchSize = 0;
}
public void Dispose()
{
GC.SuppressFinalize(this);
}
public class ValidationResult(float[][] x, int sr)
{
public float[][] X { get; } = x;
public int Sr { get; } = sr;
}
private static ValidationResult ValidateInput(float[][] x, int sr)
{
if (x.Length == 1)
{
x = [x[0]];
}
if (x.Length > 2)
{
throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
}
if (sr != 16000 && (sr % 16000 == 0))
{
int step = sr / 16000;
float[][] reducedX = new float[x.Length][];
for (int i = 0; i < x.Length; i++)
{
float[] current = x[i];
float[] newArr = new float[(current.Length + step - 1) / step];
for (int j = 0, index = 0; j < current.Length; j += step, index++)
{
newArr[index] = current[j];
}
reducedX[i] = newArr;
}
x = reducedX;
sr = 16000;
}
if (!SAMPLE_RATES.Contains(sr))
{
throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)");
}
if (((float)sr) / x[0].Length > 31.25)
{
throw new ArgumentException("Input audio is too short");
}
return new ValidationResult(x, sr);
}
private static float[][] Concatenate(float[][] a, float[][] b)
{
if (a.Length != b.Length)
{
throw new ArgumentException("The number of rows in both arrays must be the same.");
}
int rows = a.Length;
int colsA = a[0].Length;
int colsB = b[0].Length;
float[][] result = new float[rows][];
for (int i = 0; i < rows; i++)
{
result[i] = new float[colsA + colsB];
Array.Copy(a[i], 0, result[i], 0, colsA);
Array.Copy(b[i], 0, result[i], colsA, colsB);
}
return result;
}
private static float[][] GetLastColumns(float[][] array, int contextSize)
{
int rows = array.Length;
int cols = array[0].Length;
if (contextSize > cols)
{
throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
}
float[][] result = new float[rows][];
for (int i = 0; i < rows; i++)
{
result[i] = new float[contextSize];
Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize);
}
return result;
}
public float[] Call(float[][] x, int sr)
{
var result = ValidateInput(x, sr);
x = result.X;
sr = result.Sr;
int numberSamples = sr == 16000 ? 512 : 256;
if (x[0].Length != numberSamples)
{
throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
}
int batchSize = x.Length;
int contextSize = sr == 16000 ? 64 : 32;
if (lastBatchSize == 0)
{
ResetStates();
}
if (lastSr != 0 && lastSr != sr)
{
ResetStates();
}
if (lastBatchSize != 0 && lastBatchSize != batchSize)
{
ResetStates();
}
public void ResetStates()
if (context.Length == 0)
{
state = new float[2][][];
state[0] = new float[1][];
state[1] = new float[1][];
state[0][0] = new float[128];
state[1][0] = new float[128];
context = Array.Empty<float[]>();
lastSr = 0;
lastBatchSize = 0;
}
public void Dispose()
{
session?.Dispose();
}
public class ValidationResult
{
public float[][] X { get; }
public int Sr { get; }
public ValidationResult(float[][] x, int sr)
context = new float[batchSize][];
for (int i = 0; i < batchSize; i++)
{
X = x;
Sr = sr;
context[i] = new float[contextSize];
}
}
private ValidationResult ValidateInput(float[][] x, int sr)
{
if (x.Length == 1)
x = Concatenate(context, x);
var inputs = new List<NamedOnnxValue>
{
x = new float[][] { x[0] };
}
if (x.Length > 2)
{
throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
}
if (sr != 16000 && (sr % 16000 == 0))
{
int step = sr / 16000;
float[][] reducedX = new float[x.Length][];
for (int i = 0; i < x.Length; i++)
{
float[] current = x[i];
float[] newArr = new float[(current.Length + step - 1) / step];
for (int j = 0, index = 0; j < current.Length; j += step, index++)
{
newArr[index] = current[j];
}
reducedX[i] = newArr;
}
x = reducedX;
sr = 16000;
}
if (!SAMPLE_RATES.Contains(sr))
{
throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)");
}
if (((float)sr) / x[0].Length > 31.25)
{
throw new ArgumentException("Input audio is too short");
}
return new ValidationResult(x, sr);
}
private static float[][] Concatenate(float[][] a, float[][] b)
{
if (a.Length != b.Length)
{
throw new ArgumentException("The number of rows in both arrays must be the same.");
}
int rows = a.Length;
int colsA = a[0].Length;
int colsB = b[0].Length;
float[][] result = new float[rows][];
for (int i = 0; i < rows; i++)
{
result[i] = new float[colsA + colsB];
Array.Copy(a[i], 0, result[i], 0, colsA);
Array.Copy(b[i], 0, result[i], colsA, colsB);
}
return result;
}
private static float[][] GetLastColumns(float[][] array, int contextSize)
{
int rows = array.Length;
int cols = array[0].Length;
if (contextSize > cols)
{
throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
}
float[][] result = new float[rows][];
for (int i = 0; i < rows; i++)
{
result[i] = new float[contextSize];
Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize);
}
return result;
}
public float[] Call(float[][] x, int sr)
{
var result = ValidateInput(x, sr);
x = result.X;
sr = result.Sr;
int numberSamples = sr == 16000 ? 512 : 256;
if (x[0].Length != numberSamples)
{
throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
}
int batchSize = x.Length;
int contextSize = sr == 16000 ? 64 : 32;
if (lastBatchSize == 0)
{
ResetStates();
}
if (lastSr != 0 && lastSr != sr)
{
ResetStates();
}
if (lastBatchSize != 0 && lastBatchSize != batchSize)
{
ResetStates();
}
if (context.Length == 0)
{
context = new float[batchSize][];
for (int i = 0; i < batchSize; i++)
{
context[i] = new float[contextSize];
}
}
x = Concatenate(context, x);
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), new[] { x.Length, x[0].Length })),
NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, new[] { 1 })),
NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), new[] { state.Length, state[0].Length, state[0][0].Length }))
NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), [x.Length, x[0].Length])),
NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, [1])),
NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), [state.Length, state[0].Length, state[0][0].Length]))
};
using (var outputs = session.Run(inputs))
using var outputs = session.Run(inputs);
var output = outputs.First(o => o.Name == "output").AsTensor<float>();
var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>();
context = GetLastColumns(x, contextSize);
lastSr = sr;
lastBatchSize = batchSize;
state = new float[newState.Dimensions[0]][][];
for (int i = 0; i < newState.Dimensions[0]; i++)
{
state[i] = new float[newState.Dimensions[1]][];
for (int j = 0; j < newState.Dimensions[1]; j++)
{
var output = outputs.First(o => o.Name == "output").AsTensor<float>();
var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>();
context = GetLastColumns(x, contextSize);
lastSr = sr;
lastBatchSize = batchSize;
state = new float[newState.Dimensions[0]][][];
for (int i = 0; i < newState.Dimensions[0]; i++)
state[i][j] = new float[newState.Dimensions[2]];
for (int k = 0; k < newState.Dimensions[2]; k++)
{
state[i] = new float[newState.Dimensions[1]][];
for (int j = 0; j < newState.Dimensions[1]; j++)
{
state[i][j] = new float[newState.Dimensions[2]];
for (int k = 0; k < newState.Dimensions[2]; k++)
{
state[i][j][k] = newState[i, j, k];
}
}
state[i][j][k] = newState[i, j, k];
}
return output.ToArray();
}
}
return [.. output];
}
}

View File

@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36616.10 d17.14
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadDotNet", "VadDotNet.csproj", "{F36E1741-EDDB-90C7-7501-4911058F8996}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {DFC4CEE8-1034-46B4-A5F4-D1649B3543E6}
EndGlobalSection
EndGlobal