修复CalculateProb方法计算句子EndOffset的bug

修改语法提示
This commit is contained in:
dongfp
2025-11-10 15:58:20 +08:00
parent be95df9152
commit 7b0aaa1c4c
4 changed files with 233 additions and 211 deletions

View File

@@ -21,7 +21,7 @@ class Program
MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS); MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
List<SileroSpeechSegment> speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE)); List<SileroSpeechSegment> speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE));
//Console.WriteLine(speechTimeList.ToJson()); //Console.WriteLine(speechTimeList.ToJson());
StringBuilder sb = new StringBuilder(); StringBuilder sb = new();
foreach (var speechSegment in speechTimeList) foreach (var speechSegment in speechTimeList)
{ {
sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n"); sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n");

View File

@@ -53,28 +53,26 @@ public class SileroVadDetector
{ {
Reset(); Reset();
using (var audioFile = new AudioFileReader(wavFile.FullName)) using var audioFile = new AudioFileReader(wavFile.FullName);
{ List<float> speechProbList = [];
List<float> speechProbList = new List<float>();
this._audioLengthSamples = (int)(audioFile.Length / 2); this._audioLengthSamples = (int)(audioFile.Length / 2);
float[] buffer = new float[this._windowSizeSample]; float[] buffer = new float[this._windowSizeSample];
while (audioFile.Read(buffer, 0, buffer.Length) > 0) while (audioFile.Read(buffer, 0, buffer.Length) > 0)
{ {
float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0]; float speechProb = _model.Call([buffer], _samplingRate)[0];
speechProbList.Add(speechProb); speechProbList.Add(speechProb);
} }
return CalculateProb(speechProbList); return CalculateProb(speechProbList);
} }
}
private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList) private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
{ {
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>(); List<SileroSpeechSegment> result = [];
bool triggered = false; bool triggered = false;
int tempEnd = 0, prevEnd = 0, nextStart = 0; int tempEnd = 0, prevEnd = 0, nextStart = 0;
SileroSpeechSegment segment = new SileroSpeechSegment(); SileroSpeechSegment segment = new();
for (int i = 0; i < speechProbList.Count; i++) for (int i = 0; i < speechProbList.Count; i++)
{ {
@@ -164,7 +162,8 @@ public class SileroVadDetector
if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples) if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
{ {
segment.EndOffset = _audioLengthSamples; //segment.EndOffset = _audioLengthSamples;
segment.EndOffset = speechProbList.Count * _windowSizeSample;
result.Add(segment); result.Add(segment);
} }
@@ -182,7 +181,7 @@ public class SileroVadDetector
int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value; int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
if (silenceDuration < 2 * _speechPadSamples) if (silenceDuration < 2 * _speechPadSamples)
{ {
item.EndOffset = item.EndOffset + (silenceDuration / 2); item.EndOffset += (silenceDuration / 2);
nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2)); nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
} }
else else
@@ -200,9 +199,9 @@ public class SileroVadDetector
return MergeListAndCalculateSecond(result, _samplingRate); return MergeListAndCalculateSecond(result, _samplingRate);
} }
private List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate) private static List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
{ {
List<SileroSpeechSegment> result = new List<SileroSpeechSegment>(); List<SileroSpeechSegment> result = [];
if (original == null || original.Count == 0) if (original == null || original.Count == 0)
{ {
return result; return result;
@@ -216,7 +215,10 @@ public class SileroVadDetector
for (int i = 1; i < original.Count; i++) for (int i = 1; i < original.Count; i++)
{ {
SileroSpeechSegment segment = original[i]; SileroSpeechSegment segment = original[i];
if (i == 235)
{
}
if (segment.StartOffset > right) if (segment.StartOffset > right)
{ {
result.Add(new SileroSpeechSegment(left, right, result.Add(new SileroSpeechSegment(left, right,
@@ -242,7 +244,7 @@ public class SileroVadDetector
return result; return result;
} }
private float CalculateSecondByOffset(int offset, int samplingRate) private static float CalculateSecondByOffset(int offset, int samplingRate)
{ {
float secondValue = offset * 1.0f / samplingRate; float secondValue = offset * 1.0f / samplingRate;
return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f; return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;

View File

@@ -1,5 +1,6 @@
using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors; using Microsoft.ML.OnnxRuntime.Tensors;
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
@@ -14,14 +15,16 @@ namespace VADdotnet;
private float[][] context; private float[][] context;
private int lastSr = 0; private int lastSr = 0;
private int lastBatchSize = 0; private int lastBatchSize = 0;
private static readonly List<int> SAMPLE_RATES = new List<int> { 8000, 16000 }; private static readonly List<int> SAMPLE_RATES = [8000, 16000];
public SileroVadOnnxModel(string modelPath) public SileroVadOnnxModel(string modelPath)
{ {
var sessionOptions = new SessionOptions(); var sessionOptions = new SessionOptions
sessionOptions.InterOpNumThreads = 1; {
sessionOptions.IntraOpNumThreads = 1; InterOpNumThreads = 1,
sessionOptions.EnableCpuMemArena = true; IntraOpNumThreads = 1,
EnableCpuMemArena = true
};
session = new InferenceSession(modelPath, sessionOptions); session = new InferenceSession(modelPath, sessionOptions);
ResetStates(); ResetStates();
@@ -34,33 +37,27 @@ namespace VADdotnet;
state[1] = new float[1][]; state[1] = new float[1][];
state[0][0] = new float[128]; state[0][0] = new float[128];
state[1][0] = new float[128]; state[1][0] = new float[128];
context = Array.Empty<float[]>(); context = [];
lastSr = 0; lastSr = 0;
lastBatchSize = 0; lastBatchSize = 0;
} }
public void Dispose() public void Dispose()
{ {
session?.Dispose(); GC.SuppressFinalize(this);
} }
public class ValidationResult public class ValidationResult(float[][] x, int sr)
{ {
public float[][] X { get; } public float[][] X { get; } = x;
public int Sr { get; } public int Sr { get; } = sr;
public ValidationResult(float[][] x, int sr)
{
X = x;
Sr = sr;
}
} }
private ValidationResult ValidateInput(float[][] x, int sr) private static ValidationResult ValidateInput(float[][] x, int sr)
{ {
if (x.Length == 1) if (x.Length == 1)
{ {
x = new float[][] { x[0] }; x = [x[0]];
} }
if (x.Length > 2) if (x.Length > 2)
{ {
@@ -186,13 +183,12 @@ namespace VADdotnet;
var inputs = new List<NamedOnnxValue> var inputs = new List<NamedOnnxValue>
{ {
NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), new[] { x.Length, x[0].Length })), NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), [x.Length, x[0].Length])),
NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, new[] { 1 })), NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, [1])),
NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), new[] { state.Length, state[0].Length, state[0][0].Length })) NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), [state.Length, state[0].Length, state[0][0].Length]))
}; };
using (var outputs = session.Run(inputs)) using var outputs = session.Run(inputs);
{
var output = outputs.First(o => o.Name == "output").AsTensor<float>(); var output = outputs.First(o => o.Name == "output").AsTensor<float>();
var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>(); var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>();
@@ -214,7 +210,6 @@ namespace VADdotnet;
} }
} }
return output.ToArray(); return [.. output];
}
} }
} }

View File

@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36616.10 d17.14
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadDotNet", "VadDotNet.csproj", "{F36E1741-EDDB-90C7-7501-4911058F8996}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {DFC4CEE8-1034-46B4-A5F4-D1649B3543E6}
EndGlobalSection
EndGlobal