mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 09:59:20 +08:00
Changed some source.
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
This is the source code for Silero-VAD 5.1 in C++, based on LibTorch.
|
This is the source code for Silero-VAD V5 in C++, based on LibTorch.
|
||||||
The primary implementation is the CPU version, and you should compare its results with the Python version.
|
The primary implementation is the CPU version, and you should compare its results with the Python version.
|
||||||
|
I only checked the 16kHz results.
|
||||||
|
|
||||||
In addition, Batch and CUDA inference options are also available if you want to explore further.
|
In addition, Batch and CUDA inference options are also available if you want to explore further.
|
||||||
Note that when using batch inference, the speech probabilities might slightly differ from the standard version, likely due to differences in caching.
|
Note that when using batch inference, the speech probabilities might slightly differ from the standard version, likely due to differences in caching.
|
||||||
@@ -12,24 +13,24 @@ GCC 11.4.0 (GCC >= 5.1)
|
|||||||
LibTorch 1.13.0(Other versions are also acceptable)
|
LibTorch 1.13.0(Other versions are also acceptable)
|
||||||
|
|
||||||
#Download Libtorch:
|
#Download Libtorch:
|
||||||
#cpu
|
*cpu
|
||||||
$wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
|
$wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
|
||||||
$unzip libtorch-shared-with-deps-1.13.0+cpu.zip
|
$unzip libtorch-shared-with-deps-1.13.0+cpu.zip
|
||||||
|
|
||||||
#cuda
|
*cuda
|
||||||
$wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
|
$wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
|
||||||
$unzip libtorch-shared-with-deps-1.13.0+cu116.zip
|
$unzip libtorch-shared-with-deps-1.13.0+cu116.zip
|
||||||
|
|
||||||
#complie:
|
#complie:
|
||||||
#cpu
|
*cpu
|
||||||
$g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
|
$g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
|
||||||
#cuda
|
*cuda
|
||||||
$g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
|
$g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
|
||||||
|
|
||||||
#option to add
|
*option to add
|
||||||
-DUSE_BATCH
|
-DUSE_BATCH
|
||||||
-DUSE_GPU
|
-DUSE_GPU
|
||||||
|
|
||||||
# Run:
|
# Run:
|
||||||
./silero aepyx.wav 0.5 #The sample file 'aepyx.wav' is part of the Voxconverse dataset.
|
./silero aepyx.wav 16000 0.5 #The sample file 'aepyx.wav' is part of the Voxconverse dataset.
|
||||||
#aepyx.wav : 16kHz, 16-bit
|
#aepyx.wav : 16kHz, 16-bit
|
||||||
@@ -4,25 +4,28 @@
|
|||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
if(argc != 3){
|
if(argc != 4){
|
||||||
std::cerr<<"Usage : "<<argv[0]<<" <wav.path> threshold"<<std::endl;
|
std::cerr<<"Usage : "<<argv[0]<<" <wav.path> <SampleRate> <Threshold>"<<std::endl;
|
||||||
std::cerr<<"Usage : "<<argv[0]<<" sample.wav 0.38"<<std::endl;
|
std::cerr<<"Usage : "<<argv[0]<<" sample.wav 16000 0.5"<<std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string wav_path = argv[1];
|
std::string wav_path = argv[1];
|
||||||
float threshold = std::stof(argv[2]);
|
float sample_rate = std::stof(argv[2]);
|
||||||
|
float threshold = std::stof(argv[3]);
|
||||||
|
|
||||||
|
|
||||||
//Load Model
|
//Load Model
|
||||||
std::string model_path = "../../src/silero_vad/data/silero_vad.jit";
|
std::string model_path = "../../src/silero_vad/data/silero_vad.jit";
|
||||||
silero::VadIterator vad(model_path);
|
silero::VadIterator vad(model_path);
|
||||||
vad.threshold=threshold;
|
|
||||||
vad.min_speech_duration_ms=255;
|
vad.threshold=threshold; //(Default:0.5)
|
||||||
vad.max_duration_merge_ms=300;
|
vad.sample_rate=sample_rate; //16000Hz,8000Hz. (Default:16000)
|
||||||
vad.print_as_samples=true; //if true, it prints time-stamp with sample numbers.
|
vad.print_as_samples=true; //if true, it prints time-stamp with samples. otherwise, in seconds
|
||||||
//(Default:false)
|
//(Default:false)
|
||||||
|
|
||||||
|
vad.SetVariables();
|
||||||
|
|
||||||
// Read wav
|
// Read wav
|
||||||
wav::WavReader wav_reader(wav_path);
|
wav::WavReader wav_reader(wav_path);
|
||||||
std::vector<float> input_wav(wav_reader.num_samples());
|
std::vector<float> input_wav(wav_reader.num_samples());
|
||||||
@@ -34,7 +37,7 @@ int main(int argc, char* argv[]) {
|
|||||||
|
|
||||||
vad.SpeechProbs(input_wav);
|
vad.SpeechProbs(input_wav);
|
||||||
|
|
||||||
std::vector<silero::Interval> speeches = vad.GetSpeechTimestamps();
|
std::vector<silero::SpeechSegment> speeches = vad.GetSpeechTimestamps();
|
||||||
for(const auto& speech : speeches){
|
for(const auto& speech : speeches){
|
||||||
if(vad.print_as_samples){
|
if(vad.print_as_samples){
|
||||||
std::cout<<"{'start': "<<static_cast<int>(speech.start)<<", 'end': "<<static_cast<int>(speech.end)<<"}"<<std::endl;
|
std::cout<<"{'start': "<<static_cast<int>(speech.start)<<", 'end': "<<static_cast<int>(speech.end)<<"}"<<std::endl;
|
||||||
|
|||||||
BIN
examples/cpp_libtorch/silero
Executable file
BIN
examples/cpp_libtorch/silero
Executable file
Binary file not shown.
@@ -2,7 +2,6 @@
|
|||||||
//Created On : 2024-11-18
|
//Created On : 2024-11-18
|
||||||
//Description : silero 5.1 system for torch-script(c++).
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
//Version : 1.0
|
//Version : 1.0
|
||||||
//Contact : junghan4242@gmail.com
|
|
||||||
|
|
||||||
|
|
||||||
#include "silero_torch.h"
|
#include "silero_torch.h"
|
||||||
@@ -10,10 +9,10 @@
|
|||||||
namespace silero {
|
namespace silero {
|
||||||
|
|
||||||
VadIterator::VadIterator(const std::string &model_path, float threshold, int sample_rate, int window_size_ms, int speech_pad_ms, int min_silence_duration_ms, int min_speech_duration_ms, int max_duration_merge_ms, bool print_as_samples)
|
VadIterator::VadIterator(const std::string &model_path, float threshold, int sample_rate, int window_size_ms, int speech_pad_ms, int min_silence_duration_ms, int min_speech_duration_ms, int max_duration_merge_ms, bool print_as_samples)
|
||||||
:sample_rate(sample_rate), threshold(threshold), speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), print_as_samples(print_as_samples)
|
:sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms), speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), print_as_samples(print_as_samples)
|
||||||
{
|
{
|
||||||
init_torch_model(model_path);
|
init_torch_model(model_path);
|
||||||
init_engine(window_size_ms);
|
//init_engine(window_size_ms);
|
||||||
}
|
}
|
||||||
VadIterator::~VadIterator(){
|
VadIterator::~VadIterator(){
|
||||||
}
|
}
|
||||||
@@ -117,14 +116,14 @@ namespace silero {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<Interval> VadIterator::GetSpeechTimestamps() {
|
std::vector<SpeechSegment> VadIterator::GetSpeechTimestamps() {
|
||||||
std::vector<Interval> speeches = DoVad();
|
std::vector<SpeechSegment> speeches = DoVad();
|
||||||
|
|
||||||
#ifdef USE_BATCH
|
#ifdef USE_BATCH
|
||||||
//When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
|
//When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
|
||||||
//It could be better get reasonable output because of distorted probs.
|
//It could be better get reasonable output because of distorted probs.
|
||||||
duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
|
duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
|
||||||
std::vector<Interval> speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
|
std::vector<SpeechSegment> speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
|
||||||
if(!print_as_samples){
|
if(!print_as_samples){
|
||||||
for (auto& speech : speeches_merge) { //samples to second
|
for (auto& speech : speeches_merge) { //samples to second
|
||||||
speech.start /= sample_rate;
|
speech.start /= sample_rate;
|
||||||
@@ -147,6 +146,9 @@ namespace silero {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
void VadIterator::SetVariables(){
|
||||||
|
init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
|
||||||
void VadIterator::init_engine(int window_size_ms) {
|
void VadIterator::init_engine(int window_size_ms) {
|
||||||
min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
|
min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
|
||||||
@@ -186,8 +188,8 @@ namespace silero {
|
|||||||
total_sample_size = 0;
|
total_sample_size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Interval> VadIterator::DoVad() {
|
std::vector<SpeechSegment> VadIterator::DoVad() {
|
||||||
std::vector<Interval> speeches;
|
std::vector<SpeechSegment> speeches;
|
||||||
|
|
||||||
for (size_t i = 0; i < outputs_prob.size(); ++i) {
|
for (size_t i = 0; i < outputs_prob.size(); ++i) {
|
||||||
float speech_prob = outputs_prob[i];
|
float speech_prob = outputs_prob[i];
|
||||||
@@ -202,7 +204,7 @@ namespace silero {
|
|||||||
|
|
||||||
if (speech_prob >= threshold && !triggered) {
|
if (speech_prob >= threshold && !triggered) {
|
||||||
triggered = true;
|
triggered = true;
|
||||||
Interval segment;
|
SpeechSegment segment;
|
||||||
segment.start = std::max(static_cast<int>(0), current_sample - speech_pad_samples - window_size_samples);
|
segment.start = std::max(static_cast<int>(0), current_sample - speech_pad_samples - window_size_samples);
|
||||||
speeches.push_back(segment);
|
speeches.push_back(segment);
|
||||||
continue;
|
continue;
|
||||||
@@ -216,7 +218,7 @@ namespace silero {
|
|||||||
if (current_sample - temp_end < min_silence_samples) {
|
if (current_sample - temp_end < min_silence_samples) {
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
Interval& segment = speeches.back();
|
SpeechSegment& segment = speeches.back();
|
||||||
segment.end = temp_end + speech_pad_samples - window_size_samples;
|
segment.end = temp_end + speech_pad_samples - window_size_samples;
|
||||||
temp_end = 0;
|
temp_end = 0;
|
||||||
triggered = false;
|
triggered = false;
|
||||||
@@ -226,7 +228,7 @@ namespace silero {
|
|||||||
|
|
||||||
if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
|
if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
|
||||||
std::cout<<"when last triggered is keep working until last Probs"<<std::endl;
|
std::cout<<"when last triggered is keep working until last Probs"<<std::endl;
|
||||||
Interval& segment = speeches.back();
|
SpeechSegment& segment = speeches.back();
|
||||||
segment.end = total_sample_size; // 현재 샘플을 마지막 구간의 종료 시간으로 설정
|
segment.end = total_sample_size; // 현재 샘플을 마지막 구간의 종료 시간으로 설정
|
||||||
triggered = false; // VAD 상태 초기화
|
triggered = false; // VAD 상태 초기화
|
||||||
}
|
}
|
||||||
@@ -235,7 +237,7 @@ namespace silero {
|
|||||||
std::remove_if(
|
std::remove_if(
|
||||||
speeches.begin(),
|
speeches.begin(),
|
||||||
speeches.end(),
|
speeches.end(),
|
||||||
[this](const Interval& speech) {
|
[this](const SpeechSegment& speech) {
|
||||||
return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
|
return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
|
||||||
//min_speech_samples is 4000samples(0.25sec)
|
//min_speech_samples is 4000samples(0.25sec)
|
||||||
//여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
|
//여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
|
||||||
@@ -252,15 +254,15 @@ namespace silero {
|
|||||||
return speeches;
|
return speeches;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Interval> VadIterator::mergeSpeeches(const std::vector<Interval>& speeches, int duration_merge_samples) {
|
std::vector<SpeechSegment> VadIterator::mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples) {
|
||||||
std::vector<Interval> mergedSpeeches;
|
std::vector<SpeechSegment> mergedSpeeches;
|
||||||
|
|
||||||
if (speeches.empty()) {
|
if (speeches.empty()) {
|
||||||
return mergedSpeeches; // 빈 벡터 반환
|
return mergedSpeeches; // 빈 벡터 반환
|
||||||
}
|
}
|
||||||
|
|
||||||
// 첫 번째 구간으로 초기화
|
// 첫 번째 구간으로 초기화
|
||||||
Interval currentSegment = speeches[0];
|
SpeechSegment currentSegment = speeches[0];
|
||||||
|
|
||||||
for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
|
for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
|
||||||
// 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
|
// 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
//Created On : 2024-11-18
|
//Created On : 2024-11-18
|
||||||
//Description : silero 5.1 system for torch-script(c++).
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
//Version : 1.0
|
//Version : 1.0
|
||||||
//Contact : junghan4242@gmail.com
|
|
||||||
|
|
||||||
#ifndef SILERO_TORCH_H
|
#ifndef SILERO_TORCH_H
|
||||||
#define SILERO_TORCH_H
|
#define SILERO_TORCH_H
|
||||||
@@ -27,11 +26,6 @@ namespace silero{
|
|||||||
int end;
|
int end;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Interval {
|
|
||||||
float start;
|
|
||||||
float end;
|
|
||||||
};
|
|
||||||
|
|
||||||
class VadIterator{
|
class VadIterator{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@@ -42,10 +36,12 @@ namespace silero{
|
|||||||
|
|
||||||
|
|
||||||
void SpeechProbs(std::vector<float>& input_wav);
|
void SpeechProbs(std::vector<float>& input_wav);
|
||||||
std::vector<silero::Interval> GetSpeechTimestamps();
|
std::vector<silero::SpeechSegment> GetSpeechTimestamps();
|
||||||
|
void SetVariables();
|
||||||
|
|
||||||
float threshold;
|
float threshold;
|
||||||
int sample_rate;
|
int sample_rate;
|
||||||
|
int window_size_ms;
|
||||||
int min_speech_duration_ms;
|
int min_speech_duration_ms;
|
||||||
int max_duration_merge_ms;
|
int max_duration_merge_ms;
|
||||||
bool print_as_samples;
|
bool print_as_samples;
|
||||||
@@ -70,8 +66,8 @@ namespace silero{
|
|||||||
void init_engine(int window_size_ms);
|
void init_engine(int window_size_ms);
|
||||||
void init_torch_model(const std::string& model_path);
|
void init_torch_model(const std::string& model_path);
|
||||||
void reset_states();
|
void reset_states();
|
||||||
std::vector<Interval> DoVad();
|
std::vector<SpeechSegment> DoVad();
|
||||||
std::vector<Interval> mergeSpeeches(const std::vector<Interval>& speeches, int duration_merge_samples);
|
std::vector<SpeechSegment> mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user