diff --git a/examples/c++/README.md b/examples/c++/README.md new file mode 100644 index 0000000..6a469c8 --- /dev/null +++ b/examples/c++/README.md @@ -0,0 +1,49 @@ +# Silero-VAD V5 in C++ (based on LibTorch) + +This is the source code for Silero-VAD V6 in C++, utilizing LibTorch & Onnxruntime. +You should compare its results with the Python version. +Results at 16 and 8kHz have been tested. Batch and CUDA inference options are deprecated. + + +## Requirements +- GCC 11.4.0 (GCC >= 5.1) +- Onnxruntime 1.11.0 (other versions are also acceptable) +- LibTorch 1.13.0 (other versions are also acceptable) + +## Download LibTorch + +```bash +-Onnxruntime +$wget https://github.com/microsoft/onnxruntime/releases/download/v1.11.1/onnxruntime-linux-x64-1.11.1.tgz +$tar -xvf onnxruntime-linux-x64-1.11.1.tgz +$ln -s onnxruntime-linux-x64-1.11.1 onnxruntime-linux #soft-link + +-Libtorch +$wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip +$unzip libtorch-shared-with-deps-1.13.0+cpu.zip +``` + +## Compilation + +```bash +-ONNX-build +$g++ main.cc silero.cc -I ./onnxruntime-linux/include/ -L ./onnxruntime-linux/lib/ -lonnxruntime -Wl,-rpath,./onnxruntime-linux/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_ONNX + +-TORCH-build +$g++ main.cc silero.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_TORCH +``` + +## Optional Compilation Flags +-DUSE_TORCH +-DUSE_ONNX + +## Run the Program +To run the program, use the following command: + +`./silero ` +`./silero aepyx.wav 16000 0.5` +`./silero aepyx_8k.wav 8000 0.5` + +The sample file aepyx.wav is part of the Voxconverse dataset. +File details: aepyx.wav is a 16kHz, 16-bit audio file. +File details: aepyx_8k.wav is a 8kHz, 16-bit audio file. diff --git a/examples/c++/aepyx.wav b/examples/c++/aepyx.wav new file mode 100644 index 0000000..e7cc293 Binary files /dev/null and b/examples/c++/aepyx.wav differ diff --git a/examples/c++/aepyx_8k.wav b/examples/c++/aepyx_8k.wav new file mode 100644 index 0000000..53297cd Binary files /dev/null and b/examples/c++/aepyx_8k.wav differ diff --git a/examples/c++/main.cc b/examples/c++/main.cc new file mode 100644 index 0000000..ec6203f --- /dev/null +++ b/examples/c++/main.cc @@ -0,0 +1,61 @@ +#include +#include "silero.h" +#include "wav.h" + +int main(int argc, char* argv[]) { + + if(argc != 4){ + std::cerr<<"Usage : "< "< input_wav(wav_reader.num_samples()); + + for (int i = 0; i < wav_reader.num_samples(); i++) + { + input_wav[i] = static_cast(*(wav_reader.data() + i)); + } + + vad.SpeechProbs(input_wav); + + std::vector speeches = vad.GetSpeechTimestamps(); + for(const auto& speech : speeches){ + if(vad.print_as_samples){ + std::cout<<"{'start': "<(speech.start)<<", 'end': "<(speech.end)<<"}"<& input_wav) { + int num_samples = input_wav.size(); + int num_chunks = num_samples / window_size_samples; + int remainder_samples = num_samples % window_size_samples; + total_sample_size += num_samples; + + std::vector chunks; + + for (int i = 0; i < num_chunks; i++) { + float* chunk_start = input_wav.data() + i * window_size_samples; + torch::Tensor chunk = torch::from_blob(chunk_start, {1, window_size_samples}, torch::kFloat32); + chunks.push_back(chunk); + + if (i == num_chunks - 1 && remainder_samples > 0) { + int remaining_samples = num_samples - num_chunks * window_size_samples; + float* chunk_start_remainder = input_wav.data() + num_chunks * window_size_samples; + torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1, remaining_samples}, torch::kFloat32); + torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples - remaining_samples}, torch::kFloat32)}, 1); + chunks.push_back(padded_chunk); + } + } + + if (!chunks.empty()) { + std::vector outputs; + torch::Tensor batched_chunks = torch::stack(chunks); + for (size_t i = 0; i < chunks.size(); i++) { + torch::NoGradGuard no_grad; + std::vector inputs; + inputs.push_back(batched_chunks[i]); + inputs.push_back(sample_rate); + torch::Tensor output = model.forward(inputs).toTensor(); + outputs.push_back(output); + } + torch::Tensor all_outputs = torch::stack(outputs); + for (size_t i = 0; i < chunks.size(); i++) { + float output_f = all_outputs[i].item(); + outputs_prob.push_back(output_f); + //////To print Probs by libtorch + //std::cout << "Chunk " << i << " prob: " << output_f<< "\n"; + } + } + } + + +#elif USE_ONNX + + VadIterator::VadIterator(const std::string &model_path, + float threshold, + int sample_rate, + int window_size_ms, + int speech_pad_ms, + int min_silence_duration_ms, + int min_speech_duration_ms, + int max_duration_merge_ms, + bool print_as_samples) + :sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms), + speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), + min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), + print_as_samples(print_as_samples), + env(ORT_LOGGING_LEVEL_ERROR, "Vad"), session_options(), session(nullptr), allocator(), + memory_info(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU)), context_samples(64), + _context(64, 0.0f), current_sample(0), size_state(2 * 1 * 128), + input_node_names({"input", "state", "sr"}), output_node_names({"output", "stateN"}), + state_node_dims{2, 1, 128}, sr_node_dims{1} + + { + init_onnx_model(model_path); + } + VadIterator::~VadIterator(){ + } + + void VadIterator::init_onnx_model(const std::string& model_path) { + int inter_threads=1; + int intra_threads=1; + session_options.SetIntraOpNumThreads(intra_threads); + session_options.SetInterOpNumThreads(inter_threads); + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + session = std::make_shared(env, model_path.c_str(), session_options); + std::cout<<"Silero onnx-Model loaded successfully"<& data_chunk) { + // _context와 현재 청크를 결합하여 입력 데이터 구성 + std::vector new_data(effective_window_size, 0.0f); + std::copy(_context.begin(), _context.end(), new_data.begin()); + std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples); + input = new_data; + + Ort::Value input_ort = Ort::Value::CreateTensor( + memory_info, input.data(), input.size(), input_node_dims, 2); + Ort::Value state_ort = Ort::Value::CreateTensor( + memory_info, _state.data(), _state.size(), state_node_dims, 3); + Ort::Value sr_ort = Ort::Value::CreateTensor( + memory_info, sr.data(), sr.size(), sr_node_dims, 1); + ort_inputs.clear(); + ort_inputs.push_back(std::move(input_ort)); + ort_inputs.push_back(std::move(state_ort)); + ort_inputs.push_back(std::move(sr_ort)); + + ort_outputs = session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), ort_inputs.data(), ort_inputs.size(), + output_node_names.data(), output_node_names.size()); + + float speech_prob = ort_outputs[0].GetTensorMutableData()[0]; // ONNX 출력: 첫 번째 값이 음성 확률 + + float* stateN = ort_outputs[1].GetTensorMutableData(); // 두 번째 출력값: 상태 업데이트 + std::memcpy(_state.data(), stateN, size_state * sizeof(float)); + + std::copy(new_data.end() - context_samples, new_data.end(), _context.begin()); + // _context 업데이트: new_data의 마지막 context_samples 유지 + + return speech_prob; + } + void VadIterator::SpeechProbs(std::vector& input_wav) { + reset_states(); + total_sample_size = static_cast(input_wav.size()); + for (size_t j = 0; j < static_cast(total_sample_size); j += window_size_samples) { + if (j + window_size_samples > static_cast(total_sample_size)) + break; + std::vector chunk(input_wav.begin() + j, input_wav.begin() + j + window_size_samples); + float speech_prob = predict(chunk); + outputs_prob.push_back(speech_prob); + } + } + +#endif + + void VadIterator::reset_states() { + triggered = false; + current_sample = 0; + temp_end = 0; + outputs_prob.clear(); + total_sample_size = 0; + +#ifdef USE_TORCH + model.run_method("reset_states"); // Reset model states if applicable +#elif USE_ONNX + std::memset(_state.data(), 0, _state.size() * sizeof(float)); + std::fill(_context.begin(), _context.end(), 0.0f); +#endif + } + + std::vector VadIterator::GetSpeechTimestamps() { + std::vector speeches = DoVad(); + if(!print_as_samples){ + for (auto& speech : speeches) { + speech.start /= sample_rate; + speech.end /= sample_rate; + } + } + return speeches; + } + + void VadIterator::SetVariables(){ + // Initialize internal engine parameters + init_engine(window_size_ms); + } + + void VadIterator::init_engine(int window_size_ms) { + min_silence_samples = sample_rate * min_silence_duration_ms / 1000; + speech_pad_samples = sample_rate * speech_pad_ms / 1000; + window_size_samples = sample_rate / 1000 * window_size_ms; + min_speech_samples = sample_rate * min_speech_duration_ms / 1000; +#ifdef USE_ONNX + //for ONNX + context_samples=window_size_samples / 8; + _context.assign(context_samples, 0.0f); + + effective_window_size = window_size_samples + context_samples; // 예: 512 + 64 = 576 samples + input_node_dims[0] = 1; + input_node_dims[1] = effective_window_size; + _state.resize(size_state); + sr.resize(1); + sr[0] = sample_rate; +#endif + } + + std::vector VadIterator::DoVad() { + std::vector speeches; + for (size_t i = 0; i < outputs_prob.size(); ++i) { + float speech_prob = outputs_prob[i]; + current_sample += window_size_samples; + if (speech_prob >= threshold && temp_end != 0) { + temp_end = 0; + } + + if (speech_prob >= threshold) { + if (!triggered) { + triggered = true; + Interval segment; + segment.start = std::max(0, current_sample - speech_pad_samples - window_size_samples); + speeches.push_back(segment); + } + }else { + if (triggered) { + if (speech_prob < threshold - 0.15f) { + if (temp_end == 0) { + temp_end = current_sample; + } + if (current_sample - temp_end >= min_silence_samples) { + Interval& segment = speeches.back(); + segment.end = temp_end + speech_pad_samples - window_size_samples; + temp_end = 0; + triggered = false; + } + } + } + } + + + } + + if (triggered) { + std::cout<<"Finalizing active speech segment at stream end."<speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples); + }), speeches.end()); + + reset_states(); + return speeches; + } + + + } // namespace silero + diff --git a/examples/c++/silero.h b/examples/c++/silero.h new file mode 100644 index 0000000..8d4a11e --- /dev/null +++ b/examples/c++/silero.h @@ -0,0 +1,123 @@ +#ifndef SILERO_H +#define SILERO_H + +// silero.h +// Author : NathanJHLee +// Created On : 2025-11-10 +// Description : silero 6.2 system for onnx-runtime(c++) and torch-script(c++) +// Version : 1.3 + +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_TORCH +#include +#include +#elif USE_ONNX +#include "onnxruntime_cxx_api.h" +#endif + +namespace silero { + + struct Interval { + float start; + float end; + int numberOfSubseg; + + void initialize() { + start = 0; + end = 0; + numberOfSubseg = 0; + } + }; + + class VadIterator { + public: + VadIterator(const std::string &model_path, + float threshold = 0.5, + int sample_rate = 16000, + int window_size_ms = 32, + int speech_pad_ms = 30, + int min_silence_duration_ms = 100, + int min_speech_duration_ms = 250, + int max_duration_merge_ms = 300, + bool print_as_samples = false); + ~VadIterator(); + + // Batch (non-streaming) interface (for backward compatibility) + void SpeechProbs(std::vector& input_wav); + std::vector GetSpeechTimestamps(); + void SetVariables(); + + // Public parameters (can be modified by user) + float threshold; + int sample_rate; + int window_size_ms; + int min_speech_duration_ms; + int max_duration_merge_ms; + bool print_as_samples; + + private: +#ifdef USE_TORCH + torch::jit::script::Module model; + void init_torch_model(const std::string& model_path); +#elif USE_ONNX + Ort::Env env; // 환경 객체 + Ort::SessionOptions session_options; // 세션 옵션 + std::shared_ptr session; // ONNX 세션 + Ort::AllocatorWithDefaultOptions allocator; // 기본 할당자 + Ort::MemoryInfo memory_info; // 메모리 정보 (CPU) + + void init_onnx_model(const std::string& model_path); + float predict(const std::vector& data_chunk); + + //const int context_samples; // 예: 64 samples + int context_samples; // 예: 64 samples + std::vector _context; // 초기값 모두 0 + int effective_window_size; + + // ONNX 입력/출력 관련 버퍼 및 노드 이름들 + std::vector ort_inputs; + std::vector input_node_names; + std::vector input; + unsigned int size_state; // 고정값: 2*1*128 + std::vector _state; + std::vector sr; + int64_t input_node_dims[2]; // [1, effective_window_size] + const int64_t state_node_dims[3]; // [ 2, 1, 128 ] + const int64_t sr_node_dims[1]; // [ 1 ] + std::vector ort_outputs; + std::vector output_node_names; // 기본값: [ "output", "stateN" ] +#endif + std::vector outputs_prob; // used in batch mode + int min_silence_samples; + int min_speech_samples; + int speech_pad_samples; + int window_size_samples; + int duration_merge_samples; + int current_sample = 0; + int total_sample_size = 0; + int min_silence_duration_ms; + int speech_pad_ms; + bool triggered = false; + int temp_end = 0; + int global_end = 0; + int erase_tail_count = 0; + + + void init_engine(int window_size_ms); + void reset_states(); + std::vector DoVad(); + + + }; + +} // namespace silero + +#endif // SILERO_H + diff --git a/examples/c++/wav.h b/examples/c++/wav.h new file mode 100644 index 0000000..d567ee6 --- /dev/null +++ b/examples/c++/wav.h @@ -0,0 +1,237 @@ +// Copyright (c) 2016 Personal (Binbin Zhang) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef FRONTEND_WAV_H_ +#define FRONTEND_WAV_H_ + +#include +#include +#include +#include +#include + +#include + +// #include "utils/log.h" + +namespace wav { + +struct WavHeader { + char riff[4]; // "riff" + unsigned int size; + char wav[4]; // "WAVE" + char fmt[4]; // "fmt " + unsigned int fmt_size; + uint16_t format; + uint16_t channels; + unsigned int sample_rate; + unsigned int bytes_per_second; + uint16_t block_size; + uint16_t bit; + char data[4]; // "data" + unsigned int data_size; +}; + +class WavReader { + public: + WavReader() : data_(nullptr) {} + explicit WavReader(const std::string& filename) { Open(filename); } + + bool Open(const std::string& filename) { + FILE* fp = fopen(filename.c_str(), "rb"); //文件读取 + if (NULL == fp) { + std::cout << "Error in read " << filename; + return false; + } + + WavHeader header; + fread(&header, 1, sizeof(header), fp); + if (header.fmt_size < 16) { + printf("WaveData: expect PCM format data " + "to have fmt chunk of at least size 16.\n"); + return false; + } else if (header.fmt_size > 16) { + int offset = 44 - 8 + header.fmt_size - 16; + fseek(fp, offset, SEEK_SET); + fread(header.data, 8, sizeof(char), fp); + } + // check "riff" "WAVE" "fmt " "data" + + // Skip any sub-chunks between "fmt" and "data". Usually there will + // be a single "fact" sub chunk, but on Windows there can also be a + // "list" sub chunk. + while (0 != strncmp(header.data, "data", 4)) { + // We will just ignore the data in these chunks. + fseek(fp, header.data_size, SEEK_CUR); + // read next sub chunk + fread(header.data, 8, sizeof(char), fp); + } + + if (header.data_size == 0) { + int offset = ftell(fp); + fseek(fp, 0, SEEK_END); + header.data_size = ftell(fp) - offset; + fseek(fp, offset, SEEK_SET); + } + + num_channel_ = header.channels; + sample_rate_ = header.sample_rate; + bits_per_sample_ = header.bit; + int num_data = header.data_size / (bits_per_sample_ / 8); + data_ = new float[num_data]; // Create 1-dim array + num_samples_ = num_data / num_channel_; + + std::cout << "num_channel_ :" << num_channel_ << std::endl; + std::cout << "sample_rate_ :" << sample_rate_ << std::endl; + std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl; + std::cout << "num_samples :" << num_data << std::endl; + std::cout << "num_data_size :" << header.data_size << std::endl; + + switch (bits_per_sample_) { + case 8: { + char sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(char), fp); + data_[i] = static_cast(sample) / 32768; + } + break; + } + case 16: { + int16_t sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(int16_t), fp); + data_[i] = static_cast(sample) / 32768; + } + break; + } + case 32: + { + if (header.format == 1) //S32 + { + int sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(int), fp); + data_[i] = static_cast(sample) / 32768; + } + } + else if (header.format == 3) // IEEE-float + { + float sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(float), fp); + data_[i] = static_cast(sample); + } + } + else { + printf("unsupported quantization bits\n"); + } + break; + } + default: + printf("unsupported quantization bits\n"); + break; + } + + fclose(fp); + return true; + } + + int num_channel() const { return num_channel_; } + int sample_rate() const { return sample_rate_; } + int bits_per_sample() const { return bits_per_sample_; } + int num_samples() const { return num_samples_; } + + ~WavReader() { + delete[] data_; + } + + const float* data() const { return data_; } + + private: + int num_channel_; + int sample_rate_; + int bits_per_sample_; + int num_samples_; // sample points per channel + float* data_; +}; + +class WavWriter { + public: + WavWriter(const float* data, int num_samples, int num_channel, + int sample_rate, int bits_per_sample) + : data_(data), + num_samples_(num_samples), + num_channel_(num_channel), + sample_rate_(sample_rate), + bits_per_sample_(bits_per_sample) {} + + void Write(const std::string& filename) { + FILE* fp = fopen(filename.c_str(), "w"); + // init char 'riff' 'WAVE' 'fmt ' 'data' + WavHeader header; + char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, + 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00}; + memcpy(&header, wav_header, sizeof(header)); + header.channels = num_channel_; + header.bit = bits_per_sample_; + header.sample_rate = sample_rate_; + header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8); + header.size = sizeof(header) - 8 + header.data_size; + header.bytes_per_second = + sample_rate_ * num_channel_ * (bits_per_sample_ / 8); + header.block_size = num_channel_ * (bits_per_sample_ / 8); + + fwrite(&header, 1, sizeof(header), fp); + + for (int i = 0; i < num_samples_; ++i) { + for (int j = 0; j < num_channel_; ++j) { + switch (bits_per_sample_) { + case 8: { + char sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + case 16: { + int16_t sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + case 32: { + int sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + } + } + } + fclose(fp); + } + + private: + const float* data_; + int num_samples_; // total float points in data_ + int num_channel_; + int sample_rate_; + int bits_per_sample_; +}; + +} // namespace wav + +#endif // FRONTEND_WAV_H_ + +