diff --git a/examples/cpp_libtorch_deprecated/README.md b/examples/cpp_libtorch_deprecated/README.md new file mode 100644 index 0000000..be18cee --- /dev/null +++ b/examples/cpp_libtorch_deprecated/README.md @@ -0,0 +1,45 @@ +# Silero-VAD V5 in C++ (based on LibTorch) + +This is the source code for Silero-VAD V5 in C++, utilizing LibTorch. The primary implementation is CPU-based, and you should compare its results with the Python version. Only results at 16kHz have been tested. + +Additionally, batch and CUDA inference options are available if you want to explore further. Note that when using batch inference, the speech probabilities may slightly differ from the standard version, likely due to differences in caching. Unlike individual input processing, batch inference may not use the cache from previous chunks. Despite this, batch inference offers significantly faster processing. For optimal performance, consider adjusting the threshold when using batch inference. + +## Requirements + +- GCC 11.4.0 (GCC >= 5.1) +- LibTorch 1.13.0 (other versions are also acceptable) + +## Download LibTorch + +```bash +-CPU Version +wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip +unzip libtorch-shared-with-deps-1.13.0+cpu.zip' + +-CUDA Version +wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip +unzip libtorch-shared-with-deps-1.13.0+cu116.zip +``` + +## Compilation + +```bash +-CPU Version +g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 + +-CUDA Version +g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU +``` + + +## Optional Compilation Flags +-DUSE_BATCH: Enable batch inference +-DUSE_GPU: Use GPU for inference + +## Run the Program +To run the program, use the following command: + +`./silero aepyx.wav 16000 0.5` + +The sample file aepyx.wav is part of the Voxconverse dataset. +File details: aepyx.wav is a 16kHz, 16-bit audio file. diff --git a/examples/cpp_libtorch_deprecated/aepyx.wav b/examples/cpp_libtorch_deprecated/aepyx.wav new file mode 100644 index 0000000..b8b46de Binary files /dev/null and b/examples/cpp_libtorch_deprecated/aepyx.wav differ diff --git a/examples/cpp_libtorch_deprecated/main.cc b/examples/cpp_libtorch_deprecated/main.cc new file mode 100644 index 0000000..3f774e7 --- /dev/null +++ b/examples/cpp_libtorch_deprecated/main.cc @@ -0,0 +1,54 @@ +#include +#include "silero_torch.h" +#include "wav.h" + +int main(int argc, char* argv[]) { + + if(argc != 4){ + std::cerr<<"Usage : "< "< input_wav(wav_reader.num_samples()); + + for (int i = 0; i < wav_reader.num_samples(); i++) + { + input_wav[i] = static_cast(*(wav_reader.data() + i)); + } + + vad.SpeechProbs(input_wav); + + std::vector speeches = vad.GetSpeechTimestamps(); + for(const auto& speech : speeches){ + if(vad.print_as_samples){ + std::cout<<"{'start': "<(speech.start)<<", 'end': "<(speech.end)<<"}"<& input_wav){ + // Set the sample rate (must match the model's expected sample rate) + // Process the waveform in chunks of 512 samples + int num_samples = input_wav.size(); + int num_chunks = num_samples / window_size_samples; + int remainder_samples = num_samples % window_size_samples; + + total_sample_size += num_samples; + + torch::Tensor output; + std::vector chunks; + + for (int i = 0; i < num_chunks; i++) { + + float* chunk_start = input_wav.data() + i *window_size_samples; + torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32); + //std::cout<<"chunk size : "<0){//마지막 chunk && 나머지가 존재 + int remaining_samples = num_samples - num_chunks * window_size_samples; + //std::cout<<"Remainder size : "< inputs; + inputs.push_back(batched_chunks); // Batch of chunks + inputs.push_back(sample_rate); // Assuming sample_rate is a valid input for the model + + // Run inference on the batch + torch::NoGradGuard no_grad; + torch::Tensor output = model.forward(inputs).toTensor(); +#ifdef USE_GPU + output = output.to(at::kCPU); // Move the output back to CPU once +#endif + // Collect output probabilities + for (int i = 0; i < chunks.size(); i++) { + float output_f = output[i].item(); + outputs_prob.push_back(output_f); + //std::cout << "Chunk " << i << " prob: " << output_f<< "\n"; + } +#else + + std::vector outputs; + torch::Tensor batched_chunks = torch::stack(chunks); +#ifdef USE_GPU + batched_chunks = batched_chunks.to(at::kCUDA); +#endif + for (int i = 0; i < chunks.size(); i++) { + torch::NoGradGuard no_grad; + std::vector inputs; + inputs.push_back(batched_chunks[i]); + inputs.push_back(sample_rate); + + torch::Tensor output = model.forward(inputs).toTensor(); + outputs.push_back(output); + } + torch::Tensor all_outputs = torch::stack(outputs); +#ifdef USE_GPU + all_outputs = all_outputs.to(at::kCPU); +#endif + for (int i = 0; i < chunks.size(); i++) { + float output_f = all_outputs[i].item(); + outputs_prob.push_back(output_f); + } + + + +#endif + + } + + + } + + + std::vector VadIterator::GetSpeechTimestamps() { + std::vector speeches = DoVad(); + +#ifdef USE_BATCH + //When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp. + //It could be better get reasonable output because of distorted probs. + duration_merge_samples = sample_rate * max_duration_merge_ms / 1000; + std::vector speeches_merge = mergeSpeeches(speeches, duration_merge_samples); + if(!print_as_samples){ + for (auto& speech : speeches_merge) { //samples to second + speech.start /= sample_rate; + speech.end /= sample_rate; + } + } + + return speeches_merge; +#else + + if(!print_as_samples){ + for (auto& speech : speeches) { //samples to second + speech.start /= sample_rate; + speech.end /= sample_rate; + } + } + + return speeches; + +#endif + + } + void VadIterator::SetVariables(){ + init_engine(window_size_ms); + } + + void VadIterator::init_engine(int window_size_ms) { + min_silence_samples = sample_rate * min_silence_duration_ms / 1000; + speech_pad_samples = sample_rate * speech_pad_ms / 1000; + window_size_samples = sample_rate / 1000 * window_size_ms; + min_speech_samples = sample_rate * min_speech_duration_ms / 1000; + } + + void VadIterator::init_torch_model(const std::string& model_path) { + at::set_num_threads(1); + model = torch::jit::load(model_path); + +#ifdef USE_GPU + if (!torch::cuda::is_available()) { + std::cout<<"CUDA is not available! Please check your GPU settings"< VadIterator::DoVad() { + std::vector speeches; + + for (size_t i = 0; i < outputs_prob.size(); ++i) { + float speech_prob = outputs_prob[i]; + //std::cout << speech_prob << std::endl; + //std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n"; + //std::cout << speech_prob << " "; + current_sample += window_size_samples; + + if (speech_prob >= threshold && temp_end != 0) { + temp_end = 0; + } + + if (speech_prob >= threshold && !triggered) { + triggered = true; + SpeechSegment segment; + segment.start = std::max(static_cast(0), current_sample - speech_pad_samples - window_size_samples); + speeches.push_back(segment); + continue; + } + + if (speech_prob < threshold - 0.15f && triggered) { + if (temp_end == 0) { + temp_end = current_sample; + } + + if (current_sample - temp_end < min_silence_samples) { + continue; + } else { + SpeechSegment& segment = speeches.back(); + segment.end = temp_end + speech_pad_samples - window_size_samples; + temp_end = 0; + triggered = false; + } + } + } + + if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나? + std::cout<<"when last triggered is keep working until last Probs"<speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples); + //min_speech_samples is 4000samples(0.25sec) + //여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함. + } + ), + speeches.end() + ); + + + //std::cout< VadIterator::mergeSpeeches(const std::vector& speeches, int duration_merge_samples) { + std::vector mergedSpeeches; + + if (speeches.empty()) { + return mergedSpeeches; // 빈 벡터 반환 + } + + // 첫 번째 구간으로 초기화 + SpeechSegment currentSegment = speeches[0]; + + for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터 + // 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침 + if (speeches[i].start - currentSegment.end < duration_merge_samples) { + // 현재 구간의 끝점을 업데이트 + currentSegment.end = speeches[i].end; + } else { + // 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작 + mergedSpeeches.push_back(currentSegment); + currentSegment = speeches[i]; + } + } + + // 마지막 구간 추가 + mergedSpeeches.push_back(currentSegment); + + return mergedSpeeches; + } + + } diff --git a/examples/cpp_libtorch_deprecated/silero_torch.h b/examples/cpp_libtorch_deprecated/silero_torch.h new file mode 100644 index 0000000..d8d3bc7 --- /dev/null +++ b/examples/cpp_libtorch_deprecated/silero_torch.h @@ -0,0 +1,75 @@ +//Author : Nathan Lee +//Created On : 2024-11-18 +//Description : silero 5.1 system for torch-script(c++). +//Version : 1.0 + +#ifndef SILERO_TORCH_H +#define SILERO_TORCH_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace silero{ + + struct SpeechSegment{ + int start; + int end; + }; + + class VadIterator{ + public: + + VadIterator(const std::string &model_path, float threshold = 0.5, int sample_rate = 16000, + int window_size_ms = 32, int speech_pad_ms = 30, int min_silence_duration_ms = 100, + int min_speech_duration_ms = 250, int max_duration_merge_ms = 300, bool print_as_samples = false); + ~VadIterator(); + + + void SpeechProbs(std::vector& input_wav); + std::vector GetSpeechTimestamps(); + void SetVariables(); + + float threshold; + int sample_rate; + int window_size_ms; + int min_speech_duration_ms; + int max_duration_merge_ms; + bool print_as_samples; + + private: + torch::jit::script::Module model; + std::vector outputs_prob; + int min_silence_samples; + int min_speech_samples; + int speech_pad_samples; + int window_size_samples; + int duration_merge_samples; + int current_sample = 0; + + int total_sample_size=0; + + int min_silence_duration_ms; + int speech_pad_ms; + bool triggered = false; + int temp_end = 0; + + void init_engine(int window_size_ms); + void init_torch_model(const std::string& model_path); + void reset_states(); + std::vector DoVad(); + std::vector mergeSpeeches(const std::vector& speeches, int duration_merge_samples); + + }; + +} +#endif // SILERO_TORCH_H diff --git a/examples/cpp_libtorch_deprecated/wav.h b/examples/cpp_libtorch_deprecated/wav.h new file mode 100644 index 0000000..249d7e3 --- /dev/null +++ b/examples/cpp_libtorch_deprecated/wav.h @@ -0,0 +1,235 @@ +// Copyright (c) 2016 Personal (Binbin Zhang) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef FRONTEND_WAV_H_ +#define FRONTEND_WAV_H_ + +#include +#include +#include +#include +#include + +#include + +// #include "utils/log.h" + +namespace wav { + +struct WavHeader { + char riff[4]; // "riff" + unsigned int size; + char wav[4]; // "WAVE" + char fmt[4]; // "fmt " + unsigned int fmt_size; + uint16_t format; + uint16_t channels; + unsigned int sample_rate; + unsigned int bytes_per_second; + uint16_t block_size; + uint16_t bit; + char data[4]; // "data" + unsigned int data_size; +}; + +class WavReader { + public: + WavReader() : data_(nullptr) {} + explicit WavReader(const std::string& filename) { Open(filename); } + + bool Open(const std::string& filename) { + FILE* fp = fopen(filename.c_str(), "rb"); //文件读取 + if (NULL == fp) { + std::cout << "Error in read " << filename; + return false; + } + + WavHeader header; + fread(&header, 1, sizeof(header), fp); + if (header.fmt_size < 16) { + printf("WaveData: expect PCM format data " + "to have fmt chunk of at least size 16.\n"); + return false; + } else if (header.fmt_size > 16) { + int offset = 44 - 8 + header.fmt_size - 16; + fseek(fp, offset, SEEK_SET); + fread(header.data, 8, sizeof(char), fp); + } + // check "riff" "WAVE" "fmt " "data" + + // Skip any sub-chunks between "fmt" and "data". Usually there will + // be a single "fact" sub chunk, but on Windows there can also be a + // "list" sub chunk. + while (0 != strncmp(header.data, "data", 4)) { + // We will just ignore the data in these chunks. + fseek(fp, header.data_size, SEEK_CUR); + // read next sub chunk + fread(header.data, 8, sizeof(char), fp); + } + + if (header.data_size == 0) { + int offset = ftell(fp); + fseek(fp, 0, SEEK_END); + header.data_size = ftell(fp) - offset; + fseek(fp, offset, SEEK_SET); + } + + num_channel_ = header.channels; + sample_rate_ = header.sample_rate; + bits_per_sample_ = header.bit; + int num_data = header.data_size / (bits_per_sample_ / 8); + data_ = new float[num_data]; // Create 1-dim array + num_samples_ = num_data / num_channel_; + + std::cout << "num_channel_ :" << num_channel_ << std::endl; + std::cout << "sample_rate_ :" << sample_rate_ << std::endl; + std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl; + std::cout << "num_samples :" << num_data << std::endl; + std::cout << "num_data_size :" << header.data_size << std::endl; + + switch (bits_per_sample_) { + case 8: { + char sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(char), fp); + data_[i] = static_cast(sample) / 32768; + } + break; + } + case 16: { + int16_t sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(int16_t), fp); + data_[i] = static_cast(sample) / 32768; + } + break; + } + case 32: + { + if (header.format == 1) //S32 + { + int sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(int), fp); + data_[i] = static_cast(sample) / 32768; + } + } + else if (header.format == 3) // IEEE-float + { + float sample; + for (int i = 0; i < num_data; ++i) { + fread(&sample, 1, sizeof(float), fp); + data_[i] = static_cast(sample); + } + } + else { + printf("unsupported quantization bits\n"); + } + break; + } + default: + printf("unsupported quantization bits\n"); + break; + } + + fclose(fp); + return true; + } + + int num_channel() const { return num_channel_; } + int sample_rate() const { return sample_rate_; } + int bits_per_sample() const { return bits_per_sample_; } + int num_samples() const { return num_samples_; } + + ~WavReader() { + delete[] data_; + } + + const float* data() const { return data_; } + + private: + int num_channel_; + int sample_rate_; + int bits_per_sample_; + int num_samples_; // sample points per channel + float* data_; +}; + +class WavWriter { + public: + WavWriter(const float* data, int num_samples, int num_channel, + int sample_rate, int bits_per_sample) + : data_(data), + num_samples_(num_samples), + num_channel_(num_channel), + sample_rate_(sample_rate), + bits_per_sample_(bits_per_sample) {} + + void Write(const std::string& filename) { + FILE* fp = fopen(filename.c_str(), "w"); + // init char 'riff' 'WAVE' 'fmt ' 'data' + WavHeader header; + char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, + 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00}; + memcpy(&header, wav_header, sizeof(header)); + header.channels = num_channel_; + header.bit = bits_per_sample_; + header.sample_rate = sample_rate_; + header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8); + header.size = sizeof(header) - 8 + header.data_size; + header.bytes_per_second = + sample_rate_ * num_channel_ * (bits_per_sample_ / 8); + header.block_size = num_channel_ * (bits_per_sample_ / 8); + + fwrite(&header, 1, sizeof(header), fp); + + for (int i = 0; i < num_samples_; ++i) { + for (int j = 0; j < num_channel_; ++j) { + switch (bits_per_sample_) { + case 8: { + char sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + case 16: { + int16_t sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + case 32: { + int sample = static_cast(data_[i * num_channel_ + j]); + fwrite(&sample, 1, sizeof(sample), fp); + break; + } + } + } + } + fclose(fp); + } + + private: + const float* data_; + int num_samples_; // total float points in data_ + int num_channel_; + int sample_rate_; + int bits_per_sample_; +}; + +} // namespace wenet + +#endif // FRONTEND_WAV_H_