13 Commits

Author SHA1 Message Date
Dimitrii Voronin
e8850d2b9b Create python-publish.yml 2024-07-09 12:41:49 +03:00
Dimitrii Voronin
3888946c0c Merge pull request #489 from streamer45/update-golang-example
Update Golang example to support model v5
2024-07-08 09:03:12 +03:00
streamer45
24f51645d0 Update to support model v5 2024-07-08 07:43:42 +02:00
Dimitrii Voronin
fdbb0a3a81 Merge pull request #482 from filtercodes/v5_cpp_support
cpp example
2024-07-01 19:17:44 +03:00
Stefan Miletic
60ae7abfb7 v5 model cpp example 2024-07-01 15:32:40 +01:00
Stefan Miletic
0b3d43d432 cpp example v5 model 2024-07-01 15:04:48 +01:00
Dimitrii Voronin
a395853982 Merge pull request #475 from eltociear/patch-1
Update microphone_and_webRTC_integration.py
2024-07-01 12:09:08 +03:00
Dimitrii Voronin
78958b6fb6 Merge pull request #481 from snakers4/adamnsandle
Adamnsandle
2024-07-01 12:02:50 +03:00
adamnsandle
902cfc9248 fx dtype bug 2024-07-01 09:00:59 +00:00
adamnsandle
89e66a3474 Merge branch 'master' of github.com:snakers4/silero-vad into adamnsandle 2024-07-01 08:54:27 +00:00
Alexander Veysov
a3bdebed16 Update README.md 2024-07-01 10:21:20 +03:00
Ikko Eltociear Ashimine
4bdcf31d17 Update microphone_and_webRTC_integration.py
nubmer -> number
2024-06-30 02:10:59 +09:00
adamnsandle
136cdcdf5b tst 2024-06-28 14:13:18 +00:00
8 changed files with 68 additions and 31 deletions

40
.github/workflows/python-publish.yml vendored Normal file
View File

@@ -0,0 +1,40 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
name: Upload Python Package
on:
push:
tags:
- '*'
permissions:
contents: read
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -121,4 +121,6 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for releva
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp) - Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web - Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) examples

View File

@@ -120,8 +120,7 @@ private:
void reset_states() void reset_states()
{ {
// Call reset before each audio start // Call reset before each audio start
std::memset(_h.data(), 0.0f, _h.size() * sizeof(float)); std::memset(_state.data(), 0.0f, _state.size() * sizeof(float));
std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
triggered = false; triggered = false;
temp_end = 0; temp_end = 0;
current_sample = 0; current_sample = 0;
@@ -139,19 +138,16 @@ private:
input.assign(data.begin(), data.end()); input.assign(data.begin(), data.end());
Ort::Value input_ort = Ort::Value::CreateTensor<float>( Ort::Value input_ort = Ort::Value::CreateTensor<float>(
memory_info, input.data(), input.size(), input_node_dims, 2); memory_info, input.data(), input.size(), input_node_dims, 2);
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
memory_info, _state.data(), _state.size(), state_node_dims, 3);
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>( Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
memory_info, sr.data(), sr.size(), sr_node_dims, 1); memory_info, sr.data(), sr.size(), sr_node_dims, 1);
Ort::Value h_ort = Ort::Value::CreateTensor<float>(
memory_info, _h.data(), _h.size(), hc_node_dims, 3);
Ort::Value c_ort = Ort::Value::CreateTensor<float>(
memory_info, _c.data(), _c.size(), hc_node_dims, 3);
// Clear and add inputs // Clear and add inputs
ort_inputs.clear(); ort_inputs.clear();
ort_inputs.emplace_back(std::move(input_ort)); ort_inputs.emplace_back(std::move(input_ort));
ort_inputs.emplace_back(std::move(state_ort));
ort_inputs.emplace_back(std::move(sr_ort)); ort_inputs.emplace_back(std::move(sr_ort));
ort_inputs.emplace_back(std::move(h_ort));
ort_inputs.emplace_back(std::move(c_ort));
// Infer // Infer
ort_outputs = session->Run( ort_outputs = session->Run(
@@ -161,10 +157,8 @@ private:
// Output probability & update h,c recursively // Output probability & update h,c recursively
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0]; float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
float *hn = ort_outputs[1].GetTensorMutableData<float>(); float *stateN = ort_outputs[1].GetTensorMutableData<float>();
std::memcpy(_h.data(), hn, size_hc * sizeof(float)); std::memcpy(_state.data(), stateN, size_state * sizeof(float));
float *cn = ort_outputs[2].GetTensorMutableData<float>();
std::memcpy(_c.data(), cn, size_hc * sizeof(float));
// Push forward sample index // Push forward sample index
current_sample += window_size_samples; current_sample += window_size_samples;
@@ -376,27 +370,26 @@ private:
// Inputs // Inputs
std::vector<Ort::Value> ort_inputs; std::vector<Ort::Value> ort_inputs;
std::vector<const char *> input_node_names = {"input", "sr", "h", "c"}; std::vector<const char *> input_node_names = {"input", "state", "sr"};
std::vector<float> input; std::vector<float> input;
unsigned int size_state = 2 * 1 * 128; // It's FIXED.
std::vector<float> _state;
std::vector<int64_t> sr; std::vector<int64_t> sr;
unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
std::vector<float> _h;
std::vector<float> _c;
int64_t input_node_dims[2] = {}; int64_t input_node_dims[2] = {};
const int64_t state_node_dims[3] = {2, 1, 128};
const int64_t sr_node_dims[1] = {1}; const int64_t sr_node_dims[1] = {1};
const int64_t hc_node_dims[3] = {2, 1, 64};
// Outputs // Outputs
std::vector<Ort::Value> ort_outputs; std::vector<Ort::Value> ort_outputs;
std::vector<const char *> output_node_names = {"output", "hn", "cn"}; std::vector<const char *> output_node_names = {"output", "stateN"};
public: public:
// Construction // Construction
VadIterator(const std::wstring ModelPath, VadIterator(const std::wstring ModelPath,
int Sample_rate = 16000, int windows_frame_size = 64, int Sample_rate = 16000, int windows_frame_size = 32,
float Threshold = 0.5, int min_silence_duration_ms = 0, float Threshold = 0.5, int min_silence_duration_ms = 0,
int speech_pad_ms = 64, int min_speech_duration_ms = 64, int speech_pad_ms = 32, int min_speech_duration_ms = 32,
float max_speech_duration_s = std::numeric_limits<float>::infinity()) float max_speech_duration_s = std::numeric_limits<float>::infinity())
{ {
init_onnx_model(ModelPath); init_onnx_model(ModelPath);
@@ -422,8 +415,7 @@ public:
input_node_dims[0] = 1; input_node_dims[0] = 1;
input_node_dims[1] = window_size_samples; input_node_dims[1] = window_size_samples;
_h.resize(size_hc); _state.resize(size_state);
_c.resize(size_hc);
sr.resize(1); sr.resize(1);
sr[0] = sample_rate; sr[0] = sample_rate;
}; };

View File

@@ -13,7 +13,6 @@ func main() {
sd, err := speech.NewDetector(speech.DetectorConfig{ sd, err := speech.NewDetector(speech.DetectorConfig{
ModelPath: "../../files/silero_vad.onnx", ModelPath: "../../files/silero_vad.onnx",
SampleRate: 16000, SampleRate: 16000,
WindowSize: 1536,
Threshold: 0.5, Threshold: 0.5,
MinSilenceDurationMs: 0, MinSilenceDurationMs: 0,
SpeechPadMs: 0, SpeechPadMs: 0,
@@ -22,6 +21,10 @@ func main() {
log.Fatalf("failed to create speech detector: %s", err) log.Fatalf("failed to create speech detector: %s", err)
} }
if len(os.Args) != 2 {
log.Fatalf("invalid arguments provided: expecting one file path")
}
f, err := os.Open(os.Args[1]) f, err := os.Open(os.Args[1])
if err != nil { if err != nil {
log.Fatalf("failed to open sample audio file: %s", err) log.Fatalf("failed to open sample audio file: %s", err)

View File

@@ -4,7 +4,7 @@ go 1.21.4
require ( require (
github.com/go-audio/wav v1.1.0 github.com/go-audio/wav v1.1.0
github.com/streamer45/silero-vad-go v0.1.0 github.com/streamer45/silero-vad-go v0.2.0
) )
require ( require (

View File

@@ -8,8 +8,8 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4= github.com/streamer45/silero-vad-go v0.2.0 h1:bbRTa6cQuc7VI88y0qicx375UyWoxE6wlVOF+mUg0+g=
github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs= github.com/streamer45/silero-vad-go v0.2.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=

View File

@@ -186,7 +186,7 @@ if __name__ == '__main__':
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)") help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
parser.add_argument('-N', '--num_steps', type=int, default=8, parser.add_argument('-N', '--num_steps', type=int, default=8,
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)") help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000, parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)") help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
@@ -198,4 +198,4 @@ if __name__ == '__main__':
help=" minimum silence duration in samples between to separate speech chunks") help=" minimum silence duration in samples between to separate speech chunks")
ARGS = parser.parse_args() ARGS = parser.parse_args()
ARGS.rate=DEFAULT_SAMPLE_RATE ARGS.rate=DEFAULT_SAMPLE_RATE
main(ARGS) main(ARGS)

View File

@@ -72,7 +72,7 @@ class OnnxWrapper():
x = torch.cat([self._context, x], dim=1) x = torch.cat([self._context, x], dim=1)
if sr in [8000, 16000]: if sr in [8000, 16000]:
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr)} ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
ort_outs = self.session.run(None, ort_inputs) ort_outs = self.session.run(None, ort_inputs)
out, state = ort_outs out, state = ort_outs
self._state = torch.from_numpy(state) self._state = torch.from_numpy(state)