Create python-publish.yml

Merge pull request #489 from streamer45/update-golang-example
Update Golang example to support model v5
2026-02-04 17:39:22 +08:00 · 2024-07-09 12:41:49 +03:00 · 2024-07-08 09:03:12 +03:00 · 2024-07-08 07:43:42 +02:00 · 2024-07-01 19:17:44 +03:00 · 2024-07-01 15:32:40 +01:00
8 changed files with 68 additions and 31 deletions
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,40 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  push:
+    tags:        
+      - '*'   
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/README.md
+++ b/README.md
@@ -121,4 +121,6 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for releva

 - Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp) 

- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web 
+- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
+
+- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) examples
--- a/examples/cpp/silero-vad-onnx.cpp
+++ b/examples/cpp/silero-vad-onnx.cpp
@@ -120,8 +120,7 @@ private:
    void reset_states()
    {
        // Call reset before each audio start
-        std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
-        std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
+        std::memset(_state.data(), 0.0f, _state.size() * sizeof(float));
        triggered = false;
        temp_end = 0;
        current_sample = 0;
@@ -139,19 +138,16 @@ private:
        input.assign(data.begin(), data.end());
        Ort::Value input_ort = Ort::Value::CreateTensor<float>(
            memory_info, input.data(), input.size(), input_node_dims, 2);
+        Ort::Value state_ort = Ort::Value::CreateTensor<float>(
+            memory_info, _state.data(), _state.size(), state_node_dims, 3);
        Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
            memory_info, sr.data(), sr.size(), sr_node_dims, 1);
-        Ort::Value h_ort = Ort::Value::CreateTensor<float>(
-            memory_info, _h.data(), _h.size(), hc_node_dims, 3);
-        Ort::Value c_ort = Ort::Value::CreateTensor<float>(
-            memory_info, _c.data(), _c.size(), hc_node_dims, 3);

        // Clear and add inputs
        ort_inputs.clear();
        ort_inputs.emplace_back(std::move(input_ort));
+        ort_inputs.emplace_back(std::move(state_ort));
        ort_inputs.emplace_back(std::move(sr_ort));
-        ort_inputs.emplace_back(std::move(h_ort));
-        ort_inputs.emplace_back(std::move(c_ort));

        // Infer
        ort_outputs = session->Run(
@@ -161,10 +157,8 @@ private:

        // Output probability & update h,c recursively
        float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
-        float *hn = ort_outputs[1].GetTensorMutableData<float>();
-        std::memcpy(_h.data(), hn, size_hc * sizeof(float));
-        float *cn = ort_outputs[2].GetTensorMutableData<float>();
-        std::memcpy(_c.data(), cn, size_hc * sizeof(float));
+        float *stateN = ort_outputs[1].GetTensorMutableData<float>();
+        std::memcpy(_state.data(), stateN, size_state * sizeof(float));

        // Push forward sample index
        current_sample += window_size_samples;
@@ -376,27 +370,26 @@ private:
    // Inputs
    std::vector<Ort::Value> ort_inputs;
    
-    std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
+    std::vector<const char *> input_node_names = {"input", "state", "sr"};
    std::vector<float> input;
+    unsigned int size_state = 2 * 1 * 128; // It's FIXED.
+    std::vector<float> _state;
    std::vector<int64_t> sr;
-    unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
-    std::vector<float> _h;
-    std::vector<float> _c;

-    int64_t input_node_dims[2] = {}; 
+    int64_t input_node_dims[2] = {};
+    const int64_t state_node_dims[3] = {2, 1, 128}; 
    const int64_t sr_node_dims[1] = {1};
-    const int64_t hc_node_dims[3] = {2, 1, 64};

    // Outputs
    std::vector<Ort::Value> ort_outputs;
-    std::vector<const char *> output_node_names = {"output", "hn", "cn"};
+    std::vector<const char *> output_node_names = {"output", "stateN"};

 public:
    // Construction
    VadIterator(const std::wstring ModelPath,
-        int Sample_rate = 16000, int windows_frame_size = 64,
+        int Sample_rate = 16000, int windows_frame_size = 32,
        float Threshold = 0.5, int min_silence_duration_ms = 0,
-        int speech_pad_ms = 64, int min_speech_duration_ms = 64,
+        int speech_pad_ms = 32, int min_speech_duration_ms = 32,
        float max_speech_duration_s = std::numeric_limits<float>::infinity())
    {
        init_onnx_model(ModelPath);
@@ -422,8 +415,7 @@ public:
        input_node_dims[0] = 1;
        input_node_dims[1] = window_size_samples;

-        _h.resize(size_hc);
-        _c.resize(size_hc);
+        _state.resize(size_state);
        sr.resize(1);
        sr[0] = sample_rate;
    };
--- a/examples/go/cmd/main.go
+++ b/examples/go/cmd/main.go
@@ -13,7 +13,6 @@ func main() {
 	sd, err := speech.NewDetector(speech.DetectorConfig{
 		ModelPath:            "../../files/silero_vad.onnx",
 		SampleRate:           16000,
-		WindowSize:           1536,
 		Threshold:            0.5,
 		MinSilenceDurationMs: 0,
 		SpeechPadMs:          0,
@@ -22,6 +21,10 @@ func main() {
 		log.Fatalf("failed to create speech detector: %s", err)
 	}

+	if len(os.Args) != 2 {
+		log.Fatalf("invalid arguments provided: expecting one file path")
+	}
+
 	f, err := os.Open(os.Args[1])
 	if err != nil {
 		log.Fatalf("failed to open sample audio file: %s", err)
--- a/examples/go/go.mod
+++ b/examples/go/go.mod
@@ -4,7 +4,7 @@ go 1.21.4

 require (
 	github.com/go-audio/wav v1.1.0
-	github.com/streamer45/silero-vad-go v0.1.0
+	github.com/streamer45/silero-vad-go v0.2.0
 )

 require (
--- a/examples/go/go.sum
+++ b/examples/go/go.sum
@@ -8,8 +8,8 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
-github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
+github.com/streamer45/silero-vad-go v0.2.0 h1:bbRTa6cQuc7VI88y0qicx375UyWoxE6wlVOF+mUg0+g=
+github.com/streamer45/silero-vad-go v0.2.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
 github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
--- a/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
+++ b/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
@@ -186,7 +186,7 @@ if __name__ == '__main__':
                        help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")

    parser.add_argument('-N', '--num_steps', type=int, default=8,
-                        help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
+                        help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")

    parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
                        help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
@@ -198,4 +198,4 @@ if __name__ == '__main__':
                        help=" minimum silence duration in samples between to separate speech chunks")
    ARGS = parser.parse_args()
    ARGS.rate=DEFAULT_SAMPLE_RATE
-    main(ARGS)
+    main(ARGS)
--- a/utils_vad.py
+++ b/utils_vad.py
@@ -72,7 +72,7 @@ class OnnxWrapper():

        x = torch.cat([self._context, x], dim=1)
        if sr in [8000, 16000]:
-            ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr)}
+            ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
            ort_outs = self.session.run(None, ort_inputs)
            out, state = ort_outs
            self._state = torch.from_numpy(state)
Author	SHA1	Message	Date
Dimitrii Voronin	e8850d2b9b	Create python-publish.yml	2024-07-09 12:41:49 +03:00
Dimitrii Voronin	3888946c0c	Merge pull request #489 from streamer45/update-golang-example Update Golang example to support model v5	2024-07-08 09:03:12 +03:00
streamer45	24f51645d0	Update to support model v5	2024-07-08 07:43:42 +02:00
Dimitrii Voronin	fdbb0a3a81	Merge pull request #482 from filtercodes/v5_cpp_support cpp example	2024-07-01 19:17:44 +03:00
Stefan Miletic	60ae7abfb7	v5 model cpp example	2024-07-01 15:32:40 +01:00
Stefan Miletic	0b3d43d432	cpp example v5 model	2024-07-01 15:04:48 +01:00
Dimitrii Voronin	a395853982	Merge pull request #475 from eltociear/patch-1 Update microphone_and_webRTC_integration.py	2024-07-01 12:09:08 +03:00
Dimitrii Voronin	78958b6fb6	Merge pull request #481 from snakers4/adamnsandle Adamnsandle	2024-07-01 12:02:50 +03:00
adamnsandle	902cfc9248	fx dtype bug	2024-07-01 09:00:59 +00:00
adamnsandle	89e66a3474	Merge branch 'master' of github.com:snakers4/silero-vad into adamnsandle	2024-07-01 08:54:27 +00:00
Alexander Veysov	a3bdebed16	Update README.md	2024-07-01 10:21:20 +03:00
Ikko Eltociear Ashimine	4bdcf31d17	Update microphone_and_webRTC_integration.py nubmer -> number	2024-06-30 02:10:59 +09:00
adamnsandle	136cdcdf5b	tst	2024-06-28 14:13:18 +00:00