mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 01:49:22 +08:00
Compare commits
13 Commits
v5.0
...
adamnsandl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e8850d2b9b | ||
|
|
3888946c0c | ||
|
|
24f51645d0 | ||
|
|
fdbb0a3a81 | ||
|
|
60ae7abfb7 | ||
|
|
0b3d43d432 | ||
|
|
a395853982 | ||
|
|
78958b6fb6 | ||
|
|
902cfc9248 | ||
|
|
89e66a3474 | ||
|
|
a3bdebed16 | ||
|
|
4bdcf31d17 | ||
|
|
136cdcdf5b |
40
.github/workflows/python-publish.yml
vendored
Normal file
40
.github/workflows/python-publish.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# This workflow will upload a Python Package using Twine when a release is created
|
||||||
|
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
||||||
|
|
||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
|
||||||
|
name: Upload Python Package
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v3
|
||||||
|
with:
|
||||||
|
python-version: '3.x'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build
|
||||||
|
- name: Publish package
|
||||||
|
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
||||||
|
with:
|
||||||
|
user: __token__
|
||||||
|
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
@@ -121,4 +121,6 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for releva
|
|||||||
|
|
||||||
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
||||||
|
|
||||||
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
||||||
|
|
||||||
|
- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) examples
|
||||||
|
|||||||
@@ -120,8 +120,7 @@ private:
|
|||||||
void reset_states()
|
void reset_states()
|
||||||
{
|
{
|
||||||
// Call reset before each audio start
|
// Call reset before each audio start
|
||||||
std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
|
std::memset(_state.data(), 0.0f, _state.size() * sizeof(float));
|
||||||
std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
|
|
||||||
triggered = false;
|
triggered = false;
|
||||||
temp_end = 0;
|
temp_end = 0;
|
||||||
current_sample = 0;
|
current_sample = 0;
|
||||||
@@ -139,19 +138,16 @@ private:
|
|||||||
input.assign(data.begin(), data.end());
|
input.assign(data.begin(), data.end());
|
||||||
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
||||||
memory_info, input.data(), input.size(), input_node_dims, 2);
|
memory_info, input.data(), input.size(), input_node_dims, 2);
|
||||||
|
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
|
||||||
|
memory_info, _state.data(), _state.size(), state_node_dims, 3);
|
||||||
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
||||||
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
||||||
Ort::Value h_ort = Ort::Value::CreateTensor<float>(
|
|
||||||
memory_info, _h.data(), _h.size(), hc_node_dims, 3);
|
|
||||||
Ort::Value c_ort = Ort::Value::CreateTensor<float>(
|
|
||||||
memory_info, _c.data(), _c.size(), hc_node_dims, 3);
|
|
||||||
|
|
||||||
// Clear and add inputs
|
// Clear and add inputs
|
||||||
ort_inputs.clear();
|
ort_inputs.clear();
|
||||||
ort_inputs.emplace_back(std::move(input_ort));
|
ort_inputs.emplace_back(std::move(input_ort));
|
||||||
|
ort_inputs.emplace_back(std::move(state_ort));
|
||||||
ort_inputs.emplace_back(std::move(sr_ort));
|
ort_inputs.emplace_back(std::move(sr_ort));
|
||||||
ort_inputs.emplace_back(std::move(h_ort));
|
|
||||||
ort_inputs.emplace_back(std::move(c_ort));
|
|
||||||
|
|
||||||
// Infer
|
// Infer
|
||||||
ort_outputs = session->Run(
|
ort_outputs = session->Run(
|
||||||
@@ -161,10 +157,8 @@ private:
|
|||||||
|
|
||||||
// Output probability & update h,c recursively
|
// Output probability & update h,c recursively
|
||||||
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
||||||
float *hn = ort_outputs[1].GetTensorMutableData<float>();
|
float *stateN = ort_outputs[1].GetTensorMutableData<float>();
|
||||||
std::memcpy(_h.data(), hn, size_hc * sizeof(float));
|
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
|
||||||
float *cn = ort_outputs[2].GetTensorMutableData<float>();
|
|
||||||
std::memcpy(_c.data(), cn, size_hc * sizeof(float));
|
|
||||||
|
|
||||||
// Push forward sample index
|
// Push forward sample index
|
||||||
current_sample += window_size_samples;
|
current_sample += window_size_samples;
|
||||||
@@ -376,27 +370,26 @@ private:
|
|||||||
// Inputs
|
// Inputs
|
||||||
std::vector<Ort::Value> ort_inputs;
|
std::vector<Ort::Value> ort_inputs;
|
||||||
|
|
||||||
std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
|
std::vector<const char *> input_node_names = {"input", "state", "sr"};
|
||||||
std::vector<float> input;
|
std::vector<float> input;
|
||||||
|
unsigned int size_state = 2 * 1 * 128; // It's FIXED.
|
||||||
|
std::vector<float> _state;
|
||||||
std::vector<int64_t> sr;
|
std::vector<int64_t> sr;
|
||||||
unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
|
|
||||||
std::vector<float> _h;
|
|
||||||
std::vector<float> _c;
|
|
||||||
|
|
||||||
int64_t input_node_dims[2] = {};
|
int64_t input_node_dims[2] = {};
|
||||||
|
const int64_t state_node_dims[3] = {2, 1, 128};
|
||||||
const int64_t sr_node_dims[1] = {1};
|
const int64_t sr_node_dims[1] = {1};
|
||||||
const int64_t hc_node_dims[3] = {2, 1, 64};
|
|
||||||
|
|
||||||
// Outputs
|
// Outputs
|
||||||
std::vector<Ort::Value> ort_outputs;
|
std::vector<Ort::Value> ort_outputs;
|
||||||
std::vector<const char *> output_node_names = {"output", "hn", "cn"};
|
std::vector<const char *> output_node_names = {"output", "stateN"};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Construction
|
// Construction
|
||||||
VadIterator(const std::wstring ModelPath,
|
VadIterator(const std::wstring ModelPath,
|
||||||
int Sample_rate = 16000, int windows_frame_size = 64,
|
int Sample_rate = 16000, int windows_frame_size = 32,
|
||||||
float Threshold = 0.5, int min_silence_duration_ms = 0,
|
float Threshold = 0.5, int min_silence_duration_ms = 0,
|
||||||
int speech_pad_ms = 64, int min_speech_duration_ms = 64,
|
int speech_pad_ms = 32, int min_speech_duration_ms = 32,
|
||||||
float max_speech_duration_s = std::numeric_limits<float>::infinity())
|
float max_speech_duration_s = std::numeric_limits<float>::infinity())
|
||||||
{
|
{
|
||||||
init_onnx_model(ModelPath);
|
init_onnx_model(ModelPath);
|
||||||
@@ -422,8 +415,7 @@ public:
|
|||||||
input_node_dims[0] = 1;
|
input_node_dims[0] = 1;
|
||||||
input_node_dims[1] = window_size_samples;
|
input_node_dims[1] = window_size_samples;
|
||||||
|
|
||||||
_h.resize(size_hc);
|
_state.resize(size_state);
|
||||||
_c.resize(size_hc);
|
|
||||||
sr.resize(1);
|
sr.resize(1);
|
||||||
sr[0] = sample_rate;
|
sr[0] = sample_rate;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ func main() {
|
|||||||
sd, err := speech.NewDetector(speech.DetectorConfig{
|
sd, err := speech.NewDetector(speech.DetectorConfig{
|
||||||
ModelPath: "../../files/silero_vad.onnx",
|
ModelPath: "../../files/silero_vad.onnx",
|
||||||
SampleRate: 16000,
|
SampleRate: 16000,
|
||||||
WindowSize: 1536,
|
|
||||||
Threshold: 0.5,
|
Threshold: 0.5,
|
||||||
MinSilenceDurationMs: 0,
|
MinSilenceDurationMs: 0,
|
||||||
SpeechPadMs: 0,
|
SpeechPadMs: 0,
|
||||||
@@ -22,6 +21,10 @@ func main() {
|
|||||||
log.Fatalf("failed to create speech detector: %s", err)
|
log.Fatalf("failed to create speech detector: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(os.Args) != 2 {
|
||||||
|
log.Fatalf("invalid arguments provided: expecting one file path")
|
||||||
|
}
|
||||||
|
|
||||||
f, err := os.Open(os.Args[1])
|
f, err := os.Open(os.Args[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("failed to open sample audio file: %s", err)
|
log.Fatalf("failed to open sample audio file: %s", err)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ go 1.21.4
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-audio/wav v1.1.0
|
github.com/go-audio/wav v1.1.0
|
||||||
github.com/streamer45/silero-vad-go v0.1.0
|
github.com/streamer45/silero-vad-go v0.2.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
|
|||||||
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
|
github.com/streamer45/silero-vad-go v0.2.0 h1:bbRTa6cQuc7VI88y0qicx375UyWoxE6wlVOF+mUg0+g=
|
||||||
github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
github.com/streamer45/silero-vad-go v0.2.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
||||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
|||||||
@@ -186,7 +186,7 @@ if __name__ == '__main__':
|
|||||||
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
||||||
|
|
||||||
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
||||||
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
||||||
|
|
||||||
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
||||||
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
||||||
@@ -198,4 +198,4 @@ if __name__ == '__main__':
|
|||||||
help=" minimum silence duration in samples between to separate speech chunks")
|
help=" minimum silence duration in samples between to separate speech chunks")
|
||||||
ARGS = parser.parse_args()
|
ARGS = parser.parse_args()
|
||||||
ARGS.rate=DEFAULT_SAMPLE_RATE
|
ARGS.rate=DEFAULT_SAMPLE_RATE
|
||||||
main(ARGS)
|
main(ARGS)
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ class OnnxWrapper():
|
|||||||
|
|
||||||
x = torch.cat([self._context, x], dim=1)
|
x = torch.cat([self._context, x], dim=1)
|
||||||
if sr in [8000, 16000]:
|
if sr in [8000, 16000]:
|
||||||
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr)}
|
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
|
||||||
ort_outs = self.session.run(None, ort_inputs)
|
ort_outs = self.session.run(None, ort_inputs)
|
||||||
out, state = ort_outs
|
out, state = ort_outs
|
||||||
self._state = torch.from_numpy(state)
|
self._state = torch.from_numpy(state)
|
||||||
|
|||||||
Reference in New Issue
Block a user