mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84768cefdf | ||
|
|
6de3660f25 | ||
|
|
d9a6941852 | ||
|
|
dfdc9a484e | ||
|
|
f2e3a23d96 | ||
|
|
2b97f61160 | ||
|
|
e8850d2b9b | ||
|
|
657dac8736 | ||
|
|
412a478e29 | ||
|
|
9adf6d2192 | ||
|
|
8a2a73c14f | ||
|
|
3e0305559d | ||
|
|
f0d880d79c | ||
|
|
3888946c0c | ||
|
|
24f51645d0 | ||
|
|
fdbb0a3a81 | ||
|
|
60ae7abfb7 | ||
|
|
0b3d43d432 | ||
|
|
a395853982 | ||
|
|
78958b6fb6 | ||
|
|
902cfc9248 | ||
|
|
89e66a3474 | ||
|
|
a3bdebed16 | ||
|
|
4bdcf31d17 | ||
|
|
136cdcdf5b |
40
.github/workflows/python-publish.yml
vendored
Normal file
40
.github/workflows/python-publish.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# This workflow will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
||||
|
||||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: '3.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
||||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
46
README.md
46
README.md
@@ -25,6 +25,34 @@ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-
|
||||
</details>
|
||||
|
||||
<br/>
|
||||
|
||||
<h2 align="center">Fast start</h2>
|
||||
<br/>
|
||||
|
||||
**Using pip**:
|
||||
`pip install silero-vad`
|
||||
|
||||
```python3
|
||||
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
|
||||
model = load_silero_vad()
|
||||
wav = read_audio('path_to_audio_file') # backend (sox, soundfile, or ffmpeg) required!
|
||||
speech_timestamps = get_speech_timestamps(wav, model)
|
||||
```
|
||||
|
||||
**Using torch.hub**:
|
||||
```python3
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
||||
(get_speech_timestamps, _, read_audio, _, _) = utils
|
||||
|
||||
wav = read_audio('path_to_audio_file') # backend (sox, soundfile, or ffmpeg) required!
|
||||
speech_timestamps = get_speech_timestamps(wav, model)
|
||||
```
|
||||
|
||||
<br/>
|
||||
|
||||
<h2 align="center">Key Features</h2>
|
||||
<br/>
|
||||
|
||||
@@ -57,21 +85,7 @@ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-
|
||||
Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
|
||||
|
||||
<br/>
|
||||
<h2 align="center">Fast start</h2>
|
||||
<br/>
|
||||
|
||||
```python3
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
||||
(get_speech_timestamps, _, read_audio, _, _) = utils
|
||||
|
||||
wav = read_audio('path_to_audio_file')
|
||||
speech_timestamps = get_speech_timestamps(wav, model)
|
||||
```
|
||||
|
||||
<br/>
|
||||
<h2 align="center">Typical Use Cases</h2>
|
||||
<br/>
|
||||
|
||||
@@ -121,4 +135,6 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for releva
|
||||
|
||||
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
||||
|
||||
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
||||
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
||||
|
||||
- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) examples
|
||||
|
||||
@@ -120,8 +120,7 @@ private:
|
||||
void reset_states()
|
||||
{
|
||||
// Call reset before each audio start
|
||||
std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
|
||||
std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
|
||||
std::memset(_state.data(), 0.0f, _state.size() * sizeof(float));
|
||||
triggered = false;
|
||||
temp_end = 0;
|
||||
current_sample = 0;
|
||||
@@ -139,19 +138,16 @@ private:
|
||||
input.assign(data.begin(), data.end());
|
||||
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
||||
memory_info, input.data(), input.size(), input_node_dims, 2);
|
||||
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
|
||||
memory_info, _state.data(), _state.size(), state_node_dims, 3);
|
||||
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
||||
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
||||
Ort::Value h_ort = Ort::Value::CreateTensor<float>(
|
||||
memory_info, _h.data(), _h.size(), hc_node_dims, 3);
|
||||
Ort::Value c_ort = Ort::Value::CreateTensor<float>(
|
||||
memory_info, _c.data(), _c.size(), hc_node_dims, 3);
|
||||
|
||||
// Clear and add inputs
|
||||
ort_inputs.clear();
|
||||
ort_inputs.emplace_back(std::move(input_ort));
|
||||
ort_inputs.emplace_back(std::move(state_ort));
|
||||
ort_inputs.emplace_back(std::move(sr_ort));
|
||||
ort_inputs.emplace_back(std::move(h_ort));
|
||||
ort_inputs.emplace_back(std::move(c_ort));
|
||||
|
||||
// Infer
|
||||
ort_outputs = session->Run(
|
||||
@@ -161,10 +157,8 @@ private:
|
||||
|
||||
// Output probability & update h,c recursively
|
||||
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
||||
float *hn = ort_outputs[1].GetTensorMutableData<float>();
|
||||
std::memcpy(_h.data(), hn, size_hc * sizeof(float));
|
||||
float *cn = ort_outputs[2].GetTensorMutableData<float>();
|
||||
std::memcpy(_c.data(), cn, size_hc * sizeof(float));
|
||||
float *stateN = ort_outputs[1].GetTensorMutableData<float>();
|
||||
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
|
||||
|
||||
// Push forward sample index
|
||||
current_sample += window_size_samples;
|
||||
@@ -376,27 +370,26 @@ private:
|
||||
// Inputs
|
||||
std::vector<Ort::Value> ort_inputs;
|
||||
|
||||
std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
|
||||
std::vector<const char *> input_node_names = {"input", "state", "sr"};
|
||||
std::vector<float> input;
|
||||
unsigned int size_state = 2 * 1 * 128; // It's FIXED.
|
||||
std::vector<float> _state;
|
||||
std::vector<int64_t> sr;
|
||||
unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
|
||||
std::vector<float> _h;
|
||||
std::vector<float> _c;
|
||||
|
||||
int64_t input_node_dims[2] = {};
|
||||
int64_t input_node_dims[2] = {};
|
||||
const int64_t state_node_dims[3] = {2, 1, 128};
|
||||
const int64_t sr_node_dims[1] = {1};
|
||||
const int64_t hc_node_dims[3] = {2, 1, 64};
|
||||
|
||||
// Outputs
|
||||
std::vector<Ort::Value> ort_outputs;
|
||||
std::vector<const char *> output_node_names = {"output", "hn", "cn"};
|
||||
std::vector<const char *> output_node_names = {"output", "stateN"};
|
||||
|
||||
public:
|
||||
// Construction
|
||||
VadIterator(const std::wstring ModelPath,
|
||||
int Sample_rate = 16000, int windows_frame_size = 64,
|
||||
int Sample_rate = 16000, int windows_frame_size = 32,
|
||||
float Threshold = 0.5, int min_silence_duration_ms = 0,
|
||||
int speech_pad_ms = 64, int min_speech_duration_ms = 64,
|
||||
int speech_pad_ms = 32, int min_speech_duration_ms = 32,
|
||||
float max_speech_duration_s = std::numeric_limits<float>::infinity())
|
||||
{
|
||||
init_onnx_model(ModelPath);
|
||||
@@ -422,8 +415,7 @@ public:
|
||||
input_node_dims[0] = 1;
|
||||
input_node_dims[1] = window_size_samples;
|
||||
|
||||
_h.resize(size_hc);
|
||||
_c.resize(size_hc);
|
||||
_state.resize(size_state);
|
||||
sr.resize(1);
|
||||
sr[0] = sample_rate;
|
||||
};
|
||||
|
||||
@@ -13,7 +13,6 @@ func main() {
|
||||
sd, err := speech.NewDetector(speech.DetectorConfig{
|
||||
ModelPath: "../../files/silero_vad.onnx",
|
||||
SampleRate: 16000,
|
||||
WindowSize: 1536,
|
||||
Threshold: 0.5,
|
||||
MinSilenceDurationMs: 0,
|
||||
SpeechPadMs: 0,
|
||||
@@ -22,6 +21,10 @@ func main() {
|
||||
log.Fatalf("failed to create speech detector: %s", err)
|
||||
}
|
||||
|
||||
if len(os.Args) != 2 {
|
||||
log.Fatalf("invalid arguments provided: expecting one file path")
|
||||
}
|
||||
|
||||
f, err := os.Open(os.Args[1])
|
||||
if err != nil {
|
||||
log.Fatalf("failed to open sample audio file: %s", err)
|
||||
|
||||
@@ -4,7 +4,7 @@ go 1.21.4
|
||||
|
||||
require (
|
||||
github.com/go-audio/wav v1.1.0
|
||||
github.com/streamer45/silero-vad-go v0.1.0
|
||||
github.com/streamer45/silero-vad-go v0.2.0
|
||||
)
|
||||
|
||||
require (
|
||||
|
||||
@@ -8,8 +8,8 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
|
||||
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
|
||||
github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
||||
github.com/streamer45/silero-vad-go v0.2.0 h1:bbRTa6cQuc7VI88y0qicx375UyWoxE6wlVOF+mUg0+g=
|
||||
github.com/streamer45/silero-vad-go v0.2.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
|
||||
@@ -186,7 +186,7 @@ if __name__ == '__main__':
|
||||
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
||||
|
||||
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
||||
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
||||
help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
||||
|
||||
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
||||
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
||||
@@ -198,4 +198,4 @@ if __name__ == '__main__':
|
||||
help=" minimum silence duration in samples between to separate speech chunks")
|
||||
ARGS = parser.parse_args()
|
||||
ARGS.rate=DEFAULT_SAMPLE_RATE
|
||||
main(ARGS)
|
||||
main(ARGS)
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
{"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"}
|
||||
@@ -1 +0,0 @@
|
||||
{"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]}
|
||||
21
hubconf.py
21
hubconf.py
@@ -1,16 +1,15 @@
|
||||
dependencies = ['torch', 'torchaudio']
|
||||
import torch
|
||||
import json
|
||||
import os
|
||||
from utils_vad import (init_jit_model,
|
||||
get_speech_timestamps,
|
||||
save_audio,
|
||||
read_audio,
|
||||
VADIterator,
|
||||
collect_chunks,
|
||||
drop_chunks,
|
||||
Validator,
|
||||
OnnxWrapper)
|
||||
import sys
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
from silero_vad.utils_vad import (init_jit_model,
|
||||
get_speech_timestamps,
|
||||
save_audio,
|
||||
read_audio,
|
||||
VADIterator,
|
||||
collect_chunks,
|
||||
OnnxWrapper)
|
||||
|
||||
|
||||
def versiontuple(v):
|
||||
@@ -36,7 +35,7 @@ def silero_vad(onnx=False, force_onnx_cpu=False):
|
||||
if versiontuple(installed_version) < versiontuple(supported_version):
|
||||
raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
|
||||
|
||||
model_dir = os.path.join(os.path.dirname(__file__), 'files')
|
||||
model_dir = os.path.join(os.path.dirname(__file__), 'src', 'silero_vad', 'data')
|
||||
if onnx:
|
||||
model = OnnxWrapper(os.path.join(model_dir, 'silero_vad.onnx'), force_onnx_cpu)
|
||||
else:
|
||||
|
||||
35
pyproject.toml
Normal file
35
pyproject.toml
Normal file
@@ -0,0 +1,35 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
[project]
|
||||
name = "silero-vad"
|
||||
version = "5.1"
|
||||
authors = [
|
||||
{name="Silero Team", email="hello@silero.ai"},
|
||||
]
|
||||
description = "Voice Activity Detector (VAD) by Silero"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Developers",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Scientific/Engineering",
|
||||
]
|
||||
dependencies = [
|
||||
"torch>=1.12.0",
|
||||
"torchaudio>=0.12.0",
|
||||
"onnxruntime>=1.18.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/snakers4/silero-vad"
|
||||
Issues = "https://github.com/snakers4/silero-vad/issues"
|
||||
@@ -43,20 +43,30 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"USE_PIP = True # download model using pip package or torch.hub\n",
|
||||
"USE_ONNX = False # change this to True if you want to test onnx model\n",
|
||||
"if USE_ONNX:\n",
|
||||
" !pip install -q onnxruntime\n",
|
||||
"if USE_PIP:\n",
|
||||
" !pip install -q silero-vad\n",
|
||||
" from silero_vad import (load_silero_vad,\n",
|
||||
" read_audio,\n",
|
||||
" get_speech_timestamps,\n",
|
||||
" save_audio,\n",
|
||||
" VADIterator,\n",
|
||||
" collect_chunks)\n",
|
||||
" model = load_silero_vad(onnx=USE_ONNX)\n",
|
||||
"else:\n",
|
||||
" model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||
" model='silero_vad',\n",
|
||||
" force_reload=True,\n",
|
||||
" onnx=USE_ONNX)\n",
|
||||
"\n",
|
||||
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||
" model='silero_vad',\n",
|
||||
" force_reload=True,\n",
|
||||
" onnx=USE_ONNX)\n",
|
||||
"\n",
|
||||
"(get_speech_timestamps,\n",
|
||||
" save_audio,\n",
|
||||
" read_audio,\n",
|
||||
" VADIterator,\n",
|
||||
" collect_chunks) = utils"
|
||||
" (get_speech_timestamps,\n",
|
||||
" save_audio,\n",
|
||||
" read_audio,\n",
|
||||
" VADIterator,\n",
|
||||
" collect_chunks) = utils"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
12
src/silero_vad/__init__.py
Normal file
12
src/silero_vad/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from importlib.metadata import version
|
||||
try:
|
||||
__version__ = version(__name__)
|
||||
except:
|
||||
pass
|
||||
|
||||
from silero_vad.model import load_silero_vad
|
||||
from silero_vad.utils_vad import (get_speech_timestamps,
|
||||
save_audio,
|
||||
read_audio,
|
||||
VADIterator,
|
||||
collect_chunks)
|
||||
0
src/silero_vad/data/__init__.py
Normal file
0
src/silero_vad/data/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
25
src/silero_vad/model.py
Normal file
25
src/silero_vad/model.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from .utils_vad import init_jit_model, OnnxWrapper
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
def load_silero_vad(onnx=False):
|
||||
model_name = 'silero_vad.onnx' if onnx else 'silero_vad.jit'
|
||||
package_path = "silero_vad.data"
|
||||
|
||||
try:
|
||||
import importlib_resources as impresources
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
except:
|
||||
from importlib import resources as impresources
|
||||
try:
|
||||
with impresources.path(package_path, model_name) as f:
|
||||
model_file_path = f
|
||||
except:
|
||||
model_file_path = str(impresources.files(package_path).joinpath(model_name))
|
||||
|
||||
if onnx:
|
||||
model = OnnxWrapper(model_file_path, force_onnx_cpu=True)
|
||||
else:
|
||||
model = init_jit_model(model_file_path)
|
||||
|
||||
return model
|
||||
@@ -72,7 +72,7 @@ class OnnxWrapper():
|
||||
|
||||
x = torch.cat([self._context, x], dim=1)
|
||||
if sr in [8000, 16000]:
|
||||
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr)}
|
||||
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, state = ort_outs
|
||||
self._state = torch.from_numpy(state)
|
||||
@@ -132,18 +132,19 @@ class Validator():
|
||||
|
||||
def read_audio(path: str,
|
||||
sampling_rate: int = 16000):
|
||||
list_backends = torchaudio.list_audio_backends()
|
||||
|
||||
assert len(list_backends) > 0, 'The list of available backends is empty, please install backend manually. \
|
||||
\n Recommendations: \n \tSox (UNIX OS) \n \tSoundfile (Windows OS, UNIX OS) \n \tffmpeg (Windows OS, UNIX OS)'
|
||||
|
||||
sox_backends = set(['sox', 'sox_io'])
|
||||
audio_backends = torchaudio.list_audio_backends()
|
||||
|
||||
if len(sox_backends.intersection(audio_backends)) > 0:
|
||||
try:
|
||||
effects = [
|
||||
['channels', '1'],
|
||||
['rate', str(sampling_rate)]
|
||||
]
|
||||
|
||||
wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)
|
||||
else:
|
||||
except:
|
||||
wav, sr = torchaudio.load(path)
|
||||
|
||||
if wav.size(0) > 1:
|
||||
Reference in New Issue
Block a user