mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 01:49:22 +08:00
Compare commits
364 Commits
v2.0-legac
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2688a6e352 | ||
|
|
c5542cd4a8 | ||
|
|
4725c40105 | ||
|
|
cfe63384f0 | ||
|
|
2a08f0b90d | ||
|
|
21ffe8576e | ||
|
|
d5b52843f7 | ||
|
|
fb7d7c7f5d | ||
|
|
e7c3d6f2bd | ||
|
|
390614894d | ||
|
|
33eb4c7f84 | ||
|
|
c913b0c4b3 | ||
|
|
4dd2e8f6f9 | ||
|
|
63fe03add7 | ||
|
|
29a582ba37 | ||
|
|
3ca476e4fb | ||
|
|
7de462944a | ||
|
|
12b0121993 | ||
|
|
7b0aaa1c4c | ||
|
|
540eff3e24 | ||
|
|
dfeba4fc0f | ||
|
|
be95df9152 | ||
|
|
ec56fe50a5 | ||
|
|
dea5980320 | ||
|
|
90d9ce7695 | ||
|
|
c56dbb11ac | ||
|
|
9b686893ad | ||
|
|
6979fbd535 | ||
|
|
1cff663de5 | ||
|
|
bfdc019302 | ||
|
|
c0c0ffa0c5 | ||
|
|
3f0c9ead54 | ||
|
|
556a442942 | ||
|
|
9623ce72da | ||
|
|
b6dd0599fc | ||
|
|
d8f88c9157 | ||
|
|
b15a216b47 | ||
|
|
2389039408 | ||
|
|
df22fcaec8 | ||
|
|
81e8a48e25 | ||
|
|
a14a23faa7 | ||
|
|
a30b5843c1 | ||
|
|
a66c890188 | ||
|
|
77c91a91fa | ||
|
|
33093c6f1b | ||
|
|
dc0b62e1e4 | ||
|
|
64fb49e1c8 | ||
|
|
55ba6e2825 | ||
|
|
b90f8c012f | ||
|
|
25a778c798 | ||
|
|
3d860e6ace | ||
|
|
f5ea01bfda | ||
|
|
dd651a54a5 | ||
|
|
f1175c902f | ||
|
|
7819fd911b | ||
|
|
fba061dc55 | ||
|
|
11631356a2 | ||
|
|
34dea51680 | ||
|
|
51fd43130a | ||
|
|
3080062489 | ||
|
|
f974f2d6bc | ||
|
|
f1886d9088 | ||
|
|
4c00cd14be | ||
|
|
5d70880844 | ||
|
|
a16f3ed079 | ||
|
|
b0fbf4bec6 | ||
|
|
ab02267584 | ||
|
|
485a7d91b0 | ||
|
|
1da76acfc3 | ||
|
|
3c70b587e8 | ||
|
|
7aff370d68 | ||
|
|
931eddfdab | ||
|
|
6143b9a5d9 | ||
|
|
8ca8cf7d9b | ||
|
|
ad0fdbe4ac | ||
|
|
06806eb70b | ||
|
|
c90e1603c5 | ||
|
|
023d3a36f0 | ||
|
|
aa2a66cf46 | ||
|
|
b1cd34aae2 | ||
|
|
50be3744fe | ||
|
|
fce776f872 | ||
|
|
fbddc91a5d | ||
|
|
bbf22a0064 | ||
|
|
94811cbe12 | ||
|
|
22a2362b4c | ||
|
|
0dd45f0bcd | ||
|
|
feba8cd5c4 | ||
|
|
6622e562e4 | ||
|
|
d5625d5c38 | ||
|
|
cd92290a15 | ||
|
|
33a9d190fe | ||
|
|
7440bc4689 | ||
|
|
10e7e8a8bc | ||
|
|
5a5b662496 | ||
|
|
9060f664f2 | ||
|
|
94271e9096 | ||
|
|
3f9fffc261 | ||
|
|
eaf633ec9d | ||
|
|
cff5eb2980 | ||
|
|
f356a8081a | ||
|
|
782e30d28f | ||
|
|
caee535cf6 | ||
|
|
8ab5be005f | ||
|
|
9f67a54e87 | ||
|
|
c8df1dee3f | ||
|
|
0189ebd8af | ||
|
|
05e380c1de | ||
|
|
93b9782f28 | ||
|
|
d2ab7c254e | ||
|
|
6217b08bbb | ||
|
|
d53ba1ea11 | ||
|
|
102e6d0962 | ||
|
|
e531cd3462 | ||
|
|
fd41da0b15 | ||
|
|
9db72c35bd | ||
|
|
867a067bee | ||
|
|
2c43391b17 | ||
|
|
6478567951 | ||
|
|
add6e3028e | ||
|
|
e7025ed8c5 | ||
|
|
35d601adc6 | ||
|
|
032ca21a70 | ||
|
|
001d57d6ff | ||
|
|
6e6da04e7a | ||
|
|
9c1eff9169 | ||
|
|
36b759d053 | ||
|
|
1a7499607a | ||
|
|
87451b059f | ||
|
|
becc7770c7 | ||
|
|
3f2eff0303 | ||
|
|
3a25110cf9 | ||
|
|
d23867da10 | ||
|
|
2043282182 | ||
|
|
fa8036ae1c | ||
|
|
2fff4b8ce8 | ||
|
|
64b863d2ff | ||
|
|
8a3600665b | ||
|
|
9c2c90aa1c | ||
|
|
1d48167271 | ||
|
|
d0139d94d9 | ||
|
|
46f94b7d60 | ||
|
|
3de3ee3abe | ||
|
|
e680ea6633 | ||
|
|
199de226e5 | ||
|
|
4109b107c1 | ||
|
|
36854a90db | ||
|
|
827e86e685 | ||
|
|
e706ec6fee | ||
|
|
88df0ce1dd | ||
|
|
d18b91e037 | ||
|
|
1e3f343767 | ||
|
|
6a8ee81ee0 | ||
|
|
cb25c0c047 | ||
|
|
7af8628a27 | ||
|
|
3682cb189c | ||
|
|
57c0b51f9b | ||
|
|
dd0b143803 | ||
|
|
181cdf92b6 | ||
|
|
a7bd2dd38f | ||
|
|
df7de797a5 | ||
|
|
87ed11b508 | ||
|
|
84768cefdf | ||
|
|
6de3660f25 | ||
|
|
d9a6941852 | ||
|
|
dfdc9a484e | ||
|
|
f2e3a23d96 | ||
|
|
2b97f61160 | ||
|
|
e8850d2b9b | ||
|
|
657dac8736 | ||
|
|
412a478e29 | ||
|
|
9adf6d2192 | ||
|
|
8a2a73c14f | ||
|
|
3e0305559d | ||
|
|
f0d880d79c | ||
|
|
3888946c0c | ||
|
|
24f51645d0 | ||
|
|
fdbb0a3a81 | ||
|
|
60ae7abfb7 | ||
|
|
0b3d43d432 | ||
|
|
a395853982 | ||
|
|
78958b6fb6 | ||
|
|
902cfc9248 | ||
|
|
89e66a3474 | ||
|
|
a3bdebed16 | ||
|
|
4bdcf31d17 | ||
|
|
136cdcdf5b | ||
|
|
5cd2ba54db | ||
|
|
06b9e17c1e | ||
|
|
aace7e25b1 | ||
|
|
74c3f7f3fb | ||
|
|
d77d0fd42c | ||
|
|
49b421a9cd | ||
|
|
fd1f1a62b7 | ||
|
|
8145ed9a91 | ||
|
|
4392725328 | ||
|
|
8b0566682b | ||
|
|
82342b8a4c | ||
|
|
4b8ce743a8 | ||
|
|
b4b6f2ab3e | ||
|
|
5b02d84a4a | ||
|
|
60465a7e61 | ||
|
|
fcef5b3955 | ||
|
|
156436762f | ||
|
|
a27b176e45 | ||
|
|
8125483ef7 | ||
|
|
6969dcc2dc | ||
|
|
6c816a05f0 | ||
|
|
9dc344df7f | ||
|
|
41c5172dd9 | ||
|
|
894ea259f9 | ||
|
|
f56f56ffaa | ||
|
|
6c8d844710 | ||
|
|
d8cc947c73 | ||
|
|
797a88a386 | ||
|
|
48b7c742dd | ||
|
|
c5ec6bae3d | ||
|
|
af152c18f6 | ||
|
|
d391f4c302 | ||
|
|
bf18ea6b56 | ||
|
|
a65732a393 | ||
|
|
aae1e4f40d | ||
|
|
94504ece54 | ||
|
|
0b7da6e74b | ||
|
|
efb5effc8f | ||
|
|
03dc3fae5c | ||
|
|
4a6d1701a4 | ||
|
|
5e7ee10ee0 | ||
|
|
03fb810fab | ||
|
|
e30a7e32a9 | ||
|
|
bbbc657dad | ||
|
|
cb92cdd1e3 | ||
|
|
3780baf49f | ||
|
|
563106ef8c | ||
|
|
f795bc479b | ||
|
|
7e9680bc83 | ||
|
|
3b4c02dfe3 | ||
|
|
bc5a0a2dbf | ||
|
|
b03fcb2ebe | ||
|
|
026bc3d292 | ||
|
|
e755baa3c2 | ||
|
|
b88084c7ed | ||
|
|
a9d2b591de | ||
|
|
c3c67cdcb8 | ||
|
|
874c66ccbc | ||
|
|
51fbbcb32e | ||
|
|
14a0715955 | ||
|
|
a0d26769e0 | ||
|
|
e0c2015193 | ||
|
|
5872cffd78 | ||
|
|
86400b9a12 | ||
|
|
6ef43d1c5d | ||
|
|
540e092276 | ||
|
|
55c41abf46 | ||
|
|
17903cb41d | ||
|
|
c39dccc1fd | ||
|
|
a6a067de44 | ||
|
|
9865b3cb93 | ||
|
|
3d10c2d950 | ||
|
|
4f57fae3fa | ||
|
|
085d76f08e | ||
|
|
262bcb4b40 | ||
|
|
e84eca68d7 | ||
|
|
e7c4539106 | ||
|
|
a480e85aec | ||
|
|
c69cb6c9c0 | ||
|
|
11da69d88b | ||
|
|
df1d52042d | ||
|
|
d5a944b9f1 | ||
|
|
d90416e63e | ||
|
|
91f0aaecef | ||
|
|
015bfc8b21 | ||
|
|
5d56b1ea40 | ||
|
|
ff3c596cab | ||
|
|
63e1be5a22 | ||
|
|
1d8f8f38db | ||
|
|
7198087152 | ||
|
|
ad57d17f5f | ||
|
|
04e87c208a | ||
|
|
c583fd1e52 | ||
|
|
5814e548db | ||
|
|
42565d5baa | ||
|
|
ab7af9745b | ||
|
|
83e68c56ea | ||
|
|
d3882c9ebf | ||
|
|
25f04dda35 | ||
|
|
94b4c21874 | ||
|
|
324bc74a58 | ||
|
|
82d199ff22 | ||
|
|
5ba388d894 | ||
|
|
790844ba0f | ||
|
|
51b5245410 | ||
|
|
888970e77d | ||
|
|
cb6d308335 | ||
|
|
1b212c6e95 | ||
|
|
452060ad65 | ||
|
|
c7eab751b5 | ||
|
|
d1714a9ff7 | ||
|
|
94c79d899d | ||
|
|
1baf307b35 | ||
|
|
e324285cdc | ||
|
|
13dce2d067 | ||
|
|
081e6b9886 | ||
|
|
572134fdf1 | ||
|
|
a799dea837 | ||
|
|
17209e6c4f | ||
|
|
6661cc9691 | ||
|
|
7c671a75c2 | ||
|
|
622016e672 | ||
|
|
8eba346bc9 | ||
|
|
900c71a109 | ||
|
|
bf0127e016 | ||
|
|
ea7af70fe9 | ||
|
|
8cdc8d36c9 | ||
|
|
6e9fd77500 | ||
|
|
6cc08b1077 | ||
|
|
0e8e080894 | ||
|
|
af6931d1de | ||
|
|
76687cbe25 | ||
|
|
b2329fa5f2 | ||
|
|
005886e7eb | ||
|
|
f6b1294cb2 | ||
|
|
2392ea33f4 | ||
|
|
45d72863b6 | ||
|
|
f40cc128a4 | ||
|
|
0d61e4cee1 | ||
|
|
011268e492 | ||
|
|
8ebaf139c6 | ||
|
|
0a90316625 | ||
|
|
35d8969322 | ||
|
|
7c3eb8bfb5 | ||
|
|
74f759c8f8 | ||
|
|
5816eb08c4 | ||
|
|
0feae6cbbe | ||
|
|
fc0a70f42e | ||
|
|
13fd927b84 | ||
|
|
124d6564a0 | ||
|
|
56fa93a1c9 | ||
|
|
1a93276208 | ||
|
|
9fbd0c4c2d | ||
|
|
7b05a183a3 | ||
|
|
f67e68efc3 | ||
|
|
51b1365bb0 | ||
|
|
79fdb55f1c | ||
|
|
b17da75dac | ||
|
|
184e384697 | ||
|
|
adf5d6d020 | ||
|
|
41ee0f6b9f | ||
|
|
236d250a11 | ||
|
|
8794d6f835 | ||
|
|
8f16c14066 | ||
|
|
f638c47595 | ||
|
|
1fad5f4ffb | ||
|
|
7160ce99d3 | ||
|
|
8af246df49 | ||
|
|
b1142bcba4 | ||
|
|
a243bd5dc8 | ||
|
|
d4d2af5833 | ||
|
|
469ca8a2f6 | ||
|
|
8c1ae73ee7 | ||
|
|
aba7862d58 | ||
|
|
b648546a21 | ||
|
|
2e852d7d41 | ||
|
|
044278aa12 |
40
.github/workflows/python-publish.yml
vendored
Normal file
40
.github/workflows/python-publish.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# This workflow will upload a Python Package using Twine when a release is created
|
||||||
|
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
||||||
|
|
||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
|
||||||
|
name: Upload Python Package
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v3
|
||||||
|
with:
|
||||||
|
python-version: '3.x'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build
|
||||||
|
- name: Publish package
|
||||||
|
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
|
||||||
|
with:
|
||||||
|
user: __token__
|
||||||
|
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
40
.github/workflows/test.yml
vendored
Normal file
40
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
name: Test Package
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # запуск вручную
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
|
python-version: ["3.8","3.9","3.10","3.11","3.12","3.13"]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build hatchling pytest soundfile
|
||||||
|
pip install .[test]
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build --wheel --outdir dist
|
||||||
|
|
||||||
|
- name: Install package
|
||||||
|
run: |
|
||||||
|
import glob, subprocess, sys
|
||||||
|
whl = glob.glob("dist/*.whl")[0]
|
||||||
|
subprocess.check_call([sys.executable, "-m", "pip", "install", whl])
|
||||||
|
shell: python
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: pytest tests
|
||||||
20
CITATION.cff
Normal file
20
CITATION.cff
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
cff-version: 1.2.0
|
||||||
|
message: "If you use this software, please cite it as below."
|
||||||
|
title: "Silero VAD"
|
||||||
|
authors:
|
||||||
|
- family-names: "Silero Team"
|
||||||
|
email: "hello@silero.ai"
|
||||||
|
type: software
|
||||||
|
repository-code: "https://github.com/snakers4/silero-vad"
|
||||||
|
license: MIT
|
||||||
|
abstract: "Pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier"
|
||||||
|
preferred-citation:
|
||||||
|
type: software
|
||||||
|
authors:
|
||||||
|
- family-names: "Silero Team"
|
||||||
|
email: "hello@silero.ai"
|
||||||
|
title: "Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier"
|
||||||
|
year: 2024
|
||||||
|
publisher: "GitHub"
|
||||||
|
journal: "GitHub repository"
|
||||||
|
howpublished: "https://github.com/snakers4/silero-vad"
|
||||||
669
README.md
669
README.md
@@ -1,623 +1,164 @@
|
|||||||
[](mailto:hello@silero.ai) [](https://t.me/silero_speech) [](https://github.com/snakers4/silero-vad/blob/master/LICENSE)
|
[](mailto:hello@silero.ai) [](https://t.me/silero_speech) [](https://github.com/snakers4/silero-vad/blob/master/LICENSE) [](https://pypi.org/project/silero-vad/)
|
||||||
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_vad/)
|
[](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) [](https://github.com/snakers4/silero-vad/actions/workflows/test.yml) [](https://pypi.org/project/silero-vad/) [](https://pypi.org/project/silero-vad)
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
- [Silero VAD](#silero-vad)
|
<br/>
|
||||||
- [TLDR](#tldr)
|
<h1 align="center">Silero VAD</h1>
|
||||||
- [Live Demonstration](#live-demonstration)
|
<br/>
|
||||||
- [Getting Started](#getting-started)
|
|
||||||
- [Pre-trained Models](#pre-trained-models)
|
**Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)).
|
||||||
- [Version History](#version-history)
|
|
||||||
- [PyTorch](#pytorch)
|
<br/>
|
||||||
- [VAD](#vad)
|
|
||||||
- [Number Detector](#number-detector)
|
<p align="center">
|
||||||
- [Language Classifier](#language-classifier)
|
<img src="https://github.com/user-attachments/assets/f2940867-0a51-4bdb-8c14-1129d3c44e64" />
|
||||||
- [ONNX](#onnx)
|
</p>
|
||||||
- [VAD](#vad-1)
|
|
||||||
- [Number Detector](#number-detector-1)
|
|
||||||
- [Language Classifier](#language-classifier-1)
|
|
||||||
- [Metrics](#metrics)
|
|
||||||
- [Performance Metrics](#performance-metrics)
|
|
||||||
- [Streaming Latency](#streaming-latency)
|
|
||||||
- [Full Audio Throughput](#full-audio-throughput)
|
|
||||||
- [VAD Quality Metrics](#vad-quality-metrics)
|
|
||||||
- [FAQ](#faq)
|
|
||||||
- [VAD Parameter Fine Tuning](#vad-parameter-fine-tuning)
|
|
||||||
- [Classic way](#classic-way)
|
|
||||||
- [Adaptive way](#adaptive-way)
|
|
||||||
- [How VAD Works](#how-vad-works)
|
|
||||||
- [VAD Quality Metrics Methodology](#vad-quality-metrics-methodology)
|
|
||||||
- [How Number Detector Works](#how-number-detector-works)
|
|
||||||
- [How Language Classifier Works](#how-language-classifier-works)
|
|
||||||
- [Contact](#contact)
|
|
||||||
- [Get in Touch](#get-in-touch)
|
|
||||||
- [Commercial Inquiries](#commercial-inquiries)
|
|
||||||
- [Further reading](#further-reading)
|
|
||||||
- [Citations](#citations)
|
|
||||||
|
|
||||||
|
|
||||||
# Silero VAD
|
<details>
|
||||||

|
<summary>Real Time Example</summary>
|
||||||
|
|
||||||
## TLDR
|
https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
|
||||||
|
|
||||||
**Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier.**
|
Please note, that video loads only if you are logged in your GitHub account.
|
||||||
Enterprise-grade Speech Products made refreshingly simple (also see our [STT](https://github.com/snakers4/silero-models) models).
|
|
||||||
|
|
||||||
Currently, there are hardly any high quality / modern / free / public voice activity detectors except for WebRTC Voice Activity Detector ([link](https://github.com/wiseman/py-webrtcvad)). WebRTC though starts to show its age and it suffers from many false positives.
|
</details>
|
||||||
|
|
||||||
Also in some cases it is crucial to be able to anonymize large-scale spoken corpora (i.e. remove personal data). Typically personal data is considered to be private / sensitive if it contains (i) a name (ii) some private ID. Name recognition is a highly subjective matter and it depends on locale and business case, but Voice Activity and Number Detection are quite general tasks.
|
<br/>
|
||||||
|
|
||||||
**Key features:**
|
<h2 align="center">Fast start</h2>
|
||||||
|
<br/>
|
||||||
|
|
||||||
- Modern, portable;
|
<details>
|
||||||
- Low memory footprint;
|
<summary>Dependencies</summary>
|
||||||
- Superior metrics to WebRTC;
|
|
||||||
- Trained on huge spoken corpora and noise / sound libraries;
|
|
||||||
- Slower than WebRTC, but fast enough for IOT / edge / mobile applications;
|
|
||||||
- Unlike WebRTC (which mostly tells silence from voice), our VAD can tell voice from noise / music / silence;
|
|
||||||
|
|
||||||
**Typical use cases:**
|
System requirements to run python examples on `x86-64` systems:
|
||||||
|
|
||||||
- Spoken corpora anonymization;
|
- `python 3.8+`;
|
||||||
- Can be used together with WebRTC;
|
- 1G+ RAM;
|
||||||
- Voice activity detection for IOT / edge / mobile use cases;
|
- A modern CPU with AVX, AVX2, AVX-512 or AMX instruction sets.
|
||||||
- Data cleaning and preparation, number and voice detection in general;
|
|
||||||
- PyTorch and ONNX can be used with a wide variety of deployment options and backends in mind;
|
|
||||||
|
|
||||||
### Live Demonstration
|
Dependencies:
|
||||||
|
|
||||||
For more information, please see [examples](https://github.com/snakers4/silero-vad/tree/master/examples).
|
- `torch>=1.12.0`;
|
||||||
|
- `torchaudio>=0.12.0` (for I/O only);
|
||||||
|
- `onnxruntime>=1.16.1` (for ONNX model usage).
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4
|
Silero VAD uses torchaudio library for audio I/O (`torchaudio.info`, `torchaudio.load`, and `torchaudio.save`), so a proper audio backend is required:
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/8079748/117580455-4622dd00-b0f8-11eb-858d-e6368ed4eada.mp4
|
- Option №1 - [**FFmpeg**](https://www.ffmpeg.org/) backend. `conda install -c conda-forge 'ffmpeg<7'`;
|
||||||
|
- Option №2 - [**sox_io**](https://pypi.org/project/sox/) backend. `apt-get install sox`, TorchAudio is tested on libsox 14.4.2;
|
||||||
|
- Option №3 - [**soundfile**](https://pypi.org/project/soundfile/) backend. `pip install soundfile`.
|
||||||
|
|
||||||
## Getting Started
|
If you are planning to run the VAD using solely the `onnx-runtime`, it will run on any other system architectures where onnx-runtume is [supported](https://onnxruntime.ai/getting-started). In this case please note that:
|
||||||
|
|
||||||
The models are small enough to be included directly into this repository. Newer models will supersede older models directly.
|
- You will have to implement the I/O;
|
||||||
|
- You will have to adapt the existing wrappers / examples / post-processing for your use-case.
|
||||||
|
|
||||||
### Pre-trained Models
|
</details>
|
||||||
|
|
||||||
**Currently we provide the following endpoints:**
|
**Using pip**:
|
||||||
|
`pip install silero-vad`
|
||||||
|
|
||||||
| model= | Params | Model type | Streaming | Languages | PyTorch | ONNX | Colab |
|
```python3
|
||||||
| -------------------------- | ------ | ------------------- | --------- | -------------------------- | ------------------ | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
|
||||||
| `'silero_vad'` | 1.1M | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
model = load_silero_vad()
|
||||||
| `'silero_vad_micro'` | 10K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
wav = read_audio('path_to_audio_file')
|
||||||
| `'silero_vad_micro_8k'` | 10K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
speech_timestamps = get_speech_timestamps(
|
||||||
| `'silero_vad_mini'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
wav,
|
||||||
| `'silero_vad_mini_8k'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
model,
|
||||||
| `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
return_seconds=True, # Return speech timestamps in seconds (default is samples)
|
||||||
| `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
)
|
||||||
| ~~`'silero_lang_detector_116'`~~ | ~~1.7M~~ | ~~Language Classifier~~ ||| | ||
|
```
|
||||||
| `'silero_lang_detector_95'` | 4.7M | Language Classifier | No | [95 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_95.json) | :heavy_check_mark: | :heavy_check_mark: | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
|
|
||||||
|
|
||||||
(*) Though explicitly trained on these languages, VAD should work on any Germanic, Romance or Slavic Languages out of the box.
|
**Using torch.hub**:
|
||||||
|
```python3
|
||||||
What models do:
|
|
||||||
|
|
||||||
- VAD - detects speech;
|
|
||||||
- Number Detector - detects spoken numbers (i.e. thirty five);
|
|
||||||
- Language Classifier - classifies utterances between language;
|
|
||||||
- Language Classifier 95 - classifies among 95 languages as well as 58 language groups (mutually intelligible languages -> same group)
|
|
||||||
|
|
||||||
### Version History
|
|
||||||
|
|
||||||
**Version history:**
|
|
||||||
|
|
||||||
| Version | Date | Comment |
|
|
||||||
| ------- | ---------- | --------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `v1` | 2020-12-15 | Initial release |
|
|
||||||
| `v1.1` | 2020-12-24 | better vad models compatible with chunks shorter than 250 ms |
|
|
||||||
| `v1.2` | 2020-12-30 | Number Detector added |
|
|
||||||
| `v2` | 2021-01-11 | Add Language Classifier heads (en, ru, de, es) |
|
|
||||||
| `v2.1` | 2021-02-11 | Add micro (10k params) VAD models |
|
|
||||||
| `v2.2` | 2021-03-22 | Add micro 8000 sample rate VAD models |
|
|
||||||
| `v2.3` | 2021-04-12 | Add mini (100k params) VAD models (8k and 16k sample rate) + **new** adaptive utils for full audio and single audio stream |
|
|
||||||
| `v2.4` | 2021-07-09 | Add 116 languages classifier and group classifier |
|
|
||||||
| `v2.4` | 2021-07-09 | Deleted 116 language classifier, added 95 language classifier instead (get rid of lowspoken languages for quality improvement)
|
|
||||||
|
|
|
||||||
|
|
||||||
### PyTorch
|
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
|
|
||||||
|
|
||||||
We are keeping the colab examples up-to-date, but you can manually manage your dependencies:
|
|
||||||
|
|
||||||
- `pytorch` >= 1.7.1 (there were breaking changes in `torch.hub` introduced in 1.7);
|
|
||||||
- `torchaudio` >= 0.7.2 (used only for IO and resampling, can be easily replaced);
|
|
||||||
- `soundfile` >= 0.10.3 (used as a default backend for torchaudio, can be replaced);
|
|
||||||
|
|
||||||
All of the dependencies except for PyTorch are superficial and for utils / example only. You can use any libraries / pipelines that read files and resample into 16 kHz.
|
|
||||||
|
|
||||||
#### VAD
|
|
||||||
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_vad/)
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
import torch
|
||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
||||||
model='silero_vad',
|
(get_speech_timestamps, _, read_audio, _, _) = utils
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
(get_speech_ts,
|
wav = read_audio('path_to_audio_file')
|
||||||
get_speech_ts_adaptive,
|
speech_timestamps = get_speech_timestamps(
|
||||||
_, read_audio,
|
wav,
|
||||||
_, _, _) = utils
|
model,
|
||||||
|
return_seconds=True, # Return speech timestamps in seconds (default is samples)
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
)
|
||||||
|
|
||||||
wav = read_audio(f'{files_dir}/en.wav')
|
|
||||||
# full audio
|
|
||||||
# get speech timestamps from full audio file
|
|
||||||
|
|
||||||
# classic way
|
|
||||||
speech_timestamps = get_speech_ts(wav, model,
|
|
||||||
num_steps=4)
|
|
||||||
pprint(speech_timestamps)
|
|
||||||
|
|
||||||
# adaptive way
|
|
||||||
speech_timestamps = get_speech_ts_adaptive(wav, model)
|
|
||||||
pprint(speech_timestamps)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Number Detector
|
<br/>
|
||||||
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_number/)
|
<h2 align="center">Key Features</h2>
|
||||||
|
<br/>
|
||||||
|
|
||||||
```python
|
- **Stellar accuracy**
|
||||||
import torch
|
|
||||||
torch.set_num_threads(1)
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
|
||||||
model='silero_number_detector',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
(get_number_ts,
|
- **Fast**
|
||||||
_, read_audio,
|
|
||||||
_, _) = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
|
||||||
|
|
||||||
wav = read_audio(f'{files_dir}/en_num.wav')
|
- **Lightweight**
|
||||||
# full audio
|
|
||||||
# get number timestamps from full audio file
|
|
||||||
number_timestamps = get_number_ts(wav, model)
|
|
||||||
|
|
||||||
pprint(number_timestamps)
|
JIT model is around two megabytes in size.
|
||||||
```
|
|
||||||
|
|
||||||
#### Language Classifier
|
- **General**
|
||||||
##### 4 languages
|
|
||||||
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_language/)
|
Silero VAD was trained on huge corpora that include over **6000** languages and it performs well on audios from different domains with various background noise and quality levels.
|
||||||
|
|
||||||
```python
|
- **Flexible sampling rate**
|
||||||
import torch
|
|
||||||
torch.set_num_threads(1)
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
|
||||||
model='silero_lang_detector',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
get_language, read_audio = utils
|
- **Highly Portable**
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
|
||||||
|
|
||||||
wav = read_audio(f'{files_dir}/de.wav')
|
- **No Strings Attached**
|
||||||
language = get_language(wav, model)
|
|
||||||
|
|
||||||
pprint(language)
|
Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
|
||||||
```
|
|
||||||
|
|
||||||
##### 95 languages
|
<br/>
|
||||||
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_language/)
|
<h2 align="center">Typical Use Cases</h2>
|
||||||
|
<br/>
|
||||||
|
|
||||||
```python
|
- Voice activity detection for IOT / edge / mobile use cases
|
||||||
import torch
|
- Data cleaning and preparation, voice detection in general
|
||||||
torch.set_num_threads(1)
|
- Telephony and call-center automation, voice bots
|
||||||
from pprint import pprint
|
- Voice interfaces
|
||||||
|
|
||||||
model, lang_dict, lang_group_dict, utils = torch.hub.load(
|
<br/>
|
||||||
repo_or_dir='snakers4/silero-vad',
|
<h2 align="center">Links</h2>
|
||||||
model='silero_lang_detector_95',
|
<br/>
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
get_language_and_group, read_audio = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
- [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
|
||||||
|
- [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
|
||||||
|
- [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
|
||||||
|
- [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
|
||||||
|
- [Further reading](https://github.com/snakers4/silero-models#further-reading)
|
||||||
|
- [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
|
||||||
|
|
||||||
wav = read_audio(f'{files_dir}/de.wav')
|
<br/>
|
||||||
languages, language_groups = get_language_and_group(wav, model, lang_dict, lang_group_dict, top_n=2)
|
<h2 align="center">Get In Touch</h2>
|
||||||
|
<br/>
|
||||||
for i in languages:
|
|
||||||
pprint(f'Language: {i[0]} with prob {i[-1]}')
|
|
||||||
|
|
||||||
for i in language_groups:
|
|
||||||
pprint(f'Language group: {i[0]} with prob {i[-1]}')
|
|
||||||
```
|
|
||||||
|
|
||||||
### ONNX
|
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
|
|
||||||
|
|
||||||
You can run our models everywhere, where you can import the ONNX model or run ONNX runtime.
|
|
||||||
|
|
||||||
#### VAD
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
import onnxruntime
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
|
||||||
model='silero_vad',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
(get_speech_ts,
|
|
||||||
get_speech_ts_adaptive,
|
|
||||||
_, read_audio,
|
|
||||||
_, _, _) = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
|
||||||
|
|
||||||
def init_onnx_model(model_path: str):
|
|
||||||
return onnxruntime.InferenceSession(model_path)
|
|
||||||
|
|
||||||
def validate_onnx(model, inputs):
|
|
||||||
with torch.no_grad():
|
|
||||||
ort_inputs = {'input': inputs.cpu().numpy()}
|
|
||||||
outs = model.run(None, ort_inputs)
|
|
||||||
outs = [torch.Tensor(x) for x in outs]
|
|
||||||
return outs[0]
|
|
||||||
|
|
||||||
model = init_onnx_model(f'{files_dir}/model.onnx')
|
|
||||||
wav = read_audio(f'{files_dir}/en.wav')
|
|
||||||
|
|
||||||
# get speech timestamps from full audio file
|
|
||||||
|
|
||||||
# classic way
|
|
||||||
speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx)
|
|
||||||
pprint(speech_timestamps)
|
|
||||||
|
|
||||||
# adaptive way
|
|
||||||
speech_timestamps = get_speech_ts(wav, model, run_function=validate_onnx)
|
|
||||||
pprint(speech_timestamps)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Number Detector
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
import onnxruntime
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
|
||||||
model='silero_number_detector',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
(get_number_ts,
|
|
||||||
_, read_audio,
|
|
||||||
_, _) = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
|
||||||
|
|
||||||
def init_onnx_model(model_path: str):
|
|
||||||
return onnxruntime.InferenceSession(model_path)
|
|
||||||
|
|
||||||
def validate_onnx(model, inputs):
|
|
||||||
with torch.no_grad():
|
|
||||||
ort_inputs = {'input': inputs.cpu().numpy()}
|
|
||||||
outs = model.run(None, ort_inputs)
|
|
||||||
outs = [torch.Tensor(x) for x in outs]
|
|
||||||
return outs
|
|
||||||
|
|
||||||
model = init_onnx_model(f'{files_dir}/number_detector.onnx')
|
|
||||||
wav = read_audio(f'{files_dir}/en_num.wav')
|
|
||||||
|
|
||||||
# get speech timestamps from full audio file
|
|
||||||
number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)
|
|
||||||
pprint(number_timestamps)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Language Classifier
|
|
||||||
##### 4 languages
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
import onnxruntime
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
|
||||||
model='silero_lang_detector',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
get_language, read_audio = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
|
||||||
|
|
||||||
def init_onnx_model(model_path: str):
|
|
||||||
return onnxruntime.InferenceSession(model_path)
|
|
||||||
|
|
||||||
def validate_onnx(model, inputs):
|
|
||||||
with torch.no_grad():
|
|
||||||
ort_inputs = {'input': inputs.cpu().numpy()}
|
|
||||||
outs = model.run(None, ort_inputs)
|
|
||||||
outs = [torch.Tensor(x) for x in outs]
|
|
||||||
return outs
|
|
||||||
|
|
||||||
model = init_onnx_model(f'{files_dir}/number_detector.onnx')
|
|
||||||
wav = read_audio(f'{files_dir}/de.wav')
|
|
||||||
|
|
||||||
language = get_language(wav, model, run_function=validate_onnx)
|
|
||||||
print(language)
|
|
||||||
```
|
|
||||||
|
|
||||||
##### 95 languages
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
import onnxruntime
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
model, lang_dict, lang_group_dict, utils = torch.hub.load(
|
|
||||||
repo_or_dir='snakers4/silero-vad',
|
|
||||||
model='silero_lang_detector_95',
|
|
||||||
force_reload=True)
|
|
||||||
|
|
||||||
get_language_and_group, read_audio = utils
|
|
||||||
|
|
||||||
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
|
|
||||||
|
|
||||||
def init_onnx_model(model_path: str):
|
|
||||||
return onnxruntime.InferenceSession(model_path)
|
|
||||||
|
|
||||||
def validate_onnx(model, inputs):
|
|
||||||
with torch.no_grad():
|
|
||||||
ort_inputs = {'input': inputs.cpu().numpy()}
|
|
||||||
outs = model.run(None, ort_inputs)
|
|
||||||
outs = [torch.Tensor(x) for x in outs]
|
|
||||||
return outs
|
|
||||||
|
|
||||||
model = init_onnx_model(f'{files_dir}/lang_classifier_95.onnx')
|
|
||||||
wav = read_audio(f'{files_dir}/de.wav')
|
|
||||||
|
|
||||||
languages, language_groups = get_language_and_group(wav, model, lang_dict, lang_group_dict, top_n=2, run_function=validate_onnx)
|
|
||||||
|
|
||||||
for i in languages:
|
|
||||||
pprint(f'Language: {i[0]} with prob {i[-1]}')
|
|
||||||
|
|
||||||
for i in language_groups:
|
|
||||||
pprint(f'Language group: {i[0]} with prob {i[-1]}')
|
|
||||||
|
|
||||||
```
|
|
||||||
[](https://pytorch.org/hub/snakers4_silero-vad_language/)
|
|
||||||
|
|
||||||
## Metrics
|
|
||||||
|
|
||||||
### Performance Metrics
|
|
||||||
|
|
||||||
All speed test were run on AMD Ryzen Threadripper 3960X using only 1 thread:
|
|
||||||
```
|
|
||||||
torch.set_num_threads(1) # pytorch
|
|
||||||
ort_session.intra_op_num_threads = 1 # onnx
|
|
||||||
ort_session.inter_op_num_threads = 1 # onnx
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Streaming Latency
|
|
||||||
|
|
||||||
Streaming latency depends on 2 variables:
|
|
||||||
|
|
||||||
- **num_steps** - number of windows to split each audio chunk into. Our post-processing class keeps previous chunk in memory (250 ms), so new chunk (also 250 ms) is appended to it. The resulting big chunk (500 ms) is split into **num_steps** overlapping windows, each 250 ms long.
|
|
||||||
|
|
||||||
- **number of audio streams**
|
|
||||||
|
|
||||||
So **batch size** for streaming is **num_steps * number of audio streams**. Time between receiving new audio chunks and getting results is shown in picture:
|
|
||||||
|
|
||||||
| Batch size | Pytorch model time, ms | Onnx model time, ms |
|
|
||||||
| :--------: | :--------------------: | :-----------------: |
|
|
||||||
| **2** | 9 | 2 |
|
|
||||||
| **4** | 11 | 4 |
|
|
||||||
| **8** | 14 | 7 |
|
|
||||||
| **16** | 19 | 12 |
|
|
||||||
| **40** | 36 | 29 |
|
|
||||||
| **80** | 64 | 55 |
|
|
||||||
| **120** | 96 | 85 |
|
|
||||||
| **200** | 157 | 137 |
|
|
||||||
|
|
||||||
#### Full Audio Throughput
|
|
||||||
|
|
||||||
**RTS** (seconds of audio processed per second, real time speed, or 1 / RTF) for full audio processing depends on **num_steps** (see previous paragraph) and **batch size** (bigger is better).
|
|
||||||
|
|
||||||
| Batch size | num_steps | Pytorch model RTS | Onnx model RTS |
|
|
||||||
| :--------: | :-------: | :---------------: | :------------: |
|
|
||||||
| **40** | **4** | 68 | 86 |
|
|
||||||
| **40** | **8** | 34 | 43 |
|
|
||||||
| **80** | **4** | 78 | 91 |
|
|
||||||
| **80** | **8** | 39 | 45 |
|
|
||||||
| **120** | **4** | 78 | 88 |
|
|
||||||
| **120** | **8** | 39 | 44 |
|
|
||||||
| **200** | **4** | 80 | 91 |
|
|
||||||
| **200** | **8** | 40 | 46 |
|
|
||||||
|
|
||||||
### VAD Quality Metrics
|
|
||||||
|
|
||||||
We use random 250 ms audio chunks for validation. Speech to non-speech ratio among chunks is about ~50/50 (i.e. balanced). Speech chunks are sampled from real audios in four different languages (English, Russian, Spanish, German), then random background noise is added to some of them (~40%).
|
|
||||||
|
|
||||||
Since our VAD (only VAD, other networks are more flexible) was trained on chunks of the same length, model's output is just one float from 0 to 1 - **speech probability**. We use speech probabilities as thresholds for precision-recall curve. This can be extended to 100 - 150 ms. Less than 100 - 150 ms cannot be distinguished as speech with confidence.
|
|
||||||
|
|
||||||
[Webrtc](https://github.com/wiseman/py-webrtcvad) splits audio into frames, each frame has corresponding number (0 **or** 1). We use 30ms frames for webrtc, so each 250 ms chunk is split into 8 frames, their **mean** value is used as a threshold for plot.
|
|
||||||
|
|
||||||
[Auditok](https://github.com/amsehili/auditok) - logic same as Webrtc, but we use 50ms frames.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
## FAQ
|
|
||||||
|
|
||||||
### VAD Parameter Fine Tuning
|
|
||||||
|
|
||||||
#### Classic way
|
|
||||||
|
|
||||||
**This is straightforward classic method `get_speech_ts` where thresholds (`trig_sum` and `neg_trig_sum`) are specified by users**
|
|
||||||
- Among others, we provide several [utils](https://github.com/snakers4/silero-vad/blob/8b28767292b424e3e505c55f15cd3c4b91e4804b/utils.py#L52-L59) to simplify working with VAD;
|
|
||||||
- We provide sensible basic hyper-parameters that work for us, but your case can be different;
|
|
||||||
- `trig_sum` - overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state);
|
|
||||||
- `neg_trig_sum` - same as `trig_sum`, but for switching from triggered to non-triggered state (non-speech)
|
|
||||||
- `num_steps` - nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)
|
|
||||||
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
|
||||||
- `min_speech_samples` - minimum speech chunk duration in samples
|
|
||||||
- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks
|
|
||||||
|
|
||||||
Optimal parameters may vary per domain, but we provided a tiny tool to learn the best parameters. You can invoke `speech_timestamps` with visualize_probs=True (`pandas` required):
|
|
||||||
|
|
||||||
```
|
|
||||||
speech_timestamps = get_speech_ts(wav, model,
|
|
||||||
num_samples_per_window=4000,
|
|
||||||
num_steps=4,
|
|
||||||
visualize_probs=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Adaptive way
|
|
||||||
|
|
||||||
**Adaptive algorithm (`get_speech_ts_adaptive`) automatically selects thresholds (`trig_sum` and `neg_trig_sum`) based on median speech probabilities over the whole audio, SOME ARGUMENTS VARY FROM THE CLASSIC WAY FUNCTION ARGUMENTS**
|
|
||||||
- `batch_size` - batch size to feed to silero VAD (default - `200`)
|
|
||||||
- `step` - step size in samples, (default - `500`) (`num_samples_per_window` / `num_steps` from classic method)
|
|
||||||
- `num_samples_per_window` - number of samples in each window, our models were trained using `4000` samples (250 ms) per window, so this is preferable value (lesser values reduce [quality](https://github.com/snakers4/silero-vad/issues/2#issuecomment-750840434));
|
|
||||||
- `min_speech_samples` - minimum speech chunk duration in samples (default - `10000`)
|
|
||||||
- `min_silence_samples` - minimum silence duration in samples between to separate speech chunks (default - `4000`)
|
|
||||||
- `speech_pad_samples` - widen speech by this amount of samples each side (default - `2000`)
|
|
||||||
|
|
||||||
```
|
|
||||||
speech_timestamps = get_speech_ts_adaptive(wav, model,
|
|
||||||
num_samples_per_window=4000,
|
|
||||||
step=500,
|
|
||||||
visualize_probs=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
The chart should looks something like this:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
With this particular example you can try shorter chunks (`num_samples_per_window=1600`), but this results in too much noise:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
|
|
||||||
### How VAD Works
|
|
||||||
|
|
||||||
- Audio is split into 250 ms chunks (you can choose any chunk size, but quality with chunks shorter than 100ms will suffer and there will be more false positives and "unnatural" pauses);
|
|
||||||
- VAD keeps record of a previous chunk (or zeros at the beginning of the stream);
|
|
||||||
- Then this 500 ms audio (250 ms + 250 ms) is split into N (typically 4 or 8) windows and the model is applied to this window batch. Each window is 250 ms long (naturally, windows overlap);
|
|
||||||
- Then probability is averaged across these windows;
|
|
||||||
- Though typically pauses in speech are 300 ms+ or longer (pauses less than 200-300ms are typically not meaninful), it is hard to confidently classify speech vs noise / music on very short chunks (i.e. 30 - 50ms);
|
|
||||||
- ~~We are working on lifting this limitation, so that you can use 100 - 125ms windows~~;
|
|
||||||
|
|
||||||
### VAD Quality Metrics Methodology
|
|
||||||
|
|
||||||
Please see [Quality Metrics](#quality-metrics)
|
|
||||||
|
|
||||||
### How Number Detector Works
|
|
||||||
|
|
||||||
- It is recommended to split long audio into short ones (< 15s) and apply model on each of them;
|
|
||||||
- Number Detector can classify if the whole audio contains a number, or if each audio frame contains a number;
|
|
||||||
- Audio is splitted into frames in a certain way, so, having a per-frame output, we can restore timing bounds for a numbers with an accuracy of about 0.2s;
|
|
||||||
|
|
||||||
### How Language Classifier Works
|
|
||||||
|
|
||||||
- **99%** validation accuracy
|
|
||||||
- Language classifier was trained using audio samples in 4 languages: **Russian**, **English**, **Spanish**, **German**
|
|
||||||
- More languages TBD
|
|
||||||
- Arbitrary audio length can be used, although network was trained using audio shorter than 15 seconds
|
|
||||||
|
|
||||||
### How Language Classifier 95 Works
|
|
||||||
|
|
||||||
- **85%** validation accuracy among 95 languages, **90%** validation accuracy among [58 language groups](https://github.com/snakers4/silero-vad/blob/master/files/lang_group_dict_95.json)
|
|
||||||
- Language classifier 95 was trained using audio samples in [95 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_95.json)
|
|
||||||
- Arbitrary audio length can be used, although network was trained using audio shorter than 20 seconds
|
|
||||||
|
|
||||||
## Contact
|
|
||||||
|
|
||||||
### Get in Touch
|
|
||||||
|
|
||||||
Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
|
Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
|
||||||
|
|
||||||
### Commercial Inquiries
|
Please see our [wiki](https://github.com/snakers4/silero-models/wiki) for relevant information and [email](mailto:hello@silero.ai) us directly.
|
||||||
|
|
||||||
Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers](https://github.com/snakers4/silero-models/wiki/Licensing-and-Tiers) for relevant information and [email](mailto:hello@silero.ai) us directly.
|
**Citations**
|
||||||
|
|
||||||
## Further reading
|
|
||||||
|
|
||||||
### General
|
|
||||||
|
|
||||||
- Silero-models - https://github.com/snakers4/silero-models
|
|
||||||
- Nice [thread](https://github.com/snakers4/silero-vad/discussions/16#discussioncomment-305830) in discussions
|
|
||||||
|
|
||||||
### English
|
|
||||||
|
|
||||||
- STT:
|
|
||||||
- Towards an Imagenet Moment For Speech-To-Text - [link](https://thegradient.pub/towards-an-imagenet-moment-for-speech-to-text/)
|
|
||||||
- A Speech-To-Text Practitioners Criticisms of Industry and Academia - [link](https://thegradient.pub/a-speech-to-text-practitioners-criticisms-of-industry-and-academia/)
|
|
||||||
- Modern Google-level STT Models Released - [link](https://habr.com/ru/post/519562/)
|
|
||||||
|
|
||||||
- TTS:
|
|
||||||
- High-Quality Text-to-Speech Made Accessible, Simple and Fast - [link](https://habr.com/ru/post/549482/)
|
|
||||||
|
|
||||||
- VAD:
|
|
||||||
- Modern Portable Voice Activity Detector Released - [link](https://habr.com/ru/post/537276/)
|
|
||||||
|
|
||||||
- Text Enhancement:
|
|
||||||
- We have published a model for text repunctuation and recapitalization for four languages - [link](https://habr.com/ru/post/581960/)
|
|
||||||
|
|
||||||
### Chinese
|
|
||||||
|
|
||||||
- STT:
|
|
||||||
- 迈向语音识别领域的 ImageNet 时刻 - [link](https://www.infoq.cn/article/4u58WcFCs0RdpoXev1E2)
|
|
||||||
- 语音领域学术界和工业界的七宗罪 - [link](https://www.infoq.cn/article/lEe6GCRjF1CNToVITvNw)
|
|
||||||
|
|
||||||
### Russian
|
|
||||||
|
|
||||||
- STT
|
|
||||||
- Последние обновления моделей распознавания речи из Silero Models - [link](https://habr.com/ru/post/577630/)
|
|
||||||
- Сжимаем трансформеры: простые, универсальные и прикладные способы cделать их компактными и быстрыми - [link](https://habr.com/ru/post/563778/)
|
|
||||||
- Ультимативное сравнение систем распознавания речи: Ashmanov, Google, Sber, Silero, Tinkoff, Yandex - [link](https://habr.com/ru/post/559640/)
|
|
||||||
- Мы опубликовали современные STT модели сравнимые по качеству с Google - [link](https://habr.com/ru/post/519564/)
|
|
||||||
- Понижаем барьеры на вход в распознавание речи - [link](https://habr.com/ru/post/494006/)
|
|
||||||
- Огромный открытый датасет русской речи версия 1.0 - [link](https://habr.com/ru/post/474462/)
|
|
||||||
- Насколько Быстрой Можно Сделать Систему STT? - [link](https://habr.com/ru/post/531524/)
|
|
||||||
- Наша система Speech-To-Text - [link](https://www.silero.ai/tag/our-speech-to-text/)
|
|
||||||
- Speech To Text - [link](https://www.silero.ai/tag/speech-to-text/)
|
|
||||||
|
|
||||||
- TTS:
|
|
||||||
- Мы сделали наш публичный синтез речи еще лучше - [link](https://habr.com/ru/post/563484/)
|
|
||||||
- Мы Опубликовали Качественный, Простой, Доступный и Быстрый Синтез Речи - [link](https://habr.com/ru/post/549480/)
|
|
||||||
|
|
||||||
- VAD:
|
|
||||||
- Модели для Детекции Речи, Чисел и Распознавания Языков - [link](https://www.silero.ai/vad-lang-classifier-number-detector/)
|
|
||||||
- Мы опубликовали современный Voice Activity Detector и не только -[link](https://habr.com/ru/post/537274/)
|
|
||||||
|
|
||||||
- Text Enhancement:
|
|
||||||
- Мы опубликовали модель, расставляющую знаки препинания и заглавные буквы в тексте на четырех языках - [link](https://habr.com/ru/post/581946/)
|
|
||||||
|
|
||||||
|
|
||||||
## Citations
|
|
||||||
|
|
||||||
```
|
```
|
||||||
@misc{Silero VAD,
|
@misc{Silero VAD,
|
||||||
author = {Silero Team},
|
author = {Silero Team},
|
||||||
title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
|
title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
|
||||||
year = {2021},
|
year = {2024},
|
||||||
publisher = {GitHub},
|
publisher = {GitHub},
|
||||||
journal = {GitHub repository},
|
journal = {GitHub repository},
|
||||||
howpublished = {\url{https://github.com/snakers4/silero-vad}},
|
howpublished = {\url{https://github.com/snakers4/silero-vad}},
|
||||||
@@ -625,3 +166,13 @@ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers
|
|||||||
email = {hello@silero.ai}
|
email = {hello@silero.ai}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
<h2 align="center">Examples and VAD-based Community Apps</h2>
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
||||||
|
|
||||||
|
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
||||||
|
|
||||||
|
- [Rust](https://github.com/snakers4/silero-vad/tree/master/examples/rust-example), [Go](https://github.com/snakers4/silero-vad/tree/master/examples/go), [Java](https://github.com/snakers4/silero-vad/tree/master/examples/java-example), [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp), [C#](https://github.com/snakers4/silero-vad/tree/master/examples/csharp) and [other](https://github.com/snakers4/silero-vad/tree/master/examples) community examples
|
||||||
|
|||||||
84
datasets/README.md
Normal file
84
datasets/README.md
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# Датасет Silero-VAD
|
||||||
|
|
||||||
|
> Датасет создан при поддержке Фонда содействия инновациям в рамках федерального проекта «Искусственный
|
||||||
|
интеллект» национальной программы «Цифровая экономика Российской Федерации».
|
||||||
|
|
||||||
|
По ссылкам ниже представлены `.feather` файлы, содержащие размеченные с помощью Silero VAD открытые наборы аудиоданных, а также короткое описание каждого набора данных с примерами загрузки. `.feather` файлы можно открыть с помощью библиотеки `pandas`:
|
||||||
|
```python3
|
||||||
|
import pandas as pd
|
||||||
|
dataframe = pd.read_feather(PATH_TO_FEATHER_FILE)
|
||||||
|
```
|
||||||
|
|
||||||
|
Каждый `.feather` файл с разметкой содержит следующие колонки:
|
||||||
|
- `speech_timings` - разметка данного аудио. Это список, содержащий словари вида `{'start': START_SECOND, 'end': END_SECOND}`, где `START_SECOND` и `END_SECOND` - время начала и конца речи в секундах. Количество данных словарей равно количеству речевых аудио отрывков, найденных в данном аудио;
|
||||||
|
- `language` - ISO код языка данного аудио.
|
||||||
|
|
||||||
|
Колонки, содержащие информацию о загрузке аудио файла различаются и описаны для каждого набора данных ниже.
|
||||||
|
|
||||||
|
**Все данные размечены при временной дискретизации в ~30 миллисекунд (`num_samples` - 512)**
|
||||||
|
|
||||||
|
| Название | Число часов | Число языков | Ссылка | Лицензия | md5sum |
|
||||||
|
|----------------------|-------------|-------------|--------|----------|----------|
|
||||||
|
| **Bible.is** | 53,138 | 1,596 | [URL](https://live.bible.is/) | [Уникальная](https://live.bible.is/terms) | ea404eeaf2cd283b8223f63002be11f9 |
|
||||||
|
| **globalrecordings.net** | 9,743 | 6,171[^1] | [URL](https://globalrecordings.net/en) | CC BY-NC-SA 4.0 | 3c5c0f31b0abd9fe94ddbe8b1e2eb326 |
|
||||||
|
| **VoxLingua107** | 6,628 | 107 | [URL](https://bark.phon.ioc.ee/voxlingua107/) | CC BY 4.0 | 5dfef33b4d091b6d399cfaf3d05f2140 |
|
||||||
|
| **Common Voice** | 30,329 | 120 | [URL](https://commonvoice.mozilla.org/en/datasets) | CC0 | 5e30a85126adf74a5fd1496e6ac8695d |
|
||||||
|
| **MLS** | 50,709 | 8 | [URL](https://www.openslr.org/94/) | CC BY 4.0 | a339d0e94bdf41bba3c003756254ac4e |
|
||||||
|
| **Итого** | **150,547** | **6,171+** | | | |
|
||||||
|
|
||||||
|
## Bible.is
|
||||||
|
|
||||||
|
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/BibleIs.feather)
|
||||||
|
|
||||||
|
- Колонка `audio_link` содержит ссылки на конкретные аудио файлы.
|
||||||
|
|
||||||
|
## globalrecordings.net
|
||||||
|
|
||||||
|
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/globalrecordings.feather)
|
||||||
|
|
||||||
|
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
||||||
|
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
||||||
|
|
||||||
|
``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
|
||||||
|
|
||||||
|
## VoxLingua107
|
||||||
|
|
||||||
|
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/VoxLingua107.feather)
|
||||||
|
|
||||||
|
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
||||||
|
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
||||||
|
|
||||||
|
## Common Voice
|
||||||
|
|
||||||
|
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/common_voice.feather)
|
||||||
|
|
||||||
|
Этот датасет невозможно скачать по статичным ссылкам. Для загрузки необходимо перейти по [ссылке](https://commonvoice.mozilla.org/en/datasets) и, получив доступ в соответствующей форме, скачать архивы для каждого доступного языка. Внимание! Представленная разметка актуальна для версии исходного датасета `Common Voice Corpus 16.1`.
|
||||||
|
|
||||||
|
- Колонка `audio_path` содержит уникальные названия `.mp3` файлов, полученных после скачивания соответствующего датасета.
|
||||||
|
|
||||||
|
## MLS
|
||||||
|
|
||||||
|
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/MLS.feather)
|
||||||
|
|
||||||
|
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
||||||
|
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
||||||
|
|
||||||
|
## Лицензия
|
||||||
|
|
||||||
|
Данный датасет распространяется под [лицензией](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en) `CC BY-NC-SA 4.0`.
|
||||||
|
|
||||||
|
## Цитирование
|
||||||
|
|
||||||
|
```
|
||||||
|
@misc{Silero VAD Dataset,
|
||||||
|
author = {Silero Team},
|
||||||
|
title = {Silero-VAD Dataset: a large public Internet-scale dataset for voice activity detection for 6000+ languages},
|
||||||
|
year = {2024},
|
||||||
|
publisher = {GitHub},
|
||||||
|
journal = {GitHub repository},
|
||||||
|
howpublished = {\url{https://github.com/snakers4/silero-vad/datasets/README.md}},
|
||||||
|
email = {hello@silero.ai}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
[^1]: ``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
|
||||||
49
examples/c++/README.md
Normal file
49
examples/c++/README.md
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
# Silero-VAD V6 in C++ (based on LibTorch)
|
||||||
|
|
||||||
|
This is the source code for Silero-VAD V6 in C++, utilizing LibTorch & Onnxruntime.
|
||||||
|
You should compare its results with the Python version.
|
||||||
|
Results at 16 and 8kHz have been tested. Batch and CUDA inference options are deprecated.
|
||||||
|
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
- GCC 11.4.0 (GCC >= 5.1)
|
||||||
|
- Onnxruntime 1.11.0 (other versions are also acceptable)
|
||||||
|
- LibTorch 1.13.0 (other versions are also acceptable)
|
||||||
|
|
||||||
|
## Download LibTorch
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-Onnxruntime
|
||||||
|
$wget https://github.com/microsoft/onnxruntime/releases/download/v1.11.1/onnxruntime-linux-x64-1.11.1.tgz
|
||||||
|
$tar -xvf onnxruntime-linux-x64-1.11.1.tgz
|
||||||
|
$ln -s onnxruntime-linux-x64-1.11.1 onnxruntime-linux #soft-link
|
||||||
|
|
||||||
|
-Libtorch
|
||||||
|
$wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
|
||||||
|
$unzip libtorch-shared-with-deps-1.13.0+cpu.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
## Compilation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-ONNX-build
|
||||||
|
$g++ main.cc silero.cc -I ./onnxruntime-linux/include/ -L ./onnxruntime-linux/lib/ -lonnxruntime -Wl,-rpath,./onnxruntime-linux/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_ONNX
|
||||||
|
|
||||||
|
-TORCH-build
|
||||||
|
$g++ main.cc silero.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_TORCH
|
||||||
|
```
|
||||||
|
|
||||||
|
## Optional Compilation Flags
|
||||||
|
-DUSE_TORCH
|
||||||
|
-DUSE_ONNX
|
||||||
|
|
||||||
|
## Run the Program
|
||||||
|
To run the program, use the following command:
|
||||||
|
|
||||||
|
`./silero <sample.wav> <SampleRate> <threshold>`
|
||||||
|
`./silero aepyx.wav 16000 0.5`
|
||||||
|
`./silero aepyx_8k.wav 8000 0.5`
|
||||||
|
|
||||||
|
The sample file aepyx.wav is part of the Voxconverse dataset.
|
||||||
|
File details: aepyx.wav is a 16kHz, 16-bit audio file.
|
||||||
|
File details: aepyx_8k.wav is a 8kHz, 16-bit audio file.
|
||||||
BIN
examples/c++/aepyx.wav
Normal file
BIN
examples/c++/aepyx.wav
Normal file
Binary file not shown.
BIN
examples/c++/aepyx_8k.wav
Normal file
BIN
examples/c++/aepyx_8k.wav
Normal file
Binary file not shown.
61
examples/c++/main.cc
Normal file
61
examples/c++/main.cc
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include "silero.h"
|
||||||
|
#include "wav.h"
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
|
if(argc != 4){
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" <wav.path> <SampleRate> <Threshold>"<<std::endl;
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" sample.wav 16000 0.5"<<std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string wav_path = argv[1];
|
||||||
|
float sample_rate = std::stof(argv[2]);
|
||||||
|
float threshold = std::stof(argv[3]);
|
||||||
|
|
||||||
|
if (sample_rate != 16000 && sample_rate != 8000) {
|
||||||
|
std::cout<<"Unsupported sample rate (only 16000 or 8000)."<<std::endl;
|
||||||
|
exit (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Load Model
|
||||||
|
#ifdef USE_TORCH
|
||||||
|
std::string model_path = "../../src/silero_vad/data/silero_vad.jit";
|
||||||
|
#elif USE_ONNX
|
||||||
|
std::string model_path = "../../src/silero_vad/data/silero_vad.onnx";
|
||||||
|
#endif
|
||||||
|
silero::VadIterator vad(model_path);
|
||||||
|
|
||||||
|
vad.threshold=threshold; //(Default:0.5)
|
||||||
|
vad.sample_rate=sample_rate; //16000Hz,8000Hz. (Default:16000)
|
||||||
|
vad.print_as_samples=false; //if true, it prints time-stamp with samples. otherwise, in seconds
|
||||||
|
//(Default:false)
|
||||||
|
|
||||||
|
vad.SetVariables();
|
||||||
|
|
||||||
|
// Read wav
|
||||||
|
wav::WavReader wav_reader(wav_path);
|
||||||
|
std::vector<float> input_wav(wav_reader.num_samples());
|
||||||
|
|
||||||
|
for (int i = 0; i < wav_reader.num_samples(); i++)
|
||||||
|
{
|
||||||
|
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.SpeechProbs(input_wav);
|
||||||
|
|
||||||
|
std::vector<silero::Interval> speeches = vad.GetSpeechTimestamps();
|
||||||
|
for(const auto& speech : speeches){
|
||||||
|
if(vad.print_as_samples){
|
||||||
|
std::cout<<"{'start': "<<static_cast<int>(speech.start)<<", 'end': "<<static_cast<int>(speech.end)<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout<<"{'start': "<<speech.start<<", 'end': "<<speech.end<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
273
examples/c++/silero.cc
Normal file
273
examples/c++/silero.cc
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
// silero.cc
|
||||||
|
// Author : NathanJHLee
|
||||||
|
// Created On : 2025-11-10
|
||||||
|
// Description : silero 6.2 system for onnx-runtime(c++) and torch-script(c++)
|
||||||
|
// Version : 1.3
|
||||||
|
|
||||||
|
#include "silero.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace silero {
|
||||||
|
|
||||||
|
#ifdef USE_TORCH
|
||||||
|
VadIterator::VadIterator(const std::string &model_path,
|
||||||
|
float threshold,
|
||||||
|
int sample_rate,
|
||||||
|
int window_size_ms,
|
||||||
|
int speech_pad_ms,
|
||||||
|
int min_silence_duration_ms,
|
||||||
|
int min_speech_duration_ms,
|
||||||
|
int max_duration_merge_ms,
|
||||||
|
bool print_as_samples)
|
||||||
|
: threshold(threshold), sample_rate(sample_rate), window_size_ms(window_size_ms),
|
||||||
|
speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms),
|
||||||
|
min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms),
|
||||||
|
print_as_samples(print_as_samples)
|
||||||
|
{
|
||||||
|
init_torch_model(model_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
VadIterator::~VadIterator(){
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void VadIterator::init_torch_model(const std::string& model_path) {
|
||||||
|
at::set_num_threads(1);
|
||||||
|
model = torch::jit::load(model_path);
|
||||||
|
|
||||||
|
model.eval();
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::cout<<"Silero libtorch-Model loaded successfully"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::SpeechProbs(std::vector<float>& input_wav) {
|
||||||
|
int num_samples = input_wav.size();
|
||||||
|
int num_chunks = num_samples / window_size_samples;
|
||||||
|
int remainder_samples = num_samples % window_size_samples;
|
||||||
|
total_sample_size += num_samples;
|
||||||
|
|
||||||
|
std::vector<torch::Tensor> chunks;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_chunks; i++) {
|
||||||
|
float* chunk_start = input_wav.data() + i * window_size_samples;
|
||||||
|
torch::Tensor chunk = torch::from_blob(chunk_start, {1, window_size_samples}, torch::kFloat32);
|
||||||
|
chunks.push_back(chunk);
|
||||||
|
|
||||||
|
if (i == num_chunks - 1 && remainder_samples > 0) {
|
||||||
|
int remaining_samples = num_samples - num_chunks * window_size_samples;
|
||||||
|
float* chunk_start_remainder = input_wav.data() + num_chunks * window_size_samples;
|
||||||
|
torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1, remaining_samples}, torch::kFloat32);
|
||||||
|
torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples - remaining_samples}, torch::kFloat32)}, 1);
|
||||||
|
chunks.push_back(padded_chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chunks.empty()) {
|
||||||
|
std::vector<torch::Tensor> outputs;
|
||||||
|
torch::Tensor batched_chunks = torch::stack(chunks);
|
||||||
|
for (size_t i = 0; i < chunks.size(); i++) {
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::vector<torch::jit::IValue> inputs;
|
||||||
|
inputs.push_back(batched_chunks[i]);
|
||||||
|
inputs.push_back(sample_rate);
|
||||||
|
torch::Tensor output = model.forward(inputs).toTensor();
|
||||||
|
outputs.push_back(output);
|
||||||
|
}
|
||||||
|
torch::Tensor all_outputs = torch::stack(outputs);
|
||||||
|
for (size_t i = 0; i < chunks.size(); i++) {
|
||||||
|
float output_f = all_outputs[i].item<float>();
|
||||||
|
outputs_prob.push_back(output_f);
|
||||||
|
//////To print Probs by libtorch
|
||||||
|
//std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif USE_ONNX
|
||||||
|
|
||||||
|
VadIterator::VadIterator(const std::string &model_path,
|
||||||
|
float threshold,
|
||||||
|
int sample_rate,
|
||||||
|
int window_size_ms,
|
||||||
|
int speech_pad_ms,
|
||||||
|
int min_silence_duration_ms,
|
||||||
|
int min_speech_duration_ms,
|
||||||
|
int max_duration_merge_ms,
|
||||||
|
bool print_as_samples)
|
||||||
|
:sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms),
|
||||||
|
speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms),
|
||||||
|
min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms),
|
||||||
|
print_as_samples(print_as_samples),
|
||||||
|
env(ORT_LOGGING_LEVEL_ERROR, "Vad"), session_options(), session(nullptr), allocator(),
|
||||||
|
memory_info(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU)), context_samples(64),
|
||||||
|
_context(64, 0.0f), current_sample(0), size_state(2 * 1 * 128),
|
||||||
|
input_node_names({"input", "state", "sr"}), output_node_names({"output", "stateN"}),
|
||||||
|
state_node_dims{2, 1, 128}, sr_node_dims{1}
|
||||||
|
|
||||||
|
{
|
||||||
|
init_onnx_model(model_path);
|
||||||
|
}
|
||||||
|
VadIterator::~VadIterator(){
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_onnx_model(const std::string& model_path) {
|
||||||
|
int inter_threads=1;
|
||||||
|
int intra_threads=1;
|
||||||
|
session_options.SetIntraOpNumThreads(intra_threads);
|
||||||
|
session_options.SetInterOpNumThreads(inter_threads);
|
||||||
|
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
|
||||||
|
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
|
||||||
|
std::cout<<"Silero onnx-Model loaded successfully"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
float VadIterator::predict(const std::vector<float>& data_chunk) {
|
||||||
|
// _context와 현재 청크를 결합하여 입력 데이터 구성
|
||||||
|
std::vector<float> new_data(effective_window_size, 0.0f);
|
||||||
|
std::copy(_context.begin(), _context.end(), new_data.begin());
|
||||||
|
std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
|
||||||
|
input = new_data;
|
||||||
|
|
||||||
|
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
||||||
|
memory_info, input.data(), input.size(), input_node_dims, 2);
|
||||||
|
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
|
||||||
|
memory_info, _state.data(), _state.size(), state_node_dims, 3);
|
||||||
|
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
||||||
|
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
||||||
|
ort_inputs.clear();
|
||||||
|
ort_inputs.push_back(std::move(input_ort));
|
||||||
|
ort_inputs.push_back(std::move(state_ort));
|
||||||
|
ort_inputs.push_back(std::move(sr_ort));
|
||||||
|
|
||||||
|
ort_outputs = session->Run(
|
||||||
|
Ort::RunOptions{ nullptr },
|
||||||
|
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
|
||||||
|
output_node_names.data(), output_node_names.size());
|
||||||
|
|
||||||
|
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0]; // ONNX 출력: 첫 번째 값이 음성 확률
|
||||||
|
|
||||||
|
float* stateN = ort_outputs[1].GetTensorMutableData<float>(); // 두 번째 출력값: 상태 업데이트
|
||||||
|
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
|
||||||
|
|
||||||
|
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
||||||
|
// _context 업데이트: new_data의 마지막 context_samples 유지
|
||||||
|
|
||||||
|
return speech_prob;
|
||||||
|
}
|
||||||
|
void VadIterator::SpeechProbs(std::vector<float>& input_wav) {
|
||||||
|
reset_states();
|
||||||
|
total_sample_size = static_cast<int>(input_wav.size());
|
||||||
|
for (size_t j = 0; j < static_cast<size_t>(total_sample_size); j += window_size_samples) {
|
||||||
|
if (j + window_size_samples > static_cast<size_t>(total_sample_size))
|
||||||
|
break;
|
||||||
|
std::vector<float> chunk(input_wav.begin() + j, input_wav.begin() + j + window_size_samples);
|
||||||
|
float speech_prob = predict(chunk);
|
||||||
|
outputs_prob.push_back(speech_prob);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void VadIterator::reset_states() {
|
||||||
|
triggered = false;
|
||||||
|
current_sample = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
outputs_prob.clear();
|
||||||
|
total_sample_size = 0;
|
||||||
|
|
||||||
|
#ifdef USE_TORCH
|
||||||
|
model.run_method("reset_states"); // Reset model states if applicable
|
||||||
|
#elif USE_ONNX
|
||||||
|
std::memset(_state.data(), 0, _state.size() * sizeof(float));
|
||||||
|
std::fill(_context.begin(), _context.end(), 0.0f);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Interval> VadIterator::GetSpeechTimestamps() {
|
||||||
|
std::vector<Interval> speeches = DoVad();
|
||||||
|
if(!print_as_samples){
|
||||||
|
for (auto& speech : speeches) {
|
||||||
|
speech.start /= sample_rate;
|
||||||
|
speech.end /= sample_rate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::SetVariables(){
|
||||||
|
// Initialize internal engine parameters
|
||||||
|
init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_engine(int window_size_ms) {
|
||||||
|
min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
|
||||||
|
speech_pad_samples = sample_rate * speech_pad_ms / 1000;
|
||||||
|
window_size_samples = sample_rate / 1000 * window_size_ms;
|
||||||
|
min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
|
||||||
|
#ifdef USE_ONNX
|
||||||
|
//for ONNX
|
||||||
|
context_samples=window_size_samples / 8;
|
||||||
|
_context.assign(context_samples, 0.0f);
|
||||||
|
|
||||||
|
effective_window_size = window_size_samples + context_samples; // 예: 512 + 64 = 576 samples
|
||||||
|
input_node_dims[0] = 1;
|
||||||
|
input_node_dims[1] = effective_window_size;
|
||||||
|
_state.resize(size_state);
|
||||||
|
sr.resize(1);
|
||||||
|
sr[0] = sample_rate;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Interval> VadIterator::DoVad() {
|
||||||
|
std::vector<Interval> speeches;
|
||||||
|
for (size_t i = 0; i < outputs_prob.size(); ++i) {
|
||||||
|
float speech_prob = outputs_prob[i];
|
||||||
|
current_sample += window_size_samples;
|
||||||
|
if (speech_prob >= threshold && temp_end != 0) {
|
||||||
|
temp_end = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob >= threshold) {
|
||||||
|
if (!triggered) {
|
||||||
|
triggered = true;
|
||||||
|
Interval segment;
|
||||||
|
segment.start = std::max(0, current_sample - speech_pad_samples - window_size_samples);
|
||||||
|
speeches.push_back(segment);
|
||||||
|
}
|
||||||
|
}else {
|
||||||
|
if (triggered) {
|
||||||
|
if (speech_prob < threshold - 0.15f) {
|
||||||
|
if (temp_end == 0) {
|
||||||
|
temp_end = current_sample;
|
||||||
|
}
|
||||||
|
if (current_sample - temp_end >= min_silence_samples) {
|
||||||
|
Interval& segment = speeches.back();
|
||||||
|
segment.end = temp_end + speech_pad_samples - window_size_samples;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (triggered) {
|
||||||
|
std::cout<<"Finalizing active speech segment at stream end."<<std::endl;
|
||||||
|
Interval& segment = speeches.back();
|
||||||
|
segment.end = total_sample_size;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
speeches.erase(std::remove_if(speeches.begin(), speeches.end(),
|
||||||
|
[this](const Interval& speech) {
|
||||||
|
return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
|
||||||
|
}), speeches.end());
|
||||||
|
|
||||||
|
reset_states();
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace silero
|
||||||
|
|
||||||
123
examples/c++/silero.h
Normal file
123
examples/c++/silero.h
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
#ifndef SILERO_H
|
||||||
|
#define SILERO_H
|
||||||
|
|
||||||
|
// silero.h
|
||||||
|
// Author : NathanJHLee
|
||||||
|
// Created On : 2025-11-10
|
||||||
|
// Description : silero 6.2 system for onnx-runtime(c++) and torch-script(c++)
|
||||||
|
// Version : 1.3
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <chrono>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#ifdef USE_TORCH
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <torch/script.h>
|
||||||
|
#elif USE_ONNX
|
||||||
|
#include "onnxruntime_cxx_api.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace silero {
|
||||||
|
|
||||||
|
struct Interval {
|
||||||
|
float start;
|
||||||
|
float end;
|
||||||
|
int numberOfSubseg;
|
||||||
|
|
||||||
|
void initialize() {
|
||||||
|
start = 0;
|
||||||
|
end = 0;
|
||||||
|
numberOfSubseg = 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class VadIterator {
|
||||||
|
public:
|
||||||
|
VadIterator(const std::string &model_path,
|
||||||
|
float threshold = 0.5,
|
||||||
|
int sample_rate = 16000,
|
||||||
|
int window_size_ms = 32,
|
||||||
|
int speech_pad_ms = 30,
|
||||||
|
int min_silence_duration_ms = 100,
|
||||||
|
int min_speech_duration_ms = 250,
|
||||||
|
int max_duration_merge_ms = 300,
|
||||||
|
bool print_as_samples = false);
|
||||||
|
~VadIterator();
|
||||||
|
|
||||||
|
// Batch (non-streaming) interface (for backward compatibility)
|
||||||
|
void SpeechProbs(std::vector<float>& input_wav);
|
||||||
|
std::vector<Interval> GetSpeechTimestamps();
|
||||||
|
void SetVariables();
|
||||||
|
|
||||||
|
// Public parameters (can be modified by user)
|
||||||
|
float threshold;
|
||||||
|
int sample_rate;
|
||||||
|
int window_size_ms;
|
||||||
|
int min_speech_duration_ms;
|
||||||
|
int max_duration_merge_ms;
|
||||||
|
bool print_as_samples;
|
||||||
|
|
||||||
|
private:
|
||||||
|
#ifdef USE_TORCH
|
||||||
|
torch::jit::script::Module model;
|
||||||
|
void init_torch_model(const std::string& model_path);
|
||||||
|
#elif USE_ONNX
|
||||||
|
Ort::Env env; // 환경 객체
|
||||||
|
Ort::SessionOptions session_options; // 세션 옵션
|
||||||
|
std::shared_ptr<Ort::Session> session; // ONNX 세션
|
||||||
|
Ort::AllocatorWithDefaultOptions allocator; // 기본 할당자
|
||||||
|
Ort::MemoryInfo memory_info; // 메모리 정보 (CPU)
|
||||||
|
|
||||||
|
void init_onnx_model(const std::string& model_path);
|
||||||
|
float predict(const std::vector<float>& data_chunk);
|
||||||
|
|
||||||
|
//const int context_samples; // 예: 64 samples
|
||||||
|
int context_samples; // 예: 64 samples
|
||||||
|
std::vector<float> _context; // 초기값 모두 0
|
||||||
|
int effective_window_size;
|
||||||
|
|
||||||
|
// ONNX 입력/출력 관련 버퍼 및 노드 이름들
|
||||||
|
std::vector<Ort::Value> ort_inputs;
|
||||||
|
std::vector<const char*> input_node_names;
|
||||||
|
std::vector<float> input;
|
||||||
|
unsigned int size_state; // 고정값: 2*1*128
|
||||||
|
std::vector<float> _state;
|
||||||
|
std::vector<int64_t> sr;
|
||||||
|
int64_t input_node_dims[2]; // [1, effective_window_size]
|
||||||
|
const int64_t state_node_dims[3]; // [ 2, 1, 128 ]
|
||||||
|
const int64_t sr_node_dims[1]; // [ 1 ]
|
||||||
|
std::vector<Ort::Value> ort_outputs;
|
||||||
|
std::vector<const char*> output_node_names; // 기본값: [ "output", "stateN" ]
|
||||||
|
#endif
|
||||||
|
std::vector<float> outputs_prob; // used in batch mode
|
||||||
|
int min_silence_samples;
|
||||||
|
int min_speech_samples;
|
||||||
|
int speech_pad_samples;
|
||||||
|
int window_size_samples;
|
||||||
|
int duration_merge_samples;
|
||||||
|
int current_sample = 0;
|
||||||
|
int total_sample_size = 0;
|
||||||
|
int min_silence_duration_ms;
|
||||||
|
int speech_pad_ms;
|
||||||
|
bool triggered = false;
|
||||||
|
int temp_end = 0;
|
||||||
|
int global_end = 0;
|
||||||
|
int erase_tail_count = 0;
|
||||||
|
|
||||||
|
|
||||||
|
void init_engine(int window_size_ms);
|
||||||
|
void reset_states();
|
||||||
|
std::vector<Interval> DoVad();
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace silero
|
||||||
|
|
||||||
|
#endif // SILERO_H
|
||||||
|
|
||||||
237
examples/c++/wav.h
Normal file
237
examples/c++/wav.h
Normal file
@@ -0,0 +1,237 @@
|
|||||||
|
// Copyright (c) 2016 Personal (Binbin Zhang)
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef FRONTEND_WAV_H_
|
||||||
|
#define FRONTEND_WAV_H_
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// #include "utils/log.h"
|
||||||
|
|
||||||
|
namespace wav {
|
||||||
|
|
||||||
|
struct WavHeader {
|
||||||
|
char riff[4]; // "riff"
|
||||||
|
unsigned int size;
|
||||||
|
char wav[4]; // "WAVE"
|
||||||
|
char fmt[4]; // "fmt "
|
||||||
|
unsigned int fmt_size;
|
||||||
|
uint16_t format;
|
||||||
|
uint16_t channels;
|
||||||
|
unsigned int sample_rate;
|
||||||
|
unsigned int bytes_per_second;
|
||||||
|
uint16_t block_size;
|
||||||
|
uint16_t bit;
|
||||||
|
char data[4]; // "data"
|
||||||
|
unsigned int data_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavReader {
|
||||||
|
public:
|
||||||
|
WavReader() : data_(nullptr) {}
|
||||||
|
explicit WavReader(const std::string& filename) { Open(filename); }
|
||||||
|
|
||||||
|
bool Open(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
||||||
|
if (NULL == fp) {
|
||||||
|
std::cout << "Error in read " << filename;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WavHeader header;
|
||||||
|
fread(&header, 1, sizeof(header), fp);
|
||||||
|
if (header.fmt_size < 16) {
|
||||||
|
printf("WaveData: expect PCM format data "
|
||||||
|
"to have fmt chunk of at least size 16.\n");
|
||||||
|
return false;
|
||||||
|
} else if (header.fmt_size > 16) {
|
||||||
|
int offset = 44 - 8 + header.fmt_size - 16;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
// check "riff" "WAVE" "fmt " "data"
|
||||||
|
|
||||||
|
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
||||||
|
// be a single "fact" sub chunk, but on Windows there can also be a
|
||||||
|
// "list" sub chunk.
|
||||||
|
while (0 != strncmp(header.data, "data", 4)) {
|
||||||
|
// We will just ignore the data in these chunks.
|
||||||
|
fseek(fp, header.data_size, SEEK_CUR);
|
||||||
|
// read next sub chunk
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header.data_size == 0) {
|
||||||
|
int offset = ftell(fp);
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
header.data_size = ftell(fp) - offset;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
num_channel_ = header.channels;
|
||||||
|
sample_rate_ = header.sample_rate;
|
||||||
|
bits_per_sample_ = header.bit;
|
||||||
|
int num_data = header.data_size / (bits_per_sample_ / 8);
|
||||||
|
data_ = new float[num_data]; // Create 1-dim array
|
||||||
|
num_samples_ = num_data / num_channel_;
|
||||||
|
|
||||||
|
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
||||||
|
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
||||||
|
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
||||||
|
std::cout << "num_samples :" << num_data << std::endl;
|
||||||
|
std::cout << "num_data_size :" << header.data_size << std::endl;
|
||||||
|
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(char), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int16_t), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32:
|
||||||
|
{
|
||||||
|
if (header.format == 1) //S32
|
||||||
|
{
|
||||||
|
int sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (header.format == 3) // IEEE-float
|
||||||
|
{
|
||||||
|
float sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(float), fp);
|
||||||
|
data_[i] = static_cast<float>(sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_channel() const { return num_channel_; }
|
||||||
|
int sample_rate() const { return sample_rate_; }
|
||||||
|
int bits_per_sample() const { return bits_per_sample_; }
|
||||||
|
int num_samples() const { return num_samples_; }
|
||||||
|
|
||||||
|
~WavReader() {
|
||||||
|
delete[] data_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* data() const { return data_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
int num_samples_; // sample points per channel
|
||||||
|
float* data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavWriter {
|
||||||
|
public:
|
||||||
|
WavWriter(const float* data, int num_samples, int num_channel,
|
||||||
|
int sample_rate, int bits_per_sample)
|
||||||
|
: data_(data),
|
||||||
|
num_samples_(num_samples),
|
||||||
|
num_channel_(num_channel),
|
||||||
|
sample_rate_(sample_rate),
|
||||||
|
bits_per_sample_(bits_per_sample) {}
|
||||||
|
|
||||||
|
void Write(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "w");
|
||||||
|
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
||||||
|
WavHeader header;
|
||||||
|
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
||||||
|
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
||||||
|
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
memcpy(&header, wav_header, sizeof(header));
|
||||||
|
header.channels = num_channel_;
|
||||||
|
header.bit = bits_per_sample_;
|
||||||
|
header.sample_rate = sample_rate_;
|
||||||
|
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.size = sizeof(header) - 8 + header.data_size;
|
||||||
|
header.bytes_per_second =
|
||||||
|
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
|
||||||
|
fwrite(&header, 1, sizeof(header), fp);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_samples_; ++i) {
|
||||||
|
for (int j = 0; j < num_channel_; ++j) {
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const float* data_;
|
||||||
|
int num_samples_; // total float points in data_
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace wav
|
||||||
|
|
||||||
|
#endif // FRONTEND_WAV_H_
|
||||||
|
|
||||||
|
|
||||||
237
examples/colab_record_example.ipynb
Normal file
237
examples/colab_record_example.ipynb
Normal file
@@ -0,0 +1,237 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "bccAucKjnPHm"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Dependencies and inputs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "cSih95WFmwgi"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#!apt install ffmpeg\n",
|
||||||
|
"!pip -q install pydub\n",
|
||||||
|
"from google.colab import output\n",
|
||||||
|
"from base64 import b64decode, b64encode\n",
|
||||||
|
"from io import BytesIO\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from pydub import AudioSegment\n",
|
||||||
|
"from IPython.display import HTML, display\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import moviepy.editor as mpe\n",
|
||||||
|
"from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
|
||||||
|
"import matplotlib\n",
|
||||||
|
"matplotlib.use('Agg')\n",
|
||||||
|
"\n",
|
||||||
|
"torch.set_num_threads(1)\n",
|
||||||
|
"\n",
|
||||||
|
"model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||||
|
" model='silero_vad',\n",
|
||||||
|
" force_reload=True)\n",
|
||||||
|
"\n",
|
||||||
|
"def int2float(audio):\n",
|
||||||
|
" samples = audio.get_array_of_samples()\n",
|
||||||
|
" new_sound = audio._spawn(samples)\n",
|
||||||
|
" arr = np.array(samples).astype(np.float32)\n",
|
||||||
|
" arr = arr / np.abs(arr).max()\n",
|
||||||
|
" return arr\n",
|
||||||
|
"\n",
|
||||||
|
"AUDIO_HTML = \"\"\"\n",
|
||||||
|
"<script>\n",
|
||||||
|
"var my_div = document.createElement(\"DIV\");\n",
|
||||||
|
"var my_p = document.createElement(\"P\");\n",
|
||||||
|
"var my_btn = document.createElement(\"BUTTON\");\n",
|
||||||
|
"var t = document.createTextNode(\"Press to start recording\");\n",
|
||||||
|
"\n",
|
||||||
|
"my_btn.appendChild(t);\n",
|
||||||
|
"//my_p.appendChild(my_btn);\n",
|
||||||
|
"my_div.appendChild(my_btn);\n",
|
||||||
|
"document.body.appendChild(my_div);\n",
|
||||||
|
"\n",
|
||||||
|
"var base64data = 0;\n",
|
||||||
|
"var reader;\n",
|
||||||
|
"var recorder, gumStream;\n",
|
||||||
|
"var recordButton = my_btn;\n",
|
||||||
|
"\n",
|
||||||
|
"var handleSuccess = function(stream) {\n",
|
||||||
|
" gumStream = stream;\n",
|
||||||
|
" var options = {\n",
|
||||||
|
" //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
|
||||||
|
" mimeType : 'audio/webm;codecs=opus'\n",
|
||||||
|
" //mimeType : 'audio/webm;codecs=pcm'\n",
|
||||||
|
" };\n",
|
||||||
|
" //recorder = new MediaRecorder(stream, options);\n",
|
||||||
|
" recorder = new MediaRecorder(stream);\n",
|
||||||
|
" recorder.ondataavailable = function(e) {\n",
|
||||||
|
" var url = URL.createObjectURL(e.data);\n",
|
||||||
|
" // var preview = document.createElement('audio');\n",
|
||||||
|
" // preview.controls = true;\n",
|
||||||
|
" // preview.src = url;\n",
|
||||||
|
" // document.body.appendChild(preview);\n",
|
||||||
|
"\n",
|
||||||
|
" reader = new FileReader();\n",
|
||||||
|
" reader.readAsDataURL(e.data);\n",
|
||||||
|
" reader.onloadend = function() {\n",
|
||||||
|
" base64data = reader.result;\n",
|
||||||
|
" //console.log(\"Inside FileReader:\" + base64data);\n",
|
||||||
|
" }\n",
|
||||||
|
" };\n",
|
||||||
|
" recorder.start();\n",
|
||||||
|
" };\n",
|
||||||
|
"\n",
|
||||||
|
"recordButton.innerText = \"Recording... press to stop\";\n",
|
||||||
|
"\n",
|
||||||
|
"navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"function toggleRecording() {\n",
|
||||||
|
" if (recorder && recorder.state == \"recording\") {\n",
|
||||||
|
" recorder.stop();\n",
|
||||||
|
" gumStream.getAudioTracks()[0].stop();\n",
|
||||||
|
" recordButton.innerText = \"Saving recording...\"\n",
|
||||||
|
" }\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// https://stackoverflow.com/a/951057\n",
|
||||||
|
"function sleep(ms) {\n",
|
||||||
|
" return new Promise(resolve => setTimeout(resolve, ms));\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"var data = new Promise(resolve=>{\n",
|
||||||
|
"//recordButton.addEventListener(\"click\", toggleRecording);\n",
|
||||||
|
"recordButton.onclick = ()=>{\n",
|
||||||
|
"toggleRecording()\n",
|
||||||
|
"\n",
|
||||||
|
"sleep(2000).then(() => {\n",
|
||||||
|
" // wait 2000ms for the data to be available...\n",
|
||||||
|
" // ideally this should use something like await...\n",
|
||||||
|
" //console.log(\"Inside data:\" + base64data)\n",
|
||||||
|
" resolve(base64data.toString())\n",
|
||||||
|
"\n",
|
||||||
|
"});\n",
|
||||||
|
"\n",
|
||||||
|
"}\n",
|
||||||
|
"});\n",
|
||||||
|
"\n",
|
||||||
|
"</script>\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"def record(sec=10):\n",
|
||||||
|
" display(HTML(AUDIO_HTML))\n",
|
||||||
|
" s = output.eval_js(\"data\")\n",
|
||||||
|
" b = b64decode(s.split(',')[1])\n",
|
||||||
|
" audio = AudioSegment.from_file(BytesIO(b))\n",
|
||||||
|
" audio.export('test.mp3', format='mp3')\n",
|
||||||
|
" audio = audio.set_channels(1)\n",
|
||||||
|
" audio = audio.set_frame_rate(16000)\n",
|
||||||
|
" audio_float = int2float(audio)\n",
|
||||||
|
" audio_tens = torch.tensor(audio_float)\n",
|
||||||
|
" return audio_tens\n",
|
||||||
|
"\n",
|
||||||
|
"def make_animation(probs, audio_duration, interval=40):\n",
|
||||||
|
" fig = plt.figure(figsize=(16, 9))\n",
|
||||||
|
" ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
|
||||||
|
" line, = ax.plot([], [], lw=2)\n",
|
||||||
|
" x = [i / 16000 * 512 for i in range(len(probs))]\n",
|
||||||
|
" plt.xlabel('Time, seconds', fontsize=16)\n",
|
||||||
|
" plt.ylabel('Speech Probability', fontsize=16)\n",
|
||||||
|
"\n",
|
||||||
|
" def init():\n",
|
||||||
|
" plt.fill_between(x, probs, color='#064273')\n",
|
||||||
|
" line.set_data([], [])\n",
|
||||||
|
" line.set_color('#990000')\n",
|
||||||
|
" return line,\n",
|
||||||
|
"\n",
|
||||||
|
" def animate(i):\n",
|
||||||
|
" x = i * interval / 1000 - 0.04\n",
|
||||||
|
" y = np.linspace(0, 1.02, 2)\n",
|
||||||
|
"\n",
|
||||||
|
" line.set_data(x, y)\n",
|
||||||
|
" line.set_color('#990000')\n",
|
||||||
|
" return line,\n",
|
||||||
|
" anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
|
||||||
|
"\n",
|
||||||
|
" f = r\"animation.mp4\"\n",
|
||||||
|
" writervideo = FFMpegWriter(fps=1000/interval)\n",
|
||||||
|
" anim.save(f, writer=writervideo)\n",
|
||||||
|
" plt.close('all')\n",
|
||||||
|
"\n",
|
||||||
|
"def combine_audio(vidname, audname, outname, fps=25):\n",
|
||||||
|
" my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
|
||||||
|
" audio_background = mpe.AudioFileClip(audname)\n",
|
||||||
|
" final_clip = my_clip.set_audio(audio_background)\n",
|
||||||
|
" final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
|
||||||
|
"\n",
|
||||||
|
"def record_make_animation():\n",
|
||||||
|
" tensor = record()\n",
|
||||||
|
" print('Calculating probabilities...')\n",
|
||||||
|
" speech_probs = []\n",
|
||||||
|
" window_size_samples = 512\n",
|
||||||
|
" speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
|
||||||
|
" model.reset_states()\n",
|
||||||
|
" print('Making animation...')\n",
|
||||||
|
" make_animation(speech_probs, len(tensor) / 16000)\n",
|
||||||
|
"\n",
|
||||||
|
" print('Merging your voice with animation...')\n",
|
||||||
|
" combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
|
||||||
|
" print('Done!')\n",
|
||||||
|
" mp4 = open('merged.mp4','rb').read()\n",
|
||||||
|
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
|
||||||
|
" display(HTML(\"\"\"\n",
|
||||||
|
" <video width=800 controls>\n",
|
||||||
|
" <source src=\"%s\" type=\"video/mp4\">\n",
|
||||||
|
" </video>\n",
|
||||||
|
" \"\"\" % data_url))\n",
|
||||||
|
"\n",
|
||||||
|
" return speech_probs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "IFVs3GvTnpB1"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Record example"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "5EBjrTwiqAaQ"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"speech_probs = record_make_animation()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"collapsed_sections": [
|
||||||
|
"bccAucKjnPHm"
|
||||||
|
],
|
||||||
|
"name": "Untitled2.ipynb",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
43
examples/cpp/README.md
Normal file
43
examples/cpp/README.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Stream example in C++
|
||||||
|
|
||||||
|
Here's a simple example of the vad model in c++ onnxruntime.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Code are tested in the environments bellow, feel free to try others.
|
||||||
|
|
||||||
|
- WSL2 + Debian-bullseye (docker)
|
||||||
|
- gcc 12.2.0
|
||||||
|
- onnxruntime-linux-x64-1.12.1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye`
|
||||||
|
|
||||||
|
2. Install onnxruntime-linux-x64-1.12.1
|
||||||
|
|
||||||
|
- Download lib onnxruntime:
|
||||||
|
|
||||||
|
`wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz`
|
||||||
|
|
||||||
|
- Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1`
|
||||||
|
|
||||||
|
3. Modify wav path & Test configs in main function
|
||||||
|
|
||||||
|
`wav::WavReader wav_reader("${path_to_your_wav_file}");`
|
||||||
|
|
||||||
|
test sample rate, frame per ms, threshold...
|
||||||
|
|
||||||
|
4. Build with gcc and run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build
|
||||||
|
g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test
|
||||||
|
|
||||||
|
# Run
|
||||||
|
./test
|
||||||
|
```
|
||||||
367
examples/cpp/silero-vad-onnx.cpp
Normal file
367
examples/cpp/silero-vad-onnx.cpp
Normal file
@@ -0,0 +1,367 @@
|
|||||||
|
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <sstream>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cmath> // for std::rint
|
||||||
|
#if __cplusplus < 201703L
|
||||||
|
#include <memory>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//#define __DEBUG_SPEECH_PROB___
|
||||||
|
|
||||||
|
#include "onnxruntime_cxx_api.h"
|
||||||
|
#include "wav.h" // For reading WAV files
|
||||||
|
|
||||||
|
// timestamp_t class: stores the start and end (in samples) of a speech segment.
|
||||||
|
class timestamp_t {
|
||||||
|
public:
|
||||||
|
int start;
|
||||||
|
int end;
|
||||||
|
|
||||||
|
timestamp_t(int start = -1, int end = -1)
|
||||||
|
: start(start), end(end) { }
|
||||||
|
|
||||||
|
timestamp_t& operator=(const timestamp_t& a) {
|
||||||
|
start = a.start;
|
||||||
|
end = a.end;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const timestamp_t& a) const {
|
||||||
|
return (start == a.start && end == a.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns a formatted string of the timestamp.
|
||||||
|
std::string c_str() const {
|
||||||
|
return format("{start:%08d, end:%08d}", start, end);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
// Helper function for formatting.
|
||||||
|
std::string format(const char* fmt, ...) const {
|
||||||
|
char buf[256];
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
if (r < 0)
|
||||||
|
return {};
|
||||||
|
const size_t len = r;
|
||||||
|
if (len < sizeof(buf))
|
||||||
|
return std::string(buf, len);
|
||||||
|
#if __cplusplus >= 201703L
|
||||||
|
std::string s(len, '\0');
|
||||||
|
va_start(args, fmt);
|
||||||
|
std::vsnprintf(s.data(), len + 1, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
return s;
|
||||||
|
#else
|
||||||
|
auto vbuf = std::unique_ptr<char[]>(new char[len + 1]);
|
||||||
|
va_start(args, fmt);
|
||||||
|
std::vsnprintf(vbuf.get(), len + 1, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
return std::string(vbuf.get(), len);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// VadIterator class: uses ONNX Runtime to detect speech segments.
|
||||||
|
class VadIterator {
|
||||||
|
private:
|
||||||
|
// ONNX Runtime resources
|
||||||
|
Ort::Env env;
|
||||||
|
Ort::SessionOptions session_options;
|
||||||
|
std::shared_ptr<Ort::Session> session = nullptr;
|
||||||
|
Ort::AllocatorWithDefaultOptions allocator;
|
||||||
|
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
|
||||||
|
|
||||||
|
// ----- Context-related additions -----
|
||||||
|
const int context_samples = 64; // For 16kHz, 64 samples are added as context.
|
||||||
|
std::vector<float> _context; // Holds the last 64 samples from the previous chunk (initialized to zero).
|
||||||
|
|
||||||
|
// Original window size (e.g., 32ms corresponds to 512 samples)
|
||||||
|
int window_size_samples;
|
||||||
|
// Effective window size = window_size_samples + context_samples
|
||||||
|
int effective_window_size;
|
||||||
|
|
||||||
|
// Additional declaration: samples per millisecond
|
||||||
|
int sr_per_ms;
|
||||||
|
|
||||||
|
// ONNX Runtime input/output buffers
|
||||||
|
std::vector<Ort::Value> ort_inputs;
|
||||||
|
std::vector<const char*> input_node_names = { "input", "state", "sr" };
|
||||||
|
std::vector<float> input;
|
||||||
|
unsigned int size_state = 2 * 1 * 128;
|
||||||
|
std::vector<float> _state;
|
||||||
|
std::vector<int64_t> sr;
|
||||||
|
int64_t input_node_dims[2] = {};
|
||||||
|
const int64_t state_node_dims[3] = { 2, 1, 128 };
|
||||||
|
const int64_t sr_node_dims[1] = { 1 };
|
||||||
|
std::vector<Ort::Value> ort_outputs;
|
||||||
|
std::vector<const char*> output_node_names = { "output", "stateN" };
|
||||||
|
|
||||||
|
// Model configuration parameters
|
||||||
|
int sample_rate;
|
||||||
|
float threshold;
|
||||||
|
int min_silence_samples;
|
||||||
|
int min_silence_samples_at_max_speech;
|
||||||
|
int min_speech_samples;
|
||||||
|
float max_speech_samples;
|
||||||
|
int speech_pad_samples;
|
||||||
|
int audio_length_samples;
|
||||||
|
|
||||||
|
// State management
|
||||||
|
bool triggered = false;
|
||||||
|
unsigned int temp_end = 0;
|
||||||
|
unsigned int current_sample = 0;
|
||||||
|
int prev_end;
|
||||||
|
int next_start = 0;
|
||||||
|
std::vector<timestamp_t> speeches;
|
||||||
|
timestamp_t current_speech;
|
||||||
|
|
||||||
|
// Loads the ONNX model.
|
||||||
|
void init_onnx_model(const std::wstring& model_path) {
|
||||||
|
init_engine_threads(1, 1);
|
||||||
|
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initializes threading settings.
|
||||||
|
void init_engine_threads(int inter_threads, int intra_threads) {
|
||||||
|
session_options.SetIntraOpNumThreads(intra_threads);
|
||||||
|
session_options.SetInterOpNumThreads(inter_threads);
|
||||||
|
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resets internal state (_state, _context, etc.)
|
||||||
|
void reset_states() {
|
||||||
|
std::memset(_state.data(), 0, _state.size() * sizeof(float));
|
||||||
|
triggered = false;
|
||||||
|
temp_end = 0;
|
||||||
|
current_sample = 0;
|
||||||
|
prev_end = next_start = 0;
|
||||||
|
speeches.clear();
|
||||||
|
current_speech = timestamp_t();
|
||||||
|
std::fill(_context.begin(), _context.end(), 0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inference: runs inference on one chunk of input data.
|
||||||
|
// data_chunk is expected to have window_size_samples samples.
|
||||||
|
void predict(const std::vector<float>& data_chunk) {
|
||||||
|
// Build new input: first context_samples from _context, followed by the current chunk (window_size_samples).
|
||||||
|
std::vector<float> new_data(effective_window_size, 0.0f);
|
||||||
|
std::copy(_context.begin(), _context.end(), new_data.begin());
|
||||||
|
std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
|
||||||
|
input = new_data;
|
||||||
|
|
||||||
|
// Create input tensor (input_node_dims[1] is already set to effective_window_size).
|
||||||
|
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
||||||
|
memory_info, input.data(), input.size(), input_node_dims, 2);
|
||||||
|
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
|
||||||
|
memory_info, _state.data(), _state.size(), state_node_dims, 3);
|
||||||
|
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
||||||
|
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
||||||
|
ort_inputs.clear();
|
||||||
|
ort_inputs.emplace_back(std::move(input_ort));
|
||||||
|
ort_inputs.emplace_back(std::move(state_ort));
|
||||||
|
ort_inputs.emplace_back(std::move(sr_ort));
|
||||||
|
|
||||||
|
// Run inference.
|
||||||
|
ort_outputs = session->Run(
|
||||||
|
Ort::RunOptions{ nullptr },
|
||||||
|
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
|
||||||
|
output_node_names.data(), output_node_names.size());
|
||||||
|
|
||||||
|
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
||||||
|
float* stateN = ort_outputs[1].GetTensorMutableData<float>();
|
||||||
|
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
|
||||||
|
current_sample += static_cast<unsigned int>(window_size_samples); // Advance by the original window size.
|
||||||
|
|
||||||
|
// If speech is detected (probability >= threshold)
|
||||||
|
if (speech_prob >= threshold) {
|
||||||
|
#ifdef __DEBUG_SPEECH_PROB___
|
||||||
|
float speech = current_sample - window_size_samples;
|
||||||
|
printf("{ start: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
|
||||||
|
#endif
|
||||||
|
if (temp_end != 0) {
|
||||||
|
temp_end = 0;
|
||||||
|
if (next_start < prev_end)
|
||||||
|
next_start = current_sample - window_size_samples;
|
||||||
|
}
|
||||||
|
if (!triggered) {
|
||||||
|
triggered = true;
|
||||||
|
current_speech.start = current_sample - window_size_samples;
|
||||||
|
}
|
||||||
|
// Update context: copy the last context_samples from new_data.
|
||||||
|
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the speech segment becomes too long.
|
||||||
|
if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
|
||||||
|
if (prev_end > 0) {
|
||||||
|
current_speech.end = prev_end;
|
||||||
|
speeches.push_back(current_speech);
|
||||||
|
current_speech = timestamp_t();
|
||||||
|
if (next_start < prev_end)
|
||||||
|
triggered = false;
|
||||||
|
else
|
||||||
|
current_speech.start = next_start;
|
||||||
|
prev_end = 0;
|
||||||
|
next_start = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
current_speech.end = current_sample;
|
||||||
|
speeches.push_back(current_speech);
|
||||||
|
current_speech = timestamp_t();
|
||||||
|
prev_end = 0;
|
||||||
|
next_start = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
|
||||||
|
// When the speech probability temporarily drops but is still in speech, update context without changing state.
|
||||||
|
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob < (threshold - 0.15)) {
|
||||||
|
#ifdef __DEBUG_SPEECH_PROB___
|
||||||
|
float speech = current_sample - window_size_samples - speech_pad_samples;
|
||||||
|
printf("{ end: %.3f s (%.3f) %08d}\n", 1.0f * speech / sample_rate, speech_prob, current_sample - window_size_samples);
|
||||||
|
#endif
|
||||||
|
if (triggered) {
|
||||||
|
if (temp_end == 0)
|
||||||
|
temp_end = current_sample;
|
||||||
|
if (current_sample - temp_end > min_silence_samples_at_max_speech)
|
||||||
|
prev_end = temp_end;
|
||||||
|
if ((current_sample - temp_end) >= min_silence_samples) {
|
||||||
|
current_speech.end = temp_end;
|
||||||
|
if (current_speech.end - current_speech.start > min_speech_samples) {
|
||||||
|
speeches.push_back(current_speech);
|
||||||
|
current_speech = timestamp_t();
|
||||||
|
prev_end = 0;
|
||||||
|
next_start = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Process the entire audio input.
|
||||||
|
void process(const std::vector<float>& input_wav) {
|
||||||
|
reset_states();
|
||||||
|
audio_length_samples = static_cast<int>(input_wav.size());
|
||||||
|
// Process audio in chunks of window_size_samples (e.g., 512 samples)
|
||||||
|
for (size_t j = 0; j < static_cast<size_t>(audio_length_samples); j += static_cast<size_t>(window_size_samples)) {
|
||||||
|
if (j + static_cast<size_t>(window_size_samples) > static_cast<size_t>(audio_length_samples))
|
||||||
|
break;
|
||||||
|
std::vector<float> chunk(&input_wav[j], &input_wav[j] + window_size_samples);
|
||||||
|
predict(chunk);
|
||||||
|
}
|
||||||
|
if (current_speech.start >= 0) {
|
||||||
|
current_speech.end = audio_length_samples;
|
||||||
|
speeches.push_back(current_speech);
|
||||||
|
current_speech = timestamp_t();
|
||||||
|
prev_end = 0;
|
||||||
|
next_start = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the detected speech timestamps.
|
||||||
|
const std::vector<timestamp_t> get_speech_timestamps() const {
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Public method to reset the internal state.
|
||||||
|
void reset() {
|
||||||
|
reset_states();
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Constructor: sets model path, sample rate, window size (ms), and other parameters.
|
||||||
|
// The parameters are set to match the Python version.
|
||||||
|
VadIterator(const std::wstring ModelPath,
|
||||||
|
int Sample_rate = 16000, int windows_frame_size = 32,
|
||||||
|
float Threshold = 0.5, int min_silence_duration_ms = 100,
|
||||||
|
int speech_pad_ms = 30, int min_speech_duration_ms = 250,
|
||||||
|
float max_speech_duration_s = std::numeric_limits<float>::infinity())
|
||||||
|
: sample_rate(Sample_rate), threshold(Threshold), speech_pad_samples(speech_pad_ms), prev_end(0)
|
||||||
|
{
|
||||||
|
sr_per_ms = sample_rate / 1000; // e.g., 16000 / 1000 = 16
|
||||||
|
window_size_samples = windows_frame_size * sr_per_ms; // e.g., 32ms * 16 = 512 samples
|
||||||
|
effective_window_size = window_size_samples + context_samples; // e.g., 512 + 64 = 576 samples
|
||||||
|
input_node_dims[0] = 1;
|
||||||
|
input_node_dims[1] = effective_window_size;
|
||||||
|
_state.resize(size_state);
|
||||||
|
sr.resize(1);
|
||||||
|
sr[0] = sample_rate;
|
||||||
|
_context.assign(context_samples, 0.0f);
|
||||||
|
min_speech_samples = sr_per_ms * min_speech_duration_ms;
|
||||||
|
max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
|
||||||
|
min_silence_samples = sr_per_ms * min_silence_duration_ms;
|
||||||
|
min_silence_samples_at_max_speech = sr_per_ms * 98;
|
||||||
|
init_onnx_model(ModelPath);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
// Read the WAV file (expects 16000 Hz, mono, PCM).
|
||||||
|
wav::WavReader wav_reader("audio/recorder.wav"); // File located in the "audio" folder.
|
||||||
|
int numSamples = wav_reader.num_samples();
|
||||||
|
std::vector<float> input_wav(static_cast<size_t>(numSamples));
|
||||||
|
for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
|
||||||
|
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the ONNX model path (file located in the "model" folder).
|
||||||
|
std::wstring model_path = L"model/silero_vad.onnx";
|
||||||
|
|
||||||
|
// Initialize the VadIterator.
|
||||||
|
VadIterator vad(model_path);
|
||||||
|
|
||||||
|
// Process the audio.
|
||||||
|
vad.process(input_wav);
|
||||||
|
|
||||||
|
// Retrieve the speech timestamps (in samples).
|
||||||
|
std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
|
||||||
|
|
||||||
|
// Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
|
||||||
|
const float sample_rate_float = 16000.0f;
|
||||||
|
for (size_t i = 0; i < stamps.size(); i++) {
|
||||||
|
float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
|
||||||
|
float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
|
||||||
|
std::cout << "Speech detected from "
|
||||||
|
<< std::fixed << std::setprecision(1) << start_sec
|
||||||
|
<< " s to "
|
||||||
|
<< std::fixed << std::setprecision(1) << end_sec
|
||||||
|
<< " s" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optionally, reset the internal state.
|
||||||
|
vad.reset();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
237
examples/cpp/wav.h
Normal file
237
examples/cpp/wav.h
Normal file
@@ -0,0 +1,237 @@
|
|||||||
|
// Copyright (c) 2016 Personal (Binbin Zhang)
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#ifndef FRONTEND_WAV_H_
|
||||||
|
#define FRONTEND_WAV_H_
|
||||||
|
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
// #include "utils/log.h"
|
||||||
|
|
||||||
|
namespace wav {
|
||||||
|
|
||||||
|
struct WavHeader {
|
||||||
|
char riff[4]; // "riff"
|
||||||
|
unsigned int size;
|
||||||
|
char wav[4]; // "WAVE"
|
||||||
|
char fmt[4]; // "fmt "
|
||||||
|
unsigned int fmt_size;
|
||||||
|
uint16_t format;
|
||||||
|
uint16_t channels;
|
||||||
|
unsigned int sample_rate;
|
||||||
|
unsigned int bytes_per_second;
|
||||||
|
uint16_t block_size;
|
||||||
|
uint16_t bit;
|
||||||
|
char data[4]; // "data"
|
||||||
|
unsigned int data_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavReader {
|
||||||
|
public:
|
||||||
|
WavReader() : data_(nullptr) {}
|
||||||
|
explicit WavReader(const std::string& filename) { Open(filename); }
|
||||||
|
|
||||||
|
bool Open(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
||||||
|
if (NULL == fp) {
|
||||||
|
std::cout << "Error in read " << filename;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WavHeader header;
|
||||||
|
fread(&header, 1, sizeof(header), fp);
|
||||||
|
if (header.fmt_size < 16) {
|
||||||
|
printf("WaveData: expect PCM format data "
|
||||||
|
"to have fmt chunk of at least size 16.\n");
|
||||||
|
return false;
|
||||||
|
} else if (header.fmt_size > 16) {
|
||||||
|
int offset = 44 - 8 + header.fmt_size - 16;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
// check "riff" "WAVE" "fmt " "data"
|
||||||
|
|
||||||
|
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
||||||
|
// be a single "fact" sub chunk, but on Windows there can also be a
|
||||||
|
// "list" sub chunk.
|
||||||
|
while (0 != strncmp(header.data, "data", 4)) {
|
||||||
|
// We will just ignore the data in these chunks.
|
||||||
|
fseek(fp, header.data_size, SEEK_CUR);
|
||||||
|
// read next sub chunk
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header.data_size == 0) {
|
||||||
|
int offset = ftell(fp);
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
header.data_size = ftell(fp) - offset;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
num_channel_ = header.channels;
|
||||||
|
sample_rate_ = header.sample_rate;
|
||||||
|
bits_per_sample_ = header.bit;
|
||||||
|
int num_data = header.data_size / (bits_per_sample_ / 8);
|
||||||
|
data_ = new float[num_data]; // Create 1-dim array
|
||||||
|
num_samples_ = num_data / num_channel_;
|
||||||
|
|
||||||
|
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
||||||
|
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
||||||
|
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
||||||
|
std::cout << "num_samples :" << num_data << std::endl;
|
||||||
|
std::cout << "num_data_size :" << header.data_size << std::endl;
|
||||||
|
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(char), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int16_t), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32:
|
||||||
|
{
|
||||||
|
if (header.format == 1) //S32
|
||||||
|
{
|
||||||
|
int sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (header.format == 3) // IEEE-float
|
||||||
|
{
|
||||||
|
float sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(float), fp);
|
||||||
|
data_[i] = static_cast<float>(sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_channel() const { return num_channel_; }
|
||||||
|
int sample_rate() const { return sample_rate_; }
|
||||||
|
int bits_per_sample() const { return bits_per_sample_; }
|
||||||
|
int num_samples() const { return num_samples_; }
|
||||||
|
|
||||||
|
~WavReader() {
|
||||||
|
delete[] data_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* data() const { return data_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
int num_samples_; // sample points per channel
|
||||||
|
float* data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavWriter {
|
||||||
|
public:
|
||||||
|
WavWriter(const float* data, int num_samples, int num_channel,
|
||||||
|
int sample_rate, int bits_per_sample)
|
||||||
|
: data_(data),
|
||||||
|
num_samples_(num_samples),
|
||||||
|
num_channel_(num_channel),
|
||||||
|
sample_rate_(sample_rate),
|
||||||
|
bits_per_sample_(bits_per_sample) {}
|
||||||
|
|
||||||
|
void Write(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "w");
|
||||||
|
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
||||||
|
WavHeader header;
|
||||||
|
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
||||||
|
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
||||||
|
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
memcpy(&header, wav_header, sizeof(header));
|
||||||
|
header.channels = num_channel_;
|
||||||
|
header.bit = bits_per_sample_;
|
||||||
|
header.sample_rate = sample_rate_;
|
||||||
|
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.size = sizeof(header) - 8 + header.data_size;
|
||||||
|
header.bytes_per_second =
|
||||||
|
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
|
||||||
|
fwrite(&header, 1, sizeof(header), fp);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_samples_; ++i) {
|
||||||
|
for (int j = 0; j < num_channel_; ++j) {
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const float* data_;
|
||||||
|
int num_samples_; // total float points in data_
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace wav
|
||||||
|
|
||||||
|
#endif // FRONTEND_WAV_H_
|
||||||
45
examples/cpp_libtorch/README.md
Normal file
45
examples/cpp_libtorch/README.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Silero-VAD V5 in C++ (based on LibTorch)
|
||||||
|
|
||||||
|
This is the source code for Silero-VAD V5 in C++, utilizing LibTorch. The primary implementation is CPU-based, and you should compare its results with the Python version. Only results at 16kHz have been tested.
|
||||||
|
|
||||||
|
Additionally, batch and CUDA inference options are available if you want to explore further. Note that when using batch inference, the speech probabilities may slightly differ from the standard version, likely due to differences in caching. Unlike individual input processing, batch inference may not use the cache from previous chunks. Despite this, batch inference offers significantly faster processing. For optimal performance, consider adjusting the threshold when using batch inference.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- GCC 11.4.0 (GCC >= 5.1)
|
||||||
|
- LibTorch 1.13.0 (other versions are also acceptable)
|
||||||
|
|
||||||
|
## Download LibTorch
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-CPU Version
|
||||||
|
wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
|
||||||
|
unzip libtorch-shared-with-deps-1.13.0+cpu.zip'
|
||||||
|
|
||||||
|
-CUDA Version
|
||||||
|
wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
|
||||||
|
unzip libtorch-shared-with-deps-1.13.0+cu116.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
## Compilation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-CPU Version
|
||||||
|
g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
|
||||||
|
|
||||||
|
-CUDA Version
|
||||||
|
g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Optional Compilation Flags
|
||||||
|
-DUSE_BATCH: Enable batch inference
|
||||||
|
-DUSE_GPU: Use GPU for inference
|
||||||
|
|
||||||
|
## Run the Program
|
||||||
|
To run the program, use the following command:
|
||||||
|
|
||||||
|
`./silero aepyx.wav 16000 0.5`
|
||||||
|
|
||||||
|
The sample file aepyx.wav is part of the Voxconverse dataset.
|
||||||
|
File details: aepyx.wav is a 16kHz, 16-bit audio file.
|
||||||
BIN
examples/cpp_libtorch/aepyx.wav
Normal file
BIN
examples/cpp_libtorch/aepyx.wav
Normal file
Binary file not shown.
54
examples/cpp_libtorch/main.cc
Normal file
54
examples/cpp_libtorch/main.cc
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include "silero_torch.h"
|
||||||
|
#include "wav.h"
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
|
if(argc != 4){
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" <wav.path> <SampleRate> <Threshold>"<<std::endl;
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" sample.wav 16000 0.5"<<std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string wav_path = argv[1];
|
||||||
|
float sample_rate = std::stof(argv[2]);
|
||||||
|
float threshold = std::stof(argv[3]);
|
||||||
|
|
||||||
|
|
||||||
|
//Load Model
|
||||||
|
std::string model_path = "../../src/silero_vad/data/silero_vad.jit";
|
||||||
|
silero::VadIterator vad(model_path);
|
||||||
|
|
||||||
|
vad.threshold=threshold; //(Default:0.5)
|
||||||
|
vad.sample_rate=sample_rate; //16000Hz,8000Hz. (Default:16000)
|
||||||
|
vad.print_as_samples=true; //if true, it prints time-stamp with samples. otherwise, in seconds
|
||||||
|
//(Default:false)
|
||||||
|
|
||||||
|
vad.SetVariables();
|
||||||
|
|
||||||
|
// Read wav
|
||||||
|
wav::WavReader wav_reader(wav_path);
|
||||||
|
std::vector<float> input_wav(wav_reader.num_samples());
|
||||||
|
|
||||||
|
for (int i = 0; i < wav_reader.num_samples(); i++)
|
||||||
|
{
|
||||||
|
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.SpeechProbs(input_wav);
|
||||||
|
|
||||||
|
std::vector<silero::SpeechSegment> speeches = vad.GetSpeechTimestamps();
|
||||||
|
for(const auto& speech : speeches){
|
||||||
|
if(vad.print_as_samples){
|
||||||
|
std::cout<<"{'start': "<<static_cast<int>(speech.start)<<", 'end': "<<static_cast<int>(speech.end)<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout<<"{'start': "<<speech.start<<", 'end': "<<speech.end<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BIN
examples/cpp_libtorch/silero
Executable file
BIN
examples/cpp_libtorch/silero
Executable file
Binary file not shown.
285
examples/cpp_libtorch/silero_torch.cc
Normal file
285
examples/cpp_libtorch/silero_torch.cc
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
//Author : Nathan Lee
|
||||||
|
//Created On : 2024-11-18
|
||||||
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
|
//Version : 1.0
|
||||||
|
|
||||||
|
|
||||||
|
#include "silero_torch.h"
|
||||||
|
|
||||||
|
namespace silero {
|
||||||
|
|
||||||
|
VadIterator::VadIterator(const std::string &model_path, float threshold, int sample_rate, int window_size_ms, int speech_pad_ms, int min_silence_duration_ms, int min_speech_duration_ms, int max_duration_merge_ms, bool print_as_samples)
|
||||||
|
:sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms), speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), print_as_samples(print_as_samples)
|
||||||
|
{
|
||||||
|
init_torch_model(model_path);
|
||||||
|
//init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
VadIterator::~VadIterator(){
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void VadIterator::SpeechProbs(std::vector<float>& input_wav){
|
||||||
|
// Set the sample rate (must match the model's expected sample rate)
|
||||||
|
// Process the waveform in chunks of 512 samples
|
||||||
|
int num_samples = input_wav.size();
|
||||||
|
int num_chunks = num_samples / window_size_samples;
|
||||||
|
int remainder_samples = num_samples % window_size_samples;
|
||||||
|
|
||||||
|
total_sample_size += num_samples;
|
||||||
|
|
||||||
|
torch::Tensor output;
|
||||||
|
std::vector<torch::Tensor> chunks;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_chunks; i++) {
|
||||||
|
|
||||||
|
float* chunk_start = input_wav.data() + i *window_size_samples;
|
||||||
|
torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32);
|
||||||
|
//std::cout<<"chunk size : "<<chunk.sizes()<<std::endl;
|
||||||
|
chunks.push_back(chunk);
|
||||||
|
|
||||||
|
|
||||||
|
if(i==num_chunks-1 && remainder_samples>0){//마지막 chunk && 나머지가 존재
|
||||||
|
int remaining_samples = num_samples - num_chunks * window_size_samples;
|
||||||
|
//std::cout<<"Remainder size : "<<remaining_samples;
|
||||||
|
float* chunk_start_remainder = input_wav.data() + num_chunks *window_size_samples;
|
||||||
|
|
||||||
|
torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1,remaining_samples},
|
||||||
|
torch::kFloat32);
|
||||||
|
// Pad the remainder chunk to match window_size_samples
|
||||||
|
torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples
|
||||||
|
- remaining_samples}, torch::kFloat32)}, 1);
|
||||||
|
//std::cout<<", padded_chunk size : "<<padded_chunk.size(1)<<std::endl;
|
||||||
|
|
||||||
|
chunks.push_back(padded_chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chunks.empty()) {
|
||||||
|
|
||||||
|
#ifdef USE_BATCH
|
||||||
|
torch::Tensor batched_chunks = torch::stack(chunks); // Stack all chunks into a single tensor
|
||||||
|
//batched_chunks = batched_chunks.squeeze(1);
|
||||||
|
batched_chunks = torch::cat({batched_chunks.squeeze(1)});
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
batched_chunks = batched_chunks.to(at::kCUDA); // Move the entire batch to GPU once
|
||||||
|
#endif
|
||||||
|
// Prepare input for model
|
||||||
|
std::vector<torch::jit::IValue> inputs;
|
||||||
|
inputs.push_back(batched_chunks); // Batch of chunks
|
||||||
|
inputs.push_back(sample_rate); // Assuming sample_rate is a valid input for the model
|
||||||
|
|
||||||
|
// Run inference on the batch
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
torch::Tensor output = model.forward(inputs).toTensor();
|
||||||
|
#ifdef USE_GPU
|
||||||
|
output = output.to(at::kCPU); // Move the output back to CPU once
|
||||||
|
#endif
|
||||||
|
// Collect output probabilities
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
float output_f = output[i].item<float>();
|
||||||
|
outputs_prob.push_back(output_f);
|
||||||
|
//std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
|
std::vector<torch::Tensor> outputs;
|
||||||
|
torch::Tensor batched_chunks = torch::stack(chunks);
|
||||||
|
#ifdef USE_GPU
|
||||||
|
batched_chunks = batched_chunks.to(at::kCUDA);
|
||||||
|
#endif
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::vector<torch::jit::IValue> inputs;
|
||||||
|
inputs.push_back(batched_chunks[i]);
|
||||||
|
inputs.push_back(sample_rate);
|
||||||
|
|
||||||
|
torch::Tensor output = model.forward(inputs).toTensor();
|
||||||
|
outputs.push_back(output);
|
||||||
|
}
|
||||||
|
torch::Tensor all_outputs = torch::stack(outputs);
|
||||||
|
#ifdef USE_GPU
|
||||||
|
all_outputs = all_outputs.to(at::kCPU);
|
||||||
|
#endif
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
float output_f = all_outputs[i].item<float>();
|
||||||
|
outputs_prob.push_back(output_f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::GetSpeechTimestamps() {
|
||||||
|
std::vector<SpeechSegment> speeches = DoVad();
|
||||||
|
|
||||||
|
#ifdef USE_BATCH
|
||||||
|
//When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
|
||||||
|
//It could be better get reasonable output because of distorted probs.
|
||||||
|
duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
|
||||||
|
std::vector<SpeechSegment> speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
|
||||||
|
if(!print_as_samples){
|
||||||
|
for (auto& speech : speeches_merge) { //samples to second
|
||||||
|
speech.start /= sample_rate;
|
||||||
|
speech.end /= sample_rate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return speeches_merge;
|
||||||
|
#else
|
||||||
|
|
||||||
|
if(!print_as_samples){
|
||||||
|
for (auto& speech : speeches) { //samples to second
|
||||||
|
speech.start /= sample_rate;
|
||||||
|
speech.end /= sample_rate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return speeches;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
void VadIterator::SetVariables(){
|
||||||
|
init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_engine(int window_size_ms) {
|
||||||
|
min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
|
||||||
|
speech_pad_samples = sample_rate * speech_pad_ms / 1000;
|
||||||
|
window_size_samples = sample_rate / 1000 * window_size_ms;
|
||||||
|
min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_torch_model(const std::string& model_path) {
|
||||||
|
at::set_num_threads(1);
|
||||||
|
model = torch::jit::load(model_path);
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
if (!torch::cuda::is_available()) {
|
||||||
|
std::cout<<"CUDA is not available! Please check your GPU settings"<<std::endl;
|
||||||
|
throw std::runtime_error("CUDA is not available!");
|
||||||
|
model.to(at::Device(at::kCPU));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
std::cout<<"CUDA available! Running on '0'th GPU"<<std::endl;
|
||||||
|
model.to(at::Device(at::kCUDA, 0)); //select 0'th machine
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
model.eval();
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::cout << "Model loaded successfully"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::reset_states() {
|
||||||
|
triggered = false;
|
||||||
|
current_sample = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
outputs_prob.clear();
|
||||||
|
model.run_method("reset_states");
|
||||||
|
total_sample_size = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::DoVad() {
|
||||||
|
std::vector<SpeechSegment> speeches;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < outputs_prob.size(); ++i) {
|
||||||
|
float speech_prob = outputs_prob[i];
|
||||||
|
//std::cout << speech_prob << std::endl;
|
||||||
|
//std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n";
|
||||||
|
//std::cout << speech_prob << " ";
|
||||||
|
current_sample += window_size_samples;
|
||||||
|
|
||||||
|
if (speech_prob >= threshold && temp_end != 0) {
|
||||||
|
temp_end = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob >= threshold && !triggered) {
|
||||||
|
triggered = true;
|
||||||
|
SpeechSegment segment;
|
||||||
|
segment.start = std::max(static_cast<int>(0), current_sample - speech_pad_samples - window_size_samples);
|
||||||
|
speeches.push_back(segment);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob < threshold - 0.15f && triggered) {
|
||||||
|
if (temp_end == 0) {
|
||||||
|
temp_end = current_sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_sample - temp_end < min_silence_samples) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
SpeechSegment& segment = speeches.back();
|
||||||
|
segment.end = temp_end + speech_pad_samples - window_size_samples;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
|
||||||
|
std::cout<<"when last triggered is keep working until last Probs"<<std::endl;
|
||||||
|
SpeechSegment& segment = speeches.back();
|
||||||
|
segment.end = total_sample_size; // 현재 샘플을 마지막 구간의 종료 시간으로 설정
|
||||||
|
triggered = false; // VAD 상태 초기화
|
||||||
|
}
|
||||||
|
|
||||||
|
speeches.erase(
|
||||||
|
std::remove_if(
|
||||||
|
speeches.begin(),
|
||||||
|
speeches.end(),
|
||||||
|
[this](const SpeechSegment& speech) {
|
||||||
|
return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
|
||||||
|
//min_speech_samples is 4000samples(0.25sec)
|
||||||
|
//여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
|
||||||
|
}
|
||||||
|
),
|
||||||
|
speeches.end()
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
//std::cout<<std::endl;
|
||||||
|
//std::cout<<"outputs_prob.size : "<<outputs_prob.size()<<std::endl;
|
||||||
|
|
||||||
|
reset_states();
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples) {
|
||||||
|
std::vector<SpeechSegment> mergedSpeeches;
|
||||||
|
|
||||||
|
if (speeches.empty()) {
|
||||||
|
return mergedSpeeches; // 빈 벡터 반환
|
||||||
|
}
|
||||||
|
|
||||||
|
// 첫 번째 구간으로 초기화
|
||||||
|
SpeechSegment currentSegment = speeches[0];
|
||||||
|
|
||||||
|
for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
|
||||||
|
// 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
|
||||||
|
if (speeches[i].start - currentSegment.end < duration_merge_samples) {
|
||||||
|
// 현재 구간의 끝점을 업데이트
|
||||||
|
currentSegment.end = speeches[i].end;
|
||||||
|
} else {
|
||||||
|
// 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작
|
||||||
|
mergedSpeeches.push_back(currentSegment);
|
||||||
|
currentSegment = speeches[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 마지막 구간 추가
|
||||||
|
mergedSpeeches.push_back(currentSegment);
|
||||||
|
|
||||||
|
return mergedSpeeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
75
examples/cpp_libtorch/silero_torch.h
Normal file
75
examples/cpp_libtorch/silero_torch.h
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
//Author : Nathan Lee
|
||||||
|
//Created On : 2024-11-18
|
||||||
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
|
//Version : 1.0
|
||||||
|
|
||||||
|
#ifndef SILERO_TORCH_H
|
||||||
|
#define SILERO_TORCH_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <torch/script.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace silero{
|
||||||
|
|
||||||
|
struct SpeechSegment{
|
||||||
|
int start;
|
||||||
|
int end;
|
||||||
|
};
|
||||||
|
|
||||||
|
class VadIterator{
|
||||||
|
public:
|
||||||
|
|
||||||
|
VadIterator(const std::string &model_path, float threshold = 0.5, int sample_rate = 16000,
|
||||||
|
int window_size_ms = 32, int speech_pad_ms = 30, int min_silence_duration_ms = 100,
|
||||||
|
int min_speech_duration_ms = 250, int max_duration_merge_ms = 300, bool print_as_samples = false);
|
||||||
|
~VadIterator();
|
||||||
|
|
||||||
|
|
||||||
|
void SpeechProbs(std::vector<float>& input_wav);
|
||||||
|
std::vector<silero::SpeechSegment> GetSpeechTimestamps();
|
||||||
|
void SetVariables();
|
||||||
|
|
||||||
|
float threshold;
|
||||||
|
int sample_rate;
|
||||||
|
int window_size_ms;
|
||||||
|
int min_speech_duration_ms;
|
||||||
|
int max_duration_merge_ms;
|
||||||
|
bool print_as_samples;
|
||||||
|
|
||||||
|
private:
|
||||||
|
torch::jit::script::Module model;
|
||||||
|
std::vector<float> outputs_prob;
|
||||||
|
int min_silence_samples;
|
||||||
|
int min_speech_samples;
|
||||||
|
int speech_pad_samples;
|
||||||
|
int window_size_samples;
|
||||||
|
int duration_merge_samples;
|
||||||
|
int current_sample = 0;
|
||||||
|
|
||||||
|
int total_sample_size=0;
|
||||||
|
|
||||||
|
int min_silence_duration_ms;
|
||||||
|
int speech_pad_ms;
|
||||||
|
bool triggered = false;
|
||||||
|
int temp_end = 0;
|
||||||
|
|
||||||
|
void init_engine(int window_size_ms);
|
||||||
|
void init_torch_model(const std::string& model_path);
|
||||||
|
void reset_states();
|
||||||
|
std::vector<SpeechSegment> DoVad();
|
||||||
|
std::vector<SpeechSegment> mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif // SILERO_TORCH_H
|
||||||
235
examples/cpp_libtorch/wav.h
Normal file
235
examples/cpp_libtorch/wav.h
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
// Copyright (c) 2016 Personal (Binbin Zhang)
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef FRONTEND_WAV_H_
|
||||||
|
#define FRONTEND_WAV_H_
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// #include "utils/log.h"
|
||||||
|
|
||||||
|
namespace wav {
|
||||||
|
|
||||||
|
struct WavHeader {
|
||||||
|
char riff[4]; // "riff"
|
||||||
|
unsigned int size;
|
||||||
|
char wav[4]; // "WAVE"
|
||||||
|
char fmt[4]; // "fmt "
|
||||||
|
unsigned int fmt_size;
|
||||||
|
uint16_t format;
|
||||||
|
uint16_t channels;
|
||||||
|
unsigned int sample_rate;
|
||||||
|
unsigned int bytes_per_second;
|
||||||
|
uint16_t block_size;
|
||||||
|
uint16_t bit;
|
||||||
|
char data[4]; // "data"
|
||||||
|
unsigned int data_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavReader {
|
||||||
|
public:
|
||||||
|
WavReader() : data_(nullptr) {}
|
||||||
|
explicit WavReader(const std::string& filename) { Open(filename); }
|
||||||
|
|
||||||
|
bool Open(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
||||||
|
if (NULL == fp) {
|
||||||
|
std::cout << "Error in read " << filename;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WavHeader header;
|
||||||
|
fread(&header, 1, sizeof(header), fp);
|
||||||
|
if (header.fmt_size < 16) {
|
||||||
|
printf("WaveData: expect PCM format data "
|
||||||
|
"to have fmt chunk of at least size 16.\n");
|
||||||
|
return false;
|
||||||
|
} else if (header.fmt_size > 16) {
|
||||||
|
int offset = 44 - 8 + header.fmt_size - 16;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
// check "riff" "WAVE" "fmt " "data"
|
||||||
|
|
||||||
|
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
||||||
|
// be a single "fact" sub chunk, but on Windows there can also be a
|
||||||
|
// "list" sub chunk.
|
||||||
|
while (0 != strncmp(header.data, "data", 4)) {
|
||||||
|
// We will just ignore the data in these chunks.
|
||||||
|
fseek(fp, header.data_size, SEEK_CUR);
|
||||||
|
// read next sub chunk
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header.data_size == 0) {
|
||||||
|
int offset = ftell(fp);
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
header.data_size = ftell(fp) - offset;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
num_channel_ = header.channels;
|
||||||
|
sample_rate_ = header.sample_rate;
|
||||||
|
bits_per_sample_ = header.bit;
|
||||||
|
int num_data = header.data_size / (bits_per_sample_ / 8);
|
||||||
|
data_ = new float[num_data]; // Create 1-dim array
|
||||||
|
num_samples_ = num_data / num_channel_;
|
||||||
|
|
||||||
|
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
||||||
|
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
||||||
|
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
||||||
|
std::cout << "num_samples :" << num_data << std::endl;
|
||||||
|
std::cout << "num_data_size :" << header.data_size << std::endl;
|
||||||
|
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(char), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int16_t), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32:
|
||||||
|
{
|
||||||
|
if (header.format == 1) //S32
|
||||||
|
{
|
||||||
|
int sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (header.format == 3) // IEEE-float
|
||||||
|
{
|
||||||
|
float sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(float), fp);
|
||||||
|
data_[i] = static_cast<float>(sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_channel() const { return num_channel_; }
|
||||||
|
int sample_rate() const { return sample_rate_; }
|
||||||
|
int bits_per_sample() const { return bits_per_sample_; }
|
||||||
|
int num_samples() const { return num_samples_; }
|
||||||
|
|
||||||
|
~WavReader() {
|
||||||
|
delete[] data_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* data() const { return data_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
int num_samples_; // sample points per channel
|
||||||
|
float* data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavWriter {
|
||||||
|
public:
|
||||||
|
WavWriter(const float* data, int num_samples, int num_channel,
|
||||||
|
int sample_rate, int bits_per_sample)
|
||||||
|
: data_(data),
|
||||||
|
num_samples_(num_samples),
|
||||||
|
num_channel_(num_channel),
|
||||||
|
sample_rate_(sample_rate),
|
||||||
|
bits_per_sample_(bits_per_sample) {}
|
||||||
|
|
||||||
|
void Write(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "w");
|
||||||
|
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
||||||
|
WavHeader header;
|
||||||
|
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
||||||
|
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
||||||
|
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
memcpy(&header, wav_header, sizeof(header));
|
||||||
|
header.channels = num_channel_;
|
||||||
|
header.bit = bits_per_sample_;
|
||||||
|
header.sample_rate = sample_rate_;
|
||||||
|
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.size = sizeof(header) - 8 + header.data_size;
|
||||||
|
header.bytes_per_second =
|
||||||
|
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
|
||||||
|
fwrite(&header, 1, sizeof(header), fp);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_samples_; ++i) {
|
||||||
|
for (int j = 0; j < num_channel_; ++j) {
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const float* data_;
|
||||||
|
int num_samples_; // total float points in data_
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace wenet
|
||||||
|
|
||||||
|
#endif // FRONTEND_WAV_H_
|
||||||
45
examples/cpp_libtorch_deprecated/README.md
Normal file
45
examples/cpp_libtorch_deprecated/README.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Silero-VAD V5 in C++ (based on LibTorch)
|
||||||
|
|
||||||
|
This is the source code for Silero-VAD V5 in C++, utilizing LibTorch. The primary implementation is CPU-based, and you should compare its results with the Python version. Only results at 16kHz have been tested.
|
||||||
|
|
||||||
|
Additionally, batch and CUDA inference options are available if you want to explore further. Note that when using batch inference, the speech probabilities may slightly differ from the standard version, likely due to differences in caching. Unlike individual input processing, batch inference may not use the cache from previous chunks. Despite this, batch inference offers significantly faster processing. For optimal performance, consider adjusting the threshold when using batch inference.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- GCC 11.4.0 (GCC >= 5.1)
|
||||||
|
- LibTorch 1.13.0 (other versions are also acceptable)
|
||||||
|
|
||||||
|
## Download LibTorch
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-CPU Version
|
||||||
|
wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip
|
||||||
|
unzip libtorch-shared-with-deps-1.13.0+cpu.zip'
|
||||||
|
|
||||||
|
-CUDA Version
|
||||||
|
wget https://download.pytorch.org/libtorch/cu116/libtorch-shared-with-deps-1.13.0%2Bcu116.zip
|
||||||
|
unzip libtorch-shared-with-deps-1.13.0+cu116.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
## Compilation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-CPU Version
|
||||||
|
g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0
|
||||||
|
|
||||||
|
-CUDA Version
|
||||||
|
g++ main.cc silero_torch.cc -I ./libtorch/include/ -I ./libtorch/include/torch/csrc/api/include -L ./libtorch/lib/ -ltorch -ltorch_cuda -ltorch_cpu -lc10 -Wl,-rpath,./libtorch/lib/ -o silero -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=0 -DUSE_GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Optional Compilation Flags
|
||||||
|
-DUSE_BATCH: Enable batch inference
|
||||||
|
-DUSE_GPU: Use GPU for inference
|
||||||
|
|
||||||
|
## Run the Program
|
||||||
|
To run the program, use the following command:
|
||||||
|
|
||||||
|
`./silero aepyx.wav 16000 0.5`
|
||||||
|
|
||||||
|
The sample file aepyx.wav is part of the Voxconverse dataset.
|
||||||
|
File details: aepyx.wav is a 16kHz, 16-bit audio file.
|
||||||
BIN
examples/cpp_libtorch_deprecated/aepyx.wav
Normal file
BIN
examples/cpp_libtorch_deprecated/aepyx.wav
Normal file
Binary file not shown.
54
examples/cpp_libtorch_deprecated/main.cc
Normal file
54
examples/cpp_libtorch_deprecated/main.cc
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include "silero_torch.h"
|
||||||
|
#include "wav.h"
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
|
||||||
|
if(argc != 4){
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" <wav.path> <SampleRate> <Threshold>"<<std::endl;
|
||||||
|
std::cerr<<"Usage : "<<argv[0]<<" sample.wav 16000 0.5"<<std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string wav_path = argv[1];
|
||||||
|
float sample_rate = std::stof(argv[2]);
|
||||||
|
float threshold = std::stof(argv[3]);
|
||||||
|
|
||||||
|
|
||||||
|
//Load Model
|
||||||
|
std::string model_path = "../../src/silero_vad/data/silero_vad.jit";
|
||||||
|
silero::VadIterator vad(model_path);
|
||||||
|
|
||||||
|
vad.threshold=threshold; //(Default:0.5)
|
||||||
|
vad.sample_rate=sample_rate; //16000Hz,8000Hz. (Default:16000)
|
||||||
|
vad.print_as_samples=true; //if true, it prints time-stamp with samples. otherwise, in seconds
|
||||||
|
//(Default:false)
|
||||||
|
|
||||||
|
vad.SetVariables();
|
||||||
|
|
||||||
|
// Read wav
|
||||||
|
wav::WavReader wav_reader(wav_path);
|
||||||
|
std::vector<float> input_wav(wav_reader.num_samples());
|
||||||
|
|
||||||
|
for (int i = 0; i < wav_reader.num_samples(); i++)
|
||||||
|
{
|
||||||
|
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.SpeechProbs(input_wav);
|
||||||
|
|
||||||
|
std::vector<silero::SpeechSegment> speeches = vad.GetSpeechTimestamps();
|
||||||
|
for(const auto& speech : speeches){
|
||||||
|
if(vad.print_as_samples){
|
||||||
|
std::cout<<"{'start': "<<static_cast<int>(speech.start)<<", 'end': "<<static_cast<int>(speech.end)<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout<<"{'start': "<<speech.start<<", 'end': "<<speech.end<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BIN
examples/cpp_libtorch_deprecated/silero
Executable file
BIN
examples/cpp_libtorch_deprecated/silero
Executable file
Binary file not shown.
285
examples/cpp_libtorch_deprecated/silero_torch.cc
Normal file
285
examples/cpp_libtorch_deprecated/silero_torch.cc
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
//Author : Nathan Lee
|
||||||
|
//Created On : 2024-11-18
|
||||||
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
|
//Version : 1.0
|
||||||
|
|
||||||
|
|
||||||
|
#include "silero_torch.h"
|
||||||
|
|
||||||
|
namespace silero {
|
||||||
|
|
||||||
|
VadIterator::VadIterator(const std::string &model_path, float threshold, int sample_rate, int window_size_ms, int speech_pad_ms, int min_silence_duration_ms, int min_speech_duration_ms, int max_duration_merge_ms, bool print_as_samples)
|
||||||
|
:sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms), speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), print_as_samples(print_as_samples)
|
||||||
|
{
|
||||||
|
init_torch_model(model_path);
|
||||||
|
//init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
VadIterator::~VadIterator(){
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void VadIterator::SpeechProbs(std::vector<float>& input_wav){
|
||||||
|
// Set the sample rate (must match the model's expected sample rate)
|
||||||
|
// Process the waveform in chunks of 512 samples
|
||||||
|
int num_samples = input_wav.size();
|
||||||
|
int num_chunks = num_samples / window_size_samples;
|
||||||
|
int remainder_samples = num_samples % window_size_samples;
|
||||||
|
|
||||||
|
total_sample_size += num_samples;
|
||||||
|
|
||||||
|
torch::Tensor output;
|
||||||
|
std::vector<torch::Tensor> chunks;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_chunks; i++) {
|
||||||
|
|
||||||
|
float* chunk_start = input_wav.data() + i *window_size_samples;
|
||||||
|
torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32);
|
||||||
|
//std::cout<<"chunk size : "<<chunk.sizes()<<std::endl;
|
||||||
|
chunks.push_back(chunk);
|
||||||
|
|
||||||
|
|
||||||
|
if(i==num_chunks-1 && remainder_samples>0){//마지막 chunk && 나머지가 존재
|
||||||
|
int remaining_samples = num_samples - num_chunks * window_size_samples;
|
||||||
|
//std::cout<<"Remainder size : "<<remaining_samples;
|
||||||
|
float* chunk_start_remainder = input_wav.data() + num_chunks *window_size_samples;
|
||||||
|
|
||||||
|
torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1,remaining_samples},
|
||||||
|
torch::kFloat32);
|
||||||
|
// Pad the remainder chunk to match window_size_samples
|
||||||
|
torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples
|
||||||
|
- remaining_samples}, torch::kFloat32)}, 1);
|
||||||
|
//std::cout<<", padded_chunk size : "<<padded_chunk.size(1)<<std::endl;
|
||||||
|
|
||||||
|
chunks.push_back(padded_chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chunks.empty()) {
|
||||||
|
|
||||||
|
#ifdef USE_BATCH
|
||||||
|
torch::Tensor batched_chunks = torch::stack(chunks); // Stack all chunks into a single tensor
|
||||||
|
//batched_chunks = batched_chunks.squeeze(1);
|
||||||
|
batched_chunks = torch::cat({batched_chunks.squeeze(1)});
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
batched_chunks = batched_chunks.to(at::kCUDA); // Move the entire batch to GPU once
|
||||||
|
#endif
|
||||||
|
// Prepare input for model
|
||||||
|
std::vector<torch::jit::IValue> inputs;
|
||||||
|
inputs.push_back(batched_chunks); // Batch of chunks
|
||||||
|
inputs.push_back(sample_rate); // Assuming sample_rate is a valid input for the model
|
||||||
|
|
||||||
|
// Run inference on the batch
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
torch::Tensor output = model.forward(inputs).toTensor();
|
||||||
|
#ifdef USE_GPU
|
||||||
|
output = output.to(at::kCPU); // Move the output back to CPU once
|
||||||
|
#endif
|
||||||
|
// Collect output probabilities
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
float output_f = output[i].item<float>();
|
||||||
|
outputs_prob.push_back(output_f);
|
||||||
|
//std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
|
std::vector<torch::Tensor> outputs;
|
||||||
|
torch::Tensor batched_chunks = torch::stack(chunks);
|
||||||
|
#ifdef USE_GPU
|
||||||
|
batched_chunks = batched_chunks.to(at::kCUDA);
|
||||||
|
#endif
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::vector<torch::jit::IValue> inputs;
|
||||||
|
inputs.push_back(batched_chunks[i]);
|
||||||
|
inputs.push_back(sample_rate);
|
||||||
|
|
||||||
|
torch::Tensor output = model.forward(inputs).toTensor();
|
||||||
|
outputs.push_back(output);
|
||||||
|
}
|
||||||
|
torch::Tensor all_outputs = torch::stack(outputs);
|
||||||
|
#ifdef USE_GPU
|
||||||
|
all_outputs = all_outputs.to(at::kCPU);
|
||||||
|
#endif
|
||||||
|
for (int i = 0; i < chunks.size(); i++) {
|
||||||
|
float output_f = all_outputs[i].item<float>();
|
||||||
|
outputs_prob.push_back(output_f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::GetSpeechTimestamps() {
|
||||||
|
std::vector<SpeechSegment> speeches = DoVad();
|
||||||
|
|
||||||
|
#ifdef USE_BATCH
|
||||||
|
//When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
|
||||||
|
//It could be better get reasonable output because of distorted probs.
|
||||||
|
duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
|
||||||
|
std::vector<SpeechSegment> speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
|
||||||
|
if(!print_as_samples){
|
||||||
|
for (auto& speech : speeches_merge) { //samples to second
|
||||||
|
speech.start /= sample_rate;
|
||||||
|
speech.end /= sample_rate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return speeches_merge;
|
||||||
|
#else
|
||||||
|
|
||||||
|
if(!print_as_samples){
|
||||||
|
for (auto& speech : speeches) { //samples to second
|
||||||
|
speech.start /= sample_rate;
|
||||||
|
speech.end /= sample_rate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return speeches;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
void VadIterator::SetVariables(){
|
||||||
|
init_engine(window_size_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_engine(int window_size_ms) {
|
||||||
|
min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
|
||||||
|
speech_pad_samples = sample_rate * speech_pad_ms / 1000;
|
||||||
|
window_size_samples = sample_rate / 1000 * window_size_ms;
|
||||||
|
min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::init_torch_model(const std::string& model_path) {
|
||||||
|
at::set_num_threads(1);
|
||||||
|
model = torch::jit::load(model_path);
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
if (!torch::cuda::is_available()) {
|
||||||
|
std::cout<<"CUDA is not available! Please check your GPU settings"<<std::endl;
|
||||||
|
throw std::runtime_error("CUDA is not available!");
|
||||||
|
model.to(at::Device(at::kCPU));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
std::cout<<"CUDA available! Running on '0'th GPU"<<std::endl;
|
||||||
|
model.to(at::Device(at::kCUDA, 0)); //select 0'th machine
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
model.eval();
|
||||||
|
torch::NoGradGuard no_grad;
|
||||||
|
std::cout << "Model loaded successfully"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VadIterator::reset_states() {
|
||||||
|
triggered = false;
|
||||||
|
current_sample = 0;
|
||||||
|
temp_end = 0;
|
||||||
|
outputs_prob.clear();
|
||||||
|
model.run_method("reset_states");
|
||||||
|
total_sample_size = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::DoVad() {
|
||||||
|
std::vector<SpeechSegment> speeches;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < outputs_prob.size(); ++i) {
|
||||||
|
float speech_prob = outputs_prob[i];
|
||||||
|
//std::cout << speech_prob << std::endl;
|
||||||
|
//std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n";
|
||||||
|
//std::cout << speech_prob << " ";
|
||||||
|
current_sample += window_size_samples;
|
||||||
|
|
||||||
|
if (speech_prob >= threshold && temp_end != 0) {
|
||||||
|
temp_end = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob >= threshold && !triggered) {
|
||||||
|
triggered = true;
|
||||||
|
SpeechSegment segment;
|
||||||
|
segment.start = std::max(static_cast<int>(0), current_sample - speech_pad_samples - window_size_samples);
|
||||||
|
speeches.push_back(segment);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speech_prob < threshold - 0.15f && triggered) {
|
||||||
|
if (temp_end == 0) {
|
||||||
|
temp_end = current_sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_sample - temp_end < min_silence_samples) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
SpeechSegment& segment = speeches.back();
|
||||||
|
segment.end = temp_end + speech_pad_samples - window_size_samples;
|
||||||
|
temp_end = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (triggered) { //만약 낮은 확률을 보이다가 마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에 segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
|
||||||
|
std::cout<<"when last triggered is keep working until last Probs"<<std::endl;
|
||||||
|
SpeechSegment& segment = speeches.back();
|
||||||
|
segment.end = total_sample_size; // 현재 샘플을 마지막 구간의 종료 시간으로 설정
|
||||||
|
triggered = false; // VAD 상태 초기화
|
||||||
|
}
|
||||||
|
|
||||||
|
speeches.erase(
|
||||||
|
std::remove_if(
|
||||||
|
speeches.begin(),
|
||||||
|
speeches.end(),
|
||||||
|
[this](const SpeechSegment& speech) {
|
||||||
|
return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
|
||||||
|
//min_speech_samples is 4000samples(0.25sec)
|
||||||
|
//여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함.
|
||||||
|
}
|
||||||
|
),
|
||||||
|
speeches.end()
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
//std::cout<<std::endl;
|
||||||
|
//std::cout<<"outputs_prob.size : "<<outputs_prob.size()<<std::endl;
|
||||||
|
|
||||||
|
reset_states();
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<SpeechSegment> VadIterator::mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples) {
|
||||||
|
std::vector<SpeechSegment> mergedSpeeches;
|
||||||
|
|
||||||
|
if (speeches.empty()) {
|
||||||
|
return mergedSpeeches; // 빈 벡터 반환
|
||||||
|
}
|
||||||
|
|
||||||
|
// 첫 번째 구간으로 초기화
|
||||||
|
SpeechSegment currentSegment = speeches[0];
|
||||||
|
|
||||||
|
for (size_t i = 1; i < speeches.size(); ++i) { //첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
|
||||||
|
// 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
|
||||||
|
if (speeches[i].start - currentSegment.end < duration_merge_samples) {
|
||||||
|
// 현재 구간의 끝점을 업데이트
|
||||||
|
currentSegment.end = speeches[i].end;
|
||||||
|
} else {
|
||||||
|
// 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작
|
||||||
|
mergedSpeeches.push_back(currentSegment);
|
||||||
|
currentSegment = speeches[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 마지막 구간 추가
|
||||||
|
mergedSpeeches.push_back(currentSegment);
|
||||||
|
|
||||||
|
return mergedSpeeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
75
examples/cpp_libtorch_deprecated/silero_torch.h
Normal file
75
examples/cpp_libtorch_deprecated/silero_torch.h
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
//Author : Nathan Lee
|
||||||
|
//Created On : 2024-11-18
|
||||||
|
//Description : silero 5.1 system for torch-script(c++).
|
||||||
|
//Version : 1.0
|
||||||
|
|
||||||
|
#ifndef SILERO_TORCH_H
|
||||||
|
#define SILERO_TORCH_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <torch/script.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace silero{
|
||||||
|
|
||||||
|
struct SpeechSegment{
|
||||||
|
int start;
|
||||||
|
int end;
|
||||||
|
};
|
||||||
|
|
||||||
|
class VadIterator{
|
||||||
|
public:
|
||||||
|
|
||||||
|
VadIterator(const std::string &model_path, float threshold = 0.5, int sample_rate = 16000,
|
||||||
|
int window_size_ms = 32, int speech_pad_ms = 30, int min_silence_duration_ms = 100,
|
||||||
|
int min_speech_duration_ms = 250, int max_duration_merge_ms = 300, bool print_as_samples = false);
|
||||||
|
~VadIterator();
|
||||||
|
|
||||||
|
|
||||||
|
void SpeechProbs(std::vector<float>& input_wav);
|
||||||
|
std::vector<silero::SpeechSegment> GetSpeechTimestamps();
|
||||||
|
void SetVariables();
|
||||||
|
|
||||||
|
float threshold;
|
||||||
|
int sample_rate;
|
||||||
|
int window_size_ms;
|
||||||
|
int min_speech_duration_ms;
|
||||||
|
int max_duration_merge_ms;
|
||||||
|
bool print_as_samples;
|
||||||
|
|
||||||
|
private:
|
||||||
|
torch::jit::script::Module model;
|
||||||
|
std::vector<float> outputs_prob;
|
||||||
|
int min_silence_samples;
|
||||||
|
int min_speech_samples;
|
||||||
|
int speech_pad_samples;
|
||||||
|
int window_size_samples;
|
||||||
|
int duration_merge_samples;
|
||||||
|
int current_sample = 0;
|
||||||
|
|
||||||
|
int total_sample_size=0;
|
||||||
|
|
||||||
|
int min_silence_duration_ms;
|
||||||
|
int speech_pad_ms;
|
||||||
|
bool triggered = false;
|
||||||
|
int temp_end = 0;
|
||||||
|
|
||||||
|
void init_engine(int window_size_ms);
|
||||||
|
void init_torch_model(const std::string& model_path);
|
||||||
|
void reset_states();
|
||||||
|
std::vector<SpeechSegment> DoVad();
|
||||||
|
std::vector<SpeechSegment> mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif // SILERO_TORCH_H
|
||||||
235
examples/cpp_libtorch_deprecated/wav.h
Normal file
235
examples/cpp_libtorch_deprecated/wav.h
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
// Copyright (c) 2016 Personal (Binbin Zhang)
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef FRONTEND_WAV_H_
|
||||||
|
#define FRONTEND_WAV_H_
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// #include "utils/log.h"
|
||||||
|
|
||||||
|
namespace wav {
|
||||||
|
|
||||||
|
struct WavHeader {
|
||||||
|
char riff[4]; // "riff"
|
||||||
|
unsigned int size;
|
||||||
|
char wav[4]; // "WAVE"
|
||||||
|
char fmt[4]; // "fmt "
|
||||||
|
unsigned int fmt_size;
|
||||||
|
uint16_t format;
|
||||||
|
uint16_t channels;
|
||||||
|
unsigned int sample_rate;
|
||||||
|
unsigned int bytes_per_second;
|
||||||
|
uint16_t block_size;
|
||||||
|
uint16_t bit;
|
||||||
|
char data[4]; // "data"
|
||||||
|
unsigned int data_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavReader {
|
||||||
|
public:
|
||||||
|
WavReader() : data_(nullptr) {}
|
||||||
|
explicit WavReader(const std::string& filename) { Open(filename); }
|
||||||
|
|
||||||
|
bool Open(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
||||||
|
if (NULL == fp) {
|
||||||
|
std::cout << "Error in read " << filename;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WavHeader header;
|
||||||
|
fread(&header, 1, sizeof(header), fp);
|
||||||
|
if (header.fmt_size < 16) {
|
||||||
|
printf("WaveData: expect PCM format data "
|
||||||
|
"to have fmt chunk of at least size 16.\n");
|
||||||
|
return false;
|
||||||
|
} else if (header.fmt_size > 16) {
|
||||||
|
int offset = 44 - 8 + header.fmt_size - 16;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
// check "riff" "WAVE" "fmt " "data"
|
||||||
|
|
||||||
|
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
||||||
|
// be a single "fact" sub chunk, but on Windows there can also be a
|
||||||
|
// "list" sub chunk.
|
||||||
|
while (0 != strncmp(header.data, "data", 4)) {
|
||||||
|
// We will just ignore the data in these chunks.
|
||||||
|
fseek(fp, header.data_size, SEEK_CUR);
|
||||||
|
// read next sub chunk
|
||||||
|
fread(header.data, 8, sizeof(char), fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header.data_size == 0) {
|
||||||
|
int offset = ftell(fp);
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
header.data_size = ftell(fp) - offset;
|
||||||
|
fseek(fp, offset, SEEK_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
num_channel_ = header.channels;
|
||||||
|
sample_rate_ = header.sample_rate;
|
||||||
|
bits_per_sample_ = header.bit;
|
||||||
|
int num_data = header.data_size / (bits_per_sample_ / 8);
|
||||||
|
data_ = new float[num_data]; // Create 1-dim array
|
||||||
|
num_samples_ = num_data / num_channel_;
|
||||||
|
|
||||||
|
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
||||||
|
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
||||||
|
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
||||||
|
std::cout << "num_samples :" << num_data << std::endl;
|
||||||
|
std::cout << "num_data_size :" << header.data_size << std::endl;
|
||||||
|
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(char), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int16_t), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32:
|
||||||
|
{
|
||||||
|
if (header.format == 1) //S32
|
||||||
|
{
|
||||||
|
int sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(int), fp);
|
||||||
|
data_[i] = static_cast<float>(sample) / 32768;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (header.format == 3) // IEEE-float
|
||||||
|
{
|
||||||
|
float sample;
|
||||||
|
for (int i = 0; i < num_data; ++i) {
|
||||||
|
fread(&sample, 1, sizeof(float), fp);
|
||||||
|
data_[i] = static_cast<float>(sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
printf("unsupported quantization bits\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_channel() const { return num_channel_; }
|
||||||
|
int sample_rate() const { return sample_rate_; }
|
||||||
|
int bits_per_sample() const { return bits_per_sample_; }
|
||||||
|
int num_samples() const { return num_samples_; }
|
||||||
|
|
||||||
|
~WavReader() {
|
||||||
|
delete[] data_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* data() const { return data_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
int num_samples_; // sample points per channel
|
||||||
|
float* data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class WavWriter {
|
||||||
|
public:
|
||||||
|
WavWriter(const float* data, int num_samples, int num_channel,
|
||||||
|
int sample_rate, int bits_per_sample)
|
||||||
|
: data_(data),
|
||||||
|
num_samples_(num_samples),
|
||||||
|
num_channel_(num_channel),
|
||||||
|
sample_rate_(sample_rate),
|
||||||
|
bits_per_sample_(bits_per_sample) {}
|
||||||
|
|
||||||
|
void Write(const std::string& filename) {
|
||||||
|
FILE* fp = fopen(filename.c_str(), "w");
|
||||||
|
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
||||||
|
WavHeader header;
|
||||||
|
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
||||||
|
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
||||||
|
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
memcpy(&header, wav_header, sizeof(header));
|
||||||
|
header.channels = num_channel_;
|
||||||
|
header.bit = bits_per_sample_;
|
||||||
|
header.sample_rate = sample_rate_;
|
||||||
|
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.size = sizeof(header) - 8 + header.data_size;
|
||||||
|
header.bytes_per_second =
|
||||||
|
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
||||||
|
|
||||||
|
fwrite(&header, 1, sizeof(header), fp);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_samples_; ++i) {
|
||||||
|
for (int j = 0; j < num_channel_; ++j) {
|
||||||
|
switch (bits_per_sample_) {
|
||||||
|
case 8: {
|
||||||
|
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
||||||
|
fwrite(&sample, 1, sizeof(sample), fp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const float* data_;
|
||||||
|
int num_samples_; // total float points in data_
|
||||||
|
int num_channel_;
|
||||||
|
int sample_rate_;
|
||||||
|
int bits_per_sample_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace wenet
|
||||||
|
|
||||||
|
#endif // FRONTEND_WAV_H_
|
||||||
35
examples/csharp/Program.cs
Normal file
35
examples/csharp/Program.cs
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace VadDotNet;
|
||||||
|
|
||||||
|
|
||||||
|
class Program
|
||||||
|
{
|
||||||
|
private const string MODEL_PATH = "./resources/silero_vad.onnx";
|
||||||
|
private const string EXAMPLE_WAV_FILE = "./resources/example.wav";
|
||||||
|
private const int SAMPLE_RATE = 16000;
|
||||||
|
private const float THRESHOLD = 0.5f;
|
||||||
|
private const int MIN_SPEECH_DURATION_MS = 250;
|
||||||
|
private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity;
|
||||||
|
private const int MIN_SILENCE_DURATION_MS = 100;
|
||||||
|
private const int SPEECH_PAD_MS = 30;
|
||||||
|
|
||||||
|
public static void Main(string[] args)
|
||||||
|
{
|
||||||
|
|
||||||
|
var vadDetector = new SileroVadDetector(MODEL_PATH, THRESHOLD, SAMPLE_RATE,
|
||||||
|
MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
|
||||||
|
List<SileroSpeechSegment> speechTimeList = vadDetector.GetSpeechSegmentList(new FileInfo(EXAMPLE_WAV_FILE));
|
||||||
|
//Console.WriteLine(speechTimeList.ToJson());
|
||||||
|
StringBuilder sb = new();
|
||||||
|
foreach (var speechSegment in speechTimeList)
|
||||||
|
{
|
||||||
|
sb.Append($"start second: {speechSegment.StartSecond}, end second: {speechSegment.EndSecond}\n");
|
||||||
|
|
||||||
|
}
|
||||||
|
Console.WriteLine(sb.ToString());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
21
examples/csharp/SileroSpeechSegment.cs
Normal file
21
examples/csharp/SileroSpeechSegment.cs
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
namespace VadDotNet;
|
||||||
|
|
||||||
|
public class SileroSpeechSegment
|
||||||
|
{
|
||||||
|
public int? StartOffset { get; set; }
|
||||||
|
public int? EndOffset { get; set; }
|
||||||
|
public float? StartSecond { get; set; }
|
||||||
|
public float? EndSecond { get; set; }
|
||||||
|
|
||||||
|
public SileroSpeechSegment()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public SileroSpeechSegment(int startOffset, int? endOffset, float? startSecond, float? endSecond)
|
||||||
|
{
|
||||||
|
StartOffset = startOffset;
|
||||||
|
EndOffset = endOffset;
|
||||||
|
StartSecond = startSecond;
|
||||||
|
EndSecond = endSecond;
|
||||||
|
}
|
||||||
|
}
|
||||||
249
examples/csharp/SileroVadDetector.cs
Normal file
249
examples/csharp/SileroVadDetector.cs
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
using NAudio.Wave;
|
||||||
|
using VADdotnet;
|
||||||
|
|
||||||
|
namespace VadDotNet;
|
||||||
|
|
||||||
|
public class SileroVadDetector
|
||||||
|
{
|
||||||
|
private readonly SileroVadOnnxModel _model;
|
||||||
|
private readonly float _threshold;
|
||||||
|
private readonly float _negThreshold;
|
||||||
|
private readonly int _samplingRate;
|
||||||
|
private readonly int _windowSizeSample;
|
||||||
|
private readonly float _minSpeechSamples;
|
||||||
|
private readonly float _speechPadSamples;
|
||||||
|
private readonly float _maxSpeechSamples;
|
||||||
|
private readonly float _minSilenceSamples;
|
||||||
|
private readonly float _minSilenceSamplesAtMaxSpeech;
|
||||||
|
private int _audioLengthSamples;
|
||||||
|
private const float THRESHOLD_GAP = 0.15f;
|
||||||
|
// ReSharper disable once InconsistentNaming
|
||||||
|
private const int SAMPLING_RATE_8K = 8000;
|
||||||
|
// ReSharper disable once InconsistentNaming
|
||||||
|
private const int SAMPLING_RATE_16K = 16000;
|
||||||
|
|
||||||
|
public SileroVadDetector(string onnxModelPath, float threshold, int samplingRate,
|
||||||
|
int minSpeechDurationMs, float maxSpeechDurationSeconds,
|
||||||
|
int minSilenceDurationMs, int speechPadMs)
|
||||||
|
{
|
||||||
|
if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Sampling rate not support, only available for [8000, 16000]");
|
||||||
|
}
|
||||||
|
|
||||||
|
this._model = new SileroVadOnnxModel(onnxModelPath);
|
||||||
|
this._samplingRate = samplingRate;
|
||||||
|
this._threshold = threshold;
|
||||||
|
this._negThreshold = threshold - THRESHOLD_GAP;
|
||||||
|
this._windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
|
||||||
|
this._minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
|
||||||
|
this._speechPadSamples = samplingRate * speechPadMs / 1000f;
|
||||||
|
this._maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
|
||||||
|
this._minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
||||||
|
this._minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
|
||||||
|
this.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Reset()
|
||||||
|
{
|
||||||
|
_model.ResetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<SileroSpeechSegment> GetSpeechSegmentList(FileInfo wavFile)
|
||||||
|
{
|
||||||
|
Reset();
|
||||||
|
|
||||||
|
using var audioFile = new AudioFileReader(wavFile.FullName);
|
||||||
|
List<float> speechProbList = [];
|
||||||
|
this._audioLengthSamples = (int)(audioFile.Length / 2);
|
||||||
|
float[] buffer = new float[this._windowSizeSample];
|
||||||
|
|
||||||
|
while (audioFile.Read(buffer, 0, buffer.Length) > 0)
|
||||||
|
{
|
||||||
|
float speechProb = _model.Call([buffer], _samplingRate)[0];
|
||||||
|
speechProbList.Add(speechProb);
|
||||||
|
}
|
||||||
|
|
||||||
|
return CalculateProb(speechProbList);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<SileroSpeechSegment> CalculateProb(List<float> speechProbList)
|
||||||
|
{
|
||||||
|
List<SileroSpeechSegment> result = [];
|
||||||
|
bool triggered = false;
|
||||||
|
int tempEnd = 0, prevEnd = 0, nextStart = 0;
|
||||||
|
SileroSpeechSegment segment = new();
|
||||||
|
|
||||||
|
for (int i = 0; i < speechProbList.Count; i++)
|
||||||
|
{
|
||||||
|
float speechProb = speechProbList[i];
|
||||||
|
if (speechProb >= _threshold && (tempEnd != 0))
|
||||||
|
{
|
||||||
|
tempEnd = 0;
|
||||||
|
if (nextStart < prevEnd)
|
||||||
|
{
|
||||||
|
nextStart = _windowSizeSample * i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speechProb >= _threshold && !triggered)
|
||||||
|
{
|
||||||
|
triggered = true;
|
||||||
|
segment.StartOffset = _windowSizeSample * i;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (triggered && (_windowSizeSample * i) - segment.StartOffset > _maxSpeechSamples)
|
||||||
|
{
|
||||||
|
if (prevEnd != 0)
|
||||||
|
{
|
||||||
|
segment.EndOffset = prevEnd;
|
||||||
|
result.Add(segment);
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
if (nextStart < prevEnd)
|
||||||
|
{
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
segment.StartOffset = nextStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
segment.EndOffset = _windowSizeSample * i;
|
||||||
|
result.Add(segment);
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speechProb < _negThreshold && triggered)
|
||||||
|
{
|
||||||
|
if (tempEnd == 0)
|
||||||
|
{
|
||||||
|
tempEnd = _windowSizeSample * i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((_windowSizeSample * i) - tempEnd) > _minSilenceSamplesAtMaxSpeech)
|
||||||
|
{
|
||||||
|
prevEnd = tempEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((_windowSizeSample * i) - tempEnd < _minSilenceSamples)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
segment.EndOffset = tempEnd;
|
||||||
|
if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
|
||||||
|
{
|
||||||
|
result.Add(segment);
|
||||||
|
}
|
||||||
|
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
|
||||||
|
{
|
||||||
|
//segment.EndOffset = _audioLengthSamples;
|
||||||
|
segment.EndOffset = speechProbList.Count * _windowSizeSample;
|
||||||
|
result.Add(segment);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < result.Count; i++)
|
||||||
|
{
|
||||||
|
SileroSpeechSegment item = result[i];
|
||||||
|
if (i == 0)
|
||||||
|
{
|
||||||
|
item.StartOffset = (int)Math.Max(0, item.StartOffset.Value - _speechPadSamples);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i != result.Count - 1)
|
||||||
|
{
|
||||||
|
SileroSpeechSegment nextItem = result[i + 1];
|
||||||
|
int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
|
||||||
|
if (silenceDuration < 2 * _speechPadSamples)
|
||||||
|
{
|
||||||
|
item.EndOffset += (silenceDuration / 2);
|
||||||
|
nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (silenceDuration / 2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
||||||
|
nextItem.StartOffset = (int)Math.Max(0, nextItem.StartOffset.Value - _speechPadSamples);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
item.EndOffset = (int)Math.Min(_audioLengthSamples, item.EndOffset.Value + _speechPadSamples);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return MergeListAndCalculateSecond(result, _samplingRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<SileroSpeechSegment> MergeListAndCalculateSecond(List<SileroSpeechSegment> original, int samplingRate)
|
||||||
|
{
|
||||||
|
List<SileroSpeechSegment> result = [];
|
||||||
|
if (original == null || original.Count == 0)
|
||||||
|
{
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int left = original[0].StartOffset.Value;
|
||||||
|
int right = original[0].EndOffset.Value;
|
||||||
|
if (original.Count > 1)
|
||||||
|
{
|
||||||
|
original.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
|
||||||
|
for (int i = 1; i < original.Count; i++)
|
||||||
|
{
|
||||||
|
SileroSpeechSegment segment = original[i];
|
||||||
|
|
||||||
|
if (segment.StartOffset > right)
|
||||||
|
{
|
||||||
|
result.Add(new SileroSpeechSegment(left, right,
|
||||||
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||||
|
left = segment.StartOffset.Value;
|
||||||
|
right = segment.EndOffset.Value;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
right = Math.Max(right, segment.EndOffset.Value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.Add(new SileroSpeechSegment(left, right,
|
||||||
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result.Add(new SileroSpeechSegment(left, right,
|
||||||
|
CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float CalculateSecondByOffset(int offset, int samplingRate)
|
||||||
|
{
|
||||||
|
float secondValue = offset * 1.0f / samplingRate;
|
||||||
|
return (float)Math.Floor(secondValue * 1000.0f) / 1000.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
215
examples/csharp/SileroVadOnnxModel.cs
Normal file
215
examples/csharp/SileroVadOnnxModel.cs
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
using Microsoft.ML.OnnxRuntime;
|
||||||
|
using Microsoft.ML.OnnxRuntime.Tensors;
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
|
||||||
|
namespace VADdotnet;
|
||||||
|
|
||||||
|
|
||||||
|
public class SileroVadOnnxModel : IDisposable
|
||||||
|
{
|
||||||
|
private readonly InferenceSession session;
|
||||||
|
private float[][][] state;
|
||||||
|
private float[][] context;
|
||||||
|
private int lastSr = 0;
|
||||||
|
private int lastBatchSize = 0;
|
||||||
|
private static readonly List<int> SAMPLE_RATES = [8000, 16000];
|
||||||
|
|
||||||
|
public SileroVadOnnxModel(string modelPath)
|
||||||
|
{
|
||||||
|
var sessionOptions = new SessionOptions
|
||||||
|
{
|
||||||
|
InterOpNumThreads = 1,
|
||||||
|
IntraOpNumThreads = 1,
|
||||||
|
EnableCpuMemArena = true
|
||||||
|
};
|
||||||
|
|
||||||
|
session = new InferenceSession(modelPath, sessionOptions);
|
||||||
|
ResetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ResetStates()
|
||||||
|
{
|
||||||
|
state = new float[2][][];
|
||||||
|
state[0] = new float[1][];
|
||||||
|
state[1] = new float[1][];
|
||||||
|
state[0][0] = new float[128];
|
||||||
|
state[1][0] = new float[128];
|
||||||
|
context = [];
|
||||||
|
lastSr = 0;
|
||||||
|
lastBatchSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
GC.SuppressFinalize(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ValidationResult(float[][] x, int sr)
|
||||||
|
{
|
||||||
|
public float[][] X { get; } = x;
|
||||||
|
public int Sr { get; } = sr;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ValidationResult ValidateInput(float[][] x, int sr)
|
||||||
|
{
|
||||||
|
if (x.Length == 1)
|
||||||
|
{
|
||||||
|
x = [x[0]];
|
||||||
|
}
|
||||||
|
if (x.Length > 2)
|
||||||
|
{
|
||||||
|
throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sr != 16000 && (sr % 16000 == 0))
|
||||||
|
{
|
||||||
|
int step = sr / 16000;
|
||||||
|
float[][] reducedX = new float[x.Length][];
|
||||||
|
|
||||||
|
for (int i = 0; i < x.Length; i++)
|
||||||
|
{
|
||||||
|
float[] current = x[i];
|
||||||
|
float[] newArr = new float[(current.Length + step - 1) / step];
|
||||||
|
|
||||||
|
for (int j = 0, index = 0; j < current.Length; j += step, index++)
|
||||||
|
{
|
||||||
|
newArr[index] = current[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
reducedX[i] = newArr;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = reducedX;
|
||||||
|
sr = 16000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!SAMPLE_RATES.Contains(sr))
|
||||||
|
{
|
||||||
|
throw new ArgumentException($"Only supports sample rates {string.Join(", ", SAMPLE_RATES)} (or multiples of 16000)");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((float)sr) / x[0].Length > 31.25)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Input audio is too short");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ValidationResult(x, sr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float[][] Concatenate(float[][] a, float[][] b)
|
||||||
|
{
|
||||||
|
if (a.Length != b.Length)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("The number of rows in both arrays must be the same.");
|
||||||
|
}
|
||||||
|
|
||||||
|
int rows = a.Length;
|
||||||
|
int colsA = a[0].Length;
|
||||||
|
int colsB = b[0].Length;
|
||||||
|
float[][] result = new float[rows][];
|
||||||
|
|
||||||
|
for (int i = 0; i < rows; i++)
|
||||||
|
{
|
||||||
|
result[i] = new float[colsA + colsB];
|
||||||
|
Array.Copy(a[i], 0, result[i], 0, colsA);
|
||||||
|
Array.Copy(b[i], 0, result[i], colsA, colsB);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float[][] GetLastColumns(float[][] array, int contextSize)
|
||||||
|
{
|
||||||
|
int rows = array.Length;
|
||||||
|
int cols = array[0].Length;
|
||||||
|
|
||||||
|
if (contextSize > cols)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
|
||||||
|
}
|
||||||
|
|
||||||
|
float[][] result = new float[rows][];
|
||||||
|
|
||||||
|
for (int i = 0; i < rows; i++)
|
||||||
|
{
|
||||||
|
result[i] = new float[contextSize];
|
||||||
|
Array.Copy(array[i], cols - contextSize, result[i], 0, contextSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float[] Call(float[][] x, int sr)
|
||||||
|
{
|
||||||
|
var result = ValidateInput(x, sr);
|
||||||
|
x = result.X;
|
||||||
|
sr = result.Sr;
|
||||||
|
int numberSamples = sr == 16000 ? 512 : 256;
|
||||||
|
|
||||||
|
if (x[0].Length != numberSamples)
|
||||||
|
{
|
||||||
|
throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
|
||||||
|
}
|
||||||
|
|
||||||
|
int batchSize = x.Length;
|
||||||
|
int contextSize = sr == 16000 ? 64 : 32;
|
||||||
|
|
||||||
|
if (lastBatchSize == 0)
|
||||||
|
{
|
||||||
|
ResetStates();
|
||||||
|
}
|
||||||
|
if (lastSr != 0 && lastSr != sr)
|
||||||
|
{
|
||||||
|
ResetStates();
|
||||||
|
}
|
||||||
|
if (lastBatchSize != 0 && lastBatchSize != batchSize)
|
||||||
|
{
|
||||||
|
ResetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (context.Length == 0)
|
||||||
|
{
|
||||||
|
context = new float[batchSize][];
|
||||||
|
for (int i = 0; i < batchSize; i++)
|
||||||
|
{
|
||||||
|
context[i] = new float[contextSize];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
x = Concatenate(context, x);
|
||||||
|
|
||||||
|
var inputs = new List<NamedOnnxValue>
|
||||||
|
{
|
||||||
|
NamedOnnxValue.CreateFromTensor("input", new DenseTensor<float>(x.SelectMany(a => a).ToArray(), [x.Length, x[0].Length])),
|
||||||
|
NamedOnnxValue.CreateFromTensor("sr", new DenseTensor<long>(new[] { (long)sr }, [1])),
|
||||||
|
NamedOnnxValue.CreateFromTensor("state", new DenseTensor<float>(state.SelectMany(a => a.SelectMany(b => b)).ToArray(), [state.Length, state[0].Length, state[0][0].Length]))
|
||||||
|
};
|
||||||
|
|
||||||
|
using var outputs = session.Run(inputs);
|
||||||
|
var output = outputs.First(o => o.Name == "output").AsTensor<float>();
|
||||||
|
var newState = outputs.First(o => o.Name == "stateN").AsTensor<float>();
|
||||||
|
|
||||||
|
context = GetLastColumns(x, contextSize);
|
||||||
|
lastSr = sr;
|
||||||
|
lastBatchSize = batchSize;
|
||||||
|
|
||||||
|
state = new float[newState.Dimensions[0]][][];
|
||||||
|
for (int i = 0; i < newState.Dimensions[0]; i++)
|
||||||
|
{
|
||||||
|
state[i] = new float[newState.Dimensions[1]][];
|
||||||
|
for (int j = 0; j < newState.Dimensions[1]; j++)
|
||||||
|
{
|
||||||
|
state[i][j] = new float[newState.Dimensions[2]];
|
||||||
|
for (int k = 0; k < newState.Dimensions[2]; k++)
|
||||||
|
{
|
||||||
|
state[i][j][k] = newState[i, j, k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [.. output];
|
||||||
|
}
|
||||||
|
}
|
||||||
25
examples/csharp/VadDotNet.csproj
Normal file
25
examples/csharp/VadDotNet.csproj
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net8.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.18.1" />
|
||||||
|
<PackageReference Include="NAudio" Version="2.2.1" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Folder Include="resources\" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Content Include="resources\**">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</Content>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
25
examples/csharp/VadDotNet.sln
Normal file
25
examples/csharp/VadDotNet.sln
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio Version 17
|
||||||
|
VisualStudioVersion = 17.14.36616.10 d17.14
|
||||||
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadDotNet", "VadDotNet.csproj", "{F36E1741-EDDB-90C7-7501-4911058F8996}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
Release|Any CPU = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{F36E1741-EDDB-90C7-7501-4911058F8996}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{F36E1741-EDDB-90C7-7501-4911058F8996}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||||
|
SolutionGuid = {DFC4CEE8-1034-46B4-A5F4-D1649B3543E6}
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
||||||
1
examples/csharp/resources/put_model_here.txt
Normal file
1
examples/csharp/resources/put_model_here.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
place onnx model file and example.wav file in this folder
|
||||||
19
examples/go/README.md
Normal file
19
examples/go/README.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
## Golang Example
|
||||||
|
|
||||||
|
This is a sample program of how to run speech detection using `silero-vad` from Golang (CGO + ONNX Runtime).
|
||||||
|
|
||||||
|
### Requirements
|
||||||
|
|
||||||
|
- Golang >= v1.21
|
||||||
|
- ONNX Runtime
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```sh
|
||||||
|
go run ./cmd/main.go test.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
> **_Note_**
|
||||||
|
>
|
||||||
|
> Make sure you have the ONNX Runtime library and C headers installed in your path.
|
||||||
|
|
||||||
63
examples/go/cmd/main.go
Normal file
63
examples/go/cmd/main.go
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/streamer45/silero-vad-go/speech"
|
||||||
|
|
||||||
|
"github.com/go-audio/wav"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
sd, err := speech.NewDetector(speech.DetectorConfig{
|
||||||
|
ModelPath: "../../src/silero_vad/data/silero_vad.onnx",
|
||||||
|
SampleRate: 16000,
|
||||||
|
Threshold: 0.5,
|
||||||
|
MinSilenceDurationMs: 100,
|
||||||
|
SpeechPadMs: 30,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to create speech detector: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(os.Args) != 2 {
|
||||||
|
log.Fatalf("invalid arguments provided: expecting one file path")
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open(os.Args[1])
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to open sample audio file: %s", err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
dec := wav.NewDecoder(f)
|
||||||
|
|
||||||
|
if ok := dec.IsValidFile(); !ok {
|
||||||
|
log.Fatalf("invalid WAV file")
|
||||||
|
}
|
||||||
|
|
||||||
|
buf, err := dec.FullPCMBuffer()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to get PCM buffer")
|
||||||
|
}
|
||||||
|
|
||||||
|
pcmBuf := buf.AsFloat32Buffer()
|
||||||
|
|
||||||
|
segments, err := sd.Detect(pcmBuf.Data)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Detect failed: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, s := range segments {
|
||||||
|
log.Printf("speech starts at %0.2fs", s.SpeechStartAt)
|
||||||
|
if s.SpeechEndAt > 0 {
|
||||||
|
log.Printf("speech ends at %0.2fs", s.SpeechEndAt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = sd.Destroy()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to destroy detector: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
13
examples/go/go.mod
Normal file
13
examples/go/go.mod
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
module silero
|
||||||
|
|
||||||
|
go 1.21.4
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/go-audio/wav v1.1.0
|
||||||
|
github.com/streamer45/silero-vad-go v0.2.1
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/go-audio/audio v1.0.0 // indirect
|
||||||
|
github.com/go-audio/riff v1.0.0 // indirect
|
||||||
|
)
|
||||||
18
examples/go/go.sum
Normal file
18
examples/go/go.sum
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
|
||||||
|
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
|
||||||
|
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
|
||||||
|
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
|
||||||
|
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
|
||||||
|
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/streamer45/silero-vad-go v0.2.0 h1:bbRTa6cQuc7VI88y0qicx375UyWoxE6wlVOF+mUg0+g=
|
||||||
|
github.com/streamer45/silero-vad-go v0.2.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
||||||
|
github.com/streamer45/silero-vad-go v0.2.1 h1:Li1/tTC4H/3cyw6q4weX+U8GWwEL3lTekK/nYa1Cvuk=
|
||||||
|
github.com/streamer45/silero-vad-go v0.2.1/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
||||||
|
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||||
|
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
13
examples/haskell/README.md
Normal file
13
examples/haskell/README.md
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Haskell example
|
||||||
|
|
||||||
|
To run the example, make sure you put an ``example.wav`` in this directory, and then run the following:
|
||||||
|
```bash
|
||||||
|
stack run
|
||||||
|
```
|
||||||
|
|
||||||
|
The ``example.wav`` file must have the following requirements:
|
||||||
|
- Must be 16khz sample rate.
|
||||||
|
- Must be mono channel.
|
||||||
|
- Must be 16-bit audio.
|
||||||
|
|
||||||
|
This uses the [silero-vad](https://hackage.haskell.org/package/silero-vad) package, a haskell implementation based on the C# example.
|
||||||
22
examples/haskell/app/Main.hs
Normal file
22
examples/haskell/app/Main.hs
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
module Main (main) where
|
||||||
|
|
||||||
|
import qualified Data.Vector.Storable as Vector
|
||||||
|
import Data.WAVE
|
||||||
|
import Data.Function
|
||||||
|
import Silero
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main =
|
||||||
|
withModel $ \model -> do
|
||||||
|
wav <- getWAVEFile "example.wav"
|
||||||
|
let samples =
|
||||||
|
concat (waveSamples wav)
|
||||||
|
& Vector.fromList
|
||||||
|
& Vector.map (realToFrac . sampleToDouble)
|
||||||
|
let vad =
|
||||||
|
(defaultVad model)
|
||||||
|
{ startThreshold = 0.5
|
||||||
|
, endThreshold = 0.35
|
||||||
|
}
|
||||||
|
segments <- detectSegments vad samples
|
||||||
|
print segments
|
||||||
23
examples/haskell/example.cabal
Normal file
23
examples/haskell/example.cabal
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
cabal-version: 1.12
|
||||||
|
|
||||||
|
-- This file has been generated from package.yaml by hpack version 0.37.0.
|
||||||
|
--
|
||||||
|
-- see: https://github.com/sol/hpack
|
||||||
|
|
||||||
|
name: example
|
||||||
|
version: 0.1.0.0
|
||||||
|
build-type: Simple
|
||||||
|
|
||||||
|
executable example-exe
|
||||||
|
main-is: Main.hs
|
||||||
|
other-modules:
|
||||||
|
Paths_example
|
||||||
|
hs-source-dirs:
|
||||||
|
app
|
||||||
|
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||||
|
build-depends:
|
||||||
|
WAVE
|
||||||
|
, base >=4.7 && <5
|
||||||
|
, silero-vad
|
||||||
|
, vector
|
||||||
|
default-language: Haskell2010
|
||||||
28
examples/haskell/package.yaml
Normal file
28
examples/haskell/package.yaml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
name: example
|
||||||
|
version: 0.1.0.0
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
- base >= 4.7 && < 5
|
||||||
|
- silero-vad
|
||||||
|
- WAVE
|
||||||
|
- vector
|
||||||
|
|
||||||
|
ghc-options:
|
||||||
|
- -Wall
|
||||||
|
- -Wcompat
|
||||||
|
- -Widentities
|
||||||
|
- -Wincomplete-record-updates
|
||||||
|
- -Wincomplete-uni-patterns
|
||||||
|
- -Wmissing-export-lists
|
||||||
|
- -Wmissing-home-modules
|
||||||
|
- -Wpartial-fields
|
||||||
|
- -Wredundant-constraints
|
||||||
|
|
||||||
|
executables:
|
||||||
|
example-exe:
|
||||||
|
main: Main.hs
|
||||||
|
source-dirs: app
|
||||||
|
ghc-options:
|
||||||
|
- -threaded
|
||||||
|
- -rtsopts
|
||||||
|
- -with-rtsopts=-N
|
||||||
11
examples/haskell/stack.yaml
Normal file
11
examples/haskell/stack.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
snapshot:
|
||||||
|
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/20/26.yaml
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- .
|
||||||
|
|
||||||
|
extra-deps:
|
||||||
|
- silero-vad-0.1.0.4@sha256:2bff95be978a2782915b250edc795760d4cf76838e37bb7d4a965dc32566eb0f,5476
|
||||||
|
- WAVE-0.1.6@sha256:f744ff68f5e3a0d1f84fab373ea35970659085d213aef20860357512d0458c5c,1016
|
||||||
|
- derive-storable-0.3.1.0@sha256:bd1c51c155a00e2be18325d553d6764dd678904a85647d6ba952af998e70aa59,2313
|
||||||
|
- vector-0.13.2.0@sha256:98f5cb3080a3487527476e3c272dcadaba1376539f2aa0646f2f19b3af6b2f67,8481
|
||||||
41
examples/haskell/stack.yaml.lock
Normal file
41
examples/haskell/stack.yaml.lock
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# This file was autogenerated by Stack.
|
||||||
|
# You should not edit this file by hand.
|
||||||
|
# For more information, please see the documentation at:
|
||||||
|
# https://docs.haskellstack.org/en/stable/lock_files
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- completed:
|
||||||
|
hackage: silero-vad-0.1.0.4@sha256:2bff95be978a2782915b250edc795760d4cf76838e37bb7d4a965dc32566eb0f,5476
|
||||||
|
pantry-tree:
|
||||||
|
sha256: a62e813f978d32c87769796fded981d25fcf2875bb2afdf60ed6279f931ccd7f
|
||||||
|
size: 1391
|
||||||
|
original:
|
||||||
|
hackage: silero-vad-0.1.0.4@sha256:2bff95be978a2782915b250edc795760d4cf76838e37bb7d4a965dc32566eb0f,5476
|
||||||
|
- completed:
|
||||||
|
hackage: WAVE-0.1.6@sha256:f744ff68f5e3a0d1f84fab373ea35970659085d213aef20860357512d0458c5c,1016
|
||||||
|
pantry-tree:
|
||||||
|
sha256: ee5ccd70fa7fe6ffc360ebd762b2e3f44ae10406aa27f3842d55b8cbd1a19498
|
||||||
|
size: 405
|
||||||
|
original:
|
||||||
|
hackage: WAVE-0.1.6@sha256:f744ff68f5e3a0d1f84fab373ea35970659085d213aef20860357512d0458c5c,1016
|
||||||
|
- completed:
|
||||||
|
hackage: derive-storable-0.3.1.0@sha256:bd1c51c155a00e2be18325d553d6764dd678904a85647d6ba952af998e70aa59,2313
|
||||||
|
pantry-tree:
|
||||||
|
sha256: 48e35a72d1bb593173890616c8d7efd636a650a306a50bb3e1513e679939d27e
|
||||||
|
size: 902
|
||||||
|
original:
|
||||||
|
hackage: derive-storable-0.3.1.0@sha256:bd1c51c155a00e2be18325d553d6764dd678904a85647d6ba952af998e70aa59,2313
|
||||||
|
- completed:
|
||||||
|
hackage: vector-0.13.2.0@sha256:98f5cb3080a3487527476e3c272dcadaba1376539f2aa0646f2f19b3af6b2f67,8481
|
||||||
|
pantry-tree:
|
||||||
|
sha256: 2176fd677a02a4c47337f7dca5aeca2745dbb821a6ea5c7099b3a991ecd7f4f0
|
||||||
|
size: 4478
|
||||||
|
original:
|
||||||
|
hackage: vector-0.13.2.0@sha256:98f5cb3080a3487527476e3c272dcadaba1376539f2aa0646f2f19b3af6b2f67,8481
|
||||||
|
snapshots:
|
||||||
|
- completed:
|
||||||
|
sha256: 5a59b2a405b3aba3c00188453be172b85893cab8ebc352b1ef58b0eae5d248a2
|
||||||
|
size: 650475
|
||||||
|
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/20/26.yaml
|
||||||
|
original:
|
||||||
|
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/20/26.yaml
|
||||||
31
examples/java-example/pom.xml
Normal file
31
examples/java-example/pom.xml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>org.example</groupId>
|
||||||
|
<artifactId>java-example</artifactId>
|
||||||
|
<version>1.0-SNAPSHOT</version>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<name>sliero-vad-example</name>
|
||||||
|
<url>http://maven.apache.org</url>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>3.8.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- https://mvnrepository.com/artifact/com.microsoft.onnxruntime/onnxruntime -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.microsoft.onnxruntime</groupId>
|
||||||
|
<artifactId>onnxruntime</artifactId>
|
||||||
|
<version>1.23.1</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
||||||
264
examples/java-example/src/main/java/org/example/App.java
Normal file
264
examples/java-example/src/main/java/org/example/App.java
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
import javax.sound.sampled.*;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Silero VAD Java Example
|
||||||
|
* Voice Activity Detection using ONNX model
|
||||||
|
*
|
||||||
|
* @author VvvvvGH
|
||||||
|
*/
|
||||||
|
public class App {
|
||||||
|
|
||||||
|
// ONNX model path - using the model file from the project
|
||||||
|
private static final String MODEL_PATH = "../../src/silero_vad/data/silero_vad.onnx";
|
||||||
|
// Test audio file path
|
||||||
|
private static final String AUDIO_FILE_PATH = "../../en_example.wav";
|
||||||
|
// Sampling rate
|
||||||
|
private static final int SAMPLE_RATE = 16000;
|
||||||
|
// Speech threshold (consistent with Python default)
|
||||||
|
private static final float THRESHOLD = 0.5f;
|
||||||
|
// Negative threshold (used to determine speech end)
|
||||||
|
private static final float NEG_THRESHOLD = 0.35f; // threshold - 0.15
|
||||||
|
// Minimum speech duration (milliseconds)
|
||||||
|
private static final int MIN_SPEECH_DURATION_MS = 250;
|
||||||
|
// Minimum silence duration (milliseconds)
|
||||||
|
private static final int MIN_SILENCE_DURATION_MS = 100;
|
||||||
|
// Speech padding (milliseconds)
|
||||||
|
private static final int SPEECH_PAD_MS = 30;
|
||||||
|
// Window size (samples) - 512 samples for 16kHz
|
||||||
|
private static final int WINDOW_SIZE_SAMPLES = 512;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
System.out.println("=".repeat(60));
|
||||||
|
System.out.println("Silero VAD Java ONNX Example");
|
||||||
|
System.out.println("=".repeat(60));
|
||||||
|
|
||||||
|
// Load ONNX model
|
||||||
|
SlieroVadOnnxModel model;
|
||||||
|
try {
|
||||||
|
System.out.println("Loading ONNX model: " + MODEL_PATH);
|
||||||
|
model = new SlieroVadOnnxModel(MODEL_PATH);
|
||||||
|
System.out.println("Model loaded successfully!");
|
||||||
|
} catch (OrtException e) {
|
||||||
|
System.err.println("Failed to load model: " + e.getMessage());
|
||||||
|
e.printStackTrace();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read WAV file
|
||||||
|
float[] audioData;
|
||||||
|
try {
|
||||||
|
System.out.println("\nReading audio file: " + AUDIO_FILE_PATH);
|
||||||
|
audioData = readWavFileAsFloatArray(AUDIO_FILE_PATH);
|
||||||
|
System.out.println("Audio file read successfully, samples: " + audioData.length);
|
||||||
|
System.out.println("Audio duration: " + String.format("%.2f", (audioData.length / (float) SAMPLE_RATE)) + " seconds");
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println("Failed to read audio file: " + e.getMessage());
|
||||||
|
e.printStackTrace();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get speech timestamps (batch mode, consistent with Python's get_speech_timestamps)
|
||||||
|
System.out.println("\nDetecting speech segments...");
|
||||||
|
List<Map<String, Integer>> speechTimestamps;
|
||||||
|
try {
|
||||||
|
speechTimestamps = getSpeechTimestamps(
|
||||||
|
audioData,
|
||||||
|
model,
|
||||||
|
THRESHOLD,
|
||||||
|
SAMPLE_RATE,
|
||||||
|
MIN_SPEECH_DURATION_MS,
|
||||||
|
MIN_SILENCE_DURATION_MS,
|
||||||
|
SPEECH_PAD_MS,
|
||||||
|
NEG_THRESHOLD
|
||||||
|
);
|
||||||
|
} catch (OrtException e) {
|
||||||
|
System.err.println("Failed to detect speech timestamps: " + e.getMessage());
|
||||||
|
e.printStackTrace();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output detection results
|
||||||
|
System.out.println("\nDetected speech timestamps (in samples):");
|
||||||
|
for (Map<String, Integer> timestamp : speechTimestamps) {
|
||||||
|
System.out.println(timestamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output summary
|
||||||
|
System.out.println("\n" + "=".repeat(60));
|
||||||
|
System.out.println("Detection completed!");
|
||||||
|
System.out.println("Total detected " + speechTimestamps.size() + " speech segments");
|
||||||
|
System.out.println("=".repeat(60));
|
||||||
|
|
||||||
|
// Close model
|
||||||
|
try {
|
||||||
|
model.close();
|
||||||
|
} catch (OrtException e) {
|
||||||
|
System.err.println("Error closing model: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get speech timestamps
|
||||||
|
* Implements the same logic as Python's get_speech_timestamps
|
||||||
|
*
|
||||||
|
* @param audio Audio data (float array)
|
||||||
|
* @param model ONNX model
|
||||||
|
* @param threshold Speech threshold
|
||||||
|
* @param samplingRate Sampling rate
|
||||||
|
* @param minSpeechDurationMs Minimum speech duration (milliseconds)
|
||||||
|
* @param minSilenceDurationMs Minimum silence duration (milliseconds)
|
||||||
|
* @param speechPadMs Speech padding (milliseconds)
|
||||||
|
* @param negThreshold Negative threshold (used to determine speech end)
|
||||||
|
* @return List of speech timestamps
|
||||||
|
*/
|
||||||
|
private static List<Map<String, Integer>> getSpeechTimestamps(
|
||||||
|
float[] audio,
|
||||||
|
SlieroVadOnnxModel model,
|
||||||
|
float threshold,
|
||||||
|
int samplingRate,
|
||||||
|
int minSpeechDurationMs,
|
||||||
|
int minSilenceDurationMs,
|
||||||
|
int speechPadMs,
|
||||||
|
float negThreshold) throws OrtException {
|
||||||
|
|
||||||
|
// Reset model states
|
||||||
|
model.resetStates();
|
||||||
|
|
||||||
|
// Calculate parameters
|
||||||
|
int minSpeechSamples = samplingRate * minSpeechDurationMs / 1000;
|
||||||
|
int speechPadSamples = samplingRate * speechPadMs / 1000;
|
||||||
|
int minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
|
||||||
|
int windowSizeSamples = samplingRate == 16000 ? 512 : 256;
|
||||||
|
int audioLengthSamples = audio.length;
|
||||||
|
|
||||||
|
// Calculate speech probabilities for all audio chunks
|
||||||
|
List<Float> speechProbs = new ArrayList<>();
|
||||||
|
for (int currentStart = 0; currentStart < audioLengthSamples; currentStart += windowSizeSamples) {
|
||||||
|
float[] chunk = new float[windowSizeSamples];
|
||||||
|
int chunkLength = Math.min(windowSizeSamples, audioLengthSamples - currentStart);
|
||||||
|
System.arraycopy(audio, currentStart, chunk, 0, chunkLength);
|
||||||
|
|
||||||
|
// Pad with zeros if chunk is shorter than window size
|
||||||
|
if (chunkLength < windowSizeSamples) {
|
||||||
|
for (int i = chunkLength; i < windowSizeSamples; i++) {
|
||||||
|
chunk[i] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float speechProb = model.call(new float[][]{chunk}, samplingRate)[0];
|
||||||
|
speechProbs.add(speechProb);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect speech segments using the same algorithm as Python
|
||||||
|
boolean triggered = false;
|
||||||
|
List<Map<String, Integer>> speeches = new ArrayList<>();
|
||||||
|
Map<String, Integer> currentSpeech = null;
|
||||||
|
int tempEnd = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < speechProbs.size(); i++) {
|
||||||
|
float speechProb = speechProbs.get(i);
|
||||||
|
|
||||||
|
// Reset temporary end if speech probability exceeds threshold
|
||||||
|
if (speechProb >= threshold && tempEnd != 0) {
|
||||||
|
tempEnd = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect speech start
|
||||||
|
if (speechProb >= threshold && !triggered) {
|
||||||
|
triggered = true;
|
||||||
|
currentSpeech = new HashMap<>();
|
||||||
|
currentSpeech.put("start", windowSizeSamples * i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect speech end
|
||||||
|
if (speechProb < negThreshold && triggered) {
|
||||||
|
if (tempEnd == 0) {
|
||||||
|
tempEnd = windowSizeSamples * i;
|
||||||
|
}
|
||||||
|
if (windowSizeSamples * i - tempEnd < minSilenceSamples) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
currentSpeech.put("end", tempEnd);
|
||||||
|
if (currentSpeech.get("end") - currentSpeech.get("start") > minSpeechSamples) {
|
||||||
|
speeches.add(currentSpeech);
|
||||||
|
}
|
||||||
|
currentSpeech = null;
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the last speech segment
|
||||||
|
if (currentSpeech != null &&
|
||||||
|
(audioLengthSamples - currentSpeech.get("start")) > minSpeechSamples) {
|
||||||
|
currentSpeech.put("end", audioLengthSamples);
|
||||||
|
speeches.add(currentSpeech);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add speech padding - same logic as Python
|
||||||
|
for (int i = 0; i < speeches.size(); i++) {
|
||||||
|
Map<String, Integer> speech = speeches.get(i);
|
||||||
|
if (i == 0) {
|
||||||
|
speech.put("start", Math.max(0, speech.get("start") - speechPadSamples));
|
||||||
|
}
|
||||||
|
if (i != speeches.size() - 1) {
|
||||||
|
int silenceDuration = speeches.get(i + 1).get("start") - speech.get("end");
|
||||||
|
if (silenceDuration < 2 * speechPadSamples) {
|
||||||
|
speech.put("end", speech.get("end") + silenceDuration / 2);
|
||||||
|
speeches.get(i + 1).put("start",
|
||||||
|
Math.max(0, speeches.get(i + 1).get("start") - silenceDuration / 2));
|
||||||
|
} else {
|
||||||
|
speech.put("end", Math.min(audioLengthSamples, speech.get("end") + speechPadSamples));
|
||||||
|
speeches.get(i + 1).put("start",
|
||||||
|
Math.max(0, speeches.get(i + 1).get("start") - speechPadSamples));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
speech.put("end", Math.min(audioLengthSamples, speech.get("end") + speechPadSamples));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return speeches;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read WAV file and return as float array
|
||||||
|
*
|
||||||
|
* @param filePath WAV file path
|
||||||
|
* @return Audio data as float array (normalized to -1.0 to 1.0)
|
||||||
|
*/
|
||||||
|
private static float[] readWavFileAsFloatArray(String filePath)
|
||||||
|
throws UnsupportedAudioFileException, IOException {
|
||||||
|
File audioFile = new File(filePath);
|
||||||
|
AudioInputStream audioStream = AudioSystem.getAudioInputStream(audioFile);
|
||||||
|
|
||||||
|
// Get audio format information
|
||||||
|
AudioFormat format = audioStream.getFormat();
|
||||||
|
System.out.println("Audio format: " + format);
|
||||||
|
|
||||||
|
// Read all audio data
|
||||||
|
byte[] audioBytes = audioStream.readAllBytes();
|
||||||
|
audioStream.close();
|
||||||
|
|
||||||
|
// Convert to float array
|
||||||
|
float[] audioData = new float[audioBytes.length / 2];
|
||||||
|
for (int i = 0; i < audioData.length; i++) {
|
||||||
|
// 16-bit PCM: two bytes per sample (little-endian)
|
||||||
|
short sample = (short) ((audioBytes[i * 2] & 0xff) | (audioBytes[i * 2 + 1] << 8));
|
||||||
|
audioData[i] = sample / 32768.0f; // Normalize to -1.0 to 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
return audioData;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,156 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.math.RoundingMode;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Silero VAD Detector
|
||||||
|
* Real-time voice activity detection
|
||||||
|
*
|
||||||
|
* @author VvvvvGH
|
||||||
|
*/
|
||||||
|
public class SlieroVadDetector {
|
||||||
|
// ONNX model for speech processing
|
||||||
|
private final SlieroVadOnnxModel model;
|
||||||
|
// Speech start threshold
|
||||||
|
private final float startThreshold;
|
||||||
|
// Speech end threshold
|
||||||
|
private final float endThreshold;
|
||||||
|
// Sampling rate
|
||||||
|
private final int samplingRate;
|
||||||
|
// Minimum silence samples to determine speech end
|
||||||
|
private final float minSilenceSamples;
|
||||||
|
// Speech padding samples for calculating speech boundaries
|
||||||
|
private final float speechPadSamples;
|
||||||
|
// Triggered state (whether speech is being detected)
|
||||||
|
private boolean triggered;
|
||||||
|
// Temporary speech end sample position
|
||||||
|
private int tempEnd;
|
||||||
|
// Current sample position
|
||||||
|
private int currentSample;
|
||||||
|
|
||||||
|
|
||||||
|
public SlieroVadDetector(String modelPath,
|
||||||
|
float startThreshold,
|
||||||
|
float endThreshold,
|
||||||
|
int samplingRate,
|
||||||
|
int minSilenceDurationMs,
|
||||||
|
int speechPadMs) throws OrtException {
|
||||||
|
// Validate sampling rate
|
||||||
|
if (samplingRate != 8000 && samplingRate != 16000) {
|
||||||
|
throw new IllegalArgumentException("Does not support sampling rates other than [8000, 16000]");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize parameters
|
||||||
|
this.model = new SlieroVadOnnxModel(modelPath);
|
||||||
|
this.startThreshold = startThreshold;
|
||||||
|
this.endThreshold = endThreshold;
|
||||||
|
this.samplingRate = samplingRate;
|
||||||
|
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
||||||
|
this.speechPadSamples = samplingRate * speechPadMs / 1000f;
|
||||||
|
// Reset state
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset detector state
|
||||||
|
*/
|
||||||
|
public void reset() {
|
||||||
|
model.resetStates();
|
||||||
|
triggered = false;
|
||||||
|
tempEnd = 0;
|
||||||
|
currentSample = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process audio data and detect speech events
|
||||||
|
*
|
||||||
|
* @param data Audio data as byte array
|
||||||
|
* @param returnSeconds Whether to return timestamps in seconds
|
||||||
|
* @return Speech event (start or end) or empty map if no event
|
||||||
|
*/
|
||||||
|
public Map<String, Double> apply(byte[] data, boolean returnSeconds) {
|
||||||
|
|
||||||
|
// Convert byte array to float array
|
||||||
|
float[] audioData = new float[data.length / 2];
|
||||||
|
for (int i = 0; i < audioData.length; i++) {
|
||||||
|
audioData[i] = ((data[i * 2] & 0xff) | (data[i * 2 + 1] << 8)) / 32767.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get window size from audio data length
|
||||||
|
int windowSizeSamples = audioData.length;
|
||||||
|
// Update current sample position
|
||||||
|
currentSample += windowSizeSamples;
|
||||||
|
|
||||||
|
// Get speech probability from model
|
||||||
|
float speechProb = 0;
|
||||||
|
try {
|
||||||
|
speechProb = model.call(new float[][]{audioData}, samplingRate)[0];
|
||||||
|
} catch (OrtException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset temporary end if speech probability exceeds threshold
|
||||||
|
if (speechProb >= startThreshold && tempEnd != 0) {
|
||||||
|
tempEnd = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect speech start
|
||||||
|
if (speechProb >= startThreshold && !triggered) {
|
||||||
|
triggered = true;
|
||||||
|
int speechStart = (int) (currentSample - speechPadSamples);
|
||||||
|
speechStart = Math.max(speechStart, 0);
|
||||||
|
Map<String, Double> result = new HashMap<>();
|
||||||
|
// Return in seconds or samples based on returnSeconds parameter
|
||||||
|
if (returnSeconds) {
|
||||||
|
double speechStartSeconds = speechStart / (double) samplingRate;
|
||||||
|
double roundedSpeechStart = BigDecimal.valueOf(speechStartSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
|
||||||
|
result.put("start", roundedSpeechStart);
|
||||||
|
} else {
|
||||||
|
result.put("start", (double) speechStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect speech end
|
||||||
|
if (speechProb < endThreshold && triggered) {
|
||||||
|
// Initialize or update temporary end position
|
||||||
|
if (tempEnd == 0) {
|
||||||
|
tempEnd = currentSample;
|
||||||
|
}
|
||||||
|
// Wait for minimum silence duration before confirming speech end
|
||||||
|
if (currentSample - tempEnd < minSilenceSamples) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
} else {
|
||||||
|
// Calculate speech end time and reset state
|
||||||
|
int speechEnd = (int) (tempEnd + speechPadSamples);
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
Map<String, Double> result = new HashMap<>();
|
||||||
|
|
||||||
|
if (returnSeconds) {
|
||||||
|
double speechEndSeconds = speechEnd / (double) samplingRate;
|
||||||
|
double roundedSpeechEnd = BigDecimal.valueOf(speechEndSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
|
||||||
|
result.put("end", roundedSpeechEnd);
|
||||||
|
} else {
|
||||||
|
result.put("end", (double) speechEnd);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No speech event detected
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws OrtException {
|
||||||
|
reset();
|
||||||
|
model.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,224 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import ai.onnxruntime.OnnxTensor;
|
||||||
|
import ai.onnxruntime.OrtEnvironment;
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
import ai.onnxruntime.OrtSession;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Silero VAD ONNX Model Wrapper
|
||||||
|
*
|
||||||
|
* @author VvvvvGH
|
||||||
|
*/
|
||||||
|
public class SlieroVadOnnxModel {
|
||||||
|
// ONNX runtime session
|
||||||
|
private final OrtSession session;
|
||||||
|
// Model state - dimensions: [2, batch_size, 128]
|
||||||
|
private float[][][] state;
|
||||||
|
// Context - stores the tail of the previous audio chunk
|
||||||
|
private float[][] context;
|
||||||
|
// Last sample rate
|
||||||
|
private int lastSr = 0;
|
||||||
|
// Last batch size
|
||||||
|
private int lastBatchSize = 0;
|
||||||
|
// Supported sample rates
|
||||||
|
private static final List<Integer> SAMPLE_RATES = Arrays.asList(8000, 16000);
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
public SlieroVadOnnxModel(String modelPath) throws OrtException {
|
||||||
|
// Get the ONNX runtime environment
|
||||||
|
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
||||||
|
// Create ONNX session options
|
||||||
|
OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
|
||||||
|
// Set InterOp thread count to 1 (for parallel processing of different graph operations)
|
||||||
|
opts.setInterOpNumThreads(1);
|
||||||
|
// Set IntraOp thread count to 1 (for parallel processing within a single operation)
|
||||||
|
opts.setIntraOpNumThreads(1);
|
||||||
|
// Enable CPU execution optimization
|
||||||
|
opts.addCPU(true);
|
||||||
|
// Create ONNX session with the environment, model path, and options
|
||||||
|
session = env.createSession(modelPath, opts);
|
||||||
|
// Reset states
|
||||||
|
resetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset states with default batch size
|
||||||
|
*/
|
||||||
|
void resetStates() {
|
||||||
|
resetStates(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset states with specific batch size
|
||||||
|
*
|
||||||
|
* @param batchSize Batch size for state initialization
|
||||||
|
*/
|
||||||
|
void resetStates(int batchSize) {
|
||||||
|
state = new float[2][batchSize][128];
|
||||||
|
context = new float[0][]; // Empty context
|
||||||
|
lastSr = 0;
|
||||||
|
lastBatchSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws OrtException {
|
||||||
|
session.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Inner class for validation result
|
||||||
|
*/
|
||||||
|
public static class ValidationResult {
|
||||||
|
public final float[][] x;
|
||||||
|
public final int sr;
|
||||||
|
|
||||||
|
public ValidationResult(float[][] x, int sr) {
|
||||||
|
this.x = x;
|
||||||
|
this.sr = sr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate input data
|
||||||
|
*
|
||||||
|
* @param x Audio data array
|
||||||
|
* @param sr Sample rate
|
||||||
|
* @return Validated input data and sample rate
|
||||||
|
*/
|
||||||
|
private ValidationResult validateInput(float[][] x, int sr) {
|
||||||
|
// Ensure input is at least 2D
|
||||||
|
if (x.length == 1) {
|
||||||
|
x = new float[][]{x[0]};
|
||||||
|
}
|
||||||
|
// Check if input dimension is valid
|
||||||
|
if (x.length > 2) {
|
||||||
|
throw new IllegalArgumentException("Incorrect audio data dimension: " + x[0].length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample if sample rate is a multiple of 16000
|
||||||
|
if (sr != 16000 && (sr % 16000 == 0)) {
|
||||||
|
int step = sr / 16000;
|
||||||
|
float[][] reducedX = new float[x.length][];
|
||||||
|
|
||||||
|
for (int i = 0; i < x.length; i++) {
|
||||||
|
float[] current = x[i];
|
||||||
|
float[] newArr = new float[(current.length + step - 1) / step];
|
||||||
|
|
||||||
|
for (int j = 0, index = 0; j < current.length; j += step, index++) {
|
||||||
|
newArr[index] = current[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
reducedX[i] = newArr;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = reducedX;
|
||||||
|
sr = 16000;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate sample rate
|
||||||
|
if (!SAMPLE_RATES.contains(sr)) {
|
||||||
|
throw new IllegalArgumentException("Only supports sample rates " + SAMPLE_RATES + " (or multiples of 16000)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if audio chunk is too short
|
||||||
|
if (((float) sr) / x[0].length > 31.25) {
|
||||||
|
throw new IllegalArgumentException("Input audio is too short");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ValidationResult(x, sr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call the ONNX model for inference
|
||||||
|
*
|
||||||
|
* @param x Audio data array
|
||||||
|
* @param sr Sample rate
|
||||||
|
* @return Speech probability output
|
||||||
|
* @throws OrtException If ONNX runtime error occurs
|
||||||
|
*/
|
||||||
|
public float[] call(float[][] x, int sr) throws OrtException {
|
||||||
|
ValidationResult result = validateInput(x, sr);
|
||||||
|
x = result.x;
|
||||||
|
sr = result.sr;
|
||||||
|
|
||||||
|
int batchSize = x.length;
|
||||||
|
int numSamples = sr == 16000 ? 512 : 256;
|
||||||
|
int contextSize = sr == 16000 ? 64 : 32;
|
||||||
|
|
||||||
|
// Reset states only when sample rate or batch size changes
|
||||||
|
if (lastSr != 0 && lastSr != sr) {
|
||||||
|
resetStates(batchSize);
|
||||||
|
} else if (lastBatchSize != 0 && lastBatchSize != batchSize) {
|
||||||
|
resetStates(batchSize);
|
||||||
|
} else if (lastBatchSize == 0) {
|
||||||
|
// First call - state is already initialized, just set batch size
|
||||||
|
lastBatchSize = batchSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize context if needed
|
||||||
|
if (context.length == 0) {
|
||||||
|
context = new float[batchSize][contextSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Concatenate context and input
|
||||||
|
float[][] xWithContext = new float[batchSize][contextSize + numSamples];
|
||||||
|
for (int i = 0; i < batchSize; i++) {
|
||||||
|
// Copy context
|
||||||
|
System.arraycopy(context[i], 0, xWithContext[i], 0, contextSize);
|
||||||
|
// Copy input
|
||||||
|
System.arraycopy(x[i], 0, xWithContext[i], contextSize, numSamples);
|
||||||
|
}
|
||||||
|
|
||||||
|
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
||||||
|
|
||||||
|
OnnxTensor inputTensor = null;
|
||||||
|
OnnxTensor stateTensor = null;
|
||||||
|
OnnxTensor srTensor = null;
|
||||||
|
OrtSession.Result ortOutputs = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Create input tensors
|
||||||
|
inputTensor = OnnxTensor.createTensor(env, xWithContext);
|
||||||
|
stateTensor = OnnxTensor.createTensor(env, state);
|
||||||
|
srTensor = OnnxTensor.createTensor(env, new long[]{sr});
|
||||||
|
|
||||||
|
Map<String, OnnxTensor> inputs = new HashMap<>();
|
||||||
|
inputs.put("input", inputTensor);
|
||||||
|
inputs.put("sr", srTensor);
|
||||||
|
inputs.put("state", stateTensor);
|
||||||
|
|
||||||
|
// Run ONNX model inference
|
||||||
|
ortOutputs = session.run(inputs);
|
||||||
|
// Get output results
|
||||||
|
float[][] output = (float[][]) ortOutputs.get(0).getValue();
|
||||||
|
state = (float[][][]) ortOutputs.get(1).getValue();
|
||||||
|
|
||||||
|
// Update context - save the last contextSize samples from input
|
||||||
|
for (int i = 0; i < batchSize; i++) {
|
||||||
|
System.arraycopy(xWithContext[i], xWithContext[i].length - contextSize,
|
||||||
|
context[i], 0, contextSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastSr = sr;
|
||||||
|
lastBatchSize = batchSize;
|
||||||
|
return output[0];
|
||||||
|
} finally {
|
||||||
|
if (inputTensor != null) {
|
||||||
|
inputTensor.close();
|
||||||
|
}
|
||||||
|
if (stateTensor != null) {
|
||||||
|
stateTensor.close();
|
||||||
|
}
|
||||||
|
if (srTensor != null) {
|
||||||
|
srTensor.close();
|
||||||
|
}
|
||||||
|
if (ortOutputs != null) {
|
||||||
|
ortOutputs.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class App {
|
||||||
|
|
||||||
|
private static final String MODEL_PATH = "/path/silero_vad.onnx";
|
||||||
|
private static final String EXAMPLE_WAV_FILE = "/path/example.wav";
|
||||||
|
private static final int SAMPLE_RATE = 16000;
|
||||||
|
private static final float THRESHOLD = 0.5f;
|
||||||
|
private static final int MIN_SPEECH_DURATION_MS = 250;
|
||||||
|
private static final float MAX_SPEECH_DURATION_SECONDS = Float.POSITIVE_INFINITY;
|
||||||
|
private static final int MIN_SILENCE_DURATION_MS = 100;
|
||||||
|
private static final int SPEECH_PAD_MS = 30;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
// Initialize the Voice Activity Detector
|
||||||
|
SileroVadDetector vadDetector;
|
||||||
|
try {
|
||||||
|
vadDetector = new SileroVadDetector(MODEL_PATH, THRESHOLD, SAMPLE_RATE,
|
||||||
|
MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
|
||||||
|
fromWavFile(vadDetector, new File(EXAMPLE_WAV_FILE));
|
||||||
|
} catch (OrtException e) {
|
||||||
|
System.err.println("Error initializing the VAD detector: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void fromWavFile(SileroVadDetector vadDetector, File wavFile) {
|
||||||
|
List<SileroSpeechSegment> speechTimeList = vadDetector.getSpeechSegmentList(wavFile);
|
||||||
|
for (SileroSpeechSegment speechSegment : speechTimeList) {
|
||||||
|
System.out.println(String.format("start second: %f, end second: %f",
|
||||||
|
speechSegment.getStartSecond(), speechSegment.getEndSecond()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
|
||||||
|
public class SileroSpeechSegment {
|
||||||
|
private Integer startOffset;
|
||||||
|
private Integer endOffset;
|
||||||
|
private Float startSecond;
|
||||||
|
private Float endSecond;
|
||||||
|
|
||||||
|
public SileroSpeechSegment() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public SileroSpeechSegment(Integer startOffset, Integer endOffset, Float startSecond, Float endSecond) {
|
||||||
|
this.startOffset = startOffset;
|
||||||
|
this.endOffset = endOffset;
|
||||||
|
this.startSecond = startSecond;
|
||||||
|
this.endSecond = endSecond;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getStartOffset() {
|
||||||
|
return startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getEndOffset() {
|
||||||
|
return endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Float getStartSecond() {
|
||||||
|
return startSecond;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Float getEndSecond() {
|
||||||
|
return endSecond;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStartOffset(Integer startOffset) {
|
||||||
|
this.startOffset = startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEndOffset(Integer endOffset) {
|
||||||
|
this.endOffset = endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStartSecond(Float startSecond) {
|
||||||
|
this.startSecond = startSecond;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEndSecond(Float endSecond) {
|
||||||
|
this.endSecond = endSecond;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,244 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
|
||||||
|
import javax.sound.sampled.AudioInputStream;
|
||||||
|
import javax.sound.sampled.AudioSystem;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class SileroVadDetector {
|
||||||
|
private final SileroVadOnnxModel model;
|
||||||
|
private final float threshold;
|
||||||
|
private final float negThreshold;
|
||||||
|
private final int samplingRate;
|
||||||
|
private final int windowSizeSample;
|
||||||
|
private final float minSpeechSamples;
|
||||||
|
private final float speechPadSamples;
|
||||||
|
private final float maxSpeechSamples;
|
||||||
|
private final float minSilenceSamples;
|
||||||
|
private final float minSilenceSamplesAtMaxSpeech;
|
||||||
|
private int audioLengthSamples;
|
||||||
|
private static final float THRESHOLD_GAP = 0.15f;
|
||||||
|
private static final Integer SAMPLING_RATE_8K = 8000;
|
||||||
|
private static final Integer SAMPLING_RATE_16K = 16000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor
|
||||||
|
* @param onnxModelPath the path of silero-vad onnx model
|
||||||
|
* @param threshold threshold for speech start
|
||||||
|
* @param samplingRate audio sampling rate, only available for [8k, 16k]
|
||||||
|
* @param minSpeechDurationMs Minimum speech length in millis, any speech duration that smaller than this value would not be considered as speech
|
||||||
|
* @param maxSpeechDurationSeconds Maximum speech length in millis, recommend to be set as Float.POSITIVE_INFINITY
|
||||||
|
* @param minSilenceDurationMs Minimum silence length in millis, any silence duration that smaller than this value would not be considered as silence
|
||||||
|
* @param speechPadMs Additional pad millis for speech start and end
|
||||||
|
* @throws OrtException
|
||||||
|
*/
|
||||||
|
public SileroVadDetector(String onnxModelPath, float threshold, int samplingRate,
|
||||||
|
int minSpeechDurationMs, float maxSpeechDurationSeconds,
|
||||||
|
int minSilenceDurationMs, int speechPadMs) throws OrtException {
|
||||||
|
if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K) {
|
||||||
|
throw new IllegalArgumentException("Sampling rate not support, only available for [8000, 16000]");
|
||||||
|
}
|
||||||
|
this.model = new SileroVadOnnxModel(onnxModelPath);
|
||||||
|
this.samplingRate = samplingRate;
|
||||||
|
this.threshold = threshold;
|
||||||
|
this.negThreshold = threshold - THRESHOLD_GAP;
|
||||||
|
if (samplingRate == SAMPLING_RATE_16K) {
|
||||||
|
this.windowSizeSample = 512;
|
||||||
|
} else {
|
||||||
|
this.windowSizeSample = 256;
|
||||||
|
}
|
||||||
|
this.minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
|
||||||
|
this.speechPadSamples = samplingRate * speechPadMs / 1000f;
|
||||||
|
this.maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - windowSizeSample - 2 * speechPadSamples;
|
||||||
|
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
||||||
|
this.minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
|
||||||
|
this.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Method to reset the state
|
||||||
|
*/
|
||||||
|
public void reset() {
|
||||||
|
model.resetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get speech segment list by given wav-format file
|
||||||
|
* @param wavFile wav file
|
||||||
|
* @return list of speech segment
|
||||||
|
*/
|
||||||
|
public List<SileroSpeechSegment> getSpeechSegmentList(File wavFile) {
|
||||||
|
reset();
|
||||||
|
try (AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(wavFile)){
|
||||||
|
List<Float> speechProbList = new ArrayList<>();
|
||||||
|
this.audioLengthSamples = audioInputStream.available() / 2;
|
||||||
|
byte[] data = new byte[this.windowSizeSample * 2];
|
||||||
|
int numBytesRead = 0;
|
||||||
|
|
||||||
|
while ((numBytesRead = audioInputStream.read(data)) != -1) {
|
||||||
|
if (numBytesRead <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Convert the byte array to a float array
|
||||||
|
float[] audioData = new float[data.length / 2];
|
||||||
|
for (int i = 0; i < audioData.length; i++) {
|
||||||
|
audioData[i] = ((data[i * 2] & 0xff) | (data[i * 2 + 1] << 8)) / 32767.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
float speechProb = 0;
|
||||||
|
try {
|
||||||
|
speechProb = model.call(new float[][]{audioData}, samplingRate)[0];
|
||||||
|
speechProbList.add(speechProb);
|
||||||
|
} catch (OrtException e) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return calculateProb(speechProbList);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("SileroVadDetector getSpeechTimeList with error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate speech segement by probability
|
||||||
|
* @param speechProbList speech probability list
|
||||||
|
* @return list of speech segment
|
||||||
|
*/
|
||||||
|
private List<SileroSpeechSegment> calculateProb(List<Float> speechProbList) {
|
||||||
|
List<SileroSpeechSegment> result = new ArrayList<>();
|
||||||
|
boolean triggered = false;
|
||||||
|
int tempEnd = 0, prevEnd = 0, nextStart = 0;
|
||||||
|
SileroSpeechSegment segment = new SileroSpeechSegment();
|
||||||
|
|
||||||
|
for (int i = 0; i < speechProbList.size(); i++) {
|
||||||
|
Float speechProb = speechProbList.get(i);
|
||||||
|
if (speechProb >= threshold && (tempEnd != 0)) {
|
||||||
|
tempEnd = 0;
|
||||||
|
if (nextStart < prevEnd) {
|
||||||
|
nextStart = windowSizeSample * i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speechProb >= threshold && !triggered) {
|
||||||
|
triggered = true;
|
||||||
|
segment.setStartOffset(windowSizeSample * i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (triggered && (windowSizeSample * i) - segment.getStartOffset() > maxSpeechSamples) {
|
||||||
|
if (prevEnd != 0) {
|
||||||
|
segment.setEndOffset(prevEnd);
|
||||||
|
result.add(segment);
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
if (nextStart < prevEnd) {
|
||||||
|
triggered = false;
|
||||||
|
}else {
|
||||||
|
segment.setStartOffset(nextStart);
|
||||||
|
}
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
}else {
|
||||||
|
segment.setEndOffset(windowSizeSample * i);
|
||||||
|
result.add(segment);
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (speechProb < negThreshold && triggered) {
|
||||||
|
if (tempEnd == 0) {
|
||||||
|
tempEnd = windowSizeSample * i;
|
||||||
|
}
|
||||||
|
if (((windowSizeSample * i) - tempEnd) > minSilenceSamplesAtMaxSpeech) {
|
||||||
|
prevEnd = tempEnd;
|
||||||
|
}
|
||||||
|
if ((windowSizeSample * i) - tempEnd < minSilenceSamples) {
|
||||||
|
continue;
|
||||||
|
}else {
|
||||||
|
segment.setEndOffset(tempEnd);
|
||||||
|
if ((segment.getEndOffset() - segment.getStartOffset()) > minSpeechSamples) {
|
||||||
|
result.add(segment);
|
||||||
|
}
|
||||||
|
segment = new SileroSpeechSegment();
|
||||||
|
prevEnd = 0;
|
||||||
|
nextStart = 0;
|
||||||
|
tempEnd = 0;
|
||||||
|
triggered = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (segment.getStartOffset() != null && (audioLengthSamples - segment.getStartOffset()) > minSpeechSamples) {
|
||||||
|
segment.setEndOffset(audioLengthSamples);
|
||||||
|
result.add(segment);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < result.size(); i++) {
|
||||||
|
SileroSpeechSegment item = result.get(i);
|
||||||
|
if (i == 0) {
|
||||||
|
item.setStartOffset((int)(Math.max(0,item.getStartOffset() - speechPadSamples)));
|
||||||
|
}
|
||||||
|
if (i != result.size() - 1) {
|
||||||
|
SileroSpeechSegment nextItem = result.get(i + 1);
|
||||||
|
Integer silenceDuration = nextItem.getStartOffset() - item.getEndOffset();
|
||||||
|
if(silenceDuration < 2 * speechPadSamples){
|
||||||
|
item.setEndOffset(item.getEndOffset() + (silenceDuration / 2 ));
|
||||||
|
nextItem.setStartOffset(Math.max(0, nextItem.getStartOffset() - (silenceDuration / 2)));
|
||||||
|
} else {
|
||||||
|
item.setEndOffset((int)(Math.min(audioLengthSamples, item.getEndOffset() + speechPadSamples)));
|
||||||
|
nextItem.setStartOffset((int)(Math.max(0,nextItem.getStartOffset() - speechPadSamples)));
|
||||||
|
}
|
||||||
|
}else {
|
||||||
|
item.setEndOffset((int)(Math.min(audioLengthSamples, item.getEndOffset() + speechPadSamples)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergeListAndCalculateSecond(result, samplingRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<SileroSpeechSegment> mergeListAndCalculateSecond(List<SileroSpeechSegment> original, Integer samplingRate) {
|
||||||
|
List<SileroSpeechSegment> result = new ArrayList<>();
|
||||||
|
if (original == null || original.size() == 0) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
Integer left = original.get(0).getStartOffset();
|
||||||
|
Integer right = original.get(0).getEndOffset();
|
||||||
|
if (original.size() > 1) {
|
||||||
|
original.sort(Comparator.comparingLong(SileroSpeechSegment::getStartOffset));
|
||||||
|
for (int i = 1; i < original.size(); i++) {
|
||||||
|
SileroSpeechSegment segment = original.get(i);
|
||||||
|
|
||||||
|
if (segment.getStartOffset() > right) {
|
||||||
|
result.add(new SileroSpeechSegment(left, right,
|
||||||
|
calculateSecondByOffset(left, samplingRate), calculateSecondByOffset(right, samplingRate)));
|
||||||
|
left = segment.getStartOffset();
|
||||||
|
right = segment.getEndOffset();
|
||||||
|
} else {
|
||||||
|
right = Math.max(right, segment.getEndOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.add(new SileroSpeechSegment(left, right,
|
||||||
|
calculateSecondByOffset(left, samplingRate), calculateSecondByOffset(right, samplingRate)));
|
||||||
|
}else {
|
||||||
|
result.add(new SileroSpeechSegment(left, right,
|
||||||
|
calculateSecondByOffset(left, samplingRate), calculateSecondByOffset(right, samplingRate)));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Float calculateSecondByOffset(Integer offset, Integer samplingRate) {
|
||||||
|
float secondValue = offset * 1.0f / samplingRate;
|
||||||
|
return (float) Math.floor(secondValue * 1000.0f) / 1000.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,234 @@
|
|||||||
|
package org.example;
|
||||||
|
|
||||||
|
import ai.onnxruntime.OnnxTensor;
|
||||||
|
import ai.onnxruntime.OrtEnvironment;
|
||||||
|
import ai.onnxruntime.OrtException;
|
||||||
|
import ai.onnxruntime.OrtSession;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class SileroVadOnnxModel {
|
||||||
|
// Define private variable OrtSession
|
||||||
|
private final OrtSession session;
|
||||||
|
private float[][][] state;
|
||||||
|
private float[][] context;
|
||||||
|
// Define the last sample rate
|
||||||
|
private int lastSr = 0;
|
||||||
|
// Define the last batch size
|
||||||
|
private int lastBatchSize = 0;
|
||||||
|
// Define a list of supported sample rates
|
||||||
|
private static final List<Integer> SAMPLE_RATES = Arrays.asList(8000, 16000);
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
public SileroVadOnnxModel(String modelPath) throws OrtException {
|
||||||
|
// Get the ONNX runtime environment
|
||||||
|
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
||||||
|
// Create an ONNX session options object
|
||||||
|
OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
|
||||||
|
// Set the InterOp thread count to 1, InterOp threads are used for parallel processing of different computation graph operations
|
||||||
|
opts.setInterOpNumThreads(1);
|
||||||
|
// Set the IntraOp thread count to 1, IntraOp threads are used for parallel processing within a single operation
|
||||||
|
opts.setIntraOpNumThreads(1);
|
||||||
|
// Add a CPU device, setting to false disables CPU execution optimization
|
||||||
|
opts.addCPU(true);
|
||||||
|
// Create an ONNX session using the environment, model path, and options
|
||||||
|
session = env.createSession(modelPath, opts);
|
||||||
|
// Reset states
|
||||||
|
resetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset states
|
||||||
|
*/
|
||||||
|
void resetStates() {
|
||||||
|
state = new float[2][1][128];
|
||||||
|
context = new float[0][];
|
||||||
|
lastSr = 0;
|
||||||
|
lastBatchSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws OrtException {
|
||||||
|
session.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Define inner class ValidationResult
|
||||||
|
*/
|
||||||
|
public static class ValidationResult {
|
||||||
|
public final float[][] x;
|
||||||
|
public final int sr;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
public ValidationResult(float[][] x, int sr) {
|
||||||
|
this.x = x;
|
||||||
|
this.sr = sr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function to validate input data
|
||||||
|
*/
|
||||||
|
private ValidationResult validateInput(float[][] x, int sr) {
|
||||||
|
// Process the input data with dimension 1
|
||||||
|
if (x.length == 1) {
|
||||||
|
x = new float[][]{x[0]};
|
||||||
|
}
|
||||||
|
// Throw an exception when the input data dimension is greater than 2
|
||||||
|
if (x.length > 2) {
|
||||||
|
throw new IllegalArgumentException("Incorrect audio data dimension: " + x[0].length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the input data when the sample rate is not equal to 16000 and is a multiple of 16000
|
||||||
|
if (sr != 16000 && (sr % 16000 == 0)) {
|
||||||
|
int step = sr / 16000;
|
||||||
|
float[][] reducedX = new float[x.length][];
|
||||||
|
|
||||||
|
for (int i = 0; i < x.length; i++) {
|
||||||
|
float[] current = x[i];
|
||||||
|
float[] newArr = new float[(current.length + step - 1) / step];
|
||||||
|
|
||||||
|
for (int j = 0, index = 0; j < current.length; j += step, index++) {
|
||||||
|
newArr[index] = current[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
reducedX[i] = newArr;
|
||||||
|
}
|
||||||
|
|
||||||
|
x = reducedX;
|
||||||
|
sr = 16000;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the sample rate is not in the list of supported sample rates, throw an exception
|
||||||
|
if (!SAMPLE_RATES.contains(sr)) {
|
||||||
|
throw new IllegalArgumentException("Only supports sample rates " + SAMPLE_RATES + " (or multiples of 16000)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the input audio block is too short, throw an exception
|
||||||
|
if (((float) sr) / x[0].length > 31.25) {
|
||||||
|
throw new IllegalArgumentException("Input audio is too short");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the validated result
|
||||||
|
return new ValidationResult(x, sr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float[][] concatenate(float[][] a, float[][] b) {
|
||||||
|
if (a.length != b.length) {
|
||||||
|
throw new IllegalArgumentException("The number of rows in both arrays must be the same.");
|
||||||
|
}
|
||||||
|
|
||||||
|
int rows = a.length;
|
||||||
|
int colsA = a[0].length;
|
||||||
|
int colsB = b[0].length;
|
||||||
|
float[][] result = new float[rows][colsA + colsB];
|
||||||
|
|
||||||
|
for (int i = 0; i < rows; i++) {
|
||||||
|
System.arraycopy(a[i], 0, result[i], 0, colsA);
|
||||||
|
System.arraycopy(b[i], 0, result[i], colsA, colsB);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float[][] getLastColumns(float[][] array, int contextSize) {
|
||||||
|
int rows = array.length;
|
||||||
|
int cols = array[0].length;
|
||||||
|
|
||||||
|
if (contextSize > cols) {
|
||||||
|
throw new IllegalArgumentException("contextSize cannot be greater than the number of columns in the array.");
|
||||||
|
}
|
||||||
|
|
||||||
|
float[][] result = new float[rows][contextSize];
|
||||||
|
|
||||||
|
for (int i = 0; i < rows; i++) {
|
||||||
|
System.arraycopy(array[i], cols - contextSize, result[i], 0, contextSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Method to call the ONNX model
|
||||||
|
*/
|
||||||
|
public float[] call(float[][] x, int sr) throws OrtException {
|
||||||
|
ValidationResult result = validateInput(x, sr);
|
||||||
|
x = result.x;
|
||||||
|
sr = result.sr;
|
||||||
|
int numberSamples = 256;
|
||||||
|
if (sr == 16000) {
|
||||||
|
numberSamples = 512;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x[0].length != numberSamples) {
|
||||||
|
throw new IllegalArgumentException("Provided number of samples is " + x[0].length + " (Supported values: 256 for 8000 sample rate, 512 for 16000)");
|
||||||
|
}
|
||||||
|
|
||||||
|
int batchSize = x.length;
|
||||||
|
|
||||||
|
int contextSize = 32;
|
||||||
|
if (sr == 16000) {
|
||||||
|
contextSize = 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastBatchSize == 0) {
|
||||||
|
resetStates();
|
||||||
|
}
|
||||||
|
if (lastSr != 0 && lastSr != sr) {
|
||||||
|
resetStates();
|
||||||
|
}
|
||||||
|
if (lastBatchSize != 0 && lastBatchSize != batchSize) {
|
||||||
|
resetStates();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (context.length == 0) {
|
||||||
|
context = new float[batchSize][contextSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
x = concatenate(context, x);
|
||||||
|
|
||||||
|
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
||||||
|
|
||||||
|
OnnxTensor inputTensor = null;
|
||||||
|
OnnxTensor stateTensor = null;
|
||||||
|
OnnxTensor srTensor = null;
|
||||||
|
OrtSession.Result ortOutputs = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Create input tensors
|
||||||
|
inputTensor = OnnxTensor.createTensor(env, x);
|
||||||
|
stateTensor = OnnxTensor.createTensor(env, state);
|
||||||
|
srTensor = OnnxTensor.createTensor(env, new long[]{sr});
|
||||||
|
|
||||||
|
Map<String, OnnxTensor> inputs = new HashMap<>();
|
||||||
|
inputs.put("input", inputTensor);
|
||||||
|
inputs.put("sr", srTensor);
|
||||||
|
inputs.put("state", stateTensor);
|
||||||
|
|
||||||
|
// Call the ONNX model for calculation
|
||||||
|
ortOutputs = session.run(inputs);
|
||||||
|
// Get the output results
|
||||||
|
float[][] output = (float[][]) ortOutputs.get(0).getValue();
|
||||||
|
state = (float[][][]) ortOutputs.get(1).getValue();
|
||||||
|
|
||||||
|
context = getLastColumns(x, contextSize);
|
||||||
|
lastSr = sr;
|
||||||
|
lastBatchSize = batchSize;
|
||||||
|
return output[0];
|
||||||
|
} finally {
|
||||||
|
if (inputTensor != null) {
|
||||||
|
inputTensor.close();
|
||||||
|
}
|
||||||
|
if (stateTensor != null) {
|
||||||
|
stateTensor.close();
|
||||||
|
}
|
||||||
|
if (srTensor != null) {
|
||||||
|
srTensor.close();
|
||||||
|
}
|
||||||
|
if (ortOutputs != null) {
|
||||||
|
ortOutputs.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -186,7 +186,7 @@ if __name__ == '__main__':
|
|||||||
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
||||||
|
|
||||||
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
||||||
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
||||||
|
|
||||||
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
||||||
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
||||||
|
|||||||
161
examples/parallel_example.ipynb
Normal file
161
examples/parallel_example.ipynb
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Install Dependencies"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# !pip install -q torchaudio\n",
|
||||||
|
"SAMPLING_RATE = 16000\n",
|
||||||
|
"import torch\n",
|
||||||
|
"from pprint import pprint\n",
|
||||||
|
"import time\n",
|
||||||
|
"import shutil\n",
|
||||||
|
"\n",
|
||||||
|
"torch.set_num_threads(1)\n",
|
||||||
|
"NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
|
||||||
|
"NUM_COPIES=8\n",
|
||||||
|
"# download wav files, make multiple copies\n",
|
||||||
|
"torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
|
||||||
|
"for idx in range(NUM_COPIES-1):\n",
|
||||||
|
" shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load VAD model from torch hub"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||||
|
" model='silero_vad',\n",
|
||||||
|
" force_reload=True,\n",
|
||||||
|
" onnx=False)\n",
|
||||||
|
"\n",
|
||||||
|
"(get_speech_timestamps,\n",
|
||||||
|
"save_audio,\n",
|
||||||
|
"read_audio,\n",
|
||||||
|
"VADIterator,\n",
|
||||||
|
"collect_chunks) = utils"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Define a vad process function"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import multiprocessing\n",
|
||||||
|
"\n",
|
||||||
|
"vad_models = dict()\n",
|
||||||
|
"\n",
|
||||||
|
"def init_model(model):\n",
|
||||||
|
" pid = multiprocessing.current_process().pid\n",
|
||||||
|
" model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||||
|
" model='silero_vad',\n",
|
||||||
|
" force_reload=False,\n",
|
||||||
|
" onnx=False)\n",
|
||||||
|
" vad_models[pid] = model\n",
|
||||||
|
"\n",
|
||||||
|
"def vad_process(audio_file: str):\n",
|
||||||
|
" \n",
|
||||||
|
" pid = multiprocessing.current_process().pid\n",
|
||||||
|
" \n",
|
||||||
|
" with torch.no_grad():\n",
|
||||||
|
" wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)\n",
|
||||||
|
" return get_speech_timestamps(\n",
|
||||||
|
" wav,\n",
|
||||||
|
" vad_models[pid],\n",
|
||||||
|
" 0.46, # speech prob threshold\n",
|
||||||
|
" 16000, # sample rate\n",
|
||||||
|
" 300, # min speech duration in ms\n",
|
||||||
|
" 20, # max speech duration in seconds\n",
|
||||||
|
" 600, # min silence duration\n",
|
||||||
|
" 512, # window size\n",
|
||||||
|
" 200, # spech pad ms\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Parallelization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from concurrent.futures import ProcessPoolExecutor, as_completed\n",
|
||||||
|
"\n",
|
||||||
|
"futures = []\n",
|
||||||
|
"\n",
|
||||||
|
"with ProcessPoolExecutor(max_workers=NUM_PROCESS, initializer=init_model, initargs=(model,)) as ex:\n",
|
||||||
|
" for i in range(NUM_COPIES):\n",
|
||||||
|
" futures.append(ex.submit(vad_process, f\"en_example{idx}.wav\"))\n",
|
||||||
|
"\n",
|
||||||
|
"for finished in as_completed(futures):\n",
|
||||||
|
" pprint(finished.result())"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.14"
|
||||||
|
},
|
||||||
|
"toc": {
|
||||||
|
"base_numbering": 1,
|
||||||
|
"nav_menu": {},
|
||||||
|
"number_sections": true,
|
||||||
|
"sideBar": true,
|
||||||
|
"skip_h1_title": false,
|
||||||
|
"title_cell": "Table of Contents",
|
||||||
|
"title_sidebar": "Contents",
|
||||||
|
"toc_cell": false,
|
||||||
|
"toc_position": {},
|
||||||
|
"toc_section_display": true,
|
||||||
|
"toc_window_display": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -8,6 +8,8 @@ Currently, the notebook consits of two examples:
|
|||||||
- One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
|
- One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
|
||||||
- The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
|
- The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
|
||||||
|
|
||||||
|
This example does not work in google colab! For local usage only.
|
||||||
|
|
||||||
## Example Video for the Real-Time Visualization
|
## Example Video for the Real-Time Visualization
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
2
examples/rust-example/.gitignore
vendored
Normal file
2
examples/rust-example/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
target/
|
||||||
|
recorder.wav
|
||||||
823
examples/rust-example/Cargo.lock
generated
Normal file
823
examples/rust-example/Cargo.lock
generated
Normal file
@@ -0,0 +1,823 @@
|
|||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "adler"
|
||||||
|
version = "1.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "autocfg"
|
||||||
|
version = "1.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "base64"
|
||||||
|
version = "0.22.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "base64ct"
|
||||||
|
version = "1.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0e050f626429857a27ddccb31e0aca21356bfa709c04041aefddac081a8f068a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "1.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "2.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-buffer"
|
||||||
|
version = "0.10.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
|
||||||
|
dependencies = [
|
||||||
|
"generic-array",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "byteorder"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytes"
|
||||||
|
version = "1.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cc"
|
||||||
|
version = "1.0.98"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfg-if"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "core-foundation"
|
||||||
|
version = "0.9.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
|
||||||
|
dependencies = [
|
||||||
|
"core-foundation-sys",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "core-foundation-sys"
|
||||||
|
version = "0.8.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cpufeatures"
|
||||||
|
version = "0.2.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crc32fast"
|
||||||
|
version = "1.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crypto-common"
|
||||||
|
version = "0.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
|
||||||
|
dependencies = [
|
||||||
|
"generic-array",
|
||||||
|
"typenum",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "der"
|
||||||
|
version = "0.7.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
|
||||||
|
dependencies = [
|
||||||
|
"pem-rfc7468",
|
||||||
|
"zeroize",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "digest"
|
||||||
|
version = "0.10.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||||
|
dependencies = [
|
||||||
|
"block-buffer",
|
||||||
|
"crypto-common",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno"
|
||||||
|
version = "0.3.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fastrand"
|
||||||
|
version = "2.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "filetime"
|
||||||
|
version = "0.2.23"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"libc",
|
||||||
|
"redox_syscall",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "flate2"
|
||||||
|
version = "1.0.30"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
|
||||||
|
dependencies = [
|
||||||
|
"crc32fast",
|
||||||
|
"miniz_oxide",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "foreign-types"
|
||||||
|
version = "0.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||||
|
dependencies = [
|
||||||
|
"foreign-types-shared",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "foreign-types-shared"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "generic-array"
|
||||||
|
version = "0.14.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
||||||
|
dependencies = [
|
||||||
|
"typenum",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hound"
|
||||||
|
version = "3.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "http"
|
||||||
|
version = "1.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"itoa",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "httparse"
|
||||||
|
version = "1.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "1.0.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.155"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "linux-raw-sys"
|
||||||
|
version = "0.4.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "log"
|
||||||
|
version = "0.4.29"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "matrixmultiply"
|
||||||
|
version = "0.3.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
"rawpointer",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "miniz_oxide"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
|
||||||
|
dependencies = [
|
||||||
|
"adler",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "native-tls"
|
||||||
|
version = "0.2.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"log",
|
||||||
|
"openssl",
|
||||||
|
"openssl-probe",
|
||||||
|
"openssl-sys",
|
||||||
|
"schannel",
|
||||||
|
"security-framework",
|
||||||
|
"security-framework-sys",
|
||||||
|
"tempfile",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ndarray"
|
||||||
|
version = "0.16.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
|
||||||
|
dependencies = [
|
||||||
|
"matrixmultiply",
|
||||||
|
"num-complex",
|
||||||
|
"num-integer",
|
||||||
|
"num-traits",
|
||||||
|
"portable-atomic",
|
||||||
|
"portable-atomic-util",
|
||||||
|
"rawpointer",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-complex"
|
||||||
|
version = "0.4.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
||||||
|
dependencies = [
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-integer"
|
||||||
|
version = "0.1.46"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
||||||
|
dependencies = [
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-traits"
|
||||||
|
version = "0.2.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "once_cell"
|
||||||
|
version = "1.19.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openssl"
|
||||||
|
version = "0.10.75"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.5.0",
|
||||||
|
"cfg-if",
|
||||||
|
"foreign-types",
|
||||||
|
"libc",
|
||||||
|
"once_cell",
|
||||||
|
"openssl-macros",
|
||||||
|
"openssl-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openssl-macros"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openssl-probe"
|
||||||
|
version = "0.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "openssl-sys"
|
||||||
|
version = "0.9.111"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
"pkg-config",
|
||||||
|
"vcpkg",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ort"
|
||||||
|
version = "2.0.0-rc.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fa7e49bd669d32d7bc2a15ec540a527e7764aec722a45467814005725bcd721"
|
||||||
|
dependencies = [
|
||||||
|
"ndarray",
|
||||||
|
"ort-sys",
|
||||||
|
"smallvec",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ort-sys"
|
||||||
|
version = "2.0.0-rc.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e2aba9f5c7c479925205799216e7e5d07cc1d4fa76ea8058c60a9a30f6a4e890"
|
||||||
|
dependencies = [
|
||||||
|
"flate2",
|
||||||
|
"pkg-config",
|
||||||
|
"sha2",
|
||||||
|
"tar",
|
||||||
|
"ureq",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pem-rfc7468"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
|
||||||
|
dependencies = [
|
||||||
|
"base64ct",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "percent-encoding"
|
||||||
|
version = "2.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pin-project-lite"
|
||||||
|
version = "0.2.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pkg-config"
|
||||||
|
version = "0.3.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "portable-atomic"
|
||||||
|
version = "1.13.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "portable-atomic-util"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
||||||
|
dependencies = [
|
||||||
|
"portable-atomic",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.84"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.36"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rawpointer"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "redox_syscall"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 1.3.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rust-example"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"hound",
|
||||||
|
"ndarray",
|
||||||
|
"ort",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustix"
|
||||||
|
version = "0.38.34"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.5.0",
|
||||||
|
"errno",
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustls-pki-types"
|
||||||
|
version = "1.13.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282"
|
||||||
|
dependencies = [
|
||||||
|
"zeroize",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "schannel"
|
||||||
|
version = "0.1.28"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "security-framework"
|
||||||
|
version = "2.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.5.0",
|
||||||
|
"core-foundation",
|
||||||
|
"core-foundation-sys",
|
||||||
|
"libc",
|
||||||
|
"security-framework-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "security-framework-sys"
|
||||||
|
version = "2.15.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0"
|
||||||
|
dependencies = [
|
||||||
|
"core-foundation-sys",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha2"
|
||||||
|
version = "0.10.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"cpufeatures",
|
||||||
|
"digest",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "2.0.0-alpha.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "51d44cfb396c3caf6fbfd0ab422af02631b69ddd96d2eff0b0f0724f9024051b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "socks"
|
||||||
|
version = "0.3.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"libc",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "2.0.66"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tar"
|
||||||
|
version = "0.4.40"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
|
||||||
|
dependencies = [
|
||||||
|
"filetime",
|
||||||
|
"libc",
|
||||||
|
"xattr",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"fastrand",
|
||||||
|
"once_cell",
|
||||||
|
"rustix",
|
||||||
|
"windows-sys 0.59.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tracing"
|
||||||
|
version = "0.1.40"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||||
|
dependencies = [
|
||||||
|
"pin-project-lite",
|
||||||
|
"tracing-core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tracing-core"
|
||||||
|
version = "0.1.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||||
|
dependencies = [
|
||||||
|
"once_cell",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typenum"
|
||||||
|
version = "1.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq"
|
||||||
|
version = "3.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a"
|
||||||
|
dependencies = [
|
||||||
|
"base64",
|
||||||
|
"der",
|
||||||
|
"log",
|
||||||
|
"native-tls",
|
||||||
|
"percent-encoding",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"socks",
|
||||||
|
"ureq-proto",
|
||||||
|
"utf-8",
|
||||||
|
"webpki-root-certs",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq-proto"
|
||||||
|
version = "0.5.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
|
||||||
|
dependencies = [
|
||||||
|
"base64",
|
||||||
|
"http",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf-8"
|
||||||
|
version = "0.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "vcpkg"
|
||||||
|
version = "0.2.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "version_check"
|
||||||
|
version = "0.9.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "webpki-root-certs"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ee3e3b5f5e80bc89f30ce8d0343bf4e5f12341c51f3e26cbeecbc7c85443e85b"
|
||||||
|
dependencies = [
|
||||||
|
"rustls-pki-types",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi"
|
||||||
|
version = "0.3.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-i686-pc-windows-gnu",
|
||||||
|
"winapi-x86_64-pc-windows-gnu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-i686-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-x86_64-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-link"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.59.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.61.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-targets"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm",
|
||||||
|
"windows_aarch64_msvc",
|
||||||
|
"windows_i686_gnu",
|
||||||
|
"windows_i686_gnullvm",
|
||||||
|
"windows_i686_msvc",
|
||||||
|
"windows_x86_64_gnu",
|
||||||
|
"windows_x86_64_gnullvm",
|
||||||
|
"windows_x86_64_msvc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xattr"
|
||||||
|
version = "1.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"rustix",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zeroize"
|
||||||
|
version = "1.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
||||||
9
examples/rust-example/Cargo.toml
Normal file
9
examples/rust-example/Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[package]
|
||||||
|
name = "rust-example"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
ort = { version = "=2.0.0-rc.10", features = ["ndarray"] }
|
||||||
|
ndarray = "0.16"
|
||||||
|
hound = "3"
|
||||||
19
examples/rust-example/README.md
Normal file
19
examples/rust-example/README.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Stream example in Rust
|
||||||
|
Made after [C++ stream example](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
- To build Rust crate `ort` you need `cc` installed.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Just
|
||||||
|
```
|
||||||
|
cargo run
|
||||||
|
```
|
||||||
|
If you run example outside of this repo adjust environment variable
|
||||||
|
```
|
||||||
|
SILERO_MODEL_PATH=/path/to/silero_vad.onnx cargo run
|
||||||
|
```
|
||||||
|
If you need to test against other wav file, not `recorder.wav`, specify it as the first argument
|
||||||
|
```
|
||||||
|
cargo run -- /path/to/audio/file.wav
|
||||||
|
```
|
||||||
36
examples/rust-example/src/main.rs
Normal file
36
examples/rust-example/src/main.rs
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
mod silero;
|
||||||
|
mod utils;
|
||||||
|
mod vad_iter;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let model_path = std::env::var("SILERO_MODEL_PATH")
|
||||||
|
.unwrap_or_else(|_| String::from("../../src/silero_vad/data/silero_vad.onnx"));
|
||||||
|
let audio_path = std::env::args()
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or_else(|| String::from("recorder.wav"));
|
||||||
|
let mut wav_reader = hound::WavReader::open(audio_path).unwrap();
|
||||||
|
let sample_rate = match wav_reader.spec().sample_rate {
|
||||||
|
8000 => utils::SampleRate::EightkHz,
|
||||||
|
16000 => utils::SampleRate::SixteenkHz,
|
||||||
|
_ => panic!("Unsupported sample rate. Expect 8 kHz or 16 kHz."),
|
||||||
|
};
|
||||||
|
if wav_reader.spec().sample_format != hound::SampleFormat::Int {
|
||||||
|
panic!("Unsupported sample format. Expect Int.");
|
||||||
|
}
|
||||||
|
let content = wav_reader
|
||||||
|
.samples()
|
||||||
|
.filter_map(|x| x.ok())
|
||||||
|
.collect::<Vec<i16>>();
|
||||||
|
assert!(!content.is_empty());
|
||||||
|
let silero = silero::Silero::new(sample_rate, model_path).unwrap();
|
||||||
|
let vad_params = utils::VadParams {
|
||||||
|
sample_rate: sample_rate.into(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let mut vad_iterator = vad_iter::VadIter::new(silero, vad_params);
|
||||||
|
vad_iterator.process(&content).unwrap();
|
||||||
|
for timestamp in vad_iterator.speeches() {
|
||||||
|
println!("{}", timestamp);
|
||||||
|
}
|
||||||
|
println!("Finished.");
|
||||||
|
}
|
||||||
84
examples/rust-example/src/silero.rs
Normal file
84
examples/rust-example/src/silero.rs
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
use crate::utils;
|
||||||
|
use ndarray::{Array, Array1, Array2, ArrayBase, ArrayD, Dim, IxDynImpl, OwnedRepr};
|
||||||
|
use ort::session::Session;
|
||||||
|
use ort::value::Value;
|
||||||
|
use std::mem::take;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Silero {
|
||||||
|
session: Session,
|
||||||
|
sample_rate: ArrayBase<OwnedRepr<i64>, Dim<[usize; 1]>>,
|
||||||
|
state: ArrayBase<OwnedRepr<f32>, Dim<IxDynImpl>>,
|
||||||
|
context: Array1<f32>,
|
||||||
|
context_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Silero {
|
||||||
|
pub fn new(
|
||||||
|
sample_rate: utils::SampleRate,
|
||||||
|
model_path: impl AsRef<Path>,
|
||||||
|
) -> Result<Self, ort::Error> {
|
||||||
|
let session = Session::builder()?.commit_from_file(model_path)?;
|
||||||
|
let state = ArrayD::<f32>::zeros([2, 1, 128].as_slice());
|
||||||
|
let sample_rate_val: i64 = sample_rate.into();
|
||||||
|
let context_size = if sample_rate_val == 16000 { 64 } else { 32 };
|
||||||
|
let context = Array1::<f32>::zeros(context_size);
|
||||||
|
let sample_rate = Array::from_shape_vec([1], vec![sample_rate_val]).unwrap();
|
||||||
|
Ok(Self {
|
||||||
|
session,
|
||||||
|
sample_rate,
|
||||||
|
state,
|
||||||
|
context,
|
||||||
|
context_size,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset(&mut self) {
|
||||||
|
self.state = ArrayD::<f32>::zeros([2, 1, 128].as_slice());
|
||||||
|
self.context = Array1::<f32>::zeros(self.context_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn calc_level(&mut self, audio_frame: &[i16]) -> Result<f32, ort::Error> {
|
||||||
|
let data = audio_frame
|
||||||
|
.iter()
|
||||||
|
.map(|x| (*x as f32) / (i16::MAX as f32))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// Concatenate context with input
|
||||||
|
let mut input_with_context = Vec::with_capacity(self.context_size + data.len());
|
||||||
|
input_with_context.extend_from_slice(self.context.as_slice().unwrap());
|
||||||
|
input_with_context.extend_from_slice(&data);
|
||||||
|
|
||||||
|
let frame =
|
||||||
|
Array2::<f32>::from_shape_vec([1, input_with_context.len()], input_with_context)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let frame_value = Value::from_array(frame)?;
|
||||||
|
let state_value = Value::from_array(take(&mut self.state))?;
|
||||||
|
let sr_value = Value::from_array(self.sample_rate.clone())?;
|
||||||
|
|
||||||
|
let res = self.session.run([
|
||||||
|
(&frame_value).into(),
|
||||||
|
(&state_value).into(),
|
||||||
|
(&sr_value).into(),
|
||||||
|
])?;
|
||||||
|
|
||||||
|
let (shape, state_data) = res["stateN"].try_extract_tensor::<f32>()?;
|
||||||
|
let shape_usize: Vec<usize> = shape.as_ref().iter().map(|&d| d as usize).collect();
|
||||||
|
self.state = ArrayD::from_shape_vec(shape_usize.as_slice(), state_data.to_vec()).unwrap();
|
||||||
|
|
||||||
|
// Update context with last context_size samples from the input
|
||||||
|
if data.len() >= self.context_size {
|
||||||
|
self.context = Array1::from_vec(data[data.len() - self.context_size..].to_vec());
|
||||||
|
}
|
||||||
|
|
||||||
|
let prob = *res["output"]
|
||||||
|
.try_extract_tensor::<f32>()
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.first()
|
||||||
|
.unwrap();
|
||||||
|
Ok(prob)
|
||||||
|
}
|
||||||
|
}
|
||||||
60
examples/rust-example/src/utils.rs
Normal file
60
examples/rust-example/src/utils.rs
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum SampleRate {
|
||||||
|
EightkHz,
|
||||||
|
SixteenkHz,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<SampleRate> for i64 {
|
||||||
|
fn from(value: SampleRate) -> Self {
|
||||||
|
match value {
|
||||||
|
SampleRate::EightkHz => 8000,
|
||||||
|
SampleRate::SixteenkHz => 16000,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<SampleRate> for usize {
|
||||||
|
fn from(value: SampleRate) -> Self {
|
||||||
|
match value {
|
||||||
|
SampleRate::EightkHz => 8000,
|
||||||
|
SampleRate::SixteenkHz => 16000,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct VadParams {
|
||||||
|
pub frame_size: usize,
|
||||||
|
pub threshold: f32,
|
||||||
|
pub min_silence_duration_ms: usize,
|
||||||
|
pub speech_pad_ms: usize,
|
||||||
|
pub min_speech_duration_ms: usize,
|
||||||
|
pub max_speech_duration_s: f32,
|
||||||
|
pub sample_rate: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for VadParams {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
frame_size: 32, // 32ms for 512 samples at 16kHz
|
||||||
|
threshold: 0.5,
|
||||||
|
min_silence_duration_ms: 0,
|
||||||
|
speech_pad_ms: 64,
|
||||||
|
min_speech_duration_ms: 64,
|
||||||
|
max_speech_duration_s: f32::INFINITY,
|
||||||
|
sample_rate: 16000,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct TimeStamp {
|
||||||
|
pub start: i64,
|
||||||
|
pub end: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for TimeStamp {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "[start:{:08}, end:{:08}]", self.start, self.end)
|
||||||
|
}
|
||||||
|
}
|
||||||
223
examples/rust-example/src/vad_iter.rs
Normal file
223
examples/rust-example/src/vad_iter.rs
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
use crate::{silero, utils};
|
||||||
|
|
||||||
|
const DEBUG_SPEECH_PROB: bool = true;
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct VadIter {
|
||||||
|
silero: silero::Silero,
|
||||||
|
params: Params,
|
||||||
|
state: State,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VadIter {
|
||||||
|
pub fn new(silero: silero::Silero, params: utils::VadParams) -> Self {
|
||||||
|
Self {
|
||||||
|
silero,
|
||||||
|
params: Params::from(params),
|
||||||
|
state: State::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn process(&mut self, samples: &[i16]) -> Result<(), ort::Error> {
|
||||||
|
self.reset_states();
|
||||||
|
for audio_frame in samples.chunks_exact(self.params.frame_size_samples) {
|
||||||
|
let speech_prob: f32 = self.silero.calc_level(audio_frame)?;
|
||||||
|
self.state.update(&self.params, speech_prob);
|
||||||
|
}
|
||||||
|
self.state.check_for_last_speech(samples.len());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn speeches(&self) -> &[utils::TimeStamp] {
|
||||||
|
&self.state.speeches
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VadIter {
|
||||||
|
fn reset_states(&mut self) {
|
||||||
|
self.silero.reset();
|
||||||
|
self.state = State::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(unused)]
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct Params {
|
||||||
|
frame_size: usize,
|
||||||
|
threshold: f32,
|
||||||
|
min_silence_duration_ms: usize,
|
||||||
|
speech_pad_ms: usize,
|
||||||
|
min_speech_duration_ms: usize,
|
||||||
|
max_speech_duration_s: f32,
|
||||||
|
sample_rate: usize,
|
||||||
|
sr_per_ms: usize,
|
||||||
|
frame_size_samples: usize,
|
||||||
|
min_speech_samples: usize,
|
||||||
|
speech_pad_samples: usize,
|
||||||
|
max_speech_samples: f32,
|
||||||
|
min_silence_samples: usize,
|
||||||
|
min_silence_samples_at_max_speech: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<utils::VadParams> for Params {
|
||||||
|
fn from(value: utils::VadParams) -> Self {
|
||||||
|
let frame_size = value.frame_size;
|
||||||
|
let threshold = value.threshold;
|
||||||
|
let min_silence_duration_ms = value.min_silence_duration_ms;
|
||||||
|
let speech_pad_ms = value.speech_pad_ms;
|
||||||
|
let min_speech_duration_ms = value.min_speech_duration_ms;
|
||||||
|
let max_speech_duration_s = value.max_speech_duration_s;
|
||||||
|
let sample_rate = value.sample_rate;
|
||||||
|
let sr_per_ms = sample_rate / 1000;
|
||||||
|
let frame_size_samples = frame_size * sr_per_ms;
|
||||||
|
let min_speech_samples = sr_per_ms * min_speech_duration_ms;
|
||||||
|
let speech_pad_samples = sr_per_ms * speech_pad_ms;
|
||||||
|
let max_speech_samples = sample_rate as f32 * max_speech_duration_s
|
||||||
|
- frame_size_samples as f32
|
||||||
|
- 2.0 * speech_pad_samples as f32;
|
||||||
|
let min_silence_samples = sr_per_ms * min_silence_duration_ms;
|
||||||
|
let min_silence_samples_at_max_speech = sr_per_ms * 98;
|
||||||
|
Self {
|
||||||
|
frame_size,
|
||||||
|
threshold,
|
||||||
|
min_silence_duration_ms,
|
||||||
|
speech_pad_ms,
|
||||||
|
min_speech_duration_ms,
|
||||||
|
max_speech_duration_s,
|
||||||
|
sample_rate,
|
||||||
|
sr_per_ms,
|
||||||
|
frame_size_samples,
|
||||||
|
min_speech_samples,
|
||||||
|
speech_pad_samples,
|
||||||
|
max_speech_samples,
|
||||||
|
min_silence_samples,
|
||||||
|
min_silence_samples_at_max_speech,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
struct State {
|
||||||
|
current_sample: usize,
|
||||||
|
temp_end: usize,
|
||||||
|
next_start: usize,
|
||||||
|
prev_end: usize,
|
||||||
|
triggered: bool,
|
||||||
|
current_speech: utils::TimeStamp,
|
||||||
|
speeches: Vec<utils::TimeStamp>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn new() -> Self {
|
||||||
|
Default::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update(&mut self, params: &Params, speech_prob: f32) {
|
||||||
|
self.current_sample += params.frame_size_samples;
|
||||||
|
if speech_prob > params.threshold {
|
||||||
|
if self.temp_end != 0 {
|
||||||
|
self.temp_end = 0;
|
||||||
|
if self.next_start < self.prev_end {
|
||||||
|
self.next_start = self
|
||||||
|
.current_sample
|
||||||
|
.saturating_sub(params.frame_size_samples)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !self.triggered {
|
||||||
|
self.debug(speech_prob, params, "start");
|
||||||
|
self.triggered = true;
|
||||||
|
self.current_speech.start =
|
||||||
|
self.current_sample as i64 - params.frame_size_samples as i64;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if self.triggered
|
||||||
|
&& (self.current_sample as i64 - self.current_speech.start) as f32
|
||||||
|
> params.max_speech_samples
|
||||||
|
{
|
||||||
|
if self.prev_end > 0 {
|
||||||
|
self.current_speech.end = self.prev_end as _;
|
||||||
|
self.take_speech();
|
||||||
|
if self.next_start < self.prev_end {
|
||||||
|
self.triggered = false
|
||||||
|
} else {
|
||||||
|
self.current_speech.start = self.next_start as _;
|
||||||
|
}
|
||||||
|
self.prev_end = 0;
|
||||||
|
self.next_start = 0;
|
||||||
|
self.temp_end = 0;
|
||||||
|
} else {
|
||||||
|
self.current_speech.end = self.current_sample as _;
|
||||||
|
self.take_speech();
|
||||||
|
self.prev_end = 0;
|
||||||
|
self.next_start = 0;
|
||||||
|
self.temp_end = 0;
|
||||||
|
self.triggered = false;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if speech_prob >= (params.threshold - 0.15) && (speech_prob < params.threshold) {
|
||||||
|
if self.triggered {
|
||||||
|
self.debug(speech_prob, params, "speaking")
|
||||||
|
} else {
|
||||||
|
self.debug(speech_prob, params, "silence")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if self.triggered && speech_prob < (params.threshold - 0.15) {
|
||||||
|
self.debug(speech_prob, params, "end");
|
||||||
|
if self.temp_end == 0 {
|
||||||
|
self.temp_end = self.current_sample;
|
||||||
|
}
|
||||||
|
if self.current_sample.saturating_sub(self.temp_end)
|
||||||
|
> params.min_silence_samples_at_max_speech
|
||||||
|
{
|
||||||
|
self.prev_end = self.temp_end;
|
||||||
|
}
|
||||||
|
if self.current_sample.saturating_sub(self.temp_end) >= params.min_silence_samples {
|
||||||
|
self.current_speech.end = self.temp_end as _;
|
||||||
|
if self.current_speech.end - self.current_speech.start
|
||||||
|
> params.min_speech_samples as _
|
||||||
|
{
|
||||||
|
self.take_speech();
|
||||||
|
self.prev_end = 0;
|
||||||
|
self.next_start = 0;
|
||||||
|
self.temp_end = 0;
|
||||||
|
self.triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn take_speech(&mut self) {
|
||||||
|
self.speeches.push(std::mem::take(&mut self.current_speech)); // current speech becomes TimeStamp::default() due to take()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_for_last_speech(&mut self, last_sample: usize) {
|
||||||
|
if self.current_speech.start > 0 {
|
||||||
|
self.current_speech.end = last_sample as _;
|
||||||
|
self.take_speech();
|
||||||
|
self.prev_end = 0;
|
||||||
|
self.next_start = 0;
|
||||||
|
self.temp_end = 0;
|
||||||
|
self.triggered = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn debug(&self, speech_prob: f32, params: &Params, title: &str) {
|
||||||
|
if DEBUG_SPEECH_PROB {
|
||||||
|
let speech = self.current_sample as f32
|
||||||
|
- params.frame_size_samples as f32
|
||||||
|
- if title == "end" {
|
||||||
|
params.speech_pad_samples
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
} as f32; // minus window_size_samples to get precise start time point.
|
||||||
|
println!(
|
||||||
|
"[{:10}: {:.3} s ({:.3}) {:8}]",
|
||||||
|
title,
|
||||||
|
speech / params.sample_rate as f32,
|
||||||
|
speech_prob,
|
||||||
|
self.current_sample - params.frame_size_samples,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
files/de.wav
BIN
files/de.wav
Binary file not shown.
BIN
files/en_num.wav
BIN
files/en_num.wav
Binary file not shown.
BIN
files/es.wav
BIN
files/es.wav
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
|||||||
{"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
{"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]}
|
|
||||||
BIN
files/model.jit
BIN
files/model.jit
Binary file not shown.
BIN
files/model.onnx
BIN
files/model.onnx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
files/ru.wav
BIN
files/ru.wav
Binary file not shown.
BIN
files/ru_num.wav
BIN
files/ru_num.wav
Binary file not shown.
184
hubconf.py
184
hubconf.py
@@ -1,154 +1,56 @@
|
|||||||
dependencies = ['torch', 'torchaudio']
|
dependencies = ['torch', 'torchaudio']
|
||||||
import torch
|
import torch
|
||||||
import json
|
import os
|
||||||
from utils_vad import (init_jit_model,
|
import sys
|
||||||
get_speech_ts,
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
get_speech_ts_adaptive,
|
from silero_vad.utils_vad import (init_jit_model,
|
||||||
get_number_ts,
|
get_speech_timestamps,
|
||||||
get_language,
|
save_audio,
|
||||||
get_language_and_group,
|
read_audio,
|
||||||
save_audio,
|
VADIterator,
|
||||||
read_audio,
|
collect_chunks,
|
||||||
state_generator,
|
OnnxWrapper)
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks,
|
|
||||||
drop_chunks)
|
|
||||||
|
|
||||||
|
|
||||||
def silero_vad(**kwargs):
|
def versiontuple(v):
|
||||||
|
splitted = v.split('+')[0].split(".")
|
||||||
|
version_list = []
|
||||||
|
for i in splitted:
|
||||||
|
try:
|
||||||
|
version_list.append(int(i))
|
||||||
|
except:
|
||||||
|
version_list.append(0)
|
||||||
|
return tuple(version_list)
|
||||||
|
|
||||||
|
|
||||||
|
def silero_vad(onnx=False, force_onnx_cpu=False, opset_version=16):
|
||||||
"""Silero Voice Activity Detector
|
"""Silero Voice Activity Detector
|
||||||
Returns a model with a set of utils
|
Returns a model with a set of utils
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
Please see https://github.com/snakers4/silero-vad for usage examples
|
||||||
"""
|
"""
|
||||||
hub_dir = torch.hub.get_dir()
|
available_ops = [15, 16]
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model.jit')
|
if onnx and opset_version not in available_ops:
|
||||||
utils = (get_speech_ts,
|
raise Exception(f'Available ONNX opset_version: {available_ops}')
|
||||||
get_speech_ts_adaptive,
|
|
||||||
|
if not onnx:
|
||||||
|
installed_version = torch.__version__
|
||||||
|
supported_version = '1.12.0'
|
||||||
|
if versiontuple(installed_version) < versiontuple(supported_version):
|
||||||
|
raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
|
||||||
|
|
||||||
|
model_dir = os.path.join(os.path.dirname(__file__), 'src', 'silero_vad', 'data')
|
||||||
|
if onnx:
|
||||||
|
if opset_version == 16:
|
||||||
|
model_name = 'silero_vad.onnx'
|
||||||
|
else:
|
||||||
|
model_name = f'silero_vad_16k_op{opset_version}.onnx'
|
||||||
|
model = OnnxWrapper(os.path.join(model_dir, model_name), force_onnx_cpu)
|
||||||
|
else:
|
||||||
|
model = init_jit_model(os.path.join(model_dir, 'silero_vad.jit'))
|
||||||
|
utils = (get_speech_timestamps,
|
||||||
save_audio,
|
save_audio,
|
||||||
read_audio,
|
read_audio,
|
||||||
state_generator,
|
VADIterator,
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks)
|
collect_chunks)
|
||||||
|
|
||||||
return model, utils
|
return model, utils
|
||||||
|
|
||||||
|
|
||||||
def silero_vad_micro(**kwargs):
|
|
||||||
"""Silero Voice Activity Detector
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_micro.jit')
|
|
||||||
utils = (get_speech_ts,
|
|
||||||
get_speech_ts_adaptive,
|
|
||||||
save_audio,
|
|
||||||
read_audio,
|
|
||||||
state_generator,
|
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_vad_micro_8k(**kwargs):
|
|
||||||
"""Silero Voice Activity Detector
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_micro_8k.jit')
|
|
||||||
utils = (get_speech_ts,
|
|
||||||
get_speech_ts_adaptive,
|
|
||||||
save_audio,
|
|
||||||
read_audio,
|
|
||||||
state_generator,
|
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_vad_mini(**kwargs):
|
|
||||||
"""Silero Voice Activity Detector
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_mini.jit')
|
|
||||||
utils = (get_speech_ts,
|
|
||||||
get_speech_ts_adaptive,
|
|
||||||
save_audio,
|
|
||||||
read_audio,
|
|
||||||
state_generator,
|
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_vad_mini_8k(**kwargs):
|
|
||||||
"""Silero Voice Activity Detector
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/model_mini_8k.jit')
|
|
||||||
utils = (get_speech_ts,
|
|
||||||
get_speech_ts_adaptive,
|
|
||||||
save_audio,
|
|
||||||
read_audio,
|
|
||||||
state_generator,
|
|
||||||
single_audio_stream,
|
|
||||||
collect_chunks)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_number_detector(**kwargs):
|
|
||||||
"""Silero Number Detector
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/number_detector.jit')
|
|
||||||
utils = (get_number_ts,
|
|
||||||
save_audio,
|
|
||||||
read_audio,
|
|
||||||
collect_chunks,
|
|
||||||
drop_chunks)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_lang_detector(**kwargs):
|
|
||||||
"""Silero Language Classifier
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/number_detector.jit')
|
|
||||||
utils = (get_language,
|
|
||||||
read_audio)
|
|
||||||
|
|
||||||
return model, utils
|
|
||||||
|
|
||||||
|
|
||||||
def silero_lang_detector_95(**kwargs):
|
|
||||||
"""Silero Language Classifier (95 languages)
|
|
||||||
Returns a model with a set of utils
|
|
||||||
Please see https://github.com/snakers4/silero-vad for usage examples
|
|
||||||
"""
|
|
||||||
|
|
||||||
hub_dir = torch.hub.get_dir()
|
|
||||||
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/lang_classifier_95.jit')
|
|
||||||
|
|
||||||
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_95.json', 'r') as f:
|
|
||||||
lang_dict = json.load(f)
|
|
||||||
|
|
||||||
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_95.json', 'r') as f:
|
|
||||||
lang_group_dict = json.load(f)
|
|
||||||
|
|
||||||
utils = (get_language_and_group, read_audio)
|
|
||||||
|
|
||||||
return model, lang_dict, lang_group_dict, utils
|
|
||||||
46
pyproject.toml
Normal file
46
pyproject.toml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
[project]
|
||||||
|
name = "silero-vad"
|
||||||
|
version = "6.2.0"
|
||||||
|
authors = [
|
||||||
|
{name="Silero Team", email="hello@silero.ai"},
|
||||||
|
]
|
||||||
|
description = "Voice Activity Detector (VAD) by Silero"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 5 - Production/Stable",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Programming Language :: Python :: 3.14",
|
||||||
|
"Programming Language :: Python :: 3.15",
|
||||||
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
|
"Topic :: Scientific/Engineering",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"packaging",
|
||||||
|
"torch>=1.12.0",
|
||||||
|
"torchaudio>=0.12.0",
|
||||||
|
"onnxruntime>=1.16.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/snakers4/silero-vad"
|
||||||
|
Issues = "https://github.com/snakers4/silero-vad/issues"
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
test = [
|
||||||
|
"pytest",
|
||||||
|
"soundfile",
|
||||||
|
"torch<2.9",
|
||||||
|
]
|
||||||
915
silero-vad.ipynb
915
silero-vad.ipynb
@@ -1,23 +1,5 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "sVNOuHQQjsrp"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# PyTorch Examples"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "FpMplOCA2Fwp"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## VAD"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@@ -25,7 +7,7 @@
|
|||||||
"id": "62A6F_072Fwq"
|
"id": "62A6F_072Fwq"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Install Dependencies"
|
"## Install Dependencies"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -40,28 +22,51 @@
|
|||||||
"#@title Install and Import Dependencies\n",
|
"#@title Install and Import Dependencies\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
"# this assumes that you have a relevant version of PyTorch installed\n",
|
||||||
"!pip install -q torchaudio soundfile\n",
|
"!pip install -q torchaudio\n",
|
||||||
|
"\n",
|
||||||
|
"SAMPLING_RATE = 16000\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import glob\n",
|
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"torch.set_num_threads(1)\n",
|
"torch.set_num_threads(1)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from IPython.display import Audio\n",
|
"from IPython.display import Audio\n",
|
||||||
"from pprint import pprint\n",
|
"from pprint import pprint\n",
|
||||||
|
"# download example\n",
|
||||||
|
"torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "pSifus5IilRp"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"USE_PIP = True # download model using pip package or torch.hub\n",
|
||||||
|
"USE_ONNX = False # change this to True if you want to test onnx model\n",
|
||||||
|
"if USE_ONNX:\n",
|
||||||
|
" !pip install -q onnxruntime\n",
|
||||||
|
"if USE_PIP:\n",
|
||||||
|
" !pip install -q silero-vad\n",
|
||||||
|
" from silero_vad import (load_silero_vad,\n",
|
||||||
|
" read_audio,\n",
|
||||||
|
" get_speech_timestamps,\n",
|
||||||
|
" save_audio,\n",
|
||||||
|
" VADIterator,\n",
|
||||||
|
" collect_chunks)\n",
|
||||||
|
" model = load_silero_vad(onnx=USE_ONNX)\n",
|
||||||
|
"else:\n",
|
||||||
|
" model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
||||||
|
" model='silero_vad',\n",
|
||||||
|
" force_reload=True,\n",
|
||||||
|
" onnx=USE_ONNX)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
" (get_speech_timestamps,\n",
|
||||||
" model='silero_vad',\n",
|
" save_audio,\n",
|
||||||
" force_reload=True)\n",
|
" read_audio,\n",
|
||||||
"\n",
|
" VADIterator,\n",
|
||||||
"(get_speech_ts,\n",
|
" collect_chunks) = utils"
|
||||||
" get_speech_ts_adaptive,\n",
|
|
||||||
" save_audio,\n",
|
|
||||||
" read_audio,\n",
|
|
||||||
" state_generator,\n",
|
|
||||||
" single_audio_stream,\n",
|
|
||||||
" collect_chunks) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -70,16 +75,7 @@
|
|||||||
"id": "fXbbaUO3jsrw"
|
"id": "fXbbaUO3jsrw"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Full Audio"
|
"## Speech timestapms from full audio"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "dY2Us3_Q2Fws"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Classic way of getting speech chunks, you may need to select the thresholds yourself**"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -90,10 +86,9 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
"wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
||||||
"# get speech timestamps from full audio file\n",
|
"# get speech timestamps from full audio file\n",
|
||||||
"speech_timestamps = get_speech_ts(wav, model,\n",
|
"speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n",
|
||||||
" num_steps=4)\n",
|
|
||||||
"pprint(speech_timestamps)"
|
"pprint(speech_timestamps)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -107,45 +102,31 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# merge all speech chunks to one audio\n",
|
"# merge all speech chunks to one audio\n",
|
||||||
"save_audio('only_speech.wav',\n",
|
"save_audio('only_speech.wav',\n",
|
||||||
" collect_chunks(speech_timestamps, wav), 16000) \n",
|
" collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)\n",
|
||||||
"Audio('only_speech.wav')"
|
"Audio('only_speech.wav')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "n8plzbJU2Fws"
|
"id": "zeO1xCqxUC6w"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**"
|
"## Entire audio inference"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "SQOtu2Vl2Fwt"
|
"id": "LjZBcsaTT7Mk"
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
"wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
||||||
"# get speech timestamps from full audio file\n",
|
"# audio is being splitted into 31.25 ms long pieces\n",
|
||||||
"speech_timestamps = get_speech_ts_adaptive(wav, model, step=500, num_samples_per_window=4000)\n",
|
"# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)\n",
|
||||||
"pprint(speech_timestamps)"
|
"predicts = model.audio_forward(wav, sr=SAMPLING_RATE)"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "Lr6zCGXh2Fwt"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# merge all speech chunks to one audio\n",
|
|
||||||
"save_audio('only_speech.wav',\n",
|
|
||||||
" collect_chunks(speech_timestamps, wav), 16000) \n",
|
|
||||||
"Audio('only_speech.wav')"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -154,16 +135,7 @@
|
|||||||
"id": "iDKQbVr8jsry"
|
"id": "iDKQbVr8jsry"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"### Single Audio Stream"
|
"## Stream imitation example"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "xCM-HrUR2Fwu"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Classic way of getting speech chunks, you may need to select the thresholds yourself**"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -174,20 +146,20 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"wav = f'{files_dir}/en.wav'\n",
|
"## using VADIterator class\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for batch in single_audio_stream(model, wav):\n",
|
"vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)\n",
|
||||||
" if batch:\n",
|
"wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
||||||
" print(batch)"
|
"\n",
|
||||||
]
|
"window_size_samples = 512 if SAMPLING_RATE == 16000 else 256\n",
|
||||||
},
|
"for i in range(0, len(wav), window_size_samples):\n",
|
||||||
{
|
" chunk = wav[i: i+ window_size_samples]\n",
|
||||||
"cell_type": "markdown",
|
" if len(chunk) < window_size_samples:\n",
|
||||||
"metadata": {
|
" break\n",
|
||||||
"id": "t8TXtnvk2Fwv"
|
" speech_dict = vad_iterator(chunk, return_seconds=True)\n",
|
||||||
},
|
" if speech_dict:\n",
|
||||||
"source": [
|
" print(speech_dict, end=' ')\n",
|
||||||
"**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**"
|
"vad_iterator.reset_states() # reset model states after each audio"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -198,755 +170,20 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"wav = f'{files_dir}/en.wav'\n",
|
"## just probabilities\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for batch in single_audio_stream(model, wav, iterator_type='adaptive'):\n",
|
"wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
||||||
" if batch:\n",
|
"speech_probs = []\n",
|
||||||
" print(batch)"
|
"window_size_samples = 512 if SAMPLING_RATE == 16000 else 256\n",
|
||||||
]
|
"for i in range(0, len(wav), window_size_samples):\n",
|
||||||
},
|
" chunk = wav[i: i+ window_size_samples]\n",
|
||||||
{
|
" if len(chunk) < window_size_samples:\n",
|
||||||
"cell_type": "markdown",
|
" break\n",
|
||||||
"metadata": {
|
" speech_prob = model(chunk, SAMPLING_RATE).item()\n",
|
||||||
"heading_collapsed": true,
|
" speech_probs.append(speech_prob)\n",
|
||||||
"id": "KBDVybJCjsrz"
|
"vad_iterator.reset_states() # reset model states after each audio\n",
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Multiple Audio Streams"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "BK4tGfWgjsrz"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"audios_for_stream = glob.glob(f'{files_dir}/*.wav')\n",
|
|
||||||
"len(audios_for_stream) # total 4 audios"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "v1l8sam1jsrz"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"for batch in state_generator(model, audios_for_stream, audios_in_stream=2): # 2 audio stream\n",
|
|
||||||
" if batch:\n",
|
|
||||||
" pprint(batch)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "36jY0niD2Fww"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Number detector"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "scd1DlS42Fwx"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Install Dependencies"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "Kq5gQuYq2Fwx"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Install and Import Dependencies\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
"print(speech_probs[:10]) # first 10 chunks predicts"
|
||||||
"!pip install -q torchaudio soundfile\n",
|
|
||||||
"\n",
|
|
||||||
"import glob\n",
|
|
||||||
"import torch\n",
|
|
||||||
"torch.set_num_threads(1)\n",
|
|
||||||
"\n",
|
|
||||||
"from IPython.display import Audio\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
||||||
" model='silero_number_detector',\n",
|
|
||||||
" force_reload=True)\n",
|
|
||||||
"\n",
|
|
||||||
"(get_number_ts,\n",
|
|
||||||
" save_audio,\n",
|
|
||||||
" read_audio,\n",
|
|
||||||
" collect_chunks,\n",
|
|
||||||
" drop_chunks) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "qhPa30ij2Fwy"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Full audio"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "EXpau6xq2Fwy"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"wav = read_audio(f'{files_dir}/en_num.wav')\n",
|
|
||||||
"# get number timestamps from full audio file\n",
|
|
||||||
"number_timestamps = get_number_ts(wav, model)\n",
|
|
||||||
"pprint(number_timestamps)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "u-KfXRhZ2Fwy"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sample_rate = 16000\n",
|
|
||||||
"# convert ms in timestamps to samples\n",
|
|
||||||
"for timestamp in number_timestamps:\n",
|
|
||||||
" timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n",
|
|
||||||
" timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "iwYEC4aZ2Fwy"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# merge all number chunks to one audio\n",
|
|
||||||
"save_audio('only_numbers.wav',\n",
|
|
||||||
" collect_chunks(number_timestamps, wav), sample_rate) \n",
|
|
||||||
"Audio('only_numbers.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "fHaYejX12Fwy"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# drop all number chunks from audio\n",
|
|
||||||
"save_audio('no_numbers.wav',\n",
|
|
||||||
" drop_chunks(number_timestamps, wav), sample_rate) \n",
|
|
||||||
"Audio('no_numbers.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "PnKtJKbq2Fwz"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Language detector"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "F5cAmMbP2Fwz"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Install Dependencies"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "Zu9D0t6n2Fwz"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Install and Import Dependencies\n",
|
|
||||||
"\n",
|
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
|
||||||
"!pip install -q torchaudio soundfile\n",
|
|
||||||
"\n",
|
|
||||||
"import glob\n",
|
|
||||||
"import torch\n",
|
|
||||||
"torch.set_num_threads(1)\n",
|
|
||||||
"\n",
|
|
||||||
"from IPython.display import Audio\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
||||||
" model='silero_lang_detector',\n",
|
|
||||||
" force_reload=True)\n",
|
|
||||||
"\n",
|
|
||||||
"(get_language,\n",
|
|
||||||
" read_audio) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "iC696eMX2Fwz"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Full audio"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "c8UYnYBF2Fw0"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
|
||||||
"lang = get_language(wav, model)\n",
|
|
||||||
"print(lang)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "57avIBd6jsrz"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# ONNX Example"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "hEhnfORV2Fw0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## VAD"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "bL4kn4KJrlyL"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Install Dependencies"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"cellView": "form",
|
|
||||||
"hidden": true,
|
|
||||||
"id": "Q4QIfSpprnkI"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Install and Import Dependencies\n",
|
|
||||||
"\n",
|
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
|
||||||
"!pip install -q torchaudio soundfile onnxruntime\n",
|
|
||||||
"\n",
|
|
||||||
"import glob\n",
|
|
||||||
"import onnxruntime\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"from IPython.display import Audio\n",
|
|
||||||
"\n",
|
|
||||||
"_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
||||||
" model='silero_vad',\n",
|
|
||||||
" force_reload=True)\n",
|
|
||||||
"\n",
|
|
||||||
"(get_speech_ts,\n",
|
|
||||||
" get_speech_ts_adaptive,\n",
|
|
||||||
" save_audio,\n",
|
|
||||||
" read_audio,\n",
|
|
||||||
" state_generator,\n",
|
|
||||||
" single_audio_stream,\n",
|
|
||||||
" collect_speeches) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n",
|
|
||||||
"\n",
|
|
||||||
"def init_onnx_model(model_path: str):\n",
|
|
||||||
" return onnxruntime.InferenceSession(model_path)\n",
|
|
||||||
"\n",
|
|
||||||
"def validate_onnx(model, inputs):\n",
|
|
||||||
" with torch.no_grad():\n",
|
|
||||||
" ort_inputs = {'input': inputs.cpu().numpy()}\n",
|
|
||||||
" outs = model.run(None, ort_inputs)\n",
|
|
||||||
" outs = [torch.Tensor(x) for x in outs]\n",
|
|
||||||
" return outs[0]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "5JHErdB7jsr0"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Full Audio"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "TNEtK5zi2Fw2"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Classic way of getting speech chunks, you may need to select the thresholds yourself**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "krnGoA6Kjsr0"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/model.onnx')\n",
|
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
|
||||||
"\n",
|
|
||||||
"# get speech timestamps from full audio file\n",
|
|
||||||
"speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) \n",
|
|
||||||
"pprint(speech_timestamps)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "B176Lzfnjsr1"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# merge all speech chunks to one audio\n",
|
|
||||||
"save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)\n",
|
|
||||||
"Audio('only_speech.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "21RE8KEC2Fw2"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "uIVs56rb2Fw2"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/model.onnx')\n",
|
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
|
||||||
"\n",
|
|
||||||
"# get speech timestamps from full audio file\n",
|
|
||||||
"speech_timestamps = get_speech_ts_adaptive(wav, model, run_function=validate_onnx) \n",
|
|
||||||
"pprint(speech_timestamps)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "cox6oumC2Fw3"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# merge all speech chunks to one audio\n",
|
|
||||||
"save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)\n",
|
|
||||||
"Audio('only_speech.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "Rio9W50gjsr1"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Single Audio Stream"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "i8EZwtaA2Fw3"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Classic way of getting speech chunks, you may need to select the thresholds yourself**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "IPkl8Yy1jsr1"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/model.onnx')\n",
|
|
||||||
"wav = f'{files_dir}/en.wav'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "NC6Jim0hjsr1"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"for batch in single_audio_stream(model, wav, run_function=validate_onnx):\n",
|
|
||||||
" if batch:\n",
|
|
||||||
" pprint(batch)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "0pSKslpz2Fw3"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Experimental Adaptive method, algorithm selects thresholds itself (see readme for more information)**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "RZwc-Khk2Fw4"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/model.onnx')\n",
|
|
||||||
"wav = f'{files_dir}/en.wav'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "Z4lzFPs02Fw4"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"for batch in single_audio_stream(model, wav, iterator_type='adaptive', run_function=validate_onnx):\n",
|
|
||||||
" if batch:\n",
|
|
||||||
" pprint(batch)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "WNZ42u0ajsr1"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Multiple Audio Streams"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "XjhGQGppjsr1"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/model.onnx')\n",
|
|
||||||
"audios_for_stream = glob.glob(f'{files_dir}/*.wav')\n",
|
|
||||||
"pprint(len(audios_for_stream)) # total 4 audios"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "QI7-arlqjsr2"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"for batch in state_generator(model, audios_for_stream, audios_in_stream=2, run_function=validate_onnx): # 2 audio stream\n",
|
|
||||||
" if batch:\n",
|
|
||||||
" pprint(batch)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "7QMvUvpg2Fw4"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Number detector"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "tBPDkpHr2Fw4"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Install Dependencies"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"cellView": "form",
|
|
||||||
"hidden": true,
|
|
||||||
"id": "PdjGd56R2Fw5"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Install and Import Dependencies\n",
|
|
||||||
"\n",
|
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
|
||||||
"!pip install -q torchaudio soundfile onnxruntime\n",
|
|
||||||
"\n",
|
|
||||||
"import glob\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import onnxruntime\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"from IPython.display import Audio\n",
|
|
||||||
"\n",
|
|
||||||
"_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
||||||
" model='silero_number_detector',\n",
|
|
||||||
" force_reload=True)\n",
|
|
||||||
"\n",
|
|
||||||
"(get_number_ts,\n",
|
|
||||||
" save_audio,\n",
|
|
||||||
" read_audio,\n",
|
|
||||||
" collect_chunks,\n",
|
|
||||||
" drop_chunks) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n",
|
|
||||||
"\n",
|
|
||||||
"def init_onnx_model(model_path: str):\n",
|
|
||||||
" return onnxruntime.InferenceSession(model_path)\n",
|
|
||||||
"\n",
|
|
||||||
"def validate_onnx(model, inputs):\n",
|
|
||||||
" with torch.no_grad():\n",
|
|
||||||
" ort_inputs = {'input': inputs.cpu().numpy()}\n",
|
|
||||||
" outs = model.run(None, ort_inputs)\n",
|
|
||||||
" outs = [torch.Tensor(x) for x in outs]\n",
|
|
||||||
" return outs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "I9QWSFZh2Fw5"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Full Audio"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "_r6QZiwu2Fw5"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n",
|
|
||||||
"wav = read_audio(f'{files_dir}/en_num.wav')\n",
|
|
||||||
"\n",
|
|
||||||
"# get number timestamps from full audio file\n",
|
|
||||||
"number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n",
|
|
||||||
"pprint(number_timestamps)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "FN4aDwLV2Fw5"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"sample_rate = 16000\n",
|
|
||||||
"# convert ms in timestamps to samples\n",
|
|
||||||
"for timestamp in number_timestamps:\n",
|
|
||||||
" timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n",
|
|
||||||
" timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "JnvS6WTK2Fw5"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# merge all number chunks to one audio\n",
|
|
||||||
"save_audio('only_numbers.wav',\n",
|
|
||||||
" collect_chunks(number_timestamps, wav), 16000) \n",
|
|
||||||
"Audio('only_numbers.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "yUxOcOFG2Fw6"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# drop all number chunks from audio\n",
|
|
||||||
"save_audio('no_numbers.wav',\n",
|
|
||||||
" drop_chunks(number_timestamps, wav), 16000) \n",
|
|
||||||
"Audio('no_numbers.wav')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"id": "SR8Bgcd52Fw6"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"## Language detector"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"heading_collapsed": true,
|
|
||||||
"hidden": true,
|
|
||||||
"id": "PBnXPtKo2Fw6"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Install Dependencies"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"cellView": "form",
|
|
||||||
"hidden": true,
|
|
||||||
"id": "iNkDWJ3H2Fw6"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Install and Import Dependencies\n",
|
|
||||||
"\n",
|
|
||||||
"# this assumes that you have a relevant version of PyTorch installed\n",
|
|
||||||
"!pip install -q torchaudio soundfile onnxruntime\n",
|
|
||||||
"\n",
|
|
||||||
"import glob\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import onnxruntime\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"from IPython.display import Audio\n",
|
|
||||||
"\n",
|
|
||||||
"_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
||||||
" model='silero_lang_detector',\n",
|
|
||||||
" force_reload=True)\n",
|
|
||||||
"\n",
|
|
||||||
"(get_language,\n",
|
|
||||||
" read_audio) = utils\n",
|
|
||||||
"\n",
|
|
||||||
"files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n",
|
|
||||||
"\n",
|
|
||||||
"def init_onnx_model(model_path: str):\n",
|
|
||||||
" return onnxruntime.InferenceSession(model_path)\n",
|
|
||||||
"\n",
|
|
||||||
"def validate_onnx(model, inputs):\n",
|
|
||||||
" with torch.no_grad():\n",
|
|
||||||
" ort_inputs = {'input': inputs.cpu().numpy()}\n",
|
|
||||||
" outs = model.run(None, ort_inputs)\n",
|
|
||||||
" outs = [torch.Tensor(x) for x in outs]\n",
|
|
||||||
" return outs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "G8N8oP4q2Fw6"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"### Full Audio"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"hidden": true,
|
|
||||||
"id": "WHXnh9IV2Fw6"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n",
|
|
||||||
"wav = read_audio(f'{files_dir}/en.wav')\n",
|
|
||||||
"\n",
|
|
||||||
"lang = get_language(wav, model, run_function=validate_onnx)\n",
|
|
||||||
"print(lang)"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
13
src/silero_vad/__init__.py
Normal file
13
src/silero_vad/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from importlib.metadata import version
|
||||||
|
try:
|
||||||
|
__version__ = version(__name__)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
from silero_vad.model import load_silero_vad
|
||||||
|
from silero_vad.utils_vad import (get_speech_timestamps,
|
||||||
|
save_audio,
|
||||||
|
read_audio,
|
||||||
|
VADIterator,
|
||||||
|
collect_chunks,
|
||||||
|
drop_chunks)
|
||||||
0
src/silero_vad/data/__init__.py
Normal file
0
src/silero_vad/data/__init__.py
Normal file
BIN
src/silero_vad/data/silero_vad.jit
Normal file
BIN
src/silero_vad/data/silero_vad.jit
Normal file
Binary file not shown.
BIN
src/silero_vad/data/silero_vad.onnx
Normal file
BIN
src/silero_vad/data/silero_vad.onnx
Normal file
Binary file not shown.
BIN
src/silero_vad/data/silero_vad_16k.safetensors
Executable file
BIN
src/silero_vad/data/silero_vad_16k.safetensors
Executable file
Binary file not shown.
BIN
src/silero_vad/data/silero_vad_16k_op15.onnx
Normal file
BIN
src/silero_vad/data/silero_vad_16k_op15.onnx
Normal file
Binary file not shown.
BIN
src/silero_vad/data/silero_vad_half.onnx
Normal file
BIN
src/silero_vad/data/silero_vad_half.onnx
Normal file
Binary file not shown.
BIN
src/silero_vad/data/silero_vad_op18_ifless.onnx
Normal file
BIN
src/silero_vad/data/silero_vad_op18_ifless.onnx
Normal file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user