mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
Compare commits
10 Commits
snakers4-p
...
v5.1.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
032ca21a70 | ||
|
|
001d57d6ff | ||
|
|
6e6da04e7a | ||
|
|
9c1eff9169 | ||
|
|
36b759d053 | ||
|
|
1a7499607a | ||
|
|
87451b059f | ||
|
|
d23867da10 | ||
|
|
2043282182 | ||
|
|
fa8036ae1c |
@@ -17,6 +17,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!apt install ffmpeg\n",
|
||||
"!pip -q install pydub\n",
|
||||
"from google.colab import output\n",
|
||||
"from base64 import b64decode, b64encode\n",
|
||||
@@ -37,13 +38,12 @@
|
||||
" model='silero_vad',\n",
|
||||
" force_reload=True)\n",
|
||||
"\n",
|
||||
"def int2float(sound):\n",
|
||||
" abs_max = np.abs(sound).max()\n",
|
||||
" sound = sound.astype('float32')\n",
|
||||
" if abs_max > 0:\n",
|
||||
" sound *= 1/32768\n",
|
||||
" sound = sound.squeeze()\n",
|
||||
" return sound\n",
|
||||
"def int2float(audio):\n",
|
||||
" samples = audio.get_array_of_samples()\n",
|
||||
" new_sound = audio._spawn(samples)\n",
|
||||
" arr = np.array(samples).astype(np.float32)\n",
|
||||
" arr = arr / np.abs(arr).max()\n",
|
||||
" return arr\n",
|
||||
"\n",
|
||||
"AUDIO_HTML = \"\"\"\n",
|
||||
"<script>\n",
|
||||
@@ -68,10 +68,10 @@
|
||||
" //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
|
||||
" mimeType : 'audio/webm;codecs=opus'\n",
|
||||
" //mimeType : 'audio/webm;codecs=pcm'\n",
|
||||
" }; \n",
|
||||
" };\n",
|
||||
" //recorder = new MediaRecorder(stream, options);\n",
|
||||
" recorder = new MediaRecorder(stream);\n",
|
||||
" recorder.ondataavailable = function(e) { \n",
|
||||
" recorder.ondataavailable = function(e) {\n",
|
||||
" var url = URL.createObjectURL(e.data);\n",
|
||||
" // var preview = document.createElement('audio');\n",
|
||||
" // preview.controls = true;\n",
|
||||
@@ -79,7 +79,7 @@
|
||||
" // document.body.appendChild(preview);\n",
|
||||
"\n",
|
||||
" reader = new FileReader();\n",
|
||||
" reader.readAsDataURL(e.data); \n",
|
||||
" reader.readAsDataURL(e.data);\n",
|
||||
" reader.onloadend = function() {\n",
|
||||
" base64data = reader.result;\n",
|
||||
" //console.log(\"Inside FileReader:\" + base64data);\n",
|
||||
@@ -121,7 +121,7 @@
|
||||
"\n",
|
||||
"}\n",
|
||||
"});\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"</script>\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
@@ -133,8 +133,8 @@
|
||||
" audio.export('test.mp3', format='mp3')\n",
|
||||
" audio = audio.set_channels(1)\n",
|
||||
" audio = audio.set_frame_rate(16000)\n",
|
||||
" audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
|
||||
" audio_tens = torch.tensor(audio_float )\n",
|
||||
" audio_float = int2float(audio)\n",
|
||||
" audio_tens = torch.tensor(audio_float)\n",
|
||||
" return audio_tens\n",
|
||||
"\n",
|
||||
"def make_animation(probs, audio_duration, interval=40):\n",
|
||||
@@ -154,19 +154,18 @@
|
||||
" def animate(i):\n",
|
||||
" x = i * interval / 1000 - 0.04\n",
|
||||
" y = np.linspace(0, 1.02, 2)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" line.set_data(x, y)\n",
|
||||
" line.set_color('#990000')\n",
|
||||
" return line,\n",
|
||||
" anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
|
||||
"\n",
|
||||
" anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
|
||||
"\n",
|
||||
" f = r\"animation.mp4\" \n",
|
||||
" writervideo = FFMpegWriter(fps=1000/interval) \n",
|
||||
" f = r\"animation.mp4\"\n",
|
||||
" writervideo = FFMpegWriter(fps=1000/interval)\n",
|
||||
" anim.save(f, writer=writervideo)\n",
|
||||
" plt.close('all')\n",
|
||||
"\n",
|
||||
"def combine_audio(vidname, audname, outname, fps=25): \n",
|
||||
"def combine_audio(vidname, audname, outname, fps=25):\n",
|
||||
" my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
|
||||
" audio_background = mpe.AudioFileClip(audname)\n",
|
||||
" final_clip = my_clip.set_audio(audio_background)\n",
|
||||
@@ -174,15 +173,10 @@
|
||||
"\n",
|
||||
"def record_make_animation():\n",
|
||||
" tensor = record()\n",
|
||||
"\n",
|
||||
" print('Calculating probabilities...')\n",
|
||||
" speech_probs = []\n",
|
||||
" window_size_samples = 512\n",
|
||||
" for i in range(0, len(tensor), window_size_samples):\n",
|
||||
" if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
|
||||
" break\n",
|
||||
" speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
|
||||
" speech_probs.append(speech_prob)\n",
|
||||
" speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
|
||||
" model.reset_states()\n",
|
||||
" print('Making animation...')\n",
|
||||
" make_animation(speech_probs, len(tensor) / 16000)\n",
|
||||
@@ -196,7 +190,9 @@
|
||||
" <video width=800 controls>\n",
|
||||
" <source src=\"%s\" type=\"video/mp4\">\n",
|
||||
" </video>\n",
|
||||
" \"\"\" % data_url))"
|
||||
" \"\"\" % data_url))\n",
|
||||
"\n",
|
||||
" return speech_probs"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -216,7 +212,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"record_make_animation()"
|
||||
"speech_probs = record_make_animation()"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -18,17 +17,19 @@
|
||||
"SAMPLING_RATE = 16000\n",
|
||||
"import torch\n",
|
||||
"from pprint import pprint\n",
|
||||
"import time\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"torch.set_num_threads(1)\n",
|
||||
"NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
|
||||
"NUM_COPIES=8\n",
|
||||
"# download wav files, make multiple copies\n",
|
||||
"for idx in range(NUM_COPIES):\n",
|
||||
" torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
|
||||
"torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
|
||||
"for idx in range(NUM_COPIES-1):\n",
|
||||
" shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -54,7 +55,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -99,7 +99,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -127,7 +126,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "diarization",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -141,7 +140,20 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -8,6 +8,8 @@ Currently, the notebook consits of two examples:
|
||||
- One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
|
||||
- The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
|
||||
|
||||
This example does not work in google colab! For local usage only.
|
||||
|
||||
## Example Video for the Real-Time Visualization
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "62a0cccb",
|
||||
"id": "76aa55ba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pyaudio Microphone Streaming Examples\n",
|
||||
@@ -12,12 +12,14 @@
|
||||
"I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
|
||||
"Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required.\n",
|
||||
"\n",
|
||||
"This notebook does not work in google colab! For local usage only."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "64cbe1eb",
|
||||
"id": "4a4e15c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dependencies\n",
|
||||
@@ -26,22 +28,27 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57bc2aac",
|
||||
"metadata": {},
|
||||
"execution_count": 1,
|
||||
"id": "24205cce",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-10-09T08:47:34.056898Z",
|
||||
"start_time": "2024-10-09T08:47:34.053418Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install numpy==2.0.2\n",
|
||||
"#!pip install torch==2.4.1\n",
|
||||
"#!pip install matplotlib==3.9.2\n",
|
||||
"#!pip install torchaudio==2.4.1\n",
|
||||
"#!pip install numpy>=1.24.0\n",
|
||||
"#!pip install torch>=1.12.0\n",
|
||||
"#!pip install matplotlib>=3.6.0\n",
|
||||
"#!pip install torchaudio>=0.12.0\n",
|
||||
"#!pip install soundfile==0.12.1\n",
|
||||
"#!pip install pyaudio==0.2.11"
|
||||
"#!apt install python3-pyaudio (linux) or pip install pyaudio (windows)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "110de761",
|
||||
"id": "cd22818f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports"
|
||||
@@ -49,10 +56,27 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5a647d8d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 2,
|
||||
"id": "994d7f3a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-10-09T08:47:39.005032Z",
|
||||
"start_time": "2024-10-09T08:47:36.489952Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'pyaudio'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpylab\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyaudio\u001b[39;00m\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyaudio'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import io\n",
|
||||
"import numpy as np\n",
|
||||
@@ -67,7 +91,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "725d7066",
|
||||
"id": "ac5c52f7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -79,7 +103,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1c0b2ea7",
|
||||
"id": "ad5919dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -92,7 +116,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f9112603",
|
||||
"id": "784d1ab6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Helper Methods"
|
||||
@@ -101,7 +125,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5abc6330",
|
||||
"id": "af4bca64",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -124,7 +148,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5124095e",
|
||||
"id": "ca13e514",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pyaudio Set-up"
|
||||
@@ -133,7 +157,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a845356e",
|
||||
"id": "75f99022",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -147,7 +171,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b910c99",
|
||||
"id": "4da7d2ef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Simple Example\n",
|
||||
@@ -157,7 +181,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9d3d2c10",
|
||||
"id": "6fe77661",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -167,7 +191,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3cb44a4a",
|
||||
"id": "23f4da3e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -207,7 +231,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a3dda982",
|
||||
"id": "fd243e8f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Real Time Visualization\n",
|
||||
@@ -220,7 +244,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05ef4100",
|
||||
"id": "d36980c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -230,7 +254,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d1d4cdd6",
|
||||
"id": "5607b616",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -287,7 +311,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e398009",
|
||||
"id": "dc4f0108",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -311,7 +335,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
|
||||
Reference in New Issue
Block a user