mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
357 lines
9.9 KiB
Plaintext
357 lines
9.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "76aa55ba",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pyaudio Microphone Streaming Examples\n",
|
|
"\n",
|
|
"A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
|
|
"\n",
|
|
"I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
|
|
"\n",
|
|
"\n",
|
|
"Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required.\n",
|
|
"\n",
|
|
"This notebook does not work in google colab! For local usage only."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4a4e15c2",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Dependencies\n",
|
|
"The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "24205cce",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-10-09T08:47:34.056898Z",
|
|
"start_time": "2024-10-09T08:47:34.053418Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install numpy>=1.24.0\n",
|
|
"#!pip install torch>=1.12.0\n",
|
|
"#!pip install matplotlib>=3.6.0\n",
|
|
"#!pip install torchaudio>=0.12.0\n",
|
|
"#!pip install soundfile==0.12.1\n",
|
|
"#!apt install python3-pyaudio (linux) or pip install pyaudio (windows)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "cd22818f",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "994d7f3a",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2024-10-09T08:47:39.005032Z",
|
|
"start_time": "2024-10-09T08:47:36.489952Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "ModuleNotFoundError",
|
|
"evalue": "No module named 'pyaudio'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpylab\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyaudio\u001b[39;00m\n",
|
|
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyaudio'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import io\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"torch.set_num_threads(1)\n",
|
|
"import torchaudio\n",
|
|
"import matplotlib\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"import pyaudio"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ac5c52f7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
" model='silero_vad',\n",
|
|
" force_reload=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ad5919dc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"(get_speech_timestamps,\n",
|
|
" save_audio,\n",
|
|
" read_audio,\n",
|
|
" VADIterator,\n",
|
|
" collect_chunks) = utils"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "784d1ab6",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Helper Methods"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "af4bca64",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Taken from utils_vad.py\n",
|
|
"def validate(model,\n",
|
|
" inputs: torch.Tensor):\n",
|
|
" with torch.no_grad():\n",
|
|
" outs = model(inputs)\n",
|
|
" return outs\n",
|
|
"\n",
|
|
"# Provided by Alexander Veysov\n",
|
|
"def int2float(sound):\n",
|
|
" abs_max = np.abs(sound).max()\n",
|
|
" sound = sound.astype('float32')\n",
|
|
" if abs_max > 0:\n",
|
|
" sound *= 1/32768\n",
|
|
" sound = sound.squeeze() # depends on the use case\n",
|
|
" return sound"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ca13e514",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Pyaudio Set-up"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "75f99022",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"FORMAT = pyaudio.paInt16\n",
|
|
"CHANNELS = 1\n",
|
|
"SAMPLE_RATE = 16000\n",
|
|
"CHUNK = int(SAMPLE_RATE / 10)\n",
|
|
"\n",
|
|
"audio = pyaudio.PyAudio()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4da7d2ef",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Simple Example\n",
|
|
"The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6fe77661",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"num_samples = 512"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "23f4da3e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"stream = audio.open(format=FORMAT,\n",
|
|
" channels=CHANNELS,\n",
|
|
" rate=SAMPLE_RATE,\n",
|
|
" input=True,\n",
|
|
" frames_per_buffer=CHUNK)\n",
|
|
"data = []\n",
|
|
"voiced_confidences = []\n",
|
|
"\n",
|
|
"frames_to_record = 50\n",
|
|
"\n",
|
|
"print(\"Started Recording\")\n",
|
|
"for i in range(0, frames_to_record):\n",
|
|
" \n",
|
|
" audio_chunk = stream.read(num_samples)\n",
|
|
" \n",
|
|
" # in case you want to save the audio later\n",
|
|
" data.append(audio_chunk)\n",
|
|
" \n",
|
|
" audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
|
|
"\n",
|
|
" audio_float32 = int2float(audio_int16)\n",
|
|
" \n",
|
|
" # get the confidences and add them to the list to plot them later\n",
|
|
" new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
|
|
" voiced_confidences.append(new_confidence)\n",
|
|
" \n",
|
|
"print(\"Stopped the recording\")\n",
|
|
"\n",
|
|
"# plot the confidences for the speech\n",
|
|
"plt.figure(figsize=(20,6))\n",
|
|
"plt.plot(voiced_confidences)\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fd243e8f",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Real Time Visualization\n",
|
|
"\n",
|
|
"As an enhancement to plot the speech probabilities in real time I added the implementation below.\n",
|
|
"In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n",
|
|
"While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d36980c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install jupyterplot==0.0.3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5607b616",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from jupyterplot import ProgressPlot\n",
|
|
"import threading\n",
|
|
"\n",
|
|
"continue_recording = True\n",
|
|
"\n",
|
|
"def stop():\n",
|
|
" input(\"Press Enter to stop the recording:\")\n",
|
|
" global continue_recording\n",
|
|
" continue_recording = False\n",
|
|
"\n",
|
|
"def start_recording():\n",
|
|
" \n",
|
|
" stream = audio.open(format=FORMAT,\n",
|
|
" channels=CHANNELS,\n",
|
|
" rate=SAMPLE_RATE,\n",
|
|
" input=True,\n",
|
|
" frames_per_buffer=CHUNK)\n",
|
|
"\n",
|
|
" data = []\n",
|
|
" voiced_confidences = []\n",
|
|
" \n",
|
|
" global continue_recording\n",
|
|
" continue_recording = True\n",
|
|
" \n",
|
|
" pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n",
|
|
" \n",
|
|
" stop_listener = threading.Thread(target=stop)\n",
|
|
" stop_listener.start()\n",
|
|
"\n",
|
|
" while continue_recording:\n",
|
|
" \n",
|
|
" audio_chunk = stream.read(num_samples)\n",
|
|
" \n",
|
|
" # in case you want to save the audio later\n",
|
|
" data.append(audio_chunk)\n",
|
|
" \n",
|
|
" audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
|
|
"\n",
|
|
" audio_float32 = int2float(audio_int16)\n",
|
|
" \n",
|
|
" # get the confidences and add them to the list to plot them later\n",
|
|
" new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
|
|
" voiced_confidences.append(new_confidence)\n",
|
|
" \n",
|
|
" pp.update(new_confidence)\n",
|
|
"\n",
|
|
"\n",
|
|
" pp.finalize()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dc4f0108",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"start_recording()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
},
|
|
"toc": {
|
|
"base_numbering": 1,
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": false,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|