silero-vad/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "76aa55ba",
   "metadata": {},
   "source": [
    "# Pyaudio Microphone Streaming Examples\n",
    "\n",
    "A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
    "\n",
    "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
    "\n",
    "\n",
    "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required.\n",
    "\n",
    "This notebook does not work in google colab! For local usage only."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a4e15c2",
   "metadata": {},
   "source": [
    "## Dependencies\n",
    "The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "24205cce",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-09T08:47:34.056898Z",
     "start_time": "2024-10-09T08:47:34.053418Z"
    }
   },
   "outputs": [],
   "source": [
    "#!pip install numpy>=1.24.0\n",
    "#!pip install torch>=1.12.0\n",
    "#!pip install matplotlib>=3.6.0\n",
    "#!pip install torchaudio>=0.12.0\n",
    "#!pip install soundfile==0.12.1\n",
    "#!apt install python3-pyaudio (linux) or pip install pyaudio (windows)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd22818f",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "994d7f3a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-09T08:47:39.005032Z",
     "start_time": "2024-10-09T08:47:36.489952Z"
    }
   },
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'pyaudio'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpylab\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyaudio\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyaudio'"
     ]
    }
   ],
   "source": [
    "import io\n",
    "import numpy as np\n",
    "import torch\n",
    "torch.set_num_threads(1)\n",
    "import torchaudio\n",
    "import matplotlib\n",
    "import matplotlib.pylab as plt\n",
    "import pyaudio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac5c52f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
    "                              model='silero_vad',\n",
    "                              force_reload=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad5919dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "(get_speech_timestamps,\n",
    " save_audio,\n",
    " read_audio,\n",
    " VADIterator,\n",
    " collect_chunks) = utils"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "784d1ab6",
   "metadata": {},
   "source": [
    "### Helper Methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af4bca64",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Taken from utils_vad.py\n",
    "def validate(model,\n",
    "             inputs: torch.Tensor):\n",
    "    with torch.no_grad():\n",
    "        outs = model(inputs)\n",
    "    return outs\n",
    "\n",
    "# Provided by Alexander Veysov\n",
    "def int2float(sound):\n",
    "    abs_max = np.abs(sound).max()\n",
    "    sound = sound.astype('float32')\n",
    "    if abs_max > 0:\n",
    "        sound *= 1/32768\n",
    "    sound = sound.squeeze()  # depends on the use case\n",
    "    return sound"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca13e514",
   "metadata": {},
   "source": [
    "## Pyaudio Set-up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75f99022",
   "metadata": {},
   "outputs": [],
   "source": [
    "FORMAT = pyaudio.paInt16\n",
    "CHANNELS = 1\n",
    "SAMPLE_RATE = 16000\n",
    "CHUNK = int(SAMPLE_RATE / 10)\n",
    "\n",
    "audio = pyaudio.PyAudio()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4da7d2ef",
   "metadata": {},
   "source": [
    "## Simple Example\n",
    "The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fe77661",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_samples = 512"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23f4da3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "stream = audio.open(format=FORMAT,\n",
    "                    channels=CHANNELS,\n",
    "                    rate=SAMPLE_RATE,\n",
    "                    input=True,\n",
    "                    frames_per_buffer=CHUNK)\n",
    "data = []\n",
    "voiced_confidences = []\n",
    "\n",
    "frames_to_record = 50\n",
    "\n",
    "print(\"Started Recording\")\n",
    "for i in range(0, frames_to_record):\n",
    "    \n",
    "    audio_chunk = stream.read(num_samples)\n",
    "    \n",
    "    # in case you want to save the audio later\n",
    "    data.append(audio_chunk)\n",
    "    \n",
    "    audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
    "\n",
    "    audio_float32 = int2float(audio_int16)\n",
    "    \n",
    "    # get the confidences and add them to the list to plot them later\n",
    "    new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
    "    voiced_confidences.append(new_confidence)\n",
    "    \n",
    "print(\"Stopped the recording\")\n",
    "\n",
    "# plot the confidences for the speech\n",
    "plt.figure(figsize=(20,6))\n",
    "plt.plot(voiced_confidences)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd243e8f",
   "metadata": {},
   "source": [
    "## Real Time Visualization\n",
    "\n",
    "As an enhancement to plot the speech probabilities in real time I added the implementation below.\n",
    "In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n",
    "While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d36980c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install jupyterplot==0.0.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5607b616",
   "metadata": {},
   "outputs": [],
   "source": [
    "from jupyterplot import ProgressPlot\n",
    "import threading\n",
    "\n",
    "continue_recording = True\n",
    "\n",
    "def stop():\n",
    "    input(\"Press Enter to stop the recording:\")\n",
    "    global continue_recording\n",
    "    continue_recording = False\n",
    "\n",
    "def start_recording():\n",
    "    \n",
    "    stream = audio.open(format=FORMAT,\n",
    "                    channels=CHANNELS,\n",
    "                    rate=SAMPLE_RATE,\n",
    "                    input=True,\n",
    "                    frames_per_buffer=CHUNK)\n",
    "\n",
    "    data = []\n",
    "    voiced_confidences = []\n",
    "    \n",
    "    global continue_recording\n",
    "    continue_recording = True\n",
    "    \n",
    "    pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n",
    "    \n",
    "    stop_listener = threading.Thread(target=stop)\n",
    "    stop_listener.start()\n",
    "\n",
    "    while continue_recording:\n",
    "    \n",
    "        audio_chunk = stream.read(num_samples)\n",
    "    \n",
    "        # in case you want to save the audio later\n",
    "        data.append(audio_chunk)\n",
    "    \n",
    "        audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
    "\n",
    "        audio_float32 = int2float(audio_int16)\n",
    "    \n",
    "        # get the confidences and add them to the list to plot them later\n",
    "        new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
    "        voiced_confidences.append(new_confidence)\n",
    "    \n",
    "        pp.update(new_confidence)\n",
    "\n",
    "\n",
    "    pp.finalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc4f0108",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_recording()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}