Update README.md

2026-02-04 17:39:22 +08:00 · 2024-09-24 15:16:05 +03:00
4 changed files with 65 additions and 99 deletions
--- a/examples/colab_record_example.ipynb
+++ b/examples/colab_record_example.ipynb
@@ -17,7 +17,6 @@
   },
   "outputs": [],
   "source": [
-    "#!apt install ffmpeg\n",
    "!pip -q install pydub\n",
    "from google.colab import output\n",
    "from base64 import b64decode, b64encode\n",
@@ -38,12 +37,13 @@
    "                              model='silero_vad',\n",
    "                              force_reload=True)\n",
    "\n",
-    "def int2float(audio):\n",
-    "    samples = audio.get_array_of_samples()\n",
-    "    new_sound = audio._spawn(samples)\n",
-    "    arr = np.array(samples).astype(np.float32)\n",
-    "    arr = arr / np.abs(arr).max()\n",
-    "    return arr\n",
+    "def int2float(sound):\n",
+    "    abs_max = np.abs(sound).max()\n",
+    "    sound = sound.astype('float32')\n",
+    "    if abs_max > 0:\n",
+    "        sound *= 1/32768\n",
+    "    sound = sound.squeeze()\n",
+    "    return sound\n",
    "\n",
    "AUDIO_HTML = \"\"\"\n",
    "<script>\n",
@@ -68,10 +68,10 @@
    "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
    "    mimeType : 'audio/webm;codecs=opus'\n",
    "    //mimeType : 'audio/webm;codecs=pcm'\n",
-    "  };\n",
+    "  };            \n",
    "  //recorder = new MediaRecorder(stream, options);\n",
    "  recorder = new MediaRecorder(stream);\n",
-    "  recorder.ondataavailable = function(e) {\n",
+    "  recorder.ondataavailable = function(e) {            \n",
    "    var url = URL.createObjectURL(e.data);\n",
    "    // var preview = document.createElement('audio');\n",
    "    // preview.controls = true;\n",
@@ -79,7 +79,7 @@
    "    // document.body.appendChild(preview);\n",
    "\n",
    "    reader = new FileReader();\n",
-    "    reader.readAsDataURL(e.data);\n",
+    "    reader.readAsDataURL(e.data); \n",
    "    reader.onloadend = function() {\n",
    "      base64data = reader.result;\n",
    "      //console.log(\"Inside FileReader:\" + base64data);\n",
@@ -121,7 +121,7 @@
    "\n",
    "}\n",
    "});\n",
-    "\n",
+    "      \n",
    "</script>\n",
    "\"\"\"\n",
    "\n",
@@ -133,8 +133,8 @@
    "    audio.export('test.mp3', format='mp3')\n",
    "    audio = audio.set_channels(1)\n",
    "    audio = audio.set_frame_rate(16000)\n",
-    "    audio_float = int2float(audio)\n",
-    "    audio_tens = torch.tensor(audio_float)\n",
+    "    audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
+    "    audio_tens = torch.tensor(audio_float )\n",
    "    return audio_tens\n",
    "\n",
    "def make_animation(probs, audio_duration, interval=40):\n",
@@ -154,18 +154,19 @@
    "    def animate(i):\n",
    "        x = i * interval / 1000 - 0.04\n",
    "        y = np.linspace(0, 1.02, 2)\n",
-    "\n",
+    "        \n",
    "        line.set_data(x, y)\n",
    "        line.set_color('#990000')\n",
    "        return line,\n",
-    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
    "\n",
-    "    f = r\"animation.mp4\"\n",
-    "    writervideo = FFMpegWriter(fps=1000/interval)\n",
+    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
+    "\n",
+    "    f = r\"animation.mp4\" \n",
+    "    writervideo = FFMpegWriter(fps=1000/interval) \n",
    "    anim.save(f, writer=writervideo)\n",
    "    plt.close('all')\n",
    "\n",
-    "def combine_audio(vidname, audname, outname, fps=25):\n",
+    "def combine_audio(vidname, audname, outname, fps=25): \n",
    "    my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
    "    audio_background = mpe.AudioFileClip(audname)\n",
    "    final_clip = my_clip.set_audio(audio_background)\n",
@@ -173,10 +174,15 @@
    "\n",
    "def record_make_animation():\n",
    "  tensor = record()\n",
+    "\n",
    "  print('Calculating probabilities...')\n",
    "  speech_probs = []\n",
    "  window_size_samples = 512\n",
-    "  speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
+    "  for i in range(0, len(tensor), window_size_samples):\n",
+    "      if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
+    "        break\n",
+    "      speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
+    "      speech_probs.append(speech_prob)\n",
    "  model.reset_states()\n",
    "  print('Making animation...')\n",
    "  make_animation(speech_probs, len(tensor) / 16000)\n",
@@ -190,9 +196,7 @@
    "  <video width=800 controls>\n",
    "        <source src=\"%s\" type=\"video/mp4\">\n",
    "  </video>\n",
-    "  \"\"\" % data_url))\n",
-    "\n",
-    "  return speech_probs"
+    "  \"\"\" % data_url))"
   ]
  },
  {
@@ -212,7 +216,7 @@
   },
   "outputs": [],
   "source": [
-    "speech_probs = record_make_animation()"
+    "record_make_animation()"
   ]
  }
 ],
--- a/examples/parallel_example.ipynb
+++ b/examples/parallel_example.ipynb
@@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -17,19 +18,17 @@
    "SAMPLING_RATE = 16000\n",
    "import torch\n",
    "from pprint import pprint\n",
-    "import time\n",
-    "import shutil\n",
    "\n",
    "torch.set_num_threads(1)\n",
    "NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
    "NUM_COPIES=8\n",
    "# download wav files, make multiple copies\n",
-    "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
-    "for idx in range(NUM_COPIES-1):\n",
-    "    shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
+    "for idx in range(NUM_COPIES):\n",
+    "    torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -55,6 +54,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -99,6 +99,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -126,7 +127,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "diarization",
   "language": "python",
   "name": "python3"
  },
@@ -140,20 +141,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
-  },
-  "toc": {
-   "base_numbering": 1,
-   "nav_menu": {},
-   "number_sections": true,
-   "sideBar": true,
-   "skip_h1_title": false,
-   "title_cell": "Table of Contents",
-   "title_sidebar": "Contents",
-   "toc_cell": false,
-   "toc_position": {},
-   "toc_section_display": true,
-   "toc_window_display": false
+   "version": "3.9.15"
  }
 },
 "nbformat": 4,
--- a/examples/pyaudio-streaming/README.md
+++ b/examples/pyaudio-streaming/README.md
@@ -8,8 +8,6 @@ Currently, the notebook consits of two examples:
 - One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
 - The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.

- This example does not work in google colab! For local usage only.
-
 ## Example Video for the Real-Time Visualization


--- a/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb
+++ b/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "76aa55ba",
+   "id": "62a0cccb",
   "metadata": {},
   "source": [
    "# Pyaudio Microphone Streaming Examples\n",
@@ -12,14 +12,12 @@
    "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
    "\n",
    "\n",
-    "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required.\n",
-    "\n",
-    "This notebook does not work in google colab! For local usage only."
+    "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "4a4e15c2",
+   "id": "64cbe1eb",
   "metadata": {},
   "source": [
    "## Dependencies\n",
@@ -28,27 +26,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "id": "24205cce",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-10-09T08:47:34.056898Z",
-     "start_time": "2024-10-09T08:47:34.053418Z"
-    }
-   },
+   "execution_count": null,
+   "id": "57bc2aac",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "#!pip install numpy>=1.24.0\n",
-    "#!pip install torch>=1.12.0\n",
-    "#!pip install matplotlib>=3.6.0\n",
-    "#!pip install torchaudio>=0.12.0\n",
+    "#!pip install numpy==2.0.2\n",
+    "#!pip install torch==2.4.1\n",
+    "#!pip install matplotlib==3.9.2\n",
+    "#!pip install torchaudio==2.4.1\n",
    "#!pip install soundfile==0.12.1\n",
-    "#!apt install python3-pyaudio (linux) or pip install pyaudio (windows)"
+    "#!pip install pyaudio==0.2.11"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "cd22818f",
+   "id": "110de761",
   "metadata": {},
   "source": [
    "## Imports"
@@ -56,27 +49,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "id": "994d7f3a",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-10-09T08:47:39.005032Z",
-     "start_time": "2024-10-09T08:47:36.489952Z"
-    }
-   },
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'pyaudio'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpylab\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyaudio\u001b[39;00m\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyaudio'"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "id": "5a647d8d",
+   "metadata": {},
+   "outputs": [],
   "source": [
    "import io\n",
    "import numpy as np\n",
@@ -91,7 +67,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ac5c52f7",
+   "id": "725d7066",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -103,7 +79,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ad5919dc",
+   "id": "1c0b2ea7",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -116,7 +92,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "784d1ab6",
+   "id": "f9112603",
   "metadata": {},
   "source": [
    "### Helper Methods"
@@ -125,7 +101,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "af4bca64",
+   "id": "5abc6330",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -148,7 +124,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ca13e514",
+   "id": "5124095e",
   "metadata": {},
   "source": [
    "## Pyaudio Set-up"
@@ -157,7 +133,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "75f99022",
+   "id": "a845356e",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -171,7 +147,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "4da7d2ef",
+   "id": "0b910c99",
   "metadata": {},
   "source": [
    "## Simple Example\n",
@@ -181,7 +157,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "6fe77661",
+   "id": "9d3d2c10",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -191,7 +167,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "23f4da3e",
+   "id": "3cb44a4a",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -231,7 +207,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "fd243e8f",
+   "id": "a3dda982",
   "metadata": {},
   "source": [
    "## Real Time Visualization\n",
@@ -244,7 +220,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d36980c2",
+   "id": "05ef4100",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -254,7 +230,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "5607b616",
+   "id": "d1d4cdd6",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -311,7 +287,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "dc4f0108",
+   "id": "1e398009",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -335,7 +311,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.9.10"
  },
  "toc": {
   "base_numbering": 1,