Update README.md

2026-02-04 17:39:22 +08:00 · 2024-09-24 15:16:05 +03:00
4 changed files with 65 additions and 99 deletions
--- a/examples/colab_record_example.ipynb
+++ b/examples/colab_record_example.ipynb
@@ -17,7 +17,6 @@
   },
   "outputs": [],
   "source": [
    "#!apt install ffmpeg\n",
    "!pip -q install pydub\n",
    "from google.colab import output\n",
    "from base64 import b64decode, b64encode\n",
@@ -38,12 +37,13 @@
    "                              model='silero_vad',\n",
    "                              force_reload=True)\n",
    "\n",
-    "def int2float(audio):\n",
+    "def int2float(sound):\n",
-    "    samples = audio.get_array_of_samples()\n",
+    "    abs_max = np.abs(sound).max()\n",
-    "    new_sound = audio._spawn(samples)\n",
+    "    sound = sound.astype('float32')\n",
-    "    arr = np.array(samples).astype(np.float32)\n",
+    "    if abs_max > 0:\n",
-    "    arr = arr / np.abs(arr).max()\n",
+    "        sound *= 1/32768\n",
-    "    return arr\n",
+    "    sound = sound.squeeze()\n",
    "    return sound\n",
    "\n",
    "AUDIO_HTML = \"\"\"\n",
    "<script>\n",
@@ -68,10 +68,10 @@
    "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
    "    mimeType : 'audio/webm;codecs=opus'\n",
    "    //mimeType : 'audio/webm;codecs=pcm'\n",
-    "  };\n",
+    "  };            \n",
    "  //recorder = new MediaRecorder(stream, options);\n",
    "  recorder = new MediaRecorder(stream);\n",
-    "  recorder.ondataavailable = function(e) {\n",
+    "  recorder.ondataavailable = function(e) {            \n",
    "    var url = URL.createObjectURL(e.data);\n",
    "    // var preview = document.createElement('audio');\n",
    "    // preview.controls = true;\n",
@@ -79,7 +79,7 @@
    "    // document.body.appendChild(preview);\n",
    "\n",
    "    reader = new FileReader();\n",
-    "    reader.readAsDataURL(e.data);\n",
+    "    reader.readAsDataURL(e.data); \n",
    "    reader.onloadend = function() {\n",
    "      base64data = reader.result;\n",
    "      //console.log(\"Inside FileReader:\" + base64data);\n",
@@ -121,7 +121,7 @@
    "\n",
    "}\n",
    "});\n",
-    "\n",
+    "      \n",
    "</script>\n",
    "\"\"\"\n",
    "\n",
@@ -133,8 +133,8 @@
    "    audio.export('test.mp3', format='mp3')\n",
    "    audio = audio.set_channels(1)\n",
    "    audio = audio.set_frame_rate(16000)\n",
-    "    audio_float = int2float(audio)\n",
+    "    audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
-    "    audio_tens = torch.tensor(audio_float)\n",
+    "    audio_tens = torch.tensor(audio_float )\n",
    "    return audio_tens\n",
    "\n",
    "def make_animation(probs, audio_duration, interval=40):\n",
@@ -154,18 +154,19 @@
    "    def animate(i):\n",
    "        x = i * interval / 1000 - 0.04\n",
    "        y = np.linspace(0, 1.02, 2)\n",
-    "\n",
+    "        \n",
    "        line.set_data(x, y)\n",
    "        line.set_color('#990000')\n",
    "        return line,\n",
    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
    "\n",
-    "    f = r\"animation.mp4\"\n",
+    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
-    "    writervideo = FFMpegWriter(fps=1000/interval)\n",
+    "\n",
    "    f = r\"animation.mp4\" \n",
    "    writervideo = FFMpegWriter(fps=1000/interval) \n",
    "    anim.save(f, writer=writervideo)\n",
    "    plt.close('all')\n",
    "\n",
-    "def combine_audio(vidname, audname, outname, fps=25):\n",
+    "def combine_audio(vidname, audname, outname, fps=25): \n",
    "    my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
    "    audio_background = mpe.AudioFileClip(audname)\n",
    "    final_clip = my_clip.set_audio(audio_background)\n",
@@ -173,10 +174,15 @@
    "\n",
    "def record_make_animation():\n",
    "  tensor = record()\n",
    "\n",
    "  print('Calculating probabilities...')\n",
    "  speech_probs = []\n",
    "  window_size_samples = 512\n",
-    "  speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
+    "  for i in range(0, len(tensor), window_size_samples):\n",
    "      if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
    "        break\n",
    "      speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
    "      speech_probs.append(speech_prob)\n",
    "  model.reset_states()\n",
    "  print('Making animation...')\n",
    "  make_animation(speech_probs, len(tensor) / 16000)\n",
@@ -190,9 +196,7 @@
    "  <video width=800 controls>\n",
    "        <source src=\"%s\" type=\"video/mp4\">\n",
    "  </video>\n",
-    "  \"\"\" % data_url))\n",
+    "  \"\"\" % data_url))"
    "\n",
    "  return speech_probs"
   ]
  },
  {
@@ -212,7 +216,7 @@
   },
   "outputs": [],
   "source": [
-    "speech_probs = record_make_animation()"
+    "record_make_animation()"
   ]
  }
 ],
--- a/examples/parallel_example.ipynb
+++ b/examples/parallel_example.ipynb
@@ -1,6 +1,7 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -17,19 +18,17 @@
    "SAMPLING_RATE = 16000\n",
    "import torch\n",
    "from pprint import pprint\n",
    "import time\n",
    "import shutil\n",
    "\n",
    "torch.set_num_threads(1)\n",
    "NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
    "NUM_COPIES=8\n",
    "# download wav files, make multiple copies\n",
-    "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
+    "for idx in range(NUM_COPIES):\n",
-    "for idx in range(NUM_COPIES-1):\n",
+    "    torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
    "    shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -55,6 +54,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -99,6 +99,7 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@@ -126,7 +127,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "diarization",
   "language": "python",
   "name": "python3"
  },
@@ -140,20 +141,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.9.15"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
--- a/examples/pyaudio-streaming/README.md
+++ b/examples/pyaudio-streaming/README.md
@@ -8,8 +8,6 @@ Currently, the notebook consits of two examples:
 - One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
 - The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
 This example does not work in google colab! For local usage only.
 ## Example Video for the Real-Time Visualization
--- a/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb
+++ b/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "76aa55ba",
+   "id": "62a0cccb",
   "metadata": {},
   "source": [
    "# Pyaudio Microphone Streaming Examples\n",
@@ -12,14 +12,12 @@
    "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
    "\n",
    "\n",
-    "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required.\n",
+    "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
    "\n",
    "This notebook does not work in google colab! For local usage only."
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "4a4e15c2",
+   "id": "64cbe1eb",
   "metadata": {},
   "source": [
    "## Dependencies\n",
@@ -28,27 +26,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
-   "id": "24205cce",
+   "id": "57bc2aac",
-   "metadata": {
+   "metadata": {},
    "ExecuteTime": {
     "end_time": "2024-10-09T08:47:34.056898Z",
     "start_time": "2024-10-09T08:47:34.053418Z"
    }
   },
   "outputs": [],
   "source": [
-    "#!pip install numpy>=1.24.0\n",
+    "#!pip install numpy==2.0.2\n",
-    "#!pip install torch>=1.12.0\n",
+    "#!pip install torch==2.4.1\n",
-    "#!pip install matplotlib>=3.6.0\n",
+    "#!pip install matplotlib==3.9.2\n",
-    "#!pip install torchaudio>=0.12.0\n",
+    "#!pip install torchaudio==2.4.1\n",
    "#!pip install soundfile==0.12.1\n",
-    "#!apt install python3-pyaudio (linux) or pip install pyaudio (windows)"
+    "#!pip install pyaudio==0.2.11"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "cd22818f",
+   "id": "110de761",
   "metadata": {},
   "source": [
    "## Imports"
@@ -56,27 +49,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
-   "id": "994d7f3a",
+   "id": "5a647d8d",
-   "metadata": {
+   "metadata": {},
-    "ExecuteTime": {
+   "outputs": [],
     "end_time": "2024-10-09T08:47:39.005032Z",
     "start_time": "2024-10-09T08:47:36.489952Z"
    }
   },
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'pyaudio'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpylab\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyaudio\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyaudio'"
     ]
    }
   ],
   "source": [
    "import io\n",
    "import numpy as np\n",
@@ -91,7 +67,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ac5c52f7",
+   "id": "725d7066",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -103,7 +79,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ad5919dc",
+   "id": "1c0b2ea7",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -116,7 +92,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "784d1ab6",
+   "id": "f9112603",
   "metadata": {},
   "source": [
    "### Helper Methods"
@@ -125,7 +101,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "af4bca64",
+   "id": "5abc6330",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -148,7 +124,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ca13e514",
+   "id": "5124095e",
   "metadata": {},
   "source": [
    "## Pyaudio Set-up"
@@ -157,7 +133,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "75f99022",
+   "id": "a845356e",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -171,7 +147,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "4da7d2ef",
+   "id": "0b910c99",
   "metadata": {},
   "source": [
    "## Simple Example\n",
@@ -181,7 +157,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "6fe77661",
+   "id": "9d3d2c10",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -191,7 +167,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "23f4da3e",
+   "id": "3cb44a4a",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -231,7 +207,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "fd243e8f",
+   "id": "a3dda982",
   "metadata": {},
   "source": [
    "## Real Time Visualization\n",
@@ -244,7 +220,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d36980c2",
+   "id": "05ef4100",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -254,7 +230,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "5607b616",
+   "id": "d1d4cdd6",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -311,7 +287,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "dc4f0108",
+   "id": "1e398009",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -335,7 +311,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.9.10"
  },
  "toc": {
   "base_numbering": 1,