diff --git a/examples/colab_record_example.ipynb b/examples/colab_record_example.ipynb index 9b11d5e..4de7e26 100644 --- a/examples/colab_record_example.ipynb +++ b/examples/colab_record_example.ipynb @@ -17,6 +17,7 @@ }, "outputs": [], "source": [ + "#!apt install ffmpeg\n", "!pip -q install pydub\n", "from google.colab import output\n", "from base64 import b64decode, b64encode\n", @@ -37,13 +38,12 @@ " model='silero_vad',\n", " force_reload=True)\n", "\n", - "def int2float(sound):\n", - " abs_max = np.abs(sound).max()\n", - " sound = sound.astype('float32')\n", - " if abs_max > 0:\n", - " sound *= 1/32768\n", - " sound = sound.squeeze()\n", - " return sound\n", + "def int2float(audio):\n", + " samples = audio.get_array_of_samples()\n", + " new_sound = audio._spawn(samples)\n", + " arr = np.array(samples).astype(np.float32)\n", + " arr = arr / np.abs(arr).max()\n", + " return arr\n", "\n", "AUDIO_HTML = \"\"\"\n", "\n", "\"\"\"\n", "\n", @@ -133,8 +133,8 @@ " audio.export('test.mp3', format='mp3')\n", " audio = audio.set_channels(1)\n", " audio = audio.set_frame_rate(16000)\n", - " audio_float = int2float(np.array(audio.get_array_of_samples()))\n", - " audio_tens = torch.tensor(audio_float )\n", + " audio_float = int2float(audio)\n", + " audio_tens = torch.tensor(audio_float)\n", " return audio_tens\n", "\n", "def make_animation(probs, audio_duration, interval=40):\n", @@ -154,19 +154,18 @@ " def animate(i):\n", " x = i * interval / 1000 - 0.04\n", " y = np.linspace(0, 1.02, 2)\n", - " \n", + "\n", " line.set_data(x, y)\n", " line.set_color('#990000')\n", " return line,\n", + " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n", "\n", - " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n", - "\n", - " f = r\"animation.mp4\" \n", - " writervideo = FFMpegWriter(fps=1000/interval) \n", + " f = r\"animation.mp4\"\n", + " writervideo = FFMpegWriter(fps=1000/interval)\n", " anim.save(f, writer=writervideo)\n", " plt.close('all')\n", "\n", - "def combine_audio(vidname, audname, outname, fps=25): \n", + "def combine_audio(vidname, audname, outname, fps=25):\n", " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n", " audio_background = mpe.AudioFileClip(audname)\n", " final_clip = my_clip.set_audio(audio_background)\n", @@ -174,15 +173,10 @@ "\n", "def record_make_animation():\n", " tensor = record()\n", - "\n", " print('Calculating probabilities...')\n", " speech_probs = []\n", " window_size_samples = 512\n", - " for i in range(0, len(tensor), window_size_samples):\n", - " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n", - " break\n", - " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n", - " speech_probs.append(speech_prob)\n", + " speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n", " model.reset_states()\n", " print('Making animation...')\n", " make_animation(speech_probs, len(tensor) / 16000)\n", @@ -196,7 +190,9 @@ " \n", - " \"\"\" % data_url))" + " \"\"\" % data_url))\n", + "\n", + " return speech_probs" ] }, { @@ -216,7 +212,7 @@ }, "outputs": [], "source": [ - "record_make_animation()" + "speech_probs = record_make_animation()" ] } ], diff --git a/examples/parallel_example.ipynb b/examples/parallel_example.ipynb index 9704291..2c83c07 100644 --- a/examples/parallel_example.ipynb +++ b/examples/parallel_example.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -18,17 +17,19 @@ "SAMPLING_RATE = 16000\n", "import torch\n", "from pprint import pprint\n", + "import time\n", + "import shutil\n", "\n", "torch.set_num_threads(1)\n", "NUM_PROCESS=4 # set to the number of CPU cores in the machine\n", "NUM_COPIES=8\n", "# download wav files, make multiple copies\n", - "for idx in range(NUM_COPIES):\n", - " torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n" + "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n", + "for idx in range(NUM_COPIES-1):\n", + " shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -54,7 +55,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -99,7 +99,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -127,7 +126,7 @@ ], "metadata": { "kernelspec": { - "display_name": "diarization", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -141,7 +140,20 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.14" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false } }, "nbformat": 4,