From 79fdb55f1c2623a77d2156fc710a0cf72197b8ab Mon Sep 17 00:00:00 2001 From: adamnsandle Date: Fri, 10 Dec 2021 09:18:15 +0000 Subject: [PATCH] add collab record example --- examples/colab_record_example.ipynb | 241 ++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 examples/colab_record_example.ipynb diff --git a/examples/colab_record_example.ipynb b/examples/colab_record_example.ipynb new file mode 100644 index 0000000..7658770 --- /dev/null +++ b/examples/colab_record_example.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "bccAucKjnPHm" + }, + "source": [ + "### Dependencies and inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cSih95WFmwgi" + }, + "outputs": [], + "source": [ + "!pip -q install pydub\n", + "from google.colab import output\n", + "from base64 import b64decode, b64encode\n", + "from io import BytesIO\n", + "import numpy as np\n", + "from pydub import AudioSegment\n", + "from IPython.display import HTML, display\n", + "import torch\n", + "import matplotlib.pyplot as plt\n", + "import moviepy.editor as mpe\n", + "from matplotlib.animation import FuncAnimation, FFMpegWriter\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "\n", + "torch.set_num_threads(1)\n", + "\n", + "model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "def int2float(sound):\n", + " abs_max = np.abs(sound).max()\n", + " sound = sound.astype('float32')\n", + " if abs_max > 0:\n", + " sound *= 1/abs_max\n", + " sound = sound.squeeze()\n", + " return sound\n", + "\n", + "AUDIO_HTML = \"\"\"\n", + "\n", + "\"\"\"\n", + "\n", + "def record(sec=10):\n", + " display(HTML(AUDIO_HTML))\n", + " s = output.eval_js(\"data\")\n", + " b = b64decode(s.split(',')[1])\n", + " audio = AudioSegment.from_file(BytesIO(b))\n", + " audio.export('test.mp3', format='mp3')\n", + " audio = audio.set_channels(1)\n", + " audio = audio.set_frame_rate(16000)\n", + " audio_float = int2float(np.array(audio.get_array_of_samples()))\n", + " audio_tens = torch.tensor(audio_float )\n", + " return audio_tens\n", + "\n", + "def make_animation(probs, audio_duration, interval=40):\n", + " fig = plt.figure(figsize=(16, 9))\n", + " ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n", + " line, = ax.plot([], [], lw=2)\n", + " x = [i / 16000 * 1536 for i in range(len(probs))]\n", + " plt.xlabel('Time, seconds', fontsize=16)\n", + " plt.ylabel('Speech Probability', fontsize=16)\n", + "\n", + " def init():\n", + " plt.fill_between(x, probs, color='#064273')\n", + " line.set_data([], [])\n", + " line.set_color('#990000')\n", + " return line,\n", + "\n", + " def animate(i):\n", + " x = i * interval / 1000 - 0.1\n", + " y = np.linspace(0, 1.02, 2)\n", + " \n", + " line.set_data(x, y)\n", + " line.set_color('#990000')\n", + " return line,\n", + "\n", + " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n", + "\n", + " f = r\"animation.mp4\" \n", + " writervideo = FFMpegWriter(fps=1000/interval) \n", + " anim.save(f, writer=writervideo)\n", + " plt.close('all')\n", + "\n", + "def combine_audio(vidname, audname, outname, fps=25): \n", + " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n", + " audio_background = mpe.AudioFileClip(audname)\n", + " final_clip = my_clip.set_audio(audio_background)\n", + " final_clip.write_videofile(outname,fps=fps,verbose=False)\n", + "\n", + "def record_make_animation():\n", + " tensor = record()\n", + "\n", + " print('Calculating probabilities...')\n", + " speech_probs = []\n", + " window_size_samples = 1536\n", + " for i in range(0, len(tensor), window_size_samples):\n", + " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n", + " break\n", + " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n", + " speech_probs.append(speech_prob)\n", + " model.reset_states()\n", + " print('Making animation...')\n", + " make_animation(speech_probs, len(tensor) / 16000)\n", + "\n", + " print('Merging your voice with animation...')\n", + " combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n", + " print('Done!')\n", + " mp4 = open('merged.mp4','rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " display(HTML(\"\"\"\n", + " \n", + " \"\"\" % data_url))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IFVs3GvTnpB1" + }, + "source": [ + "## Record example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EBjrTwiqAaQ" + }, + "outputs": [], + "source": [ + "record_make_animation()" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "bccAucKjnPHm" + ], + "name": "Untitled2.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}