mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-04 17:39:22 +08:00
242 lines
7.7 KiB
Plaintext
242 lines
7.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "bccAucKjnPHm"
|
|
},
|
|
"source": [
|
|
"### Dependencies and inputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "cSih95WFmwgi"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip -q install pydub\n",
|
|
"from google.colab import output\n",
|
|
"from base64 import b64decode, b64encode\n",
|
|
"from io import BytesIO\n",
|
|
"import numpy as np\n",
|
|
"from pydub import AudioSegment\n",
|
|
"from IPython.display import HTML, display\n",
|
|
"import torch\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import moviepy.editor as mpe\n",
|
|
"from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
|
|
"import matplotlib\n",
|
|
"matplotlib.use('Agg')\n",
|
|
"\n",
|
|
"torch.set_num_threads(1)\n",
|
|
"\n",
|
|
"model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
" model='silero_vad',\n",
|
|
" force_reload=True)\n",
|
|
"\n",
|
|
"def int2float(sound):\n",
|
|
" abs_max = np.abs(sound).max()\n",
|
|
" sound = sound.astype('float32')\n",
|
|
" if abs_max > 0:\n",
|
|
" sound *= 1/32768\n",
|
|
" sound = sound.squeeze()\n",
|
|
" return sound\n",
|
|
"\n",
|
|
"AUDIO_HTML = \"\"\"\n",
|
|
"<script>\n",
|
|
"var my_div = document.createElement(\"DIV\");\n",
|
|
"var my_p = document.createElement(\"P\");\n",
|
|
"var my_btn = document.createElement(\"BUTTON\");\n",
|
|
"var t = document.createTextNode(\"Press to start recording\");\n",
|
|
"\n",
|
|
"my_btn.appendChild(t);\n",
|
|
"//my_p.appendChild(my_btn);\n",
|
|
"my_div.appendChild(my_btn);\n",
|
|
"document.body.appendChild(my_div);\n",
|
|
"\n",
|
|
"var base64data = 0;\n",
|
|
"var reader;\n",
|
|
"var recorder, gumStream;\n",
|
|
"var recordButton = my_btn;\n",
|
|
"\n",
|
|
"var handleSuccess = function(stream) {\n",
|
|
" gumStream = stream;\n",
|
|
" var options = {\n",
|
|
" //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
|
|
" mimeType : 'audio/webm;codecs=opus'\n",
|
|
" //mimeType : 'audio/webm;codecs=pcm'\n",
|
|
" }; \n",
|
|
" //recorder = new MediaRecorder(stream, options);\n",
|
|
" recorder = new MediaRecorder(stream);\n",
|
|
" recorder.ondataavailable = function(e) { \n",
|
|
" var url = URL.createObjectURL(e.data);\n",
|
|
" // var preview = document.createElement('audio');\n",
|
|
" // preview.controls = true;\n",
|
|
" // preview.src = url;\n",
|
|
" // document.body.appendChild(preview);\n",
|
|
"\n",
|
|
" reader = new FileReader();\n",
|
|
" reader.readAsDataURL(e.data); \n",
|
|
" reader.onloadend = function() {\n",
|
|
" base64data = reader.result;\n",
|
|
" //console.log(\"Inside FileReader:\" + base64data);\n",
|
|
" }\n",
|
|
" };\n",
|
|
" recorder.start();\n",
|
|
" };\n",
|
|
"\n",
|
|
"recordButton.innerText = \"Recording... press to stop\";\n",
|
|
"\n",
|
|
"navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
|
|
"\n",
|
|
"\n",
|
|
"function toggleRecording() {\n",
|
|
" if (recorder && recorder.state == \"recording\") {\n",
|
|
" recorder.stop();\n",
|
|
" gumStream.getAudioTracks()[0].stop();\n",
|
|
" recordButton.innerText = \"Saving recording...\"\n",
|
|
" }\n",
|
|
"}\n",
|
|
"\n",
|
|
"// https://stackoverflow.com/a/951057\n",
|
|
"function sleep(ms) {\n",
|
|
" return new Promise(resolve => setTimeout(resolve, ms));\n",
|
|
"}\n",
|
|
"\n",
|
|
"var data = new Promise(resolve=>{\n",
|
|
"//recordButton.addEventListener(\"click\", toggleRecording);\n",
|
|
"recordButton.onclick = ()=>{\n",
|
|
"toggleRecording()\n",
|
|
"\n",
|
|
"sleep(2000).then(() => {\n",
|
|
" // wait 2000ms for the data to be available...\n",
|
|
" // ideally this should use something like await...\n",
|
|
" //console.log(\"Inside data:\" + base64data)\n",
|
|
" resolve(base64data.toString())\n",
|
|
"\n",
|
|
"});\n",
|
|
"\n",
|
|
"}\n",
|
|
"});\n",
|
|
" \n",
|
|
"</script>\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"def record(sec=10):\n",
|
|
" display(HTML(AUDIO_HTML))\n",
|
|
" s = output.eval_js(\"data\")\n",
|
|
" b = b64decode(s.split(',')[1])\n",
|
|
" audio = AudioSegment.from_file(BytesIO(b))\n",
|
|
" audio.export('test.mp3', format='mp3')\n",
|
|
" audio = audio.set_channels(1)\n",
|
|
" audio = audio.set_frame_rate(16000)\n",
|
|
" audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
|
|
" audio_tens = torch.tensor(audio_float )\n",
|
|
" return audio_tens\n",
|
|
"\n",
|
|
"def make_animation(probs, audio_duration, interval=40):\n",
|
|
" fig = plt.figure(figsize=(16, 9))\n",
|
|
" ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
|
|
" line, = ax.plot([], [], lw=2)\n",
|
|
" x = [i / 16000 * 512 for i in range(len(probs))]\n",
|
|
" plt.xlabel('Time, seconds', fontsize=16)\n",
|
|
" plt.ylabel('Speech Probability', fontsize=16)\n",
|
|
"\n",
|
|
" def init():\n",
|
|
" plt.fill_between(x, probs, color='#064273')\n",
|
|
" line.set_data([], [])\n",
|
|
" line.set_color('#990000')\n",
|
|
" return line,\n",
|
|
"\n",
|
|
" def animate(i):\n",
|
|
" x = i * interval / 1000 - 0.04\n",
|
|
" y = np.linspace(0, 1.02, 2)\n",
|
|
" \n",
|
|
" line.set_data(x, y)\n",
|
|
" line.set_color('#990000')\n",
|
|
" return line,\n",
|
|
"\n",
|
|
" anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
|
|
"\n",
|
|
" f = r\"animation.mp4\" \n",
|
|
" writervideo = FFMpegWriter(fps=1000/interval) \n",
|
|
" anim.save(f, writer=writervideo)\n",
|
|
" plt.close('all')\n",
|
|
"\n",
|
|
"def combine_audio(vidname, audname, outname, fps=25): \n",
|
|
" my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
|
|
" audio_background = mpe.AudioFileClip(audname)\n",
|
|
" final_clip = my_clip.set_audio(audio_background)\n",
|
|
" final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
|
|
"\n",
|
|
"def record_make_animation():\n",
|
|
" tensor = record()\n",
|
|
"\n",
|
|
" print('Calculating probabilities...')\n",
|
|
" speech_probs = []\n",
|
|
" window_size_samples = 512\n",
|
|
" for i in range(0, len(tensor), window_size_samples):\n",
|
|
" if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
|
|
" break\n",
|
|
" speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
|
|
" speech_probs.append(speech_prob)\n",
|
|
" model.reset_states()\n",
|
|
" print('Making animation...')\n",
|
|
" make_animation(speech_probs, len(tensor) / 16000)\n",
|
|
"\n",
|
|
" print('Merging your voice with animation...')\n",
|
|
" combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
|
|
" print('Done!')\n",
|
|
" mp4 = open('merged.mp4','rb').read()\n",
|
|
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
|
|
" display(HTML(\"\"\"\n",
|
|
" <video width=800 controls>\n",
|
|
" <source src=\"%s\" type=\"video/mp4\">\n",
|
|
" </video>\n",
|
|
" \"\"\" % data_url))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "IFVs3GvTnpB1"
|
|
},
|
|
"source": [
|
|
"## Record example"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "5EBjrTwiqAaQ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"record_make_animation()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [
|
|
"bccAucKjnPHm"
|
|
],
|
|
"name": "Untitled2.ipynb",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|