diff --git a/examples/colab_record_example.ipynb b/examples/colab_record_example.ipynb
index 9b11d5e..4de7e26 100644
--- a/examples/colab_record_example.ipynb
+++ b/examples/colab_record_example.ipynb
@@ -17,6 +17,7 @@
},
"outputs": [],
"source": [
+ "#!apt install ffmpeg\n",
"!pip -q install pydub\n",
"from google.colab import output\n",
"from base64 import b64decode, b64encode\n",
@@ -37,13 +38,12 @@
" model='silero_vad',\n",
" force_reload=True)\n",
"\n",
- "def int2float(sound):\n",
- " abs_max = np.abs(sound).max()\n",
- " sound = sound.astype('float32')\n",
- " if abs_max > 0:\n",
- " sound *= 1/32768\n",
- " sound = sound.squeeze()\n",
- " return sound\n",
+ "def int2float(audio):\n",
+ " samples = audio.get_array_of_samples()\n",
+ " new_sound = audio._spawn(samples)\n",
+ " arr = np.array(samples).astype(np.float32)\n",
+ " arr = arr / np.abs(arr).max()\n",
+ " return arr\n",
"\n",
"AUDIO_HTML = \"\"\"\n",
"\n",
"\"\"\"\n",
"\n",
@@ -133,8 +133,8 @@
" audio.export('test.mp3', format='mp3')\n",
" audio = audio.set_channels(1)\n",
" audio = audio.set_frame_rate(16000)\n",
- " audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
- " audio_tens = torch.tensor(audio_float )\n",
+ " audio_float = int2float(audio)\n",
+ " audio_tens = torch.tensor(audio_float)\n",
" return audio_tens\n",
"\n",
"def make_animation(probs, audio_duration, interval=40):\n",
@@ -154,19 +154,18 @@
" def animate(i):\n",
" x = i * interval / 1000 - 0.04\n",
" y = np.linspace(0, 1.02, 2)\n",
- " \n",
+ "\n",
" line.set_data(x, y)\n",
" line.set_color('#990000')\n",
" return line,\n",
+ " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
"\n",
- " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
- "\n",
- " f = r\"animation.mp4\" \n",
- " writervideo = FFMpegWriter(fps=1000/interval) \n",
+ " f = r\"animation.mp4\"\n",
+ " writervideo = FFMpegWriter(fps=1000/interval)\n",
" anim.save(f, writer=writervideo)\n",
" plt.close('all')\n",
"\n",
- "def combine_audio(vidname, audname, outname, fps=25): \n",
+ "def combine_audio(vidname, audname, outname, fps=25):\n",
" my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
" audio_background = mpe.AudioFileClip(audname)\n",
" final_clip = my_clip.set_audio(audio_background)\n",
@@ -174,15 +173,10 @@
"\n",
"def record_make_animation():\n",
" tensor = record()\n",
- "\n",
" print('Calculating probabilities...')\n",
" speech_probs = []\n",
" window_size_samples = 512\n",
- " for i in range(0, len(tensor), window_size_samples):\n",
- " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
- " break\n",
- " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
- " speech_probs.append(speech_prob)\n",
+ " speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
" model.reset_states()\n",
" print('Making animation...')\n",
" make_animation(speech_probs, len(tensor) / 16000)\n",
@@ -196,7 +190,9 @@
" \n",
- " \"\"\" % data_url))"
+ " \"\"\" % data_url))\n",
+ "\n",
+ " return speech_probs"
]
},
{
@@ -216,7 +212,7 @@
},
"outputs": [],
"source": [
- "record_make_animation()"
+ "speech_probs = record_make_animation()"
]
}
],
diff --git a/examples/parallel_example.ipynb b/examples/parallel_example.ipynb
index 9704291..2c83c07 100644
--- a/examples/parallel_example.ipynb
+++ b/examples/parallel_example.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -18,17 +17,19 @@
"SAMPLING_RATE = 16000\n",
"import torch\n",
"from pprint import pprint\n",
+ "import time\n",
+ "import shutil\n",
"\n",
"torch.set_num_threads(1)\n",
"NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
"NUM_COPIES=8\n",
"# download wav files, make multiple copies\n",
- "for idx in range(NUM_COPIES):\n",
- " torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
+ "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
+ "for idx in range(NUM_COPIES-1):\n",
+ " shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -54,7 +55,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -99,7 +99,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -127,7 +126,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "diarization",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -141,7 +140,20 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.15"
+ "version": "3.10.14"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
}
},
"nbformat": 4,