mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 01:49:22 +08:00
272 lines
37 KiB
Plaintext
272 lines
37 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "62a0cccb",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pyaudio Microphone Streaming Examples\n",
|
|
"\n",
|
|
"A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
|
|
"\n",
|
|
"I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
|
|
"\n",
|
|
"\n",
|
|
"Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "64cbe1eb",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Dependencies\n",
|
|
"The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "57bc2aac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install numpy==1.20.2\n",
|
|
"#!pip install torch==1.8.1\n",
|
|
"#!pip install matplotlib==3.4.2\n",
|
|
"#!pip install torchaudio==0.8.1\n",
|
|
"#!pip install soundfile==0.10.3.post1\n",
|
|
"#!pip install pyaudio==0.2.11"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "110de761",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "5a647d8d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import io\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"torch.set_num_threads(1)\n",
|
|
"import torchaudio\n",
|
|
"import matplotlib\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"torchaudio.set_audio_backend(\"soundfile\")\n",
|
|
"import pyaudio"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "725d7066",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Downloading: \"https://github.com/snakers4/silero-vad/archive/master.zip\" to /home/kaik/.cache/torch/hub/master.zip\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
|
" model='silero_vad',\n",
|
|
" force_reload=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "1c0b2ea7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"(get_speech_ts,\n",
|
|
" get_speech_ts_adaptive,\n",
|
|
" save_audio,\n",
|
|
" read_audio,\n",
|
|
" state_generator,\n",
|
|
" single_audio_stream,\n",
|
|
" collect_chunks) = utils"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f9112603",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Helper Methods"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "5abc6330",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Taken from utils_vad.py\n",
|
|
"def validate(model,\n",
|
|
" inputs: torch.Tensor):\n",
|
|
" with torch.no_grad():\n",
|
|
" outs = model(inputs)\n",
|
|
" return outs\n",
|
|
"\n",
|
|
"# Provided by Alexander Veysov\n",
|
|
"def int2float(sound):\n",
|
|
" abs_max = np.abs(sound).max()\n",
|
|
" sound = sound.astype('float32')\n",
|
|
" if abs_max > 0:\n",
|
|
" sound *= 1/abs_max\n",
|
|
" sound = sound.squeeze() # depends on the use case\n",
|
|
" return sound"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5124095e",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Pyaudio Set-up"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "a845356e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"FORMAT = pyaudio.paInt16\n",
|
|
"CHANNELS = 1\n",
|
|
"SAMPLE_RATE = 16000\n",
|
|
"CHUNK = int(SAMPLE_RATE / 10)\n",
|
|
"\n",
|
|
"audio = pyaudio.PyAudio()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0b910c99",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Simple Example\n",
|
|
"The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "9d3d2c10",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Configure how long you want to record the audio\n",
|
|
"frames_to_record = 20 # frames_to_record * frame_duration_ms = recording duration\n",
|
|
"frame_duration_ms = 250"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "3cb44a4a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Started Recording\n",
|
|
"Stopped the recording\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 1440x432 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"stream = audio.open(format=FORMAT,\n",
|
|
" channels=CHANNELS,\n",
|
|
" rate=SAMPLE_RATE,\n",
|
|
" input=True,\n",
|
|
" frames_per_buffer=CHUNK)\n",
|
|
"data = []\n",
|
|
"voiced_confidences = []\n",
|
|
"\n",
|
|
"print(\"Started Recording\")\n",
|
|
"for i in range(0, frames_to_record):\n",
|
|
" \n",
|
|
" audio_chunk = stream.read(int(SAMPLE_RATE * frame_duration_ms / 1000.0))\n",
|
|
" \n",
|
|
" # in case you want to save the audio later\n",
|
|
" data.append(audio_chunk)\n",
|
|
" \n",
|
|
" audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
|
|
"\n",
|
|
" audio_float32 = int2float(audio_int16)\n",
|
|
" \n",
|
|
" # get the confidences and add them to the list to plot them later\n",
|
|
" vad_outs = validate(model, torch.from_numpy(audio_float32))\n",
|
|
" # only keep the confidence for the speech\n",
|
|
" voiced_confidences.append(vad_outs[:,1])\n",
|
|
" \n",
|
|
"print(\"Stopped the recording\")\n",
|
|
"\n",
|
|
"# plot the confidences for the speech\n",
|
|
"plt.figure(figsize=(20,6))\n",
|
|
"plt.plot(voiced_confidences)\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "430a343e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|