README.md updated and code cleaned up

This commit is contained in:
Gianpaolo Bontempo
2021-04-30 23:17:37 +00:00
parent 0671c576cb
commit 9a5f8cc9e0
2 changed files with 64 additions and 74 deletions

View File

@@ -1,6 +1,28 @@
In this example, an integration with the microphone and the webRTC VAD has been done. I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft.
Here short video to present the results:
In this example, an integration with the microphone and the webRTC VAD has been done. I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft.
Here a short video to present the results:
https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4
# Requirements:
The libraries used for the following example are:
```
Python == 3.6.9
webrtcvad >= 2.0.10
torchaudio >= 0.8.1
torch >= 1.8.1
halo >= 0.0.31
Soundfile >= 0.13.3
```
Using pip3:
```
pip3 install webrtcvad
pip3 install torchaudio
pip3 install torch
pip3 install halo
pip3 install soundfile
```
Moreover, to make the code easier, the default sample_rate is 16KHz without resampling.
This example has been tested on ``` ubuntu 18.04.3 LTS```

View File

@@ -1,17 +1,11 @@
import time, logging
from datetime import datetime
import threading, collections, queue, os, os.path
import collections, queue
import numpy as np
import pyaudio
import wave
import webrtcvad
from halo import Halo
from scipy import signal
import torch
import torchaudio
logging.basicConfig(level=20)
class Audio(object):
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
@@ -21,11 +15,9 @@ class Audio(object):
CHANNELS = 1
BLOCKS_PER_SECOND = 50
def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS, file=None):
def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):
def proxy_callback(in_data, frame_count, time_info, status):
#pylint: disable=unused-argument
if self.chunk is not None:
in_data = self.wf.readframes(self.chunk)
callback(in_data)
return (None, pyaudio.paContinue)
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
@@ -50,34 +42,10 @@ class Audio(object):
# if not default device
if self.device:
kwargs['input_device_index'] = self.device
elif file is not None:
self.chunk = 320
self.wf = wave.open(file, 'rb')
self.stream = self.pa.open(**kwargs)
self.stream.start_stream()
def resample(self, data, input_rate):
"""
Microphone may not support our native processing sampling rate, so
resample from input_rate to RATE_PROCESS here for webrtcvad and
deepspeech
Args:
data (binary): Input audio stream
input_rate (int): Input audio rate to resample from
"""
data16 = np.fromstring(string=data, dtype=np.int16)
resample_size = int(len(data16) / self.input_rate * self.RATE_PROCESS)
resample = signal.resample(data16, resample_size)
resample16 = np.array(resample, dtype=np.int16)
return resample16.tostring()
def read_resampled(self):
"""Return a block of audio data resampled to 16000hz, blocking if necessary."""
return self.resample(data=self.buffer_queue.get(),
input_rate=self.input_rate)
def read(self):
"""Return a block of audio data, blocking if necessary."""
return self.buffer_queue.get()
@@ -89,23 +57,12 @@ class Audio(object):
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
def write_wav(self, filename, data):
logging.info("write wav %s", filename)
wf = wave.open(filename, 'wb')
wf.setnchannels(self.CHANNELS)
# wf.setsampwidth(self.pa.get_sample_size(FORMAT))
assert self.FORMAT == pyaudio.paInt16
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
wf.writeframes(data)
wf.close()
class VADAudio(Audio):
"""Filter & segment audio with voice activity detection."""
def __init__(self, aggressiveness=3, device=None, input_rate=None, file=None):
super().__init__(device=device, input_rate=input_rate, file=file)
def __init__(self, aggressiveness=3, device=None, input_rate=None):
super().__init__(device=device, input_rate=input_rate)
self.vad = webrtcvad.Vad(aggressiveness)
def frame_generator(self):
@@ -114,8 +71,7 @@ class VADAudio(Audio):
while True:
yield self.read()
else:
while True:
yield self.read_resampled()
raise Exception("Resampling required")
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
@@ -153,24 +109,20 @@ class VADAudio(Audio):
ring_buffer.clear()
def main(ARGS):
# Start audio with VAD
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
device=ARGS.device,
input_rate=ARGS.rate,
file=ARGS.file)
input_rate=ARGS.rate)
print("Listening (ctrl-C to exit)...")
frames = vad_audio.vad_collector()
# load silero VAD
torchaudio.set_audio_backend("soundfile")
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=True)
(get_speech_ts,get_speech_ts_adaptive,_, read_audio,_, _, _) = utils
model=ARGS.silaro_model_name,
force_reload= ARGS.reload)
(get_speech_ts,_,_, _,_, _, _) = utils
# Stream from microphone to DeepSpeech using VAD
@@ -182,7 +134,6 @@ def main(ARGS):
if frame is not None:
if spinner: spinner.start()
logging.debug("streaming frame")
wav_data.extend(frame)
else:
if spinner: spinner.stop()
@@ -190,11 +141,12 @@ def main(ARGS):
newsound= np.frombuffer(wav_data,np.int16)
audio_float32=Int2Float(newsound)
time_stamps =get_speech_ts(audio_float32, model,num_steps=4)
time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum,
num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples,
min_silence_samples=ARGS.min_silence_samples)
if(len(time_stamps)>0):
print("silero VAD has detected a possible speech")
if ARGS.savewav:
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
else:
print("silero VAD has detected a noise")
print()
@@ -216,18 +168,34 @@ if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3,
help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
parser.add_argument('--nospinner', action='store_true',
help="Disable spinner")
parser.add_argument('-w', '--savewav',
help="Save .wav files of utterences to given directory")
parser.add_argument('-f', '--file',
help="Read from .wav file instead of microphone")
parser.add_argument('-d', '--device', type=int, default=None,
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad",
help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'")
parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad")
parser.add_argument('-ts', '--trig_sum', type=float, default=0.25,
help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)")
parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07,
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
parser.add_argument('-N', '--num_steps', type=int, default=8,
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000,
help="minimum speech chunk duration in samples")
parser.add_argument('-msis', '--min_silence_samples', type=int, default=500,
help=" minimum silence duration in samples between to separate speech chunks")
ARGS = parser.parse_args()
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
ARGS.rate=DEFAULT_SAMPLE_RATE
main(ARGS)