mirror of
https://github.com/snakers4/silero-vad.git
synced 2026-02-05 18:09:22 +08:00
Integration with silero vad added
This commit is contained in:
@@ -1,13 +1,14 @@
|
|||||||
import time, logging
|
import time, logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import threading, collections, queue, os, os.path
|
import threading, collections, queue, os, os.path
|
||||||
import deepspeech
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyaudio
|
import pyaudio
|
||||||
import wave
|
import wave
|
||||||
import webrtcvad
|
import webrtcvad
|
||||||
from halo import Halo
|
from halo import Halo
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
logging.basicConfig(level=20)
|
logging.basicConfig(level=20)
|
||||||
|
|
||||||
@@ -152,18 +153,9 @@ class VADAudio(Audio):
|
|||||||
ring_buffer.clear()
|
ring_buffer.clear()
|
||||||
|
|
||||||
def main(ARGS):
|
def main(ARGS):
|
||||||
# Load DeepSpeech model
|
|
||||||
if os.path.isdir(ARGS.model):
|
|
||||||
model_dir = ARGS.model
|
|
||||||
ARGS.model = os.path.join(model_dir, 'output_graph.pb')
|
|
||||||
ARGS.scorer = os.path.join(model_dir, ARGS.scorer)
|
|
||||||
|
|
||||||
print('Initializing model...')
|
|
||||||
logging.info("ARGS.model: %s", ARGS.model)
|
|
||||||
model = deepspeech.Model(ARGS.model)
|
|
||||||
if ARGS.scorer:
|
|
||||||
logging.info("ARGS.scorer: %s", ARGS.scorer)
|
|
||||||
model.enableExternalScorer(ARGS.scorer)
|
|
||||||
|
|
||||||
# Start audio with VAD
|
# Start audio with VAD
|
||||||
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
|
vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
|
||||||
@@ -173,36 +165,56 @@ def main(ARGS):
|
|||||||
print("Listening (ctrl-C to exit)...")
|
print("Listening (ctrl-C to exit)...")
|
||||||
frames = vad_audio.vad_collector()
|
frames = vad_audio.vad_collector()
|
||||||
|
|
||||||
|
# load silero VAD
|
||||||
|
torchaudio.set_audio_backend("soundfile")
|
||||||
|
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
||||||
|
model='silero_vad',
|
||||||
|
force_reload=True)
|
||||||
|
(get_speech_ts,get_speech_ts_adaptive,_, read_audio,_, _, _) = utils
|
||||||
|
|
||||||
|
|
||||||
# Stream from microphone to DeepSpeech using VAD
|
# Stream from microphone to DeepSpeech using VAD
|
||||||
spinner = None
|
spinner = None
|
||||||
if not ARGS.nospinner:
|
if not ARGS.nospinner:
|
||||||
spinner = Halo(spinner='line')
|
spinner = Halo(spinner='line')
|
||||||
stream_context = model.createStream()
|
|
||||||
wav_data = bytearray()
|
wav_data = bytearray()
|
||||||
for frame in frames:
|
for frame in frames:
|
||||||
if frame is not None:
|
if frame is not None:
|
||||||
if spinner: spinner.start()
|
if spinner: spinner.start()
|
||||||
|
|
||||||
logging.debug("streaming frame")
|
logging.debug("streaming frame")
|
||||||
stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
|
wav_data.extend(frame)
|
||||||
if ARGS.savewav: wav_data.extend(frame)
|
|
||||||
else:
|
else:
|
||||||
if spinner: spinner.stop()
|
if spinner: spinner.stop()
|
||||||
logging.debug("end utterence")
|
print("webRTC has detected a possible speech")
|
||||||
if ARGS.savewav:
|
|
||||||
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
|
newsound= np.frombuffer(wav_data,np.int16)
|
||||||
wav_data = bytearray()
|
audio_float32=Int2Float(newsound)
|
||||||
text = stream_context.finishStream()
|
time_stamps =get_speech_ts(audio_float32, model,num_steps=4)
|
||||||
print("Recognized: %s" % text)
|
if(len(time_stamps)>0):
|
||||||
if ARGS.keyboard:
|
print("silero VAD has detected a possible speech")
|
||||||
from pyautogui import typewrite
|
if ARGS.savewav:
|
||||||
typewrite(text)
|
vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
|
||||||
stream_context = model.createStream()
|
else:
|
||||||
|
print("silero VAD has detected a noise")
|
||||||
|
print()
|
||||||
|
wav_data = bytearray()
|
||||||
|
|
||||||
|
|
||||||
|
def Int2Float(sound):
|
||||||
|
_sound = np.copy(sound) #
|
||||||
|
abs_max = np.abs(_sound).max()
|
||||||
|
_sound = _sound.astype('float32')
|
||||||
|
if abs_max > 0:
|
||||||
|
_sound *= 1/abs_max
|
||||||
|
audio_float32 = torch.from_numpy(_sound.squeeze())
|
||||||
|
return audio_float32
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
DEFAULT_SAMPLE_RATE = 16000
|
DEFAULT_SAMPLE_RATE = 16000
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
|
parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
|
||||||
|
|
||||||
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
|
||||||
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
||||||
@@ -212,17 +224,10 @@ if __name__ == '__main__':
|
|||||||
help="Save .wav files of utterences to given directory")
|
help="Save .wav files of utterences to given directory")
|
||||||
parser.add_argument('-f', '--file',
|
parser.add_argument('-f', '--file',
|
||||||
help="Read from .wav file instead of microphone")
|
help="Read from .wav file instead of microphone")
|
||||||
|
|
||||||
parser.add_argument('-m', '--model', required=True,
|
|
||||||
help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
|
|
||||||
parser.add_argument('-s', '--scorer',
|
|
||||||
help="Path to the external scorer file.")
|
|
||||||
parser.add_argument('-d', '--device', type=int, default=None,
|
parser.add_argument('-d', '--device', type=int, default=None,
|
||||||
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
|
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
|
||||||
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
|
parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
|
||||||
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
|
help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
|
||||||
parser.add_argument('-k', '--keyboard', action='store_true',
|
|
||||||
help="Type output through system keyboard")
|
|
||||||
ARGS = parser.parse_args()
|
ARGS = parser.parse_args()
|
||||||
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
|
if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
|
||||||
main(ARGS)
|
main(ARGS)
|
||||||
Reference in New Issue
Block a user