Integration with silero vad added

2026-02-05 18:09:22 +08:00 · 2021-04-30 10:35:46 +00:00
parent 8e4ec7ed6e
commit 1814007ef5
1 changed files with 38 additions and 33 deletions
--- a/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
+++ b/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
@@ -1,13 +1,14 @@
 import time, logging
 from datetime import datetime
 import threading, collections, queue, os, os.path
 import deepspeech
 import numpy as np
 import pyaudio
 import wave
 import webrtcvad
 from halo import Halo
 from scipy import signal
 import torch
 import torchaudio
 logging.basicConfig(level=20)
@@ -152,18 +153,9 @@ class VADAudio(Audio):
                    ring_buffer.clear()
 def main(ARGS):
    # Load DeepSpeech model
    if os.path.isdir(ARGS.model):
        model_dir = ARGS.model
        ARGS.model = os.path.join(model_dir, 'output_graph.pb')
        ARGS.scorer = os.path.join(model_dir, ARGS.scorer)
-    print('Initializing model...')
+
-    logging.info("ARGS.model: %s", ARGS.model)
+
    model = deepspeech.Model(ARGS.model)
    if ARGS.scorer:
        logging.info("ARGS.scorer: %s", ARGS.scorer)
        model.enableExternalScorer(ARGS.scorer)
    # Start audio with VAD
    vad_audio = VADAudio(aggressiveness=ARGS.vad_aggressiveness,
@@ -173,36 +165,56 @@ def main(ARGS):
    print("Listening (ctrl-C to exit)...")
    frames = vad_audio.vad_collector()
    # load silero VAD
    torchaudio.set_audio_backend("soundfile")
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                    model='silero_vad',
                                    force_reload=True)
    (get_speech_ts,get_speech_ts_adaptive,_, read_audio,_, _, _) = utils
    # Stream from microphone to DeepSpeech using VAD
    spinner = None
    if not ARGS.nospinner:
        spinner = Halo(spinner='line')
    stream_context = model.createStream()
    wav_data = bytearray()
    for frame in frames:
        if frame is not None:
            if spinner: spinner.start()
            logging.debug("streaming frame")
-            stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
+            wav_data.extend(frame)
            if ARGS.savewav: wav_data.extend(frame)
        else:
            if spinner: spinner.stop()
-            logging.debug("end utterence")
+            print("webRTC has detected a possible speech")
-            if ARGS.savewav:
+
-                vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
+            newsound= np.frombuffer(wav_data,np.int16)
-                wav_data = bytearray()
+            audio_float32=Int2Float(newsound)
-            text = stream_context.finishStream()
+            time_stamps =get_speech_ts(audio_float32, model,num_steps=4)
-            print("Recognized: %s" % text)
+            if(len(time_stamps)>0):
-            if ARGS.keyboard:
+                print("silero VAD has detected a possible speech")
-                from pyautogui import typewrite
+                if ARGS.savewav:
-                typewrite(text)
+                    vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data)
-            stream_context = model.createStream()
+            else:
                print("silero VAD has detected a noise")
            print()
            wav_data = bytearray()
 def Int2Float(sound):
    _sound = np.copy(sound)  #
    abs_max = np.abs(_sound).max()
    _sound = _sound.astype('float32')
    if abs_max > 0:
        _sound *= 1/abs_max
    audio_float32 = torch.from_numpy(_sound.squeeze())
    return audio_float32
 if __name__ == '__main__':
    DEFAULT_SAMPLE_RATE = 16000
    import argparse
-    parser = argparse.ArgumentParser(description="Stream from microphone to DeepSpeech using VAD")
+    parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
    parser.add_argument('-v', '--vad_aggressiveness', type=int, default=3,
                        help="Set aggressiveness of VAD: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
@@ -212,17 +224,10 @@ if __name__ == '__main__':
                        help="Save .wav files of utterences to given directory")
    parser.add_argument('-f', '--file',
                        help="Read from .wav file instead of microphone")
    parser.add_argument('-m', '--model', required=True,
                        help="Path to the model (protocol buffer binary file, or entire directory containing all standard-named files for model)")
    parser.add_argument('-s', '--scorer',
                        help="Path to the external scorer file.")
    parser.add_argument('-d', '--device', type=int, default=None,
                        help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
    parser.add_argument('-r', '--rate', type=int, default=DEFAULT_SAMPLE_RATE,
                        help=f"Input device sample rate. Default: {DEFAULT_SAMPLE_RATE}. Your device may require 44100.")
    parser.add_argument('-k', '--keyboard', action='store_true',
                        help="Type output through system keyboard")
    ARGS = parser.parse_args()
    if ARGS.savewav: os.makedirs(ARGS.savewav, exist_ok=True)
    main(ARGS)