diff --git a/tools/extract_embedding.py b/tools/extract_embedding.py index cb198cb..de48779 100755 --- a/tools/extract_embedding.py +++ b/tools/extract_embedding.py @@ -26,9 +26,9 @@ def single_job(utt): if sample_rate != 16000: audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) feat = kaldi.fbank(audio, - num_mel_bins=80, - dither=0, - sample_frequency=16000) + num_mel_bins=80, + dither=0, + sample_frequency=16000) feat = feat - feat.mean(dim=0, keepdim=True) embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist() return utt, embedding diff --git a/tools/extract_speech_token.py b/tools/extract_speech_token.py index 2829624..26aa296 100755 --- a/tools/extract_speech_token.py +++ b/tools/extract_speech_token.py @@ -33,7 +33,7 @@ def single_job(utt): else: feat = whisper.log_mel_spectrogram(audio, n_mels=128) speech_token = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(), - ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() + ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() return utt, speech_token