From 3ee6e19c881a1cc0e25ae6cdfe89ccea37c8dfe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 11:43:16 +0800 Subject: [PATCH 1/9] Update webui.py add speed_factor --- webui.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/webui.py b/webui.py index 7fe63ca..03dfca7 100644 --- a/webui.py +++ b/webui.py @@ -28,7 +28,7 @@ import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav +from cosyvoice.utils.file_utils import load_wav,speed_change logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') @@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成 def change_instruction(mode_checkbox_group): return instruct_dict[mode_checkbox_group] -def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed): +def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor): if prompt_wav_upload is not None: prompt_wav = prompt_wav_upload elif prompt_wav_record is not None: @@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro logging.info('get instruct inference request') set_all_random_seed(seed) output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text) - audio_data = output['tts_speech'].numpy().flatten() + + if speed_factor != 1.0: + try: + audio_data , sample_rate = speed_change(output["tts_speech"],target_sr,str(speed_factor)) + audio_data = audio_data.numpy().flatten() + except Exception as e: + print(f"Failed to change speed of audio: \n{e}") + else: + audio_data = output['tts_speech'].numpy().flatten() + return (target_sr, audio_data) def main(): @@ -141,7 +150,7 @@ def main(): gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作") tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。") - + speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="语速调节",value=1.0,interactive=True) with gr.Row(): mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0]) instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5) @@ -162,7 +171,7 @@ def main(): seed_button.click(generate_seed, inputs=[], outputs=seed) generate_button.click(generate_audio, - inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed], + inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor], outputs=[audio_output]) mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text]) demo.queue(max_size=4, default_concurrency_limit=2) From cf43100f66a3583c4cba011a46eb0ed54ffc0e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 11:44:39 +0800 Subject: [PATCH 2/9] Update file_utils.py add speed_change function --- cosyvoice/utils/file_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index 92c448b..7700a9f 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -39,3 +39,15 @@ def load_wav(wav, target_sr): assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) return speech + +def speed_change(waveform,sample_rate,speed_factor:str): + effects = [ + ["tempo",speed_factor], # speed_factor + ["rate", f"{sample_rate}"] + ] + augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( + waveform, + sample_rate, + effects + ) + return augmented_waveform, new_sample_rate From 866207dbf0f1fe0af5e9ca9caea1a6ef7e443c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 11:51:47 +0800 Subject: [PATCH 3/9] Update README.md add speed_change --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c54c473..2a7b735 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ export PYTHONPATH=third_party/Matcha-TTS ``` python from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav +from cosyvoice.utils.file_utils import load_wav,speed_change import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') @@ -89,6 +89,10 @@ print(cosyvoice.list_avaliable_spks()) output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') torchaudio.save('sft.wav', output['tts_speech'], 22050) +# if you wanna change the speed +audio,sample_rate = speed_change(output["tts_speech"],22050,"1.6") +torchaudio.save('sft_speed_1.6.wav',audio, 22050) + cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M') # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) @@ -156,4 +160,4 @@ You can also scan the QR code to join our official Dingding chat group. 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). ## Disclaimer -The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. \ No newline at end of file +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. From 9b7c9157606ef59e7e89a37fc5014cf59bdd5ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 11:53:23 +0800 Subject: [PATCH 4/9] Update webui.py add author --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index 03dfca7..d53f418 100644 --- a/webui.py +++ b/webui.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu,Liu Yue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6f6ad85f3091a403bbdd4284ef614656955eff86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 15:58:41 +0800 Subject: [PATCH 5/9] Update README.md --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 2a7b735..4c4fe34 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ export PYTHONPATH=third_party/Matcha-TTS ``` python from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav,speed_change +from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') @@ -89,10 +89,6 @@ print(cosyvoice.list_avaliable_spks()) output = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女') torchaudio.save('sft.wav', output['tts_speech'], 22050) -# if you wanna change the speed -audio,sample_rate = speed_change(output["tts_speech"],22050,"1.6") -torchaudio.save('sft_speed_1.6.wav',audio, 22050) - cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M') # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) From f2939a9a504a5651a6d3d0f124719f11614bafe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 16:02:18 +0800 Subject: [PATCH 6/9] Update file_utils.py fix flake8 --- cosyvoice/utils/file_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index 7700a9f..d4179e1 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -40,14 +40,14 @@ def load_wav(wav, target_sr): speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) return speech -def speed_change(waveform,sample_rate,speed_factor:str): +def speed_change(waveform, sample_rate, speed_factor: str): effects = [ - ["tempo",speed_factor], # speed_factor - ["rate", f"{sample_rate}"] + ["tempo", speed_factor], # speed_factor + ["rate", f"{sample_rate}"] ] augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( - waveform, - sample_rate, + waveform, + sample_rate, effects ) return augmented_waveform, new_sample_rate From cdd9a01a28ecf0861929d5abc47dcda2390682af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 16:05:25 +0800 Subject: [PATCH 7/9] Update webui.py fix flake8 --- webui.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webui.py b/webui.py index d53f418..f145816 100644 --- a/webui.py +++ b/webui.py @@ -28,7 +28,7 @@ import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav,speed_change +from cosyvoice.utils.file_utils import load_wav, speed_change logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') @@ -135,7 +135,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro if speed_factor != 1.0: try: - audio_data , sample_rate = speed_change(output["tts_speech"],target_sr,str(speed_factor)) + audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor)) audio_data = audio_data.numpy().flatten() except Exception as e: print(f"Failed to change speed of audio: \n{e}") @@ -171,7 +171,7 @@ def main(): seed_button.click(generate_seed, inputs=[], outputs=seed) generate_button.click(generate_audio, - inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor], + inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor], outputs=[audio_output]) mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text]) demo.queue(max_size=4, default_concurrency_limit=2) From ea160ea59be6ec429b60a187e1de4de7bceaed90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 16:08:36 +0800 Subject: [PATCH 8/9] Update webui.py fix flake8 --- webui.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index f145816..af3b658 100644 --- a/webui.py +++ b/webui.py @@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成 def change_instruction(mode_checkbox_group): return instruct_dict[mode_checkbox_group] -def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor): +def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor): if prompt_wav_upload is not None: prompt_wav = prompt_wav_upload elif prompt_wav_record is not None: @@ -150,7 +150,7 @@ def main(): gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作") tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。") - speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="语速调节",value=1.0,interactive=True) + speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True) with gr.Row(): mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0]) instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5) From 4042a65c5b2d7dd543c31148714c30c5e4e3f97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=82=A6?= Date: Mon, 22 Jul 2024 16:09:21 +0800 Subject: [PATCH 9/9] Update webui.py fix flake8 --- webui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webui.py b/webui.py index af3b658..ce90e26 100644 --- a/webui.py +++ b/webui.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu,Liu Yue) +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.