From a96ae1361608f3dfe54fc6a0ddeea8475e575dad Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 23 Apr 2025 15:40:59 +0800
Subject: [PATCH 1/5] fix instruct2 bug

---
 cosyvoice/cli/cosyvoice.py | 4 ++--
 cosyvoice/cli/frontend.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index fc1ea90..d82f66e 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -177,10 +177,10 @@ class CosyVoice2(CosyVoice):
     def inference_instruct(self, *args, **kwargs):
         raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
 
-    def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
+    def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
         assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
         for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
-            model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
+            model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index 99cdb18..36dcd18 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -196,8 +196,8 @@ class CosyVoiceFrontEnd:
         model_input['prompt_text_len'] = instruct_text_token_len
         return model_input
 
-    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
-        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate)
+    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
         del model_input['llm_prompt_speech_token']
         del model_input['llm_prompt_speech_token_len']
         return model_input

From b4c4d848ca6a6645cb7d35a51266b2e053781369 Mon Sep 17 00:00:00 2001
From: hwangsihu <129564966+hwangsihu@users.noreply.github.com>
Date: Thu, 1 May 2025 13:28:15 +0900
Subject: [PATCH 2/5] Reorder requirements.txt

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4166dac..e482020 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,8 @@
 conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.29.0
+fastapi==0.115.6
+fastapi-cli==0.0.4
 gdown==5.1.0
 gradio==5.4.0
 grpcio==1.57.0
@@ -34,7 +36,5 @@ torch==2.3.1
 torchaudio==2.3.1
 transformers==4.40.1
 uvicorn==0.30.0
-wget==3.2
-fastapi==0.115.6
-fastapi-cli==0.0.4
 WeTextProcessing==1.0.3
+wget==3.2

From 97f0bc61cdcabca1c15834c7b37fcea283c69da5 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Tue, 6 May 2025 10:41:55 +0800
Subject: [PATCH 3/5] remove unnecessary file

---
 test1.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 test1.py

diff --git a/test1.py b/test1.py
deleted file mode 100644
index a1243e4..0000000
--- a/test1.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import sys
-sys.path.append('third_party/Matcha-TTS')
-from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
-from cosyvoice.utils.file_utils import load_wav
-import torchaudio # type: ignore
-
-cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
-
-# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
-# zero_shot usage
-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# save zero_shot spk for future usage
-assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-cosyvoice.save_spkinfo()
-
-# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
-for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
-    torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# instruct usage
-for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# bistream usage, you can use generator as input, this is useful when using text llm model as input
-# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
-def text_generator():
-    yield '收到好友从远方寄来的生日礼物，'
-    yield '那份意外的惊喜与深深的祝福'
-    yield '让我心中充满了甜蜜的快乐，'
-    yield '笑容如花儿般绽放。'
-for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
\ No newline at end of file

From fbab274b6a9abe127be61825092ca3a241bff19f Mon Sep 17 00:00:00 2001
From: burkliu <boji123@aliyun.com>
Date: Fri, 25 Apr 2025 10:31:43 +0800
Subject: [PATCH 4/5] [feature] modify pad to trim

Conflicts:
	cosyvoice/dataset/processor.py
---
 cosyvoice/dataset/processor.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py
index 8424ada..a94eb15 100644
--- a/cosyvoice/dataset/processor.py
+++ b/cosyvoice/dataset/processor.py
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
 
 def compute_fbank(data,
                   feat_extractor,
+                  token_mel_ratio=0,
                   mode='train'):
     """ Extract fbank
 
@@ -174,8 +175,13 @@ def compute_fbank(data,
         assert 'utt' in sample
         assert 'text_token' in sample
         waveform = sample['speech']
-        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
-        sample['speech_feat'] = mat
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        if token_mel_ratio != 0:
+            # trim to align speech_token and speech_feat
+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            feat = feat[:token_mel_ratio * token_len]
+            sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_feat'] = feat
         yield sample
 
 

From 3660da4a19de6d0b9596bf5513009c8ae353be99 Mon Sep 17 00:00:00 2001
From: Lsnsh Xin <Lsnsh@users.noreply.github.com>
Date: Sat, 24 May 2025 04:17:28 +0800
Subject: [PATCH 5/5] docs(README): set Markdown headings for paragraphs to
 support quick anchor points

---
 README.md | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index c7a724d..1afc16a 100644
--- a/README.md
+++ b/README.md
@@ -49,34 +49,34 @@
 
 ## Install
 
-**Clone and install**
+### Clone and install
 
 - Clone the repo
-``` sh
-git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
-# If you failed to clone submodule due to network failures, please run following command until success
-cd CosyVoice
-git submodule update --init --recursive
-```
+    ``` sh
+    git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+    # If you failed to clone submodule due to network failures, please run following command until success
+    cd CosyVoice
+    git submodule update --init --recursive
+    ```
 
 - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
 - Create Conda env:
 
-``` sh
-conda create -n cosyvoice -y python=3.10
-conda activate cosyvoice
-# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
-conda install -y -c conda-forge pynini==2.1.5
-pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+    ``` sh
+    conda create -n cosyvoice -y python=3.10
+    conda activate cosyvoice
+    # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
+    conda install -y -c conda-forge pynini==2.1.5
+    pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+    
+    # If you encounter sox compatibility issues
+    # ubuntu
+    sudo apt-get install sox libsox-dev
+    # centos
+    sudo yum install sox sox-devel
+    ```
 
-# If you encounter sox compatibility issues
-# ubuntu
-sudo apt-get install sox libsox-dev
-# centos
-sudo yum install sox sox-devel
-```
-
-**Model download**
+### Model download
 
 We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
 
@@ -111,7 +111,7 @@ pip install ttsfrd_dependency-0.1-py3-none-any.whl
 pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
 ```
 
-**Basic Usage**
+### Basic Usage
 
 We strongly recommend using `CosyVoice2-0.5B` for better performance.
 Follow code below for detailed usage of each model.
@@ -124,7 +124,7 @@ from cosyvoice.utils.file_utils import load_wav
 import torchaudio
 ```
 
-**CosyVoice2 Usage**
+#### CosyVoice2 Usage
 ```python
 cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
 
@@ -159,7 +159,7 @@ for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你
     torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 ```
 
-**CosyVoice Usage**
+#### CosyVoice Usage
 ```python
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
 # sft usage
@@ -189,7 +189,7 @@ for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展
     torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 ```
 
-**Start web demo**
+#### Start web demo
 
 You can use our web demo page to get familiar with CosyVoice quickly.
 
@@ -200,11 +200,11 @@ Please see the demo website for details.
 python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
 ```
 
-**Advanced Usage**
+#### Advanced Usage
 
 For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
 
-**Build for deployment**
+#### Build for deployment
 
 Optionally, if you want service deployment,
 you can run following steps.