From cde3cec6fa2eeda06b4ee1132d9342fa19e348e7 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Tue, 10 Dec 2024 14:50:20 +0800
Subject: [PATCH 01/10] add qwen lm

---
 cosyvoice/llm/llm.py | 125 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py
index cf9c231..f06b23a 100644
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -15,6 +15,7 @@ from typing import Dict, Optional, Callable, List, Generator
 import torch
 from torch import nn
 import torch.nn.functional as F
+from transformers import Qwen2ForCausalLM
 from torch.nn.utils.rnn import pad_sequence, unpad_sequence
 from cosyvoice.utils.common import IGNORE_ID
 from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
@@ -213,3 +214,127 @@ class TransformerLM(torch.nn.Module):
             out_tokens.append(top_ids)
             offset += lm_input.size(1)
             lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+
+
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, pretrain_path):
+        super().__init__()
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+
+
+class Qwen2LM(torch.nn.Module):
+    def __init__(
+            self,
+            llm_input_size: int,
+            llm_output_size: int,
+            speech_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+
+        # 2. build speech token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 3,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)
+
+        # 4. sampling method
+        self.sampling = sampling
+
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        while True:
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+            if (not ignore_eos) or (self.speech_token_size not in top_ids):
+                break
+        return top_ids
+
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.model.embed_tokens(text)
+
+        # 2. encode embedding
+        embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+
+        # 5. step by step decode
+        out_tokens = []
+        cache = None
+        for i in range(max_len):
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+            if top_ids == self.speech_token_size:
+                break
+            if top_ids > self.speech_token_size:
+                continue
+            # in stream mode, yield token one by one
+            yield top_ids
+            out_tokens.append(top_ids)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
\ No newline at end of file

From 3e381002d780d1a6eb017402c9662f277528b695 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 11 Dec 2024 16:14:19 +0800
Subject: [PATCH 02/10] add cosyvoice2

---
 .gitignore                             |   3 +-
 cosyvoice/cli/cosyvoice.py             |  34 ++++-
 cosyvoice/cli/model.py                 | 178 ++++++++++++++++++++++++-
 cosyvoice/flow/decoder.py              | 100 ++++++++++++--
 cosyvoice/flow/flow.py                 |  80 +++++++++++
 cosyvoice/flow/flow_matching.py        |  60 +++++++--
 cosyvoice/tokenizer/tokenizer.py       |  36 +++++
 cosyvoice/transformer/encoder_layer.py |  14 +-
 cosyvoice/utils/common.py              |  11 ++
 9 files changed, 484 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 12b53ef..4a9f23f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,5 @@ compile_commands.json
 *.pt
 pretrained_models/*
 *_pb2_grpc.py
-*_pb2.py
\ No newline at end of file
+*_pb2.py
+*.tar
\ No newline at end of file
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index bcade73..d4bea1e 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -18,7 +18,7 @@ from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 import torch
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
-from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 from cosyvoice.utils.file_utils import logging
 
 
@@ -118,3 +118,35 @@ class CosyVoice:
             logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
             yield model_output
             start_time = time.time()
+
+class CosyVoice2(CosyVoice):
+
+    def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v2.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          instruct,
+                                          configs['allowed_special'])
+        if torch.cuda.is_available() is False and (fp16 is True or load_jit is True):
+            load_jit = False
+            fp16 = False
+            logging.warning('cpu do not support fp16 and jit, force set to False')
+        self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
+                                '{}/llm.llm.fp16.zip'.format(model_dir),
+                                '{}/flow.encoder.fp32.zip'.format(model_dir))
+        if load_onnx:
+            self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
+        del configs
\ No newline at end of file
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 1fcc31f..ce53f04 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -57,15 +57,15 @@ class CosyVoiceModel:
         self.hift_cache_dict = {}
 
     def load(self, llm_model, flow_model, hift_model):
-        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=False)
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
         self.llm.to(self.device).eval()
         if self.fp16 is True:
             self.llm.half()
-        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=False)
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
         self.flow.to(self.device).eval()
         # in case hift_model is a hifigan model
         hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
-        self.hift.load_state_dict(hift_state_dict, strict=False)
+        self.hift.load_state_dict(hift_state_dict, strict=True)
         self.hift.to(self.device).eval()
 
     def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
@@ -254,3 +254,175 @@ class CosyVoiceModel:
             self.llm_end_dict.pop(this_uuid)
             self.mel_overlap_dict.pop(this_uuid)
             self.hift_cache_dict.pop(this_uuid)
+
+
+class CosyVoice2Model:
+
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        self.token_min_hop_len = 1 * self.flow.input_frame_rate
+        self.token_max_hop_len = 2 * self.flow.input_frame_rate
+        self.token_right_context = self.flow.encoder.pre_lookahead_layer.pre_lookahead_len
+        # hift cache
+        self.mel_cache_len = 8
+        self.source_cache_len = int(self.mel_cache_len * 480)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor == 1, 'fix stream_scale_factor to 1 as we haven\'t implement cache in flow matching yet, this constraint will be loosen in the future'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.hift_cache_dict = {}
+
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
+        self.llm.to(self.device).eval()
+        if self.fp16 is True:
+            self.llm.half()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
+        self.flow.to(self.device).eval()
+        # in case hift_model is a hifigan model
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.to(self.device).eval()
+
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+
+    def load_onnx(self, flow_decoder_estimator_model):
+        import onnxruntime
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        del self.flow.decoder.estimator
+        self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
+
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+        if self.fp16 is True:
+            llm_embedding = llm_embedding.half()
+        with self.llm_context:
+            for i in self.llm.inference(text=text.to(self.device),
+                                        text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_text=prompt_text.to(self.device),
+                                        prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                        prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                        embedding=llm_embedding.to(self.device)):
+                self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
+        tts_mel, _ = self.flow.inference(token=token.to(self.device),
+                                                  token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_token=prompt_token.to(self.device),
+                                                  prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_feat=prompt_feat.to(self.device),
+                                                  prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                                  embedding=embedding.to(self.device),
+                                                  finalize=finalize)
+        tts_mel = tts_mel[:, :, token_offset * self.flow.encoder.up_layer.stride:]
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+
+    def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+        p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        p.start()
+        if stream is True:
+            token_hop_len, token_offset = self.token_min_hop_len, 0
+            self.flow.encoder.static_chunk_size = self.token_min_hop_len
+            self.flow.decoder.estimator.static_chunk_size = self.token_min_hop_len * self.flow.encoder.up_layer.stride
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= token_hop_len + self.token_right_context:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + token_hop_len + self.token_right_context]) \
+                        .unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     token_offset=token_offset,
+                                                     finalize=False)
+                    token_offset += token_hop_len
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < token_hop_len + self.token_right_context:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             token_offset=token_offset,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            self.flow.encoder.static_chunk_size = 0
+            self.flow.decoder.estimator.static_chunk_size = 0
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
\ No newline at end of file
diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py
index b101d72..dfb3c07 100644
--- a/cosyvoice/flow/decoder.py
+++ b/cosyvoice/flow/decoder.py
@@ -13,16 +13,84 @@
 # limitations under the License.
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
 from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
 from matcha.models.components.transformer import BasicTransformerBlock
 
 
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+
+
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+
+
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int=8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+
+
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+            kernel_size, stride,
+            padding=0, dilation=dilation,
+            groups=groups, bias=bias,
+            padding_mode=padding_mode,
+            device=device, dtype=dtype
+        )
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+
+
 class ConditionalDecoder(nn.Module):
     def __init__(
         self,
         in_channels,
         out_channels,
+        causal=False,
         channels=(256, 256),
         dropout=0.05,
         attention_head_dim=64,
@@ -39,7 +107,7 @@ class ConditionalDecoder(nn.Module):
         channels = tuple(channels)
         self.in_channels = in_channels
         self.out_channels = out_channels
-
+        self.causal = causal
         self.time_embeddings = SinusoidalPosEmb(in_channels)
         time_embed_dim = channels[0] * 4
         self.time_mlp = TimestepEmbedding(
@@ -56,7 +124,7 @@ class ConditionalDecoder(nn.Module):
             input_channel = output_channel
             output_channel = channels[i]
             is_last = i == len(channels) - 1
-            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
             transformer_blocks = nn.ModuleList(
                 [
                     BasicTransformerBlock(
@@ -70,14 +138,14 @@ class ConditionalDecoder(nn.Module):
                 ]
             )
             downsample = (
-                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+                Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
             )
             self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
 
         for _ in range(num_mid_blocks):
             input_channel = channels[-1]
             out_channels = channels[-1]
-            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
 
             transformer_blocks = nn.ModuleList(
                 [
@@ -99,7 +167,11 @@ class ConditionalDecoder(nn.Module):
             input_channel = channels[i] * 2
             output_channel = channels[i + 1]
             is_last = i == len(channels) - 2
-            resnet = ResnetBlock1D(
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
                 dim=input_channel,
                 dim_out=output_channel,
                 time_emb_dim=time_embed_dim,
@@ -119,10 +191,10 @@ class ConditionalDecoder(nn.Module):
             upsample = (
                 Upsample1D(output_channel, use_conv_transpose=True)
                 if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
             )
             self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
-        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
         self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
         self.initialize_weights()
 
@@ -175,7 +247,9 @@ class ConditionalDecoder(nn.Module):
             mask_down = masks[-1]
             x = resnet(x, mask_down, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask==1, x.dtype)
             for transformer_block in transformer_blocks:
                 x = transformer_block(
                     hidden_states=x,
@@ -192,7 +266,9 @@ class ConditionalDecoder(nn.Module):
         for resnet, transformer_blocks in self.mid_blocks:
             x = resnet(x, mask_mid, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask==1, x.dtype)
             for transformer_block in transformer_blocks:
                 x = transformer_block(
                     hidden_states=x,
@@ -207,7 +283,9 @@ class ConditionalDecoder(nn.Module):
             x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
             x = resnet(x, mask_up, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask==1, x.dtype)
             for transformer_block in transformer_blocks:
                 x = transformer_block(
                     hidden_states=x,
@@ -218,4 +296,4 @@ class ConditionalDecoder(nn.Module):
             x = upsample(x * mask_up)
         x = self.final_block(x, mask_up)
         output = self.final_proj(x * mask_up)
-        return output * mask
+        return output * mask
\ No newline at end of file
diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index eea705b..459e3fc 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -146,3 +146,83 @@ class MaskedDiffWithXvec(torch.nn.Module):
         feat = feat[:, :, mel_len1:]
         assert feat.shape[2] == mel_len2
         return feat, flow_cache
+
+
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  finalize):
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+
+        # concat text and prompt_text
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        if finalize is False:
+            h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] -  prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat, None
\ No newline at end of file
diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py
index d011304..1e59f13 100644
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -89,17 +89,25 @@ class ConditionalCFM(BASECFM):
         sol = []
 
         for step in range(1, len(t_span)):
-            dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
             # Classifier-Free Guidance inference introduced in VoiceBox
             if self.inference_cfg_rate > 0:
-                cfg_dphi_dt = self.forward_estimator(
-                    x, mask,
-                    torch.zeros_like(mu), t,
-                    torch.zeros_like(spks) if spks is not None else None,
-                    torch.zeros_like(cond)
-                )
-                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
-                           self.inference_cfg_rate * cfg_dphi_dt)
+                x_in = torch.concat([x, x], dim=0)
+                mask_in = torch.concat([mask, mask], dim=0)
+                mu_in = torch.concat([mu, torch.zeros_like(mu).to(x.device)], dim=0)
+                t_in = torch.concat([t, t], dim=0)
+                spks_in = torch.concat([spks, torch.zeros_like(spks).to(x.device)], dim=0) if spks is not None else None
+                cond_in = torch.concat([cond, torch.zeros_like(cond).to(x.device)], dim=0) if cond is not None else None
+            else:
+                x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in
+            )
+            if self.inference_cfg_rate > 0:
+                dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
             x = x + dt * dphi_dt
             t = t + dt
             sol.append(x)
@@ -163,3 +171,37 @@ class ConditionalCFM(BASECFM):
         pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
         loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
         return loss, y
+
+
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature
+        z[:] = 0
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index caecf26..3cbe8b5 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -2,6 +2,8 @@ import base64
 import os
 from functools import lru_cache
 from typing import Optional
+import torch
+from transformers import AutoTokenizer
 from whisper.tokenizer import Tokenizer
 
 import tiktoken
@@ -234,3 +236,37 @@ def get_tokenizer(
     return Tokenizer(
         encoding=encoding, num_languages=num_languages, language=language, task=task
     )
+
+
+class QwenTokenizer():
+    def __init__(self, token_path, skip_special_tokens=True):
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+            ]
+        }
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+
+    def encode(self, text, **kwargs):
+        tokens = self.tokenizer([text], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+
+    def decode(self, tokens):
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(
+    token_path: str,
+    skip_special_tokens: bool
+) -> QwenTokenizer:
+    return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
\ No newline at end of file
diff --git a/cosyvoice/transformer/encoder_layer.py b/cosyvoice/transformer/encoder_layer.py
index dfd758b..efbb12d 100644
--- a/cosyvoice/transformer/encoder_layer.py
+++ b/cosyvoice/transformer/encoder_layer.py
@@ -49,8 +49,8 @@ class TransformerEncoderLayer(nn.Module):
         super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, eps=1e-5)
-        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.norm1 = nn.LayerNorm(size, eps=1e-12)
+        self.norm2 = nn.LayerNorm(size, eps=1e-12)
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
@@ -142,17 +142,17 @@ class ConformerEncoderLayer(nn.Module):
         self.feed_forward = feed_forward
         self.feed_forward_macaron = feed_forward_macaron
         self.conv_module = conv_module
-        self.norm_ff = nn.LayerNorm(size, eps=1e-5)  # for the FNN module
-        self.norm_mha = nn.LayerNorm(size, eps=1e-5)  # for the MHA module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
         if feed_forward_macaron is not None:
-            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
             self.ff_scale = 0.5
         else:
             self.ff_scale = 1.0
         if self.conv_module is not None:
-            self.norm_conv = nn.LayerNorm(size, eps=1e-5)  # for the CNN module
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
             self.norm_final = nn.LayerNorm(
-                size, eps=1e-5)  # for the final output of the block
+                size, eps=1e-12)  # for the final output of the block
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py
index 25bc835..2e12ad2 100644
--- a/cosyvoice/utils/common.py
+++ b/cosyvoice/utils/common.py
@@ -153,3 +153,14 @@ def set_all_random_seed(seed):
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
+
+
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * torch.finfo(dtype).min
+    return mask
\ No newline at end of file

From 0bf706c26f488dd9a06ded13bb93659952424119 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 11 Dec 2024 16:37:15 +0800
Subject: [PATCH 03/10] update

---
 cosyvoice/transformer/upsample_encoder.py | 322 ++++++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 cosyvoice/transformer/upsample_encoder.py

diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py
new file mode 100644
index 0000000..7c64726
--- /dev/null
+++ b/cosyvoice/transformer/upsample_encoder.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple
+
+import torch
+from torch import nn
+import torch.utils.checkpoint as ckpt
+from torch.nn import functional as F
+
+from cosyvoice.transformer.convolution import ConvolutionModule
+from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
+from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice.utils.class_utils import (
+    COSYVOICE_EMB_CLASSES,
+    COSYVOICE_SUBSAMPLE_CLASSES,
+    COSYVOICE_ATTENTION_CLASSES,
+    COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice.utils.mask import make_pad_mask
+from cosyvoice.utils.mask import add_optional_chunk_mask
+
+
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+
+    def __init__(self, channels: int, out_channels: int, stride: int=2):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(
+            self.channels, self.out_channels, stride*2+1, stride=1,
+            padding=0,
+        )
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+        outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+
+
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            channels, channels,
+            kernel_size=pre_lookahead_len+1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        # look ahead
+        outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+
+
+class UpsampleConformerEncoder(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+
+        self.global_cmvn = global_cmvn
+        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+        self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(4)
+        ])
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        # lookahead + conformer encoder
+        xs = self.pre_lookahead_layer(xs)
+        xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+
+        # upsample + conformer encoder
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size * self.up_layer.stride,
+                                              num_decoding_left_chunks)
+        xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+
+    def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.up_encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs

From 2345ce6be2548f134c6366d24c2fd69d045ee33a Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Thu, 12 Dec 2024 15:43:17 +0800
Subject: [PATCH 04/10] update

---
 README.md                        | 10 +++++-----
 cosyvoice/flow/flow.py           |  6 +++++-
 cosyvoice/tokenizer/tokenizer.py |  5 +++++
 requirements.txt                 |  7 ++++---
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 0e01098..9a717cd 100644
--- a/README.md
+++ b/README.md
@@ -116,27 +116,27 @@ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, loa
 print(cosyvoice.list_avaliable_spks())
 # change stream=True for chunk stream inference
 for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
-    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
 # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 # cross_lingual usage
 prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
-    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 # vc usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
-    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
 # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
 for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 ```
 
 **Start web demo**
diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index 459e3fc..d99c495 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -157,6 +157,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
                  vocab_size: int = 4096,
                  input_frame_rate: int = 50,
                  only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
                  encoder: torch.nn.Module = None,
                  decoder: torch.nn.Module = None,
                  decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
@@ -181,6 +183,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
         self.decoder = decoder
         self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
 
     @torch.inference_mode()
     def inference(self,
@@ -206,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         # text encode
         h, h_lengths = self.encoder(token, token_len)
         if finalize is False:
-            h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride]
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
         mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] -  prompt_feat.shape[1]
         h = self.encoder_proj(h)
 
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index 3cbe8b5..fbe78ff 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -240,6 +240,8 @@ def get_tokenizer(
 
 class QwenTokenizer():
     def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
         special_tokens = {
             'eos_token': '<|endoftext|>',
             'pad_token': '<|endoftext|>',
@@ -248,6 +250,9 @@ class QwenTokenizer():
                 '[breath]', '<strong>', '</strong>', '[noise]',
                 '[laughter]', '[cough]', '[clucking]', '[accent]',
                 '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
             ]
         }
         self.tokenizer = AutoTokenizer.from_pretrained(token_path)
diff --git a/requirements.txt b/requirements.txt
index d6b3ca2..da758a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/cu118
+--extra-index-url https://download.pytorch.org/whl/torch_stable.html
 conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.27.2
@@ -25,8 +25,9 @@ pydantic==2.7.0
 rich==13.7.1
 soundfile==0.12.1
 tensorboard==2.14.0
-torch==2.0.1
-torchaudio==2.0.2
+tensorrt-cu12==10.0.1
+torch==2.3.1+cu121
+torchaudio==2.3.1+cu121
 uvicorn==0.30.0
 wget==3.2
 fastapi==0.111.0

From c693039d14e698dc5365eaa8b23b28075bde2d42 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Thu, 12 Dec 2024 16:46:28 +0800
Subject: [PATCH 05/10] update

---
 README.md                       | 45 +++++++++++++++++++----
 cosyvoice/cli/cosyvoice.py      | 36 ++++++++++--------
 cosyvoice/cli/frontend.py       | 16 ++++----
 cosyvoice/cli/model.py          | 47 +++++++++++-------------
 cosyvoice/flow/flow_matching.py | 65 +++++++++++++++++++++++++++------
 requirements.txt                |  7 ++--
 6 files changed, 145 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 9a717cd..634d670 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
 # CosyVoice
+
+## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻
+[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)]
+
 ## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻
 [[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)]
 
@@ -6,6 +10,11 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo
 
 ## Roadmap
 
+- [x] 2024/12
+
+    - [x] CosyVoice2-0.5B model release
+    - [x] CosyVoice2-0.5B streaming inference with no quality degradation
+
 - [x] 2024/07
 
     - [x] Flow matching training support
@@ -24,9 +33,8 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo
 
 - [ ] TBD
 
-    - [ ] 25hz llama based llm model which supports lora finetune
-    - [ ] Support more instruction mode
-    - [ ] Music generation
+    - [ ] CosyVoice2-0.5B bistream inference support
+    - [ ] CosyVoice2-0.5B training and finetune recipie
     - [ ] CosyVoice-500M trained with more multi-lingual data
     - [ ] More...
 
@@ -46,7 +54,7 @@ git submodule update --init --recursive
 - Create Conda env:
 
 ``` sh
-conda create -n cosyvoice python=3.8
+conda create -n cosyvoice python=3.10
 conda activate cosyvoice
 # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
 conda install -y -c conda-forge pynini==2.1.5
@@ -68,6 +76,7 @@ If you are expert in this field, and you are only interested in training your ow
 ``` python
 # SDK模型下载
 from modelscope import snapshot_download
+snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
@@ -78,6 +87,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
 ``` sh
 # git模型下载，请确保已安装git lfs
 mkdir -p pretrained_models
+git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
@@ -97,9 +107,11 @@ pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
 
 **Basic Usage**
 
-For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
+For zero_shot/cross_lingual inference, please use `CosyVoice2-0.5B` or `CosyVoice-300M` model.
 For sft inference, please use `CosyVoice-300M-SFT` model.
 For instruct inference, please use `CosyVoice-300M-Instruct` model.
+We strongly recommend using `CosyVoice2-0.5B` model for better streaming performance.
+
 First, add `third_party/Matcha-TTS` to your `PYTHONPATH`.
 
 ``` sh
@@ -107,10 +119,18 @@ export PYTHONPATH=third_party/Matcha-TTS
 ```
 
 ``` python
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
 import torchaudio
 
+## cosyvoice2 usage
+cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, load_trt=False)
+# sft usage
+prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
+for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)):
+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate)
+
+## cosyvoice usage
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
 # sft usage
 print(cosyvoice.list_avaliable_spks())
@@ -189,5 +209,16 @@ You can also scan the QR code to join our official Dingding chat group.
 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
 
+## Citations
+
+``` bibtex
+@article{du2024cosyvoice,
+  title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
+  author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
+  journal={arXiv preprint arXiv:2407.05407},
+  year={2024}
+}
+```
+
 ## Disclaimer
-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
\ No newline at end of file
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index d4bea1e..c95439a 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -38,6 +38,7 @@ class CosyVoice:
                                           '{}/spk2info.pt'.format(model_dir),
                                           instruct,
                                           configs['allowed_special'])
+        self.sample_rate = configs['sample_rate']
         if torch.cuda.is_available() is False and (fp16 is True or load_jit is True):
             load_jit = False
             fp16 = False
@@ -64,7 +65,7 @@ class CosyVoice:
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
-                speech_len = model_output['tts_speech'].shape[1] / 22050
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                 logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                 yield model_output
                 start_time = time.time()
@@ -74,11 +75,11 @@ class CosyVoice:
         for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
             if len(i) < 0.5 * len(prompt_text):
                 logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
-            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
-                speech_len = model_output['tts_speech'].shape[1] / 22050
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                 logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                 yield model_output
                 start_time = time.time()
@@ -87,11 +88,11 @@ class CosyVoice:
         if self.frontend.instruct is True:
             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
         for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
-            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
-                speech_len = model_output['tts_speech'].shape[1] / 22050
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                 logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                 yield model_output
                 start_time = time.time()
@@ -105,23 +106,23 @@ class CosyVoice:
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
-                speech_len = model_output['tts_speech'].shape[1] / 22050
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                 logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                 yield model_output
                 start_time = time.time()
 
     def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
-        model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
+        model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
         start_time = time.time()
         for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
-            speech_len = model_output['tts_speech'].shape[1] / 22050
+            speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
             logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
             yield model_output
             start_time = time.time()
 
 class CosyVoice2(CosyVoice):
 
-    def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
+    def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
@@ -135,18 +136,21 @@ class CosyVoice2(CosyVoice):
                                           '{}/spk2info.pt'.format(model_dir),
                                           instruct,
                                           configs['allowed_special'])
-        if torch.cuda.is_available() is False and (fp16 is True or load_jit is True):
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and load_jit is True:
             load_jit = False
-            fp16 = False
-            logging.warning('cpu do not support fp16 and jit, force set to False')
-        self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+            logging.warning('cpu do not support jit, force set to False')
+        self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'])
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
                         '{}/hift.pt'.format(model_dir))
         if load_jit:
-            self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
-                                '{}/llm.llm.fp16.zip'.format(model_dir),
-                                '{}/flow.encoder.fp32.zip'.format(model_dir))
+            self.model.load_jit('{}/flow.encoder.fp32.zip'.format(model_dir))
+        if load_trt is True and load_onnx is True:
+            load_onnx = False
+            logging.warning('can not set both load_trt and load_onnx to True, force set load_onnx to False')
         if load_onnx:
             self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
+        if load_trt:
+            self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir))
         del configs
\ No newline at end of file
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index ac138b1..f73ba42 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -142,11 +142,11 @@ class CosyVoiceFrontEnd:
         model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
         return model_input
 
-    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
         tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
         prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
-        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
-        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
         speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
         embedding = self._extract_spk_embedding(prompt_speech_16k)
         model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
@@ -157,8 +157,8 @@ class CosyVoiceFrontEnd:
                        'llm_embedding': embedding, 'flow_embedding': embedding}
         return model_input
 
-    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
-        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
         # in cross lingual mode, we remove prompt in llm
         del model_input['prompt_text']
         del model_input['prompt_text_len']
@@ -175,10 +175,10 @@ class CosyVoiceFrontEnd:
         model_input['prompt_text_len'] = instruct_text_token_len
         return model_input
 
-    def frontend_vc(self, source_speech_16k, prompt_speech_16k):
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
         prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
-        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
-        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
         embedding = self._extract_spk_embedding(prompt_speech_16k)
         source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
         model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index ce53f04..1191e62 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -261,16 +261,15 @@ class CosyVoice2Model:
     def __init__(self,
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
-                 hift: torch.nn.Module,
-                 fp16: bool):
+                 hift: torch.nn.Module):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
-        self.fp16 = fp16
-        self.token_min_hop_len = 1 * self.flow.input_frame_rate
-        self.token_max_hop_len = 2 * self.flow.input_frame_rate
-        self.token_right_context = self.flow.encoder.pre_lookahead_layer.pre_lookahead_len
+        self.token_hop_len = 2 * self.flow.input_frame_rate
+        # here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache
+        self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate
+        self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio
         # hift cache
         self.mel_cache_len = 8
         self.source_cache_len = int(self.mel_cache_len * 480)
@@ -278,7 +277,6 @@ class CosyVoice2Model:
         self.speech_window = np.hamming(2 * self.source_cache_len)
         # rtf and decoding related
         self.stream_scale_factor = 1
-        assert self.stream_scale_factor == 1, 'fix stream_scale_factor to 1 as we haven\'t implement cache in flow matching yet, this constraint will be loosen in the future'
         self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
@@ -293,17 +291,13 @@ class CosyVoice2Model:
             self.llm.half()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
         self.flow.to(self.device).eval()
+        self.flow.decoder.fp16 = False
         # in case hift_model is a hifigan model
         hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
         self.hift.load_state_dict(hift_state_dict, strict=True)
         self.hift.to(self.device).eval()
 
-    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
-        assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
-        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
-        self.llm.text_encoder = llm_text_encoder
-        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
-        self.llm.llm = llm_llm
+    def load_jit(self, flow_encoder_model):
         flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
         self.flow.encoder = flow_encoder
 
@@ -316,6 +310,14 @@ class CosyVoice2Model:
         del self.flow.decoder.estimator
         self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
 
+    def load_trt(self, flow_decoder_estimator_model):
+        del self.flow.decoder.estimator
+        import tensorrt as trt
+        with open(flow_decoder_estimator_model, 'rb') as f:
+            self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
+        self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
+        self.flow.decoder.fp16 = True
+
     def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
         if self.fp16 is True:
             llm_embedding = llm_embedding.half()
@@ -339,7 +341,7 @@ class CosyVoice2Model:
                                                   prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
                                                   embedding=embedding.to(self.device),
                                                   finalize=finalize)
-        tts_mel = tts_mel[:, :, token_offset * self.flow.encoder.up_layer.stride:]
+        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
         # append hift cache
         if self.hift_cache_dict[uuid] is not None:
             hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
@@ -377,13 +379,11 @@ class CosyVoice2Model:
         p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
         p.start()
         if stream is True:
-            token_hop_len, token_offset = self.token_min_hop_len, 0
-            self.flow.encoder.static_chunk_size = self.token_min_hop_len
-            self.flow.decoder.estimator.static_chunk_size = self.token_min_hop_len * self.flow.encoder.up_layer.stride
+            token_offset = 0
             while True:
                 time.sleep(0.1)
-                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= token_hop_len + self.token_right_context:
-                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + token_hop_len + self.token_right_context]) \
+                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]) \
                         .unsqueeze(dim=0)
                     this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                                      prompt_token=flow_prompt_speech_token,
@@ -392,11 +392,9 @@ class CosyVoice2Model:
                                                      uuid=this_uuid,
                                                      token_offset=token_offset,
                                                      finalize=False)
-                    token_offset += token_hop_len
+                    token_offset += self.token_hop_len
                     yield {'tts_speech': this_tts_speech.cpu()}
-                    # increase token_hop_len for better speech quality
-                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
-                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < token_hop_len + self.token_right_context:
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len:
                     break
             p.join()
             # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
@@ -412,14 +410,13 @@ class CosyVoice2Model:
         else:
             # deal with all tokens
             p.join()
-            self.flow.encoder.static_chunk_size = 0
-            self.flow.decoder.estimator.static_chunk_size = 0
             this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
             this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                              prompt_token=flow_prompt_speech_token,
                                              prompt_feat=prompt_speech_feat,
                                              embedding=flow_embedding,
                                              uuid=this_uuid,
+                                             token_offset=0,
                                              finalize=True,
                                              speed=speed)
             yield {'tts_speech': this_tts_speech.cpu()}
diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py
index 1e59f13..1c293f8 100644
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import onnxruntime
 import torch
 import torch.nn.functional as F
 from matcha.models.components.flow_matching import BASECFM
@@ -88,15 +89,25 @@ class ConditionalCFM(BASECFM):
         # Or in future might add like a return_all_steps flag
         sol = []
 
+        if self.inference_cfg_rate > 0:
+            # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+            x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+            mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+            mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+            t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+            spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+            cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        else:
+            x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
         for step in range(1, len(t_span)):
             # Classifier-Free Guidance inference introduced in VoiceBox
             if self.inference_cfg_rate > 0:
-                x_in = torch.concat([x, x], dim=0)
-                mask_in = torch.concat([mask, mask], dim=0)
-                mu_in = torch.concat([mu, torch.zeros_like(mu).to(x.device)], dim=0)
-                t_in = torch.concat([t, t], dim=0)
-                spks_in = torch.concat([spks, torch.zeros_like(spks).to(x.device)], dim=0) if spks is not None else None
-                cond_in = torch.concat([cond, torch.zeros_like(cond).to(x.device)], dim=0) if cond is not None else None
+                x_in[:] = x
+                mask_in[:] = mask
+                mu_in[0] = mu
+                t_in[:] = t.unsqueeze(0)
+                spks_in[0] = spks
+                cond_in[0] = cond
             else:
                 x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
             dphi_dt = self.forward_estimator(
@@ -114,22 +125,53 @@ class ConditionalCFM(BASECFM):
             if step < len(t_span) - 1:
                 dt = t_span[step + 1] - t
 
-        return sol[-1]
+        return sol[-1].float()
 
     def forward_estimator(self, x, mask, mu, t, spks, cond):
         if isinstance(self.estimator, torch.nn.Module):
             return self.estimator.forward(x, mask, mu, t, spks, cond)
-        else:
+        elif isinstance(self.estimator, onnxruntime.InferenceSession):
             ort_inputs = {
                 'x': x.cpu().numpy(),
                 'mask': mask.cpu().numpy(),
                 'mu': mu.cpu().numpy(),
                 't': t.cpu().numpy(),
-                'spks': spks.cpu().numpy(),
-                'cond': cond.cpu().numpy()
+                'spk': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy(),
+                'mask_rand': torch.randn(1, 1, 1).numpy()
             }
             output = self.estimator.run(None, ort_inputs)[0]
             return torch.tensor(output, dtype=x.dtype, device=x.device)
+        else:
+            if not x.is_contiguous():
+                x = x.contiguous()
+            if not mask.is_contiguous():
+                mask = mask.contiguous()
+            if not mu.is_contiguous():
+                mu = mu.contiguous()
+            if not t.is_contiguous():
+                t = t.contiguous()
+            if not spks.is_contiguous():
+                spks = spks.contiguous()
+            if not cond.is_contiguous():
+                cond = cond.contiguous()
+            self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+            self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('t', (2,))
+            self.estimator.set_input_shape('spk', (2, 80))
+            self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('mask_rand', (1, 1, 1))
+            # run trt engine
+            self.estimator.execute_v2([x.data_ptr(),
+                                       mask.data_ptr(),
+                                       mu.data_ptr(),
+                                       t.data_ptr(),
+                                       spks.data_ptr(),
+                                       cond.data_ptr(),
+                                       torch.randn(1, 1, 1).to(x.device).data_ptr(),
+                                       x.data_ptr()])
+            return x
 
     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
         """Computes diffusion loss
@@ -199,7 +241,8 @@ class CausalConditionalCFM(ConditionalCFM):
         """
 
         z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature
-        z[:] = 0
+        if self.sp16 is True:
+            z = z.half()
         # fix prompt and overlap part mu and z
         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
         if self.t_scheduler == 'cosine':
diff --git a/requirements.txt b/requirements.txt
index da758a0..8be865a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/torch_stable.html
-conformer==0.3.2
+--extra-index-url https://download.pytorch.org/whl/cu121
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.27.2
 gdown==5.1.0
@@ -26,8 +25,8 @@ rich==13.7.1
 soundfile==0.12.1
 tensorboard==2.14.0
 tensorrt-cu12==10.0.1
-torch==2.3.1+cu121
-torchaudio==2.3.1+cu121
+torch==2.3.1
+torchaudio==2.3.1
 uvicorn==0.30.0
 wget==3.2
 fastapi==0.111.0

From 2511a49a72af4971f1eb0c3204c2a9a54a6b6af4 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Thu, 12 Dec 2024 18:48:25 +0800
Subject: [PATCH 06/10] update

---
 README.md                       |  2 +-
 cosyvoice/cli/model.py          |  4 ----
 cosyvoice/flow/flow_matching.py | 37 ++++++++++-----------------------
 requirements.txt                |  8 +++++--
 4 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 634d670..b8c1fd4 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ from cosyvoice.utils.file_utils import load_wav
 import torchaudio
 
 ## cosyvoice2 usage
-cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, load_trt=False)
+cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_onnx=False, load_trt=False)
 # sft usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)):
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 1191e62..a66b463 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -287,8 +287,6 @@ class CosyVoice2Model:
     def load(self, llm_model, flow_model, hift_model):
         self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
         self.llm.to(self.device).eval()
-        if self.fp16 is True:
-            self.llm.half()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
         self.flow.to(self.device).eval()
         self.flow.decoder.fp16 = False
@@ -319,8 +317,6 @@ class CosyVoice2Model:
         self.flow.decoder.fp16 = True
 
     def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
-        if self.fp16 is True:
-            llm_embedding = llm_embedding.half()
         with self.llm_context:
             for i in self.llm.inference(text=text.to(self.device),
                                         text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py
index 1c293f8..d5458d4 100644
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -136,41 +136,26 @@ class ConditionalCFM(BASECFM):
                 'mask': mask.cpu().numpy(),
                 'mu': mu.cpu().numpy(),
                 't': t.cpu().numpy(),
-                'spk': spks.cpu().numpy(),
-                'cond': cond.cpu().numpy(),
-                'mask_rand': torch.randn(1, 1, 1).numpy()
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
             }
             output = self.estimator.run(None, ort_inputs)[0]
             return torch.tensor(output, dtype=x.dtype, device=x.device)
         else:
-            if not x.is_contiguous():
-                x = x.contiguous()
-            if not mask.is_contiguous():
-                mask = mask.contiguous()
-            if not mu.is_contiguous():
-                mu = mu.contiguous()
-            if not t.is_contiguous():
-                t = t.contiguous()
-            if not spks.is_contiguous():
-                spks = spks.contiguous()
-            if not cond.is_contiguous():
-                cond = cond.contiguous()
             self.estimator.set_input_shape('x', (2, 80, x.size(2)))
             self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
             self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
             self.estimator.set_input_shape('t', (2,))
-            self.estimator.set_input_shape('spk', (2, 80))
+            self.estimator.set_input_shape('spks', (2, 80))
             self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
-            self.estimator.set_input_shape('mask_rand', (1, 1, 1))
             # run trt engine
-            self.estimator.execute_v2([x.data_ptr(),
-                                       mask.data_ptr(),
-                                       mu.data_ptr(),
-                                       t.data_ptr(),
-                                       spks.data_ptr(),
-                                       cond.data_ptr(),
-                                       torch.randn(1, 1, 1).to(x.device).data_ptr(),
-                                       x.data_ptr()])
+            self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                        mask.contiguous().data_ptr(),
+                                        mu.contiguous().data_ptr(),
+                                        t.contiguous().data_ptr(),
+                                        spks.contiguous().data_ptr(),
+                                        cond.contiguous().data_ptr(),
+                                        x.data_ptr()])
             return x
 
     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
@@ -241,7 +226,7 @@ class CausalConditionalCFM(ConditionalCFM):
         """
 
         z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature
-        if self.sp16 is True:
+        if self.fp16 is True:
             z = z.half()
         # fix prompt and overlap part mu and z
         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
diff --git a/requirements.txt b/requirements.txt
index 8be865a..04c600e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
+conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.27.2
 gdown==5.1.0
@@ -16,8 +17,8 @@ modelscope==1.15.0
 networkx==3.1
 omegaconf==2.3.0
 onnx==1.16.0
-onnxruntime-gpu==1.16.0; sys_platform == 'linux'
-onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'
+onnxruntime-gpu==1.18.0; sys_platform == 'linux'
+onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
 openai-whisper==20231117
 protobuf==4.25
 pydantic==2.7.0
@@ -25,8 +26,11 @@ rich==13.7.1
 soundfile==0.12.1
 tensorboard==2.14.0
 tensorrt-cu12==10.0.1
+tensorrt-cu12-bindings==10.0.1
+tensorrt-cu12-libs==10.0.1
 torch==2.3.1
 torchaudio==2.3.1
+transformers==4.40.1
 uvicorn==0.30.0
 wget==3.2
 fastapi==0.111.0

From 9b8f28aa3200c0c98cff0cd7cdcbea2c535814f4 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Thu, 12 Dec 2024 22:48:31 +0800
Subject: [PATCH 07/10] update

---
 README.md |  2 +-
 webui.py  | 34 +++++++++++++++++-----------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index b8c1fd4..7df61d1 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # CosyVoice
 
 ## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻
-[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)]
+[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)]
 
 ## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻
 [[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)]
diff --git a/webui.py b/webui.py
index 0233a94..b402a64 100644
--- a/webui.py
+++ b/webui.py
@@ -22,7 +22,7 @@ import random
 import librosa
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
 
@@ -51,7 +51,7 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
     )
     if speech.abs().max() > max_val:
         speech = speech / speech.abs().max() * max_val
-    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
+    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
     return speech
 
 
@@ -71,31 +71,31 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
     if mode_checkbox_group in ['自然语言控制']:
         if cosyvoice.frontend.instruct is False:
             gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         if instruct_text == '':
             gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         if prompt_wav is not None or prompt_text != '':
             gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
     if mode_checkbox_group in ['跨语种复刻']:
         if cosyvoice.frontend.instruct is True:
             gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         if instruct_text != '':
             gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
         if prompt_wav is None:
             gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
     # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
     if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
         if prompt_wav is None:
             gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
             gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
     # sft mode only use sft_dropdown
     if mode_checkbox_group in ['预训练音色']:
         if instruct_text != '' or prompt_wav is not None or prompt_text != '':
@@ -104,7 +104,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
     if mode_checkbox_group in ['3s极速复刻']:
         if prompt_text == '':
             gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
-            yield (target_sr, default_data)
+            yield (cosyvoice.sample_rate, default_data)
         if instruct_text != '':
             gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
 
@@ -112,24 +112,24 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
         logging.info('get sft inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '3s极速复刻':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '跨语种复刻':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
 
 
 def main():
@@ -178,11 +178,11 @@ if __name__ == '__main__':
                         default=8000)
     parser.add_argument('--model_dir',
                         type=str,
-                        default='pretrained_models/CosyVoice-300M',
+                        default='pretrained_models/CosyVoice2-0.5B',
                         help='local path or modelscope repo id')
     args = parser.parse_args()
-    cosyvoice = CosyVoice(args.model_dir)
+    cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
     sft_spk = cosyvoice.list_avaliable_spks()
-    prompt_sr, target_sr = 16000, 22050
-    default_data = np.zeros(target_sr)
+    prompt_sr = 16000
+    default_data = np.zeros(cosyvoice.sample_rate)
     main()

From 6b9cebea14030ac8c76bbd3eeb0f898dfb150448 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Fri, 13 Dec 2024 15:33:36 +0800
Subject: [PATCH 08/10] update

---
 README.md                  | 4 ++--
 cosyvoice/cli/cosyvoice.py | 2 +-
 requirements.txt           | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7df61d1..65728e2 100644
--- a/README.md
+++ b/README.md
@@ -124,8 +124,8 @@ from cosyvoice.utils.file_utils import load_wav
 import torchaudio
 
 ## cosyvoice2 usage
-cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_onnx=False, load_trt=False)
-# sft usage
+cosyvoice2 = CosyVoice('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_onnx=False, load_trt=False)
+# zero_shot usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)):
     torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate)
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index c95439a..7c2531d 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -128,7 +128,7 @@ class CosyVoice2(CosyVoice):
         if not os.path.exists(model_dir):
             model_dir = snapshot_download(model_dir)
         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
-            configs = load_hyperpyyaml(f)
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'Qwen2-0.5B-CosyVoice-BlankEN')})
         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
                                           configs['feat_extractor'],
                                           '{}/campplus.onnx'.format(model_dir),
diff --git a/requirements.txt b/requirements.txt
index 04c600e..7fd5596 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
 conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.27.2

From 6b5931dc70a9272485b0d931805bbe2accab4cb8 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Mon, 16 Dec 2024 09:42:08 +0800
Subject: [PATCH 09/10] update

---
 README.md                 | 89 +++++++++++++++++++--------------------
 cosyvoice/cli/frontend.py | 35 +++++++--------
 2 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index 65728e2..8ccc2c2 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,44 @@
-# CosyVoice
+[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
 
-## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻
-[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)]
+## 👉🏻 CosyVoice 👈🏻
+**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_2.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)
 
-## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻
-[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)]
+**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
 
-For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice).
+## Highlight🔥
+
+**CosyVoice 2.0** has been released! Compared to version 1.0, the new version offers more accurate, more stable, faster, and better speech generation capabilities.
+### Multilingual
+- **Support Language**: Chinese, English, Japanese, Korean, Chinese dialects (Cantonese, Sichuanese, Shanghainese, Tianjinese, Wuhanese, etc.)
+- **Crosslingual & Mixlingual**：Support zero-shot voice cloning for cross-lingual and code-switching scenarios.
+### Ultra-Low Latency
+- **Bidirectional Streaming Support**: CosyVoice 2.0 integrates offline and streaming modeling technologies.
+- **Rapid First Packet Synthesis**: Achieves latency as low as 150ms while maintaining high-quality audio output.
+### High Accuracy
+- **Improved Pronunciation**: Reduces pronunciation errors by 30% to 50% compared to CosyVoice 1.0.
+- **Benchmark Achievements**: Attains the lowest character error rate on the hard test set of the Seed-TTS evaluation set.
+### Strong Stability
+- **Consistency in Timbre**: Ensures reliable voice consistency for zero-shot and cross-language speech synthesis.
+- **Cross-language Synthesis**: Marked improvements compared to version 1.0.
+### Natural Experience
+- **Enhanced Prosody and Sound Quality**: Improved alignment of synthesized audio, raising MOS evaluation scores from 5.4 to 5.53.
+- **Emotional and Dialectal Flexibility**: Now supports more granular emotional controls and accent adjustments.
 
 ## Roadmap
 
 - [x] 2024/12
 
-    - [x] CosyVoice2-0.5B model release
-    - [x] CosyVoice2-0.5B streaming inference with no quality degradation
+    - [x] 25hz cosyvoice 2.0 released
+
+- [x] 2024/09
+
+    - [x] 25hz cosyvoice base model
+    - [x] 25hz cosyvoice voice conversion model
+
+- [x] 2024/08
+
+    - [x] Repetition Aware Sampling(RAS) inference for llm stability
+    - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
 
 - [x] 2024/07
 
@@ -21,22 +46,6 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo
     - [x] WeTextProcessing support when ttsfrd is not avaliable
     - [x] Fastapi server and client
 
-- [x] 2024/08
-
-    - [x] Repetition Aware Sampling(RAS) inference for llm stability
-    - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
-
-- [x] 2024/09
-
-    - [x] 25hz cosyvoice base model
-    - [x] 25hz cosyvoice voice conversion model
-
-- [ ] TBD
-
-    - [ ] CosyVoice2-0.5B bistream inference support
-    - [ ] CosyVoice2-0.5B training and finetune recipie
-    - [ ] CosyVoice-500M trained with more multi-lingual data
-    - [ ] More...
 
 ## Install
 
@@ -54,7 +63,7 @@ git submodule update --init --recursive
 - Create Conda env:
 
 ``` sh
-conda create -n cosyvoice python=3.10
+conda create -n cosyvoice python=3.8
 conda activate cosyvoice
 # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
 conda install -y -c conda-forge pynini==2.1.5
@@ -102,16 +111,16 @@ Notice that this step is not necessary. If you do not install `ttsfrd` package,
 ``` sh
 cd pretrained_models/CosyVoice-ttsfrd/
 unzip resource.zip -d .
-pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
+pip install ttsfrd_dependency-0.1-py3-none-any.whl
+pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
 ```
 
 **Basic Usage**
 
-For zero_shot/cross_lingual inference, please use `CosyVoice2-0.5B` or `CosyVoice-300M` model.
+We strongly recommend using `CosyVoice2-0.5B` for better performance.
+For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
 For sft inference, please use `CosyVoice-300M-SFT` model.
 For instruct inference, please use `CosyVoice-300M-Instruct` model.
-We strongly recommend using `CosyVoice2-0.5B` model for better streaming performance.
-
 First, add `third_party/Matcha-TTS` to your `PYTHONPATH`.
 
 ``` sh
@@ -123,14 +132,15 @@ from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
 import torchaudio
 
-## cosyvoice2 usage
-cosyvoice2 = CosyVoice('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_onnx=False, load_trt=False)
+# cosyvoice2
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
+
 # zero_shot usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate)
+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
-## cosyvoice usage
+# cosyvoice
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
 # sft usage
 print(cosyvoice.list_avaliable_spks())
@@ -209,16 +219,5 @@ You can also scan the QR code to join our official Dingding chat group.
 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
 
-## Citations
-
-``` bibtex
-@article{du2024cosyvoice,
-  title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
-  author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
-  journal={arXiv preprint arXiv:2407.05407},
-  year={2024}
-}
-```
-
 ## Disclaimer
 The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
\ No newline at end of file
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index f73ba42..7b09c59 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import partial
+import json
 import onnxruntime
 import torch
 import numpy as np
@@ -66,9 +67,7 @@ class CosyVoiceFrontEnd:
             ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
             assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
                 'failed to initialize ttsfrd resource'
-            self.frd.set_lang_type('pinyin')
-            self.frd.enable_pinyin_mix(True)
-            self.frd.set_breakmodel_index(1)
+            self.frd.set_lang_type('pinyinvg')
         else:
             self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
             self.en_tn_model = EnNormalizer()
@@ -112,26 +111,28 @@ class CosyVoiceFrontEnd:
         text = text.strip()
         if contains_chinese(text):
             if self.use_ttsfrd:
-                text = self.frd.get_frd_extra_info(text, 'input')
+                texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+                text = ''.join(texts)
             else:
                 text = self.zh_tn_model.normalize(text)
-            text = text.replace("\n", "")
-            text = replace_blank(text)
-            text = replace_corner_mark(text)
-            text = text.replace(".", "。")
-            text = text.replace(" - ", "，")
-            text = remove_bracket(text)
-            text = re.sub(r'[，,、]+$', '。', text)
-            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
-                                         token_min_n=60, merge_len=20, comma_split=False))
+                text = text.replace("\n", "")
+                text = replace_blank(text)
+                text = replace_corner_mark(text)
+                text = text.replace(".", "。")
+                text = text.replace(" - ", "，")
+                text = remove_bracket(text)
+                text = re.sub(r'[，,、]+$', '。', text)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                            token_min_n=60, merge_len=20, comma_split=False))
         else:
             if self.use_ttsfrd:
-                text = self.frd.get_frd_extra_info(text, 'input')
+                texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+                text = ''.join(texts)
             else:
                 text = self.en_tn_model.normalize(text)
-            text = spell_out_number(text, self.inflect_parser)
-            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
-                                         token_min_n=60, merge_len=20, comma_split=False))
+                text = spell_out_number(text, self.inflect_parser)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                            token_min_n=60, merge_len=20, comma_split=False))
         if split is False:
             return text
         return texts

From ac705603645e544d11461f879d4c593819c790f1 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Mon, 16 Dec 2024 09:54:24 +0800
Subject: [PATCH 10/10] fix lint

---
 cosyvoice/cli/cosyvoice.py                |  3 ++-
 cosyvoice/cli/frontend.py                 |  4 ++--
 cosyvoice/cli/model.py                    | 16 ++++++++--------
 cosyvoice/flow/decoder.py                 | 22 ++++++++++++----------
 cosyvoice/flow/flow.py                    |  5 ++---
 cosyvoice/tokenizer/tokenizer.py          |  2 +-
 cosyvoice/transformer/upsample_encoder.py | 11 +++++------
 cosyvoice/utils/common.py                 |  2 +-
 8 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index 7c2531d..a6dc7a6 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -120,6 +120,7 @@ class CosyVoice:
             yield model_output
             start_time = time.time()
 
+
 class CosyVoice2(CosyVoice):
 
     def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
@@ -153,4 +154,4 @@ class CosyVoice2(CosyVoice):
             self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
         if load_trt:
             self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir))
-        del configs
\ No newline at end of file
+        del configs
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index 7b09c59..1281971 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -123,7 +123,7 @@ class CosyVoiceFrontEnd:
                 text = remove_bracket(text)
                 text = re.sub(r'[，,、]+$', '。', text)
                 texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
-                                            token_min_n=60, merge_len=20, comma_split=False))
+                                             token_min_n=60, merge_len=20, comma_split=False))
         else:
             if self.use_ttsfrd:
                 texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
@@ -132,7 +132,7 @@ class CosyVoiceFrontEnd:
                 text = self.en_tn_model.normalize(text)
                 text = spell_out_number(text, self.inflect_parser)
                 texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
-                                            token_min_n=60, merge_len=20, comma_split=False))
+                                             token_min_n=60, merge_len=20, comma_split=False))
         if split is False:
             return text
         return texts
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index a66b463..bf962e9 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -330,13 +330,13 @@ class CosyVoice2Model:
 
     def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
         tts_mel, _ = self.flow.inference(token=token.to(self.device),
-                                                  token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
-                                                  prompt_token=prompt_token.to(self.device),
-                                                  prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
-                                                  prompt_feat=prompt_feat.to(self.device),
-                                                  prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
-                                                  embedding=embedding.to(self.device),
-                                                  finalize=finalize)
+                                         token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                         prompt_token=prompt_token.to(self.device),
+                                         prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                         prompt_feat=prompt_feat.to(self.device),
+                                         prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                         embedding=embedding.to(self.device),
+                                         finalize=finalize)
         tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
         # append hift cache
         if self.hift_cache_dict[uuid] is not None:
@@ -418,4 +418,4 @@ class CosyVoice2Model:
             yield {'tts_speech': this_tts_speech.cpu()}
         with self.lock:
             self.tts_speech_token_dict.pop(this_uuid)
-            self.llm_end_dict.pop(this_uuid)
\ No newline at end of file
+            self.llm_end_dict.pop(this_uuid)
diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py
index dfb3c07..6167a11 100644
--- a/cosyvoice/flow/decoder.py
+++ b/cosyvoice/flow/decoder.py
@@ -49,7 +49,7 @@ class CausalBlock1D(Block1D):
 
 
 class CausalResnetBlock1D(ResnetBlock1D):
-    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int=8):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
         super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
         self.block1 = CausalBlock1D(dim, dim_out)
         self.block2 = CausalBlock1D(dim_out, dim_out)
@@ -70,12 +70,11 @@ class CausalConv1d(torch.nn.Conv1d):
         dtype=None
     ) -> None:
         super(CausalConv1d, self).__init__(in_channels, out_channels,
-            kernel_size, stride,
-            padding=0, dilation=dilation,
-            groups=groups, bias=bias,
-            padding_mode=padding_mode,
-            device=device, dtype=dtype
-        )
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
         assert stride == 1
         self.causal_padding = (kernel_size - 1, 0)
 
@@ -124,7 +123,8 @@ class ConditionalDecoder(nn.Module):
             input_channel = output_channel
             output_channel = channels[i]
             is_last = i == len(channels) - 1
-            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal \
+                     else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
             transformer_blocks = nn.ModuleList(
                 [
                     BasicTransformerBlock(
@@ -138,14 +138,16 @@ class ConditionalDecoder(nn.Module):
                 ]
             )
             downsample = (
-                Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+                Downsample1D(output_channel) if not is_last else \
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
             )
             self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
 
         for _ in range(num_mid_blocks):
             input_channel = channels[-1]
             out_channels = channels[-1]
-            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                     ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
 
             transformer_blocks = nn.ModuleList(
                 [
diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index d99c495..c69ded6 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -202,7 +202,6 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         embedding = self.spk_embed_affine_layer(embedding)
 
         # concat text and prompt_text
-        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
         token = self.input_embedding(torch.clamp(token, min=0)) * mask
@@ -211,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         h, h_lengths = self.encoder(token, token_len)
         if finalize is False:
             h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
-        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] -  prompt_feat.shape[1]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
         h = self.encoder_proj(h)
 
         # get conditions
@@ -229,4 +228,4 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         )
         feat = feat[:, :, mel_len1:]
         assert feat.shape[2] == mel_len2
-        return feat, None
\ No newline at end of file
+        return feat, None
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index fbe78ff..8113891 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -274,4 +274,4 @@ def get_qwen_tokenizer(
     token_path: str,
     skip_special_tokens: bool
 ) -> QwenTokenizer:
-    return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
\ No newline at end of file
+    return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py
index 7c64726..05a99cc 100644
--- a/cosyvoice/transformer/upsample_encoder.py
+++ b/cosyvoice/transformer/upsample_encoder.py
@@ -19,7 +19,6 @@ from typing import Tuple
 
 import torch
 from torch import nn
-import torch.utils.checkpoint as ckpt
 from torch.nn import functional as F
 
 from cosyvoice.transformer.convolution import ConvolutionModule
@@ -49,14 +48,14 @@ class Upsample1D(nn.Module):
             number of output channels. Defaults to `channels`.
     """
 
-    def __init__(self, channels: int, out_channels: int, stride: int=2):
+    def __init__(self, channels: int, out_channels: int, stride: int = 2):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels
         self.stride = stride
         # In this mode, first repeat interpolate, than conv with stride=1
         self.conv = nn.Conv1d(
-            self.channels, self.out_channels, stride*2+1, stride=1,
+            self.channels, self.out_channels, stride * 2 + 1, stride = 1,
             padding=0,
         )
 
@@ -74,7 +73,7 @@ class PreLookaheadLayer(nn.Module):
         self.pre_lookahead_len = pre_lookahead_len
         self.conv1 = nn.Conv1d(
             channels, channels,
-            kernel_size=pre_lookahead_len+1,
+            kernel_size=pre_lookahead_len + 1,
             stride=1, padding=0,
         )
         self.conv2 = nn.Conv1d(
@@ -315,8 +314,8 @@ class UpsampleConformerEncoder(torch.nn.Module):
         return xs
 
     def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
-                       pos_emb: torch.Tensor,
-                       mask_pad: torch.Tensor) -> torch.Tensor:
+                          pos_emb: torch.Tensor,
+                          mask_pad: torch.Tensor) -> torch.Tensor:
         for layer in self.up_encoders:
             xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
         return xs
diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py
index 2e12ad2..b356f0c 100644
--- a/cosyvoice/utils/common.py
+++ b/cosyvoice/utils/common.py
@@ -163,4 +163,4 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     # NOTE(Mddct): torch.finfo jit issues
     #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
     mask = (1.0 - mask) * torch.finfo(dtype).min
-    return mask
\ No newline at end of file
+    return mask