From cde3cec6fa2eeda06b4ee1132d9342fa19e348e7 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 10 Dec 2024 14:50:20 +0800 Subject: [PATCH 01/10] add qwen lm --- cosyvoice/llm/llm.py | 125 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index cf9c231..f06b23a 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -15,6 +15,7 @@ from typing import Dict, Optional, Callable, List, Generator import torch from torch import nn import torch.nn.functional as F +from transformers import Qwen2ForCausalLM from torch.nn.utils.rnn import pad_sequence, unpad_sequence from cosyvoice.utils.common import IGNORE_ID from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss @@ -213,3 +214,127 @@ class TransformerLM(torch.nn.Module): out_tokens.append(top_ids) offset += lm_input.size(1) lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) + + +class Qwen2Encoder(torch.nn.Module): + def __init__(self, pretrain_path): + super().__init__() + self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path) + + def forward_one_step(self, xs, masks, cache=None): + input_masks = masks[:, -1, :] + outs = self.model( + inputs_embeds=xs, + attention_mask=input_masks, + output_hidden_states=True, + return_dict=True, + use_cache=True, + past_key_values=cache, + ) + xs = outs.hidden_states[-1] + new_cache = outs.past_key_values + return xs, new_cache + + +class Qwen2LM(torch.nn.Module): + def __init__( + self, + llm_input_size: int, + llm_output_size: int, + speech_token_size: int, + llm: torch.nn.Module, + sampling: Callable, + length_normalized_loss: bool = True, + lsm_weight: float = 0.0, + ): + super().__init__() + self.llm_input_size = llm_input_size + self.llm_output_size = llm_output_size + self.speech_token_size = speech_token_size + + # 2. build speech token language model related modules + self.sos_eos = 0 + self.task_id = 1 + self.fill_token = 2 + + self.llm_embedding = torch.nn.Embedding(2, llm_input_size) + self.llm = llm + self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3) + self.criterion_ce = LabelSmoothingLoss( + size=speech_token_size + 3, + padding_idx=IGNORE_ID, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, + ) + + # 3. [Optional] build speech token related modules + self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size) + + # 4. sampling method + self.sampling = sampling + + def sampling_ids( + self, + weighted_scores: torch.Tensor, + decoded_tokens: List, + sampling: int, + ignore_eos: bool = True, + ): + while True: + top_ids = self.sampling(weighted_scores, decoded_tokens, sampling) + if (not ignore_eos) or (self.speech_token_size not in top_ids): + break + return top_ids + + @torch.inference_mode() + def inference( + self, + text: torch.Tensor, + text_len: torch.Tensor, + prompt_text: torch.Tensor, + prompt_text_len: torch.Tensor, + prompt_speech_token: torch.Tensor, + prompt_speech_token_len: torch.Tensor, + embedding: torch.Tensor, + sampling: int = 25, + max_token_text_ratio: float = 20, + min_token_text_ratio: float = 2, + ) -> Generator[torch.Tensor, None, None]: + device = text.device + text = torch.concat([prompt_text, text], dim=1) + text_len += prompt_text_len + text = self.llm.model.model.embed_tokens(text) + + # 2. encode embedding + embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device) + + # 3. concat llm_input + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + if prompt_speech_token_len != 0: + prompt_speech_token_emb = self.speech_embedding(prompt_speech_token) + else: + prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device) + lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1) + + # 4. cal min/max_length + min_len = int((text_len - prompt_text_len) * min_token_text_ratio) + max_len = int((text_len - prompt_text_len) * max_token_text_ratio) + + # 5. step by step decode + out_tokens = [] + cache = None + for i in range(max_len): + y_pred, cache = self.llm.forward_one_step(lm_input, + masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool), + cache=cache) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() + if top_ids == self.speech_token_size: + break + if top_ids > self.speech_token_size: + continue + # in stream mode, yield token one by one + yield top_ids + out_tokens.append(top_ids) + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) \ No newline at end of file From 3e381002d780d1a6eb017402c9662f277528b695 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 11 Dec 2024 16:14:19 +0800 Subject: [PATCH 02/10] add cosyvoice2 --- .gitignore | 3 +- cosyvoice/cli/cosyvoice.py | 34 ++++- cosyvoice/cli/model.py | 178 ++++++++++++++++++++++++- cosyvoice/flow/decoder.py | 100 ++++++++++++-- cosyvoice/flow/flow.py | 80 +++++++++++ cosyvoice/flow/flow_matching.py | 60 +++++++-- cosyvoice/tokenizer/tokenizer.py | 36 +++++ cosyvoice/transformer/encoder_layer.py | 14 +- cosyvoice/utils/common.py | 11 ++ 9 files changed, 484 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 12b53ef..4a9f23f 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,5 @@ compile_commands.json *.pt pretrained_models/* *_pb2_grpc.py -*_pb2.py \ No newline at end of file +*_pb2.py +*.tar \ No newline at end of file diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index bcade73..d4bea1e 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -18,7 +18,7 @@ from hyperpyyaml import load_hyperpyyaml from modelscope import snapshot_download import torch from cosyvoice.cli.frontend import CosyVoiceFrontEnd -from cosyvoice.cli.model import CosyVoiceModel +from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model from cosyvoice.utils.file_utils import logging @@ -118,3 +118,35 @@ class CosyVoice: logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() + +class CosyVoice2(CosyVoice): + + def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True): + instruct = True if '-Instruct' in model_dir else False + self.model_dir = model_dir + if not os.path.exists(model_dir): + model_dir = snapshot_download(model_dir) + with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: + configs = load_hyperpyyaml(f) + self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], + configs['feat_extractor'], + '{}/campplus.onnx'.format(model_dir), + '{}/speech_tokenizer_v2.onnx'.format(model_dir), + '{}/spk2info.pt'.format(model_dir), + instruct, + configs['allowed_special']) + if torch.cuda.is_available() is False and (fp16 is True or load_jit is True): + load_jit = False + fp16 = False + logging.warning('cpu do not support fp16 and jit, force set to False') + self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16) + self.model.load('{}/llm.pt'.format(model_dir), + '{}/flow.pt'.format(model_dir), + '{}/hift.pt'.format(model_dir)) + if load_jit: + self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir), + '{}/llm.llm.fp16.zip'.format(model_dir), + '{}/flow.encoder.fp32.zip'.format(model_dir)) + if load_onnx: + self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir)) + del configs \ No newline at end of file diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 1fcc31f..ce53f04 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -57,15 +57,15 @@ class CosyVoiceModel: self.hift_cache_dict = {} def load(self, llm_model, flow_model, hift_model): - self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=False) + self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True) self.llm.to(self.device).eval() if self.fp16 is True: self.llm.half() - self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=False) + self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True) self.flow.to(self.device).eval() # in case hift_model is a hifigan model hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()} - self.hift.load_state_dict(hift_state_dict, strict=False) + self.hift.load_state_dict(hift_state_dict, strict=True) self.hift.to(self.device).eval() def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model): @@ -254,3 +254,175 @@ class CosyVoiceModel: self.llm_end_dict.pop(this_uuid) self.mel_overlap_dict.pop(this_uuid) self.hift_cache_dict.pop(this_uuid) + + +class CosyVoice2Model: + + def __init__(self, + llm: torch.nn.Module, + flow: torch.nn.Module, + hift: torch.nn.Module, + fp16: bool): + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.llm = llm + self.flow = flow + self.hift = hift + self.fp16 = fp16 + self.token_min_hop_len = 1 * self.flow.input_frame_rate + self.token_max_hop_len = 2 * self.flow.input_frame_rate + self.token_right_context = self.flow.encoder.pre_lookahead_layer.pre_lookahead_len + # hift cache + self.mel_cache_len = 8 + self.source_cache_len = int(self.mel_cache_len * 480) + # speech fade in out + self.speech_window = np.hamming(2 * self.source_cache_len) + # rtf and decoding related + self.stream_scale_factor = 1 + assert self.stream_scale_factor == 1, 'fix stream_scale_factor to 1 as we haven\'t implement cache in flow matching yet, this constraint will be loosen in the future' + self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.lock = threading.Lock() + # dict used to store session related variable + self.tts_speech_token_dict = {} + self.llm_end_dict = {} + self.hift_cache_dict = {} + + def load(self, llm_model, flow_model, hift_model): + self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True) + self.llm.to(self.device).eval() + if self.fp16 is True: + self.llm.half() + self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True) + self.flow.to(self.device).eval() + # in case hift_model is a hifigan model + hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()} + self.hift.load_state_dict(hift_state_dict, strict=True) + self.hift.to(self.device).eval() + + def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model): + assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model" + llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device) + self.llm.text_encoder = llm_text_encoder + llm_llm = torch.jit.load(llm_llm_model, map_location=self.device) + self.llm.llm = llm_llm + flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) + self.flow.encoder = flow_encoder + + def load_onnx(self, flow_decoder_estimator_model): + import onnxruntime + option = onnxruntime.SessionOptions() + option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + option.intra_op_num_threads = 1 + providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider'] + del self.flow.decoder.estimator + self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers) + + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): + if self.fp16 is True: + llm_embedding = llm_embedding.half() + with self.llm_context: + for i in self.llm.inference(text=text.to(self.device), + text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device), + prompt_text=prompt_text.to(self.device), + prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), + prompt_speech_token=llm_prompt_speech_token.to(self.device), + prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), + embedding=llm_embedding.to(self.device)): + self.tts_speech_token_dict[uuid].append(i) + self.llm_end_dict[uuid] = True + + def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0): + tts_mel, _ = self.flow.inference(token=token.to(self.device), + token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), + prompt_token=prompt_token.to(self.device), + prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device), + prompt_feat=prompt_feat.to(self.device), + prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device), + embedding=embedding.to(self.device), + finalize=finalize) + tts_mel = tts_mel[:, :, token_offset * self.flow.encoder.up_layer.stride:] + # append hift cache + if self.hift_cache_dict[uuid] is not None: + hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source'] + tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2) + else: + hift_cache_source = torch.zeros(1, 1, 0) + # keep overlap mel and hift cache + if finalize is False: + tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) + self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:], + 'source': tts_source[:, :, -self.source_cache_len:], + 'speech': tts_speech[:, -self.source_cache_len:]} + tts_speech = tts_speech[:, :-self.source_cache_len] + else: + if speed != 1.0: + assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode' + tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear') + tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) + return tts_speech + + def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192), + prompt_text=torch.zeros(1, 0, dtype=torch.int32), + llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), + flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), + prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs): + # this_uuid is used to track variables related to this inference thread + this_uuid = str(uuid.uuid1()) + with self.lock: + self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False + self.hift_cache_dict[this_uuid] = None + p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) + p.start() + if stream is True: + token_hop_len, token_offset = self.token_min_hop_len, 0 + self.flow.encoder.static_chunk_size = self.token_min_hop_len + self.flow.decoder.estimator.static_chunk_size = self.token_min_hop_len * self.flow.encoder.up_layer.stride + while True: + time.sleep(0.1) + if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= token_hop_len + self.token_right_context: + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + token_hop_len + self.token_right_context]) \ + .unsqueeze(dim=0) + this_tts_speech = self.token2wav(token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + token_offset=token_offset, + finalize=False) + token_offset += token_hop_len + yield {'tts_speech': this_tts_speech.cpu()} + # increase token_hop_len for better speech quality + token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor)) + if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < token_hop_len + self.token_right_context: + break + p.join() + # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) + this_tts_speech = self.token2wav(token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + token_offset=token_offset, + finalize=True) + yield {'tts_speech': this_tts_speech.cpu()} + else: + # deal with all tokens + p.join() + self.flow.encoder.static_chunk_size = 0 + self.flow.decoder.estimator.static_chunk_size = 0 + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) + this_tts_speech = self.token2wav(token=this_tts_speech_token, + prompt_token=flow_prompt_speech_token, + prompt_feat=prompt_speech_feat, + embedding=flow_embedding, + uuid=this_uuid, + finalize=True, + speed=speed) + yield {'tts_speech': this_tts_speech.cpu()} + with self.lock: + self.tts_speech_token_dict.pop(this_uuid) + self.llm_end_dict.pop(this_uuid) \ No newline at end of file diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py index b101d72..dfb3c07 100644 --- a/cosyvoice/flow/decoder.py +++ b/cosyvoice/flow/decoder.py @@ -13,16 +13,84 @@ # limitations under the License. import torch import torch.nn as nn +import torch.nn.functional as F from einops import pack, rearrange, repeat +from cosyvoice.utils.common import mask_to_bias +from cosyvoice.utils.mask import add_optional_chunk_mask from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D from matcha.models.components.transformer import BasicTransformerBlock +class Transpose(torch.nn.Module): + def __init__(self, dim0: int, dim1: int): + super().__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x: torch.Tensor): + x = torch.transpose(x, self.dim0, self.dim1) + return x + + +class CausalBlock1D(Block1D): + def __init__(self, dim: int, dim_out: int): + super(CausalBlock1D, self).__init__(dim, dim_out) + self.block = torch.nn.Sequential( + CausalConv1d(dim, dim_out, 3), + Transpose(1, 2), + nn.LayerNorm(dim_out), + Transpose(1, 2), + nn.Mish(), + ) + + def forward(self, x: torch.Tensor, mask: torch.Tensor): + output = self.block(x * mask) + return output * mask + + +class CausalResnetBlock1D(ResnetBlock1D): + def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int=8): + super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups) + self.block1 = CausalBlock1D(dim, dim_out) + self.block2 = CausalBlock1D(dim_out, dim_out) + + +class CausalConv1d(torch.nn.Conv1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = 'zeros', + device=None, + dtype=None + ) -> None: + super(CausalConv1d, self).__init__(in_channels, out_channels, + kernel_size, stride, + padding=0, dilation=dilation, + groups=groups, bias=bias, + padding_mode=padding_mode, + device=device, dtype=dtype + ) + assert stride == 1 + self.causal_padding = (kernel_size - 1, 0) + + def forward(self, x: torch.Tensor): + x = F.pad(x, self.causal_padding) + x = super(CausalConv1d, self).forward(x) + return x + + class ConditionalDecoder(nn.Module): def __init__( self, in_channels, out_channels, + causal=False, channels=(256, 256), dropout=0.05, attention_head_dim=64, @@ -39,7 +107,7 @@ class ConditionalDecoder(nn.Module): channels = tuple(channels) self.in_channels = in_channels self.out_channels = out_channels - + self.causal = causal self.time_embeddings = SinusoidalPosEmb(in_channels) time_embed_dim = channels[0] * 4 self.time_mlp = TimestepEmbedding( @@ -56,7 +124,7 @@ class ConditionalDecoder(nn.Module): input_channel = output_channel output_channel = channels[i] is_last = i == len(channels) - 1 - resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ BasicTransformerBlock( @@ -70,14 +138,14 @@ class ConditionalDecoder(nn.Module): ] ) downsample = ( - Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) + Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1) ) self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) for _ in range(num_mid_blocks): input_channel = channels[-1] out_channels = channels[-1] - resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ @@ -99,7 +167,11 @@ class ConditionalDecoder(nn.Module): input_channel = channels[i] * 2 output_channel = channels[i + 1] is_last = i == len(channels) - 2 - resnet = ResnetBlock1D( + resnet = CausalResnetBlock1D( + dim=input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) if self.causal else ResnetBlock1D( dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim, @@ -119,10 +191,10 @@ class ConditionalDecoder(nn.Module): upsample = ( Upsample1D(output_channel, use_conv_transpose=True) if not is_last - else nn.Conv1d(output_channel, output_channel, 3, padding=1) + else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1) ) self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) - self.final_block = Block1D(channels[-1], channels[-1]) + self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1]) self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) self.initialize_weights() @@ -175,7 +247,9 @@ class ConditionalDecoder(nn.Module): mask_down = masks[-1] x = resnet(x, mask_down, t) x = rearrange(x, "b c t -> b t c").contiguous() - attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down) + # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down) + attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1) + attn_mask = mask_to_bias(attn_mask==1, x.dtype) for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, @@ -192,7 +266,9 @@ class ConditionalDecoder(nn.Module): for resnet, transformer_blocks in self.mid_blocks: x = resnet(x, mask_mid, t) x = rearrange(x, "b c t -> b t c").contiguous() - attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid) + # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid) + attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1) + attn_mask = mask_to_bias(attn_mask==1, x.dtype) for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, @@ -207,7 +283,9 @@ class ConditionalDecoder(nn.Module): x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] x = resnet(x, mask_up, t) x = rearrange(x, "b c t -> b t c").contiguous() - attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up) + # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up) + attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1) + attn_mask = mask_to_bias(attn_mask==1, x.dtype) for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, @@ -218,4 +296,4 @@ class ConditionalDecoder(nn.Module): x = upsample(x * mask_up) x = self.final_block(x, mask_up) output = self.final_proj(x * mask_up) - return output * mask + return output * mask \ No newline at end of file diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index eea705b..459e3fc 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -146,3 +146,83 @@ class MaskedDiffWithXvec(torch.nn.Module): feat = feat[:, :, mel_len1:] assert feat.shape[2] == mel_len2 return feat, flow_cache + + +class CausalMaskedDiffWithXvec(torch.nn.Module): + def __init__(self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: torch.nn.Module = None, + decoder: torch.nn.Module = None, + decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, + 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', + 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), + 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, + 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, + mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, + 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) + self.encoder = encoder + self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) + self.decoder = decoder + self.only_mask_loss = only_mask_loss + + @torch.inference_mode() + def inference(self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding, + finalize): + assert token.shape[0] == 1 + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + + # concat text and prompt_text + token_len1, token_len2 = prompt_token.shape[1], token.shape[1] + token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len + mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + if finalize is False: + h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride] + mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] + h = self.encoder_proj(h) + + # get conditions + conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device) + conds[:, :mel_len1] = prompt_feat + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h) + feat, _ = self.decoder( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=10 + ) + feat = feat[:, :, mel_len1:] + assert feat.shape[2] == mel_len2 + return feat, None \ No newline at end of file diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index d011304..1e59f13 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -89,17 +89,25 @@ class ConditionalCFM(BASECFM): sol = [] for step in range(1, len(t_span)): - dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond) # Classifier-Free Guidance inference introduced in VoiceBox if self.inference_cfg_rate > 0: - cfg_dphi_dt = self.forward_estimator( - x, mask, - torch.zeros_like(mu), t, - torch.zeros_like(spks) if spks is not None else None, - torch.zeros_like(cond) - ) - dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - - self.inference_cfg_rate * cfg_dphi_dt) + x_in = torch.concat([x, x], dim=0) + mask_in = torch.concat([mask, mask], dim=0) + mu_in = torch.concat([mu, torch.zeros_like(mu).to(x.device)], dim=0) + t_in = torch.concat([t, t], dim=0) + spks_in = torch.concat([spks, torch.zeros_like(spks).to(x.device)], dim=0) if spks is not None else None + cond_in = torch.concat([cond, torch.zeros_like(cond).to(x.device)], dim=0) if cond is not None else None + else: + x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond + dphi_dt = self.forward_estimator( + x_in, mask_in, + mu_in, t_in, + spks_in, + cond_in + ) + if self.inference_cfg_rate > 0: + dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0) + dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) x = x + dt * dphi_dt t = t + dt sol.append(x) @@ -163,3 +171,37 @@ class ConditionalCFM(BASECFM): pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) return loss, y + + +class CausalConditionalCFM(ConditionalCFM): + def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): + super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator) + self.rand_noise = torch.randn([1, 80, 50 * 300]) + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + + z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature + z[:] = 0 + # fix prompt and overlap part mu and z + t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) + if self.t_scheduler == 'cosine': + t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py index caecf26..3cbe8b5 100644 --- a/cosyvoice/tokenizer/tokenizer.py +++ b/cosyvoice/tokenizer/tokenizer.py @@ -2,6 +2,8 @@ import base64 import os from functools import lru_cache from typing import Optional +import torch +from transformers import AutoTokenizer from whisper.tokenizer import Tokenizer import tiktoken @@ -234,3 +236,37 @@ def get_tokenizer( return Tokenizer( encoding=encoding, num_languages=num_languages, language=language, task=task ) + + +class QwenTokenizer(): + def __init__(self, token_path, skip_special_tokens=True): + special_tokens = { + 'eos_token': '<|endoftext|>', + 'pad_token': '<|endoftext|>', + 'additional_special_tokens': [ + '<|im_start|>', '<|im_end|>', '<|endofprompt|>', + '[breath]', '', '', '[noise]', + '[laughter]', '[cough]', '[clucking]', '[accent]', + '[quick_breath]', + ] + } + self.tokenizer = AutoTokenizer.from_pretrained(token_path) + self.tokenizer.add_special_tokens(special_tokens) + self.skip_special_tokens = skip_special_tokens + + def encode(self, text, **kwargs): + tokens = self.tokenizer([text], return_tensors="pt") + tokens = tokens["input_ids"][0].cpu().tolist() + return tokens + + def decode(self, tokens): + tokens = torch.tensor(tokens, dtype=torch.int64) + text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0] + return text + +@lru_cache(maxsize=None) +def get_qwen_tokenizer( + token_path: str, + skip_special_tokens: bool +) -> QwenTokenizer: + return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens) \ No newline at end of file diff --git a/cosyvoice/transformer/encoder_layer.py b/cosyvoice/transformer/encoder_layer.py index dfd758b..efbb12d 100644 --- a/cosyvoice/transformer/encoder_layer.py +++ b/cosyvoice/transformer/encoder_layer.py @@ -49,8 +49,8 @@ class TransformerEncoderLayer(nn.Module): super().__init__() self.self_attn = self_attn self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) + self.norm1 = nn.LayerNorm(size, eps=1e-12) + self.norm2 = nn.LayerNorm(size, eps=1e-12) self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before @@ -142,17 +142,17 @@ class ConformerEncoderLayer(nn.Module): self.feed_forward = feed_forward self.feed_forward_macaron = feed_forward_macaron self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module + self.norm_ff = nn.LayerNorm(size, eps=1e-12) # for the FNN module + self.norm_mha = nn.LayerNorm(size, eps=1e-12) # for the MHA module if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) + self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12) self.ff_scale = 0.5 else: self.ff_scale = 1.0 if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module + self.norm_conv = nn.LayerNorm(size, eps=1e-12) # for the CNN module self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block + size, eps=1e-12) # for the final output of the block self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py index 25bc835..2e12ad2 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -153,3 +153,14 @@ def set_all_random_seed(seed): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) + + +def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: + assert mask.dtype == torch.bool + assert dtype in [torch.float32, torch.bfloat16, torch.float16] + mask = mask.to(dtype) + # attention mask bias + # NOTE(Mddct): torch.finfo jit issues + # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min + mask = (1.0 - mask) * torch.finfo(dtype).min + return mask \ No newline at end of file From 0bf706c26f488dd9a06ded13bb93659952424119 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 11 Dec 2024 16:37:15 +0800 Subject: [PATCH 03/10] update --- cosyvoice/transformer/upsample_encoder.py | 322 ++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 cosyvoice/transformer/upsample_encoder.py diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py new file mode 100644 index 0000000..7c64726 --- /dev/null +++ b/cosyvoice/transformer/upsample_encoder.py @@ -0,0 +1,322 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Encoder definition.""" +from typing import Tuple + +import torch +from torch import nn +import torch.utils.checkpoint as ckpt +from torch.nn import functional as F + +from cosyvoice.transformer.convolution import ConvolutionModule +from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer +from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward +from cosyvoice.utils.class_utils import ( + COSYVOICE_EMB_CLASSES, + COSYVOICE_SUBSAMPLE_CLASSES, + COSYVOICE_ATTENTION_CLASSES, + COSYVOICE_ACTIVATION_CLASSES, +) +from cosyvoice.utils.mask import make_pad_mask +from cosyvoice.utils.mask import add_optional_chunk_mask + + +class Upsample1D(nn.Module): + """A 1D upsampling layer with an optional convolution. + + Parameters: + channels (`int`): + number of channels in the inputs and outputs. + use_conv (`bool`, default `False`): + option to use a convolution. + use_conv_transpose (`bool`, default `False`): + option to use a convolution transpose. + out_channels (`int`, optional): + number of output channels. Defaults to `channels`. + """ + + def __init__(self, channels: int, out_channels: int, stride: int=2): + super().__init__() + self.channels = channels + self.out_channels = out_channels + self.stride = stride + # In this mode, first repeat interpolate, than conv with stride=1 + self.conv = nn.Conv1d( + self.channels, self.out_channels, stride*2+1, stride=1, + padding=0, + ) + + def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor): + outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest") + outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0) + outputs = self.conv(outputs) + return outputs, input_lengths * self.stride + + +class PreLookaheadLayer(nn.Module): + def __init__(self, channels: int, pre_lookahead_len: int = 1): + super().__init__() + self.channels = channels + self.pre_lookahead_len = pre_lookahead_len + self.conv1 = nn.Conv1d( + channels, channels, + kernel_size=pre_lookahead_len+1, + stride=1, padding=0, + ) + self.conv2 = nn.Conv1d( + channels, channels, + kernel_size=3, stride=1, padding=0, + ) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + """ + inputs: (batch_size, seq_len, channels) + """ + outputs = inputs.transpose(1, 2).contiguous() + # look ahead + outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0) + outputs = F.leaky_relu(self.conv1(outputs)) + # outputs + outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0) + outputs = self.conv2(outputs) + outputs = outputs.transpose(1, 2).contiguous() + + # residual connection + outputs = outputs + inputs + return outputs + + +class UpsampleConformerEncoder(torch.nn.Module): + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "rel_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool = True, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + key_bias: bool = True, + gradient_checkpointing: bool = False, + ): + """ + Args: + input_size (int): input dim + output_size (int): dimension of attention + attention_heads (int): the number of heads of multi head attention + linear_units (int): the hidden units number of position-wise feed + forward + num_blocks (int): the number of decoder blocks + dropout_rate (float): dropout rate + attention_dropout_rate (float): dropout rate in attention + positional_dropout_rate (float): dropout rate after adding + positional encoding + input_layer (str): input layer type. + optional [linear, conv2d, conv2d6, conv2d8] + pos_enc_layer_type (str): Encoder positional encoding layer type. + opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] + normalize_before (bool): + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + static_chunk_size (int): chunk size for static chunk training and + decoding + use_dynamic_chunk (bool): whether use dynamic chunk size for + training or not, You can only use fixed chunk(chunk_size > 0) + or dyanmic chunk size(use_dynamic_chunk = True) + global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module + use_dynamic_left_chunk (bool): whether use dynamic left chunk in + dynamic chunk training + key_bias: whether use bias in attention.linear_k, False for whisper models. + gradient_checkpointing: rerunning a forward-pass segment for each + checkpointed segment during backward. + """ + super().__init__() + self._output_size = output_size + + self.global_cmvn = global_cmvn + self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer]( + input_size, + output_size, + dropout_rate, + COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size, + positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + self.gradient_checkpointing = gradient_checkpointing + activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]() + # self-attention module definition + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + key_bias, + ) + # feed-forward module definition + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3) + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type]( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(num_blocks) + ]) + self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2) + self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer]( + input_size, + output_size, + dropout_rate, + COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size, + positional_dropout_rate), + ) + self.up_encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type]( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(4) + ]) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: torch.Tensor, + xs_lens: torch.Tensor, + decoding_chunk_size: int = 0, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Embed positions in tensor. + + Args: + xs: padded input tensor (B, T, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor xs, and subsampled masks + xs: padded output tensor (B, T' ~= T/subsample_rate, D) + masks: torch.Tensor batch padding mask after subsample + (B, 1, T' ~= T/subsample_rate) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + T = xs.size(1) + masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) + # lookahead + conformer encoder + xs = self.pre_lookahead_layer(xs) + xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad) + + # upsample + conformer encoder + xs = xs.transpose(1, 2).contiguous() + xs, xs_lens = self.up_layer(xs, xs_lens) + xs = xs.transpose(1, 2).contiguous() + T = xs.size(1) + masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) + xs, pos_emb, masks = self.up_embed(xs, masks) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size * self.up_layer.stride, + num_decoding_left_chunks) + xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad) + + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks + + def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + return xs + + def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.up_encoders: + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + return xs From 2345ce6be2548f134c6366d24c2fd69d045ee33a Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 12 Dec 2024 15:43:17 +0800 Subject: [PATCH 04/10] update --- README.md | 10 +++++----- cosyvoice/flow/flow.py | 6 +++++- cosyvoice/tokenizer/tokenizer.py | 5 +++++ requirements.txt | 7 ++++--- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0e01098..9a717cd 100644 --- a/README.md +++ b/README.md @@ -116,27 +116,27 @@ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, loa print(cosyvoice.list_avaliable_spks()) # change stream=True for chunk stream inference for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)): - torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): - torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) # cross_lingual usage prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)): - torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) # vc usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)): - torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct') # instruct usage, support [laughter][breath] for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)): - torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) ``` **Start web demo** diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index 459e3fc..d99c495 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -157,6 +157,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): vocab_size: int = 4096, input_frame_rate: int = 50, only_mask_loss: bool = True, + token_mel_ratio: int = 2, + pre_lookahead_len: int = 3, encoder: torch.nn.Module = None, decoder: torch.nn.Module = None, decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, @@ -181,6 +183,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) self.decoder = decoder self.only_mask_loss = only_mask_loss + self.token_mel_ratio = token_mel_ratio + self.pre_lookahead_len = pre_lookahead_len @torch.inference_mode() def inference(self, @@ -206,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): # text encode h, h_lengths = self.encoder(token, token_len) if finalize is False: - h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride] + h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio] mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] h = self.encoder_proj(h) diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py index 3cbe8b5..fbe78ff 100644 --- a/cosyvoice/tokenizer/tokenizer.py +++ b/cosyvoice/tokenizer/tokenizer.py @@ -240,6 +240,8 @@ def get_tokenizer( class QwenTokenizer(): def __init__(self, token_path, skip_special_tokens=True): + super().__init__() + # NOTE: non-chat model, all these special tokens keep randomly initialized. special_tokens = { 'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', @@ -248,6 +250,9 @@ class QwenTokenizer(): '[breath]', '', '', '[noise]', '[laughter]', '[cough]', '[clucking]', '[accent]', '[quick_breath]', + "", "", + "[hissing]", "[sigh]", "[vocalized-noise]", + "[lipsmack]", "[mn]" ] } self.tokenizer = AutoTokenizer.from_pretrained(token_path) diff --git a/requirements.txt b/requirements.txt index d6b3ca2..da758a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 +--extra-index-url https://download.pytorch.org/whl/torch_stable.html conformer==0.3.2 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.27.2 @@ -25,8 +25,9 @@ pydantic==2.7.0 rich==13.7.1 soundfile==0.12.1 tensorboard==2.14.0 -torch==2.0.1 -torchaudio==2.0.2 +tensorrt-cu12==10.0.1 +torch==2.3.1+cu121 +torchaudio==2.3.1+cu121 uvicorn==0.30.0 wget==3.2 fastapi==0.111.0 From c693039d14e698dc5365eaa8b23b28075bde2d42 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 12 Dec 2024 16:46:28 +0800 Subject: [PATCH 05/10] update --- README.md | 45 +++++++++++++++++++---- cosyvoice/cli/cosyvoice.py | 36 ++++++++++-------- cosyvoice/cli/frontend.py | 16 ++++---- cosyvoice/cli/model.py | 47 +++++++++++------------- cosyvoice/flow/flow_matching.py | 65 +++++++++++++++++++++++++++------ requirements.txt | 7 ++-- 6 files changed, 145 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 9a717cd..634d670 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ # CosyVoice + +## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻 +[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)] + ## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 [[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] @@ -6,6 +10,11 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo ## Roadmap +- [x] 2024/12 + + - [x] CosyVoice2-0.5B model release + - [x] CosyVoice2-0.5B streaming inference with no quality degradation + - [x] 2024/07 - [x] Flow matching training support @@ -24,9 +33,8 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo - [ ] TBD - - [ ] 25hz llama based llm model which supports lora finetune - - [ ] Support more instruction mode - - [ ] Music generation + - [ ] CosyVoice2-0.5B bistream inference support + - [ ] CosyVoice2-0.5B training and finetune recipie - [ ] CosyVoice-500M trained with more multi-lingual data - [ ] More... @@ -46,7 +54,7 @@ git submodule update --init --recursive - Create Conda env: ``` sh -conda create -n cosyvoice python=3.8 +conda create -n cosyvoice python=3.10 conda activate cosyvoice # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform. conda install -y -c conda-forge pynini==2.1.5 @@ -68,6 +76,7 @@ If you are expert in this field, and you are only interested in training your ow ``` python # SDK模型下载 from modelscope import snapshot_download +snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B') snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz') snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') @@ -78,6 +87,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice ``` sh # git模型下载,请确保已安装git lfs mkdir -p pretrained_models +git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT @@ -97,9 +107,11 @@ pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl **Basic Usage** -For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. +For zero_shot/cross_lingual inference, please use `CosyVoice2-0.5B` or `CosyVoice-300M` model. For sft inference, please use `CosyVoice-300M-SFT` model. For instruct inference, please use `CosyVoice-300M-Instruct` model. +We strongly recommend using `CosyVoice2-0.5B` model for better streaming performance. + First, add `third_party/Matcha-TTS` to your `PYTHONPATH`. ``` sh @@ -107,10 +119,18 @@ export PYTHONPATH=third_party/Matcha-TTS ``` ``` python -from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav import torchaudio +## cosyvoice2 usage +cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, load_trt=False) +# sft usage +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate) + +## cosyvoice usage cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True) # sft usage print(cosyvoice.list_avaliable_spks()) @@ -189,5 +209,16 @@ You can also scan the QR code to join our official Dingding chat group. 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). +## Citations + +``` bibtex +@article{du2024cosyvoice, + title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens}, + author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others}, + journal={arXiv preprint arXiv:2407.05407}, + year={2024} +} +``` + ## Disclaimer -The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. +The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. \ No newline at end of file diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index d4bea1e..c95439a 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -38,6 +38,7 @@ class CosyVoice: '{}/spk2info.pt'.format(model_dir), instruct, configs['allowed_special']) + self.sample_rate = configs['sample_rate'] if torch.cuda.is_available() is False and (fp16 is True or load_jit is True): load_jit = False fp16 = False @@ -64,7 +65,7 @@ class CosyVoice: start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.tts(**model_input, stream=stream, speed=speed): - speech_len = model_output['tts_speech'].shape[1] / 22050 + speech_len = model_output['tts_speech'].shape[1] / self.sample_rate logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() @@ -74,11 +75,11 @@ class CosyVoice: for i in tqdm(self.frontend.text_normalize(tts_text, split=True)): if len(i) < 0.5 * len(prompt_text): logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text)) - model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) + model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.tts(**model_input, stream=stream, speed=speed): - speech_len = model_output['tts_speech'].shape[1] / 22050 + speech_len = model_output['tts_speech'].shape[1] / self.sample_rate logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() @@ -87,11 +88,11 @@ class CosyVoice: if self.frontend.instruct is True: raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) for i in tqdm(self.frontend.text_normalize(tts_text, split=True)): - model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k) + model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.tts(**model_input, stream=stream, speed=speed): - speech_len = model_output['tts_speech'].shape[1] / 22050 + speech_len = model_output['tts_speech'].shape[1] / self.sample_rate logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() @@ -105,23 +106,23 @@ class CosyVoice: start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.tts(**model_input, stream=stream, speed=speed): - speech_len = model_output['tts_speech'].shape[1] / 22050 + speech_len = model_output['tts_speech'].shape[1] / self.sample_rate logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0): - model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k) + model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate) start_time = time.time() for model_output in self.model.vc(**model_input, stream=stream, speed=speed): - speech_len = model_output['tts_speech'].shape[1] / 22050 + speech_len = model_output['tts_speech'].shape[1] / self.sample_rate logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len)) yield model_output start_time = time.time() class CosyVoice2(CosyVoice): - def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True): + def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False): instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir if not os.path.exists(model_dir): @@ -135,18 +136,21 @@ class CosyVoice2(CosyVoice): '{}/spk2info.pt'.format(model_dir), instruct, configs['allowed_special']) - if torch.cuda.is_available() is False and (fp16 is True or load_jit is True): + self.sample_rate = configs['sample_rate'] + if torch.cuda.is_available() is False and load_jit is True: load_jit = False - fp16 = False - logging.warning('cpu do not support fp16 and jit, force set to False') - self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16) + logging.warning('cpu do not support jit, force set to False') + self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift']) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), '{}/hift.pt'.format(model_dir)) if load_jit: - self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir), - '{}/llm.llm.fp16.zip'.format(model_dir), - '{}/flow.encoder.fp32.zip'.format(model_dir)) + self.model.load_jit('{}/flow.encoder.fp32.zip'.format(model_dir)) + if load_trt is True and load_onnx is True: + load_onnx = False + logging.warning('can not set both load_trt and load_onnx to True, force set load_onnx to False') if load_onnx: self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir)) + if load_trt: + self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir)) del configs \ No newline at end of file diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index ac138b1..f73ba42 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -142,11 +142,11 @@ class CosyVoiceFrontEnd: model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding} return model_input - def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): + def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate): tts_text_token, tts_text_token_len = self._extract_text_token(tts_text) prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text) - prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k) - speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050) + prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k) + speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample) speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k) embedding = self._extract_spk_embedding(prompt_speech_16k) model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, @@ -157,8 +157,8 @@ class CosyVoiceFrontEnd: 'llm_embedding': embedding, 'flow_embedding': embedding} return model_input - def frontend_cross_lingual(self, tts_text, prompt_speech_16k): - model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k) + def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate): + model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate) # in cross lingual mode, we remove prompt in llm del model_input['prompt_text'] del model_input['prompt_text_len'] @@ -175,10 +175,10 @@ class CosyVoiceFrontEnd: model_input['prompt_text_len'] = instruct_text_token_len return model_input - def frontend_vc(self, source_speech_16k, prompt_speech_16k): + def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate): prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k) - prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k) - prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050) + prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k) + prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample) embedding = self._extract_spk_embedding(prompt_speech_16k) source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k) model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len, diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index ce53f04..1191e62 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -261,16 +261,15 @@ class CosyVoice2Model: def __init__(self, llm: torch.nn.Module, flow: torch.nn.Module, - hift: torch.nn.Module, - fp16: bool): + hift: torch.nn.Module): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift - self.fp16 = fp16 - self.token_min_hop_len = 1 * self.flow.input_frame_rate - self.token_max_hop_len = 2 * self.flow.input_frame_rate - self.token_right_context = self.flow.encoder.pre_lookahead_layer.pre_lookahead_len + self.token_hop_len = 2 * self.flow.input_frame_rate + # here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache + self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate + self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio # hift cache self.mel_cache_len = 8 self.source_cache_len = int(self.mel_cache_len * 480) @@ -278,7 +277,6 @@ class CosyVoice2Model: self.speech_window = np.hamming(2 * self.source_cache_len) # rtf and decoding related self.stream_scale_factor = 1 - assert self.stream_scale_factor == 1, 'fix stream_scale_factor to 1 as we haven\'t implement cache in flow matching yet, this constraint will be loosen in the future' self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() self.lock = threading.Lock() # dict used to store session related variable @@ -293,17 +291,13 @@ class CosyVoice2Model: self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True) self.flow.to(self.device).eval() + self.flow.decoder.fp16 = False # in case hift_model is a hifigan model hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()} self.hift.load_state_dict(hift_state_dict, strict=True) self.hift.to(self.device).eval() - def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model): - assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model" - llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device) - self.llm.text_encoder = llm_text_encoder - llm_llm = torch.jit.load(llm_llm_model, map_location=self.device) - self.llm.llm = llm_llm + def load_jit(self, flow_encoder_model): flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) self.flow.encoder = flow_encoder @@ -316,6 +310,14 @@ class CosyVoice2Model: del self.flow.decoder.estimator self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers) + def load_trt(self, flow_decoder_estimator_model): + del self.flow.decoder.estimator + import tensorrt as trt + with open(flow_decoder_estimator_model, 'rb') as f: + self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read()) + self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() + self.flow.decoder.fp16 = True + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): if self.fp16 is True: llm_embedding = llm_embedding.half() @@ -339,7 +341,7 @@ class CosyVoice2Model: prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device), embedding=embedding.to(self.device), finalize=finalize) - tts_mel = tts_mel[:, :, token_offset * self.flow.encoder.up_layer.stride:] + tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:] # append hift cache if self.hift_cache_dict[uuid] is not None: hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source'] @@ -377,13 +379,11 @@ class CosyVoice2Model: p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)) p.start() if stream is True: - token_hop_len, token_offset = self.token_min_hop_len, 0 - self.flow.encoder.static_chunk_size = self.token_min_hop_len - self.flow.decoder.estimator.static_chunk_size = self.token_min_hop_len * self.flow.encoder.up_layer.stride + token_offset = 0 while True: time.sleep(0.1) - if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= token_hop_len + self.token_right_context: - this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + token_hop_len + self.token_right_context]) \ + if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len: + this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]) \ .unsqueeze(dim=0) this_tts_speech = self.token2wav(token=this_tts_speech_token, prompt_token=flow_prompt_speech_token, @@ -392,11 +392,9 @@ class CosyVoice2Model: uuid=this_uuid, token_offset=token_offset, finalize=False) - token_offset += token_hop_len + token_offset += self.token_hop_len yield {'tts_speech': this_tts_speech.cpu()} - # increase token_hop_len for better speech quality - token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor)) - if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < token_hop_len + self.token_right_context: + if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len: break p.join() # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None @@ -412,14 +410,13 @@ class CosyVoice2Model: else: # deal with all tokens p.join() - self.flow.encoder.static_chunk_size = 0 - self.flow.decoder.estimator.static_chunk_size = 0 this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0) this_tts_speech = self.token2wav(token=this_tts_speech_token, prompt_token=flow_prompt_speech_token, prompt_feat=prompt_speech_feat, embedding=flow_embedding, uuid=this_uuid, + token_offset=0, finalize=True, speed=speed) yield {'tts_speech': this_tts_speech.cpu()} diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 1e59f13..1c293f8 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import onnxruntime import torch import torch.nn.functional as F from matcha.models.components.flow_matching import BASECFM @@ -88,15 +89,25 @@ class ConditionalCFM(BASECFM): # Or in future might add like a return_all_steps flag sol = [] + if self.inference_cfg_rate > 0: + # Do not use concat, it may cause memory format changed and trt infer with wrong results! + x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) + mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype) + mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) + t_in = torch.zeros([2], device=x.device, dtype=x.dtype) + spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype) + cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) + else: + x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond for step in range(1, len(t_span)): # Classifier-Free Guidance inference introduced in VoiceBox if self.inference_cfg_rate > 0: - x_in = torch.concat([x, x], dim=0) - mask_in = torch.concat([mask, mask], dim=0) - mu_in = torch.concat([mu, torch.zeros_like(mu).to(x.device)], dim=0) - t_in = torch.concat([t, t], dim=0) - spks_in = torch.concat([spks, torch.zeros_like(spks).to(x.device)], dim=0) if spks is not None else None - cond_in = torch.concat([cond, torch.zeros_like(cond).to(x.device)], dim=0) if cond is not None else None + x_in[:] = x + mask_in[:] = mask + mu_in[0] = mu + t_in[:] = t.unsqueeze(0) + spks_in[0] = spks + cond_in[0] = cond else: x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond dphi_dt = self.forward_estimator( @@ -114,22 +125,53 @@ class ConditionalCFM(BASECFM): if step < len(t_span) - 1: dt = t_span[step + 1] - t - return sol[-1] + return sol[-1].float() def forward_estimator(self, x, mask, mu, t, spks, cond): if isinstance(self.estimator, torch.nn.Module): return self.estimator.forward(x, mask, mu, t, spks, cond) - else: + elif isinstance(self.estimator, onnxruntime.InferenceSession): ort_inputs = { 'x': x.cpu().numpy(), 'mask': mask.cpu().numpy(), 'mu': mu.cpu().numpy(), 't': t.cpu().numpy(), - 'spks': spks.cpu().numpy(), - 'cond': cond.cpu().numpy() + 'spk': spks.cpu().numpy(), + 'cond': cond.cpu().numpy(), + 'mask_rand': torch.randn(1, 1, 1).numpy() } output = self.estimator.run(None, ort_inputs)[0] return torch.tensor(output, dtype=x.dtype, device=x.device) + else: + if not x.is_contiguous(): + x = x.contiguous() + if not mask.is_contiguous(): + mask = mask.contiguous() + if not mu.is_contiguous(): + mu = mu.contiguous() + if not t.is_contiguous(): + t = t.contiguous() + if not spks.is_contiguous(): + spks = spks.contiguous() + if not cond.is_contiguous(): + cond = cond.contiguous() + self.estimator.set_input_shape('x', (2, 80, x.size(2))) + self.estimator.set_input_shape('mask', (2, 1, x.size(2))) + self.estimator.set_input_shape('mu', (2, 80, x.size(2))) + self.estimator.set_input_shape('t', (2,)) + self.estimator.set_input_shape('spk', (2, 80)) + self.estimator.set_input_shape('cond', (2, 80, x.size(2))) + self.estimator.set_input_shape('mask_rand', (1, 1, 1)) + # run trt engine + self.estimator.execute_v2([x.data_ptr(), + mask.data_ptr(), + mu.data_ptr(), + t.data_ptr(), + spks.data_ptr(), + cond.data_ptr(), + torch.randn(1, 1, 1).to(x.device).data_ptr(), + x.data_ptr()]) + return x def compute_loss(self, x1, mask, mu, spks=None, cond=None): """Computes diffusion loss @@ -199,7 +241,8 @@ class CausalConditionalCFM(ConditionalCFM): """ z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature - z[:] = 0 + if self.sp16 is True: + z = z.half() # fix prompt and overlap part mu and z t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) if self.t_scheduler == 'cosine': diff --git a/requirements.txt b/requirements.txt index da758a0..8be865a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/torch_stable.html -conformer==0.3.2 +--extra-index-url https://download.pytorch.org/whl/cu121 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.27.2 gdown==5.1.0 @@ -26,8 +25,8 @@ rich==13.7.1 soundfile==0.12.1 tensorboard==2.14.0 tensorrt-cu12==10.0.1 -torch==2.3.1+cu121 -torchaudio==2.3.1+cu121 +torch==2.3.1 +torchaudio==2.3.1 uvicorn==0.30.0 wget==3.2 fastapi==0.111.0 From 2511a49a72af4971f1eb0c3204c2a9a54a6b6af4 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 12 Dec 2024 18:48:25 +0800 Subject: [PATCH 06/10] update --- README.md | 2 +- cosyvoice/cli/model.py | 4 ---- cosyvoice/flow/flow_matching.py | 37 ++++++++++----------------------- requirements.txt | 8 +++++-- 4 files changed, 18 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 634d670..b8c1fd4 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ from cosyvoice.utils.file_utils import load_wav import torchaudio ## cosyvoice2 usage -cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, load_trt=False) +cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_onnx=False, load_trt=False) # sft usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)): diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 1191e62..a66b463 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -287,8 +287,6 @@ class CosyVoice2Model: def load(self, llm_model, flow_model, hift_model): self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True) self.llm.to(self.device).eval() - if self.fp16 is True: - self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True) self.flow.to(self.device).eval() self.flow.decoder.fp16 = False @@ -319,8 +317,6 @@ class CosyVoice2Model: self.flow.decoder.fp16 = True def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): - if self.fp16 is True: - llm_embedding = llm_embedding.half() with self.llm_context: for i in self.llm.inference(text=text.to(self.device), text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device), diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 1c293f8..d5458d4 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -136,41 +136,26 @@ class ConditionalCFM(BASECFM): 'mask': mask.cpu().numpy(), 'mu': mu.cpu().numpy(), 't': t.cpu().numpy(), - 'spk': spks.cpu().numpy(), - 'cond': cond.cpu().numpy(), - 'mask_rand': torch.randn(1, 1, 1).numpy() + 'spks': spks.cpu().numpy(), + 'cond': cond.cpu().numpy() } output = self.estimator.run(None, ort_inputs)[0] return torch.tensor(output, dtype=x.dtype, device=x.device) else: - if not x.is_contiguous(): - x = x.contiguous() - if not mask.is_contiguous(): - mask = mask.contiguous() - if not mu.is_contiguous(): - mu = mu.contiguous() - if not t.is_contiguous(): - t = t.contiguous() - if not spks.is_contiguous(): - spks = spks.contiguous() - if not cond.is_contiguous(): - cond = cond.contiguous() self.estimator.set_input_shape('x', (2, 80, x.size(2))) self.estimator.set_input_shape('mask', (2, 1, x.size(2))) self.estimator.set_input_shape('mu', (2, 80, x.size(2))) self.estimator.set_input_shape('t', (2,)) - self.estimator.set_input_shape('spk', (2, 80)) + self.estimator.set_input_shape('spks', (2, 80)) self.estimator.set_input_shape('cond', (2, 80, x.size(2))) - self.estimator.set_input_shape('mask_rand', (1, 1, 1)) # run trt engine - self.estimator.execute_v2([x.data_ptr(), - mask.data_ptr(), - mu.data_ptr(), - t.data_ptr(), - spks.data_ptr(), - cond.data_ptr(), - torch.randn(1, 1, 1).to(x.device).data_ptr(), - x.data_ptr()]) + self.estimator.execute_v2([x.contiguous().data_ptr(), + mask.contiguous().data_ptr(), + mu.contiguous().data_ptr(), + t.contiguous().data_ptr(), + spks.contiguous().data_ptr(), + cond.contiguous().data_ptr(), + x.data_ptr()]) return x def compute_loss(self, x1, mask, mu, spks=None, cond=None): @@ -241,7 +226,7 @@ class CausalConditionalCFM(ConditionalCFM): """ z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature - if self.sp16 is True: + if self.fp16 is True: z = z.half() # fix prompt and overlap part mu and z t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) diff --git a/requirements.txt b/requirements.txt index 8be865a..04c600e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu121 +conformer==0.3.2 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.27.2 gdown==5.1.0 @@ -16,8 +17,8 @@ modelscope==1.15.0 networkx==3.1 omegaconf==2.3.0 onnx==1.16.0 -onnxruntime-gpu==1.16.0; sys_platform == 'linux' -onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' +onnxruntime-gpu==1.18.0; sys_platform == 'linux' +onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows' openai-whisper==20231117 protobuf==4.25 pydantic==2.7.0 @@ -25,8 +26,11 @@ rich==13.7.1 soundfile==0.12.1 tensorboard==2.14.0 tensorrt-cu12==10.0.1 +tensorrt-cu12-bindings==10.0.1 +tensorrt-cu12-libs==10.0.1 torch==2.3.1 torchaudio==2.3.1 +transformers==4.40.1 uvicorn==0.30.0 wget==3.2 fastapi==0.111.0 From 9b8f28aa3200c0c98cff0cd7cdcbea2c535814f4 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 12 Dec 2024 22:48:31 +0800 Subject: [PATCH 07/10] update --- README.md | 2 +- webui.py | 34 +++++++++++++++++----------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index b8c1fd4..7df61d1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # CosyVoice ## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻 -[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)] +[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)] ## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 [[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] diff --git a/webui.py b/webui.py index 0233a94..b402a64 100644 --- a/webui.py +++ b/webui.py @@ -22,7 +22,7 @@ import random import librosa ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) -from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav, logging from cosyvoice.utils.common import set_all_random_seed @@ -51,7 +51,7 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440): ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val - speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) + speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) return speech @@ -71,31 +71,31 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro if mode_checkbox_group in ['自然语言控制']: if cosyvoice.frontend.instruct is False: gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir)) - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) if instruct_text == '': gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本') - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) if prompt_wav is not None or prompt_text != '': gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略') # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language if mode_checkbox_group in ['跨语种复刻']: if cosyvoice.frontend.instruct is True: gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir)) - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) if instruct_text != '': gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略') if prompt_wav is None: gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频') - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言') # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']: if prompt_wav is None: gr.Warning('prompt音频为空,您是否忘记输入prompt音频?') - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) if torchaudio.info(prompt_wav).sample_rate < prompt_sr: gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr)) - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) # sft mode only use sft_dropdown if mode_checkbox_group in ['预训练音色']: if instruct_text != '' or prompt_wav is not None or prompt_text != '': @@ -104,7 +104,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro if mode_checkbox_group in ['3s极速复刻']: if prompt_text == '': gr.Warning('prompt文本为空,您是否忘记输入prompt文本?') - yield (target_sr, default_data) + yield (cosyvoice.sample_rate, default_data) if instruct_text != '': gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!') @@ -112,24 +112,24 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro logging.info('get sft inference request') set_all_random_seed(seed) for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '3s极速复刻': logging.info('get zero_shot inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '跨语种复刻': logging.info('get cross_lingual inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) else: logging.info('get instruct inference request') set_all_random_seed(seed) for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) def main(): @@ -178,11 +178,11 @@ if __name__ == '__main__': default=8000) parser.add_argument('--model_dir', type=str, - default='pretrained_models/CosyVoice-300M', + default='pretrained_models/CosyVoice2-0.5B', help='local path or modelscope repo id') args = parser.parse_args() - cosyvoice = CosyVoice(args.model_dir) + cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir) sft_spk = cosyvoice.list_avaliable_spks() - prompt_sr, target_sr = 16000, 22050 - default_data = np.zeros(target_sr) + prompt_sr = 16000 + default_data = np.zeros(cosyvoice.sample_rate) main() From 6b9cebea14030ac8c76bbd3eeb0f898dfb150448 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Fri, 13 Dec 2024 15:33:36 +0800 Subject: [PATCH 08/10] update --- README.md | 4 ++-- cosyvoice/cli/cosyvoice.py | 2 +- requirements.txt | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7df61d1..65728e2 100644 --- a/README.md +++ b/README.md @@ -124,8 +124,8 @@ from cosyvoice.utils.file_utils import load_wav import torchaudio ## cosyvoice2 usage -cosyvoice2 = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_onnx=False, load_trt=False) -# sft usage +cosyvoice2 = CosyVoice('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_onnx=False, load_trt=False) +# zero_shot usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)): torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index c95439a..7c2531d 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -128,7 +128,7 @@ class CosyVoice2(CosyVoice): if not os.path.exists(model_dir): model_dir = snapshot_download(model_dir) with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: - configs = load_hyperpyyaml(f) + configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'Qwen2-0.5B-CosyVoice-BlankEN')}) self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], configs['feat_extractor'], '{}/campplus.onnx'.format(model_dir), diff --git a/requirements.txt b/requirements.txt index 04c600e..7fd5596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684 conformer==0.3.2 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.27.2 From 6b5931dc70a9272485b0d931805bbe2accab4cb8 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Mon, 16 Dec 2024 09:42:08 +0800 Subject: [PATCH 09/10] update --- README.md | 89 +++++++++++++++++++-------------------- cosyvoice/cli/frontend.py | 35 +++++++-------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index 65728e2..8ccc2c2 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,44 @@ -# CosyVoice +[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners) -## 👉🏻 [CosyVoice2 Demos](https://funaudiollm.github.io/cosyvoice2/) 👈🏻 -[[CosyVoice2 Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice2 Studio](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)] +## 👉🏻 CosyVoice 👈🏻 +**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_2.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B) -## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻 -[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)] +**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M) -For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice). +## Highlight🔥 + +**CosyVoice 2.0** has been released! Compared to version 1.0, the new version offers more accurate, more stable, faster, and better speech generation capabilities. +### Multilingual +- **Support Language**: Chinese, English, Japanese, Korean, Chinese dialects (Cantonese, Sichuanese, Shanghainese, Tianjinese, Wuhanese, etc.) +- **Crosslingual & Mixlingual**:Support zero-shot voice cloning for cross-lingual and code-switching scenarios. +### Ultra-Low Latency +- **Bidirectional Streaming Support**: CosyVoice 2.0 integrates offline and streaming modeling technologies. +- **Rapid First Packet Synthesis**: Achieves latency as low as 150ms while maintaining high-quality audio output. +### High Accuracy +- **Improved Pronunciation**: Reduces pronunciation errors by 30% to 50% compared to CosyVoice 1.0. +- **Benchmark Achievements**: Attains the lowest character error rate on the hard test set of the Seed-TTS evaluation set. +### Strong Stability +- **Consistency in Timbre**: Ensures reliable voice consistency for zero-shot and cross-language speech synthesis. +- **Cross-language Synthesis**: Marked improvements compared to version 1.0. +### Natural Experience +- **Enhanced Prosody and Sound Quality**: Improved alignment of synthesized audio, raising MOS evaluation scores from 5.4 to 5.53. +- **Emotional and Dialectal Flexibility**: Now supports more granular emotional controls and accent adjustments. ## Roadmap - [x] 2024/12 - - [x] CosyVoice2-0.5B model release - - [x] CosyVoice2-0.5B streaming inference with no quality degradation + - [x] 25hz cosyvoice 2.0 released + +- [x] 2024/09 + + - [x] 25hz cosyvoice base model + - [x] 25hz cosyvoice voice conversion model + +- [x] 2024/08 + + - [x] Repetition Aware Sampling(RAS) inference for llm stability + - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization - [x] 2024/07 @@ -21,22 +46,6 @@ For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVo - [x] WeTextProcessing support when ttsfrd is not avaliable - [x] Fastapi server and client -- [x] 2024/08 - - - [x] Repetition Aware Sampling(RAS) inference for llm stability - - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization - -- [x] 2024/09 - - - [x] 25hz cosyvoice base model - - [x] 25hz cosyvoice voice conversion model - -- [ ] TBD - - - [ ] CosyVoice2-0.5B bistream inference support - - [ ] CosyVoice2-0.5B training and finetune recipie - - [ ] CosyVoice-500M trained with more multi-lingual data - - [ ] More... ## Install @@ -54,7 +63,7 @@ git submodule update --init --recursive - Create Conda env: ``` sh -conda create -n cosyvoice python=3.10 +conda create -n cosyvoice python=3.8 conda activate cosyvoice # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform. conda install -y -c conda-forge pynini==2.1.5 @@ -102,16 +111,16 @@ Notice that this step is not necessary. If you do not install `ttsfrd` package, ``` sh cd pretrained_models/CosyVoice-ttsfrd/ unzip resource.zip -d . -pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl +pip install ttsfrd_dependency-0.1-py3-none-any.whl +pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl ``` **Basic Usage** -For zero_shot/cross_lingual inference, please use `CosyVoice2-0.5B` or `CosyVoice-300M` model. +We strongly recommend using `CosyVoice2-0.5B` for better performance. +For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model. For sft inference, please use `CosyVoice-300M-SFT` model. For instruct inference, please use `CosyVoice-300M-Instruct` model. -We strongly recommend using `CosyVoice2-0.5B` model for better streaming performance. - First, add `third_party/Matcha-TTS` to your `PYTHONPATH`. ``` sh @@ -123,14 +132,15 @@ from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav import torchaudio -## cosyvoice2 usage -cosyvoice2 = CosyVoice('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_onnx=False, load_trt=False) +# cosyvoice2 +cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False) + # zero_shot usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) -for i, j in enumerate(cosyvoice2.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=True)): - torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice2.sample_rate) +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) -## cosyvoice usage +# cosyvoice cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True) # sft usage print(cosyvoice.list_avaliable_spks()) @@ -209,16 +219,5 @@ You can also scan the QR code to join our official Dingding chat group. 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec). 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet). -## Citations - -``` bibtex -@article{du2024cosyvoice, - title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens}, - author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others}, - journal={arXiv preprint arXiv:2407.05407}, - year={2024} -} -``` - ## Disclaimer The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal. \ No newline at end of file diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index f73ba42..7b09c59 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import partial +import json import onnxruntime import torch import numpy as np @@ -66,9 +67,7 @@ class CosyVoiceFrontEnd: ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \ 'failed to initialize ttsfrd resource' - self.frd.set_lang_type('pinyin') - self.frd.enable_pinyin_mix(True) - self.frd.set_breakmodel_index(1) + self.frd.set_lang_type('pinyinvg') else: self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False) self.en_tn_model = EnNormalizer() @@ -112,26 +111,28 @@ class CosyVoiceFrontEnd: text = text.strip() if contains_chinese(text): if self.use_ttsfrd: - text = self.frd.get_frd_extra_info(text, 'input') + texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]] + text = ''.join(texts) else: text = self.zh_tn_model.normalize(text) - text = text.replace("\n", "") - text = replace_blank(text) - text = replace_corner_mark(text) - text = text.replace(".", "。") - text = text.replace(" - ", ",") - text = remove_bracket(text) - text = re.sub(r'[,,、]+$', '。', text) - texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, - token_min_n=60, merge_len=20, comma_split=False)) + text = text.replace("\n", "") + text = replace_blank(text) + text = replace_corner_mark(text) + text = text.replace(".", "。") + text = text.replace(" - ", ",") + text = remove_bracket(text) + text = re.sub(r'[,,、]+$', '。', text) + texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, + token_min_n=60, merge_len=20, comma_split=False)) else: if self.use_ttsfrd: - text = self.frd.get_frd_extra_info(text, 'input') + texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]] + text = ''.join(texts) else: text = self.en_tn_model.normalize(text) - text = spell_out_number(text, self.inflect_parser) - texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, - token_min_n=60, merge_len=20, comma_split=False)) + text = spell_out_number(text, self.inflect_parser) + texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, + token_min_n=60, merge_len=20, comma_split=False)) if split is False: return text return texts From ac705603645e544d11461f879d4c593819c790f1 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Mon, 16 Dec 2024 09:54:24 +0800 Subject: [PATCH 10/10] fix lint --- cosyvoice/cli/cosyvoice.py | 3 ++- cosyvoice/cli/frontend.py | 4 ++-- cosyvoice/cli/model.py | 16 ++++++++-------- cosyvoice/flow/decoder.py | 22 ++++++++++++---------- cosyvoice/flow/flow.py | 5 ++--- cosyvoice/tokenizer/tokenizer.py | 2 +- cosyvoice/transformer/upsample_encoder.py | 11 +++++------ cosyvoice/utils/common.py | 2 +- 8 files changed, 33 insertions(+), 32 deletions(-) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 7c2531d..a6dc7a6 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -120,6 +120,7 @@ class CosyVoice: yield model_output start_time = time.time() + class CosyVoice2(CosyVoice): def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False): @@ -153,4 +154,4 @@ class CosyVoice2(CosyVoice): self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir)) if load_trt: self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir)) - del configs \ No newline at end of file + del configs diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 7b09c59..1281971 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -123,7 +123,7 @@ class CosyVoiceFrontEnd: text = remove_bracket(text) text = re.sub(r'[,,、]+$', '。', text) texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, - token_min_n=60, merge_len=20, comma_split=False)) + token_min_n=60, merge_len=20, comma_split=False)) else: if self.use_ttsfrd: texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]] @@ -132,7 +132,7 @@ class CosyVoiceFrontEnd: text = self.en_tn_model.normalize(text) text = spell_out_number(text, self.inflect_parser) texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, - token_min_n=60, merge_len=20, comma_split=False)) + token_min_n=60, merge_len=20, comma_split=False)) if split is False: return text return texts diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index a66b463..bf962e9 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -330,13 +330,13 @@ class CosyVoice2Model: def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0): tts_mel, _ = self.flow.inference(token=token.to(self.device), - token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), - prompt_token=prompt_token.to(self.device), - prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device), - prompt_feat=prompt_feat.to(self.device), - prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device), - embedding=embedding.to(self.device), - finalize=finalize) + token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), + prompt_token=prompt_token.to(self.device), + prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device), + prompt_feat=prompt_feat.to(self.device), + prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device), + embedding=embedding.to(self.device), + finalize=finalize) tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:] # append hift cache if self.hift_cache_dict[uuid] is not None: @@ -418,4 +418,4 @@ class CosyVoice2Model: yield {'tts_speech': this_tts_speech.cpu()} with self.lock: self.tts_speech_token_dict.pop(this_uuid) - self.llm_end_dict.pop(this_uuid) \ No newline at end of file + self.llm_end_dict.pop(this_uuid) diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py index dfb3c07..6167a11 100644 --- a/cosyvoice/flow/decoder.py +++ b/cosyvoice/flow/decoder.py @@ -49,7 +49,7 @@ class CausalBlock1D(Block1D): class CausalResnetBlock1D(ResnetBlock1D): - def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int=8): + def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8): super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups) self.block1 = CausalBlock1D(dim, dim_out) self.block2 = CausalBlock1D(dim_out, dim_out) @@ -70,12 +70,11 @@ class CausalConv1d(torch.nn.Conv1d): dtype=None ) -> None: super(CausalConv1d, self).__init__(in_channels, out_channels, - kernel_size, stride, - padding=0, dilation=dilation, - groups=groups, bias=bias, - padding_mode=padding_mode, - device=device, dtype=dtype - ) + kernel_size, stride, + padding=0, dilation=dilation, + groups=groups, bias=bias, + padding_mode=padding_mode, + device=device, dtype=dtype) assert stride == 1 self.causal_padding = (kernel_size - 1, 0) @@ -124,7 +123,8 @@ class ConditionalDecoder(nn.Module): input_channel = output_channel output_channel = channels[i] is_last = i == len(channels) - 1 - resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal \ + else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ BasicTransformerBlock( @@ -138,14 +138,16 @@ class ConditionalDecoder(nn.Module): ] ) downsample = ( - Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1) + Downsample1D(output_channel) if not is_last else \ + CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1) ) self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) for _ in range(num_mid_blocks): input_channel = channels[-1] out_channels = channels[-1] - resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \ + ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index d99c495..c69ded6 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -202,7 +202,6 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): embedding = self.spk_embed_affine_layer(embedding) # concat text and prompt_text - token_len1, token_len2 = prompt_token.shape[1], token.shape[1] token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding) token = self.input_embedding(torch.clamp(token, min=0)) * mask @@ -211,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): h, h_lengths = self.encoder(token, token_len) if finalize is False: h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio] - mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] + mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] h = self.encoder_proj(h) # get conditions @@ -229,4 +228,4 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): ) feat = feat[:, :, mel_len1:] assert feat.shape[2] == mel_len2 - return feat, None \ No newline at end of file + return feat, None diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py index fbe78ff..8113891 100644 --- a/cosyvoice/tokenizer/tokenizer.py +++ b/cosyvoice/tokenizer/tokenizer.py @@ -274,4 +274,4 @@ def get_qwen_tokenizer( token_path: str, skip_special_tokens: bool ) -> QwenTokenizer: - return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens) \ No newline at end of file + return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens) diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py index 7c64726..05a99cc 100644 --- a/cosyvoice/transformer/upsample_encoder.py +++ b/cosyvoice/transformer/upsample_encoder.py @@ -19,7 +19,6 @@ from typing import Tuple import torch from torch import nn -import torch.utils.checkpoint as ckpt from torch.nn import functional as F from cosyvoice.transformer.convolution import ConvolutionModule @@ -49,14 +48,14 @@ class Upsample1D(nn.Module): number of output channels. Defaults to `channels`. """ - def __init__(self, channels: int, out_channels: int, stride: int=2): + def __init__(self, channels: int, out_channels: int, stride: int = 2): super().__init__() self.channels = channels self.out_channels = out_channels self.stride = stride # In this mode, first repeat interpolate, than conv with stride=1 self.conv = nn.Conv1d( - self.channels, self.out_channels, stride*2+1, stride=1, + self.channels, self.out_channels, stride * 2 + 1, stride = 1, padding=0, ) @@ -74,7 +73,7 @@ class PreLookaheadLayer(nn.Module): self.pre_lookahead_len = pre_lookahead_len self.conv1 = nn.Conv1d( channels, channels, - kernel_size=pre_lookahead_len+1, + kernel_size=pre_lookahead_len + 1, stride=1, padding=0, ) self.conv2 = nn.Conv1d( @@ -315,8 +314,8 @@ class UpsampleConformerEncoder(torch.nn.Module): return xs def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor) -> torch.Tensor: + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: for layer in self.up_encoders: xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) return xs diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py index 2e12ad2..b356f0c 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -163,4 +163,4 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: # NOTE(Mddct): torch.finfo jit issues # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min mask = (1.0 - mask) * torch.finfo(dtype).min - return mask \ No newline at end of file + return mask