diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index b95a9e0..a51a304 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -26,7 +26,7 @@ from cosyvoice.utils.class_utils import get_model_type class CosyVoice: - def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): + def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1): self.instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir self.fp16 = fp16 @@ -48,7 +48,7 @@ class CosyVoice: if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): load_jit, load_trt, fp16 = False, False, False logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') - self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16) + self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, trt_concurrent) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), '{}/hift.pt'.format(model_dir)) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 811b2cb..c1e441f 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -258,9 +258,6 @@ class CosyVoice2Model(CosyVoiceModel): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.llm = llm self.flow = flow - # NOTE default setting for jit/onnx export, you can set to False when using pytorch inference - self.flow.encoder.streaming = True - self.flow.decoder.estimator.streaming = True self.hift = hift self.fp16 = fp16 self.trt_concurrent = trt_concurrent @@ -290,7 +287,7 @@ class CosyVoice2Model(CosyVoiceModel): flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) self.flow.encoder = flow_encoder - def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, finalize=False, speed=1.0): + def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0): with torch.cuda.amp.autocast(self.fp16), self.trt_context_dict[uuid]: tts_mel, _ = self.flow.inference(token=token.to(self.device), token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), @@ -299,6 +296,7 @@ class CosyVoice2Model(CosyVoiceModel): prompt_feat=prompt_feat.to(self.device), prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device), embedding=embedding.to(self.device), + streaming=stream, finalize=finalize) tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:] # append hift cache @@ -356,6 +354,7 @@ class CosyVoice2Model(CosyVoiceModel): embedding=flow_embedding, token_offset=token_offset, uuid=this_uuid, + stream=stream, finalize=False) token_offset += this_token_hop_len yield {'tts_speech': this_tts_speech.cpu()} diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py index 9e28c3f..97768a4 100644 --- a/cosyvoice/flow/decoder.py +++ b/cosyvoice/flow/decoder.py @@ -419,10 +419,6 @@ class CausalConditionalDecoder(ConditionalDecoder): Returns: _type_: _description_ """ - if hasattr(self, 'streaming'): - assert self.training is False, 'you have self.streaming attr, make sure that you are running inference mode' - streaming = self.streaming - t = self.time_embeddings(t).to(t.dtype) t = self.time_mlp(t) diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index d9e832b..a068288 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -241,6 +241,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): prompt_feat, prompt_feat_len, embedding, + streaming, finalize): assert token.shape[0] == 1 # xvec projection @@ -254,10 +255,10 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): # text encode if finalize is True: - h, h_lengths = self.encoder(token, token_len) + h, h_lengths = self.encoder(token, token_len, streaming=streaming) else: token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:] - h, h_lengths = self.encoder(token, token_len, context=context) + h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming) mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] h = self.encoder_proj(h) @@ -273,6 +274,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): spks=embedding, cond=conds, n_timesteps=10, + streaming=streaming ) feat = feat[:, :, mel_len1:] assert feat.shape[2] == mel_len2 diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 735889f..704ced3 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -69,7 +69,7 @@ class ConditionalCFM(BASECFM): t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache - def solve_euler(self, x, t_span, mu, mask, spks, cond): + def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False): """ Fixed euler solver for ODEs. Args: @@ -110,7 +110,8 @@ class ConditionalCFM(BASECFM): x_in, mask_in, mu_in, t_in, spks_in, - cond_in + cond_in, + streaming ) dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0) dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) @@ -122,9 +123,9 @@ class ConditionalCFM(BASECFM): return sol[-1].float() - def forward_estimator(self, x, mask, mu, t, spks, cond): + def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False): if isinstance(self.estimator, torch.nn.Module): - return self.estimator(x, mask, mu, t, spks, cond) + return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming) else: estimator, trt_engine = self.estimator.acquire_estimator() estimator.set_input_shape('x', (2, 80, x.size(2))) @@ -196,7 +197,7 @@ class CausalConditionalCFM(ConditionalCFM): self.rand_noise = torch.randn([1, 80, 50 * 300]) @torch.inference_mode() - def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming=False): """Forward diffusion Args: @@ -220,4 +221,4 @@ class CausalConditionalCFM(ConditionalCFM): t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) if self.t_scheduler == 'cosine': t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) - return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, streaming=streaming), None diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py index e17b188..6ffda6a 100644 --- a/cosyvoice/transformer/upsample_encoder.py +++ b/cosyvoice/transformer/upsample_encoder.py @@ -272,9 +272,6 @@ class UpsampleConformerEncoder(torch.nn.Module): checkpointing API because `__call__` attaches all the hooks of the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 """ - if hasattr(self, 'streaming'): - assert self.training is False, 'you have self.streaming attr, make sure that you are running inference mode' - streaming = self.streaming T = xs.size(1) masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) if self.global_cmvn is not None: diff --git a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml index 84d1bd5..df36109 100644 --- a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml +++ b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml @@ -158,6 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref + token_mel_ratio: 2 compute_f0: !name:cosyvoice.dataset.processor.compute_f0 sample_rate: !ref hop_size: 480 diff --git a/examples/libritts/cosyvoice2/path.sh b/examples/libritts/cosyvoice2/path.sh deleted file mode 100644 index e0fa06c..0000000 --- a/examples/libritts/cosyvoice2/path.sh +++ /dev/null @@ -1,3 +0,0 @@ -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH diff --git a/examples/libritts/cosyvoice2/path.sh b/examples/libritts/cosyvoice2/path.sh new file mode 120000 index 0000000..59f7179 --- /dev/null +++ b/examples/libritts/cosyvoice2/path.sh @@ -0,0 +1 @@ +../cosyvoice/path.sh \ No newline at end of file diff --git a/examples/libritts/cosyvoice2/tts_text.json b/examples/libritts/cosyvoice2/tts_text.json deleted file mode 100644 index 9f3e8d9..0000000 --- a/examples/libritts/cosyvoice2/tts_text.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "1089_134686_000002_000000": [ - "hello, my name is Jack. What is your name?" - ] -} \ No newline at end of file diff --git a/examples/libritts/cosyvoice2/tts_text.json b/examples/libritts/cosyvoice2/tts_text.json new file mode 120000 index 0000000..e85cf1c --- /dev/null +++ b/examples/libritts/cosyvoice2/tts_text.json @@ -0,0 +1 @@ +../cosyvoice/tts_text.json \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf b/examples/magicdata-read/cosyvoice/conf new file mode 120000 index 0000000..e1368df --- /dev/null +++ b/examples/magicdata-read/cosyvoice/conf @@ -0,0 +1 @@ +../../libritts/cosyvoice/conf \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml deleted file mode 100644 index 0420d02..0000000 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ /dev/null @@ -1,203 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.TransformerLM - text_encoder_input_size: !ref - llm_input_size: !ref - llm_output_size: !ref - text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe - speech_token_size: 4096 - length_normalized_loss: True - lsm_weight: 0 - spk_embed_dim: !ref - text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - input_size: !ref - output_size: 1024 - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - use_cnn_module: False - macaron_style: False - use_dynamic_chunk: False - use_dynamic_left_chunk: False - static_chunk_size: 1 - llm: !new:cosyvoice.transformer.encoder.TransformerEncoder - input_size: !ref - output_size: !ref - attention_heads: 8 - linear_units: 2048 - num_blocks: 7 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: 'linear_legacy' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - static_chunk_size: 1 - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 - -flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 4096 - input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe - only_mask_loss: True - encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - output_size: 512 - attention_heads: 4 - linear_units: 1024 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator - channels: 80 - sampling_ratios: [1, 1, 1, 1] - decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - channels: [256, 256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 8 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 12000 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# train conf -train_conf: - optim: adam - optim_conf: - lr: 0.002 # change to 0.001 if you want to train flow from scratch - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml deleted file mode 100644 index b2ff51c..0000000 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +++ /dev/null @@ -1,203 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.TransformerLM - text_encoder_input_size: !ref - llm_input_size: !ref - llm_output_size: !ref - text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe - speech_token_size: 4096 - length_normalized_loss: True - lsm_weight: 0 - spk_embed_dim: !ref - text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - input_size: !ref - output_size: 1024 - attention_heads: 16 - linear_units: 4096 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - use_cnn_module: False - macaron_style: False - use_dynamic_chunk: False - use_dynamic_left_chunk: False - static_chunk_size: 1 - llm: !new:cosyvoice.transformer.encoder.TransformerEncoder - input_size: !ref - output_size: !ref - attention_heads: 16 - linear_units: 4096 - num_blocks: 14 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: 'linear_legacy' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - static_chunk_size: 1 - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 - -flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 4096 - input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe - only_mask_loss: True - encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - output_size: 512 - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator - channels: 80 - sampling_ratios: [1, 1, 1, 1] - decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - channels: [256, 256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 12 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 2000 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# train conf -train_conf: - optim: adam - optim_conf: - lr: 0.001 # change to 1e-5 during sft - scheduler: warmuplr # change to constantlr during sft - scheduler_conf: - warmup_steps: 2500 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/ds_stage2.json b/examples/magicdata-read/cosyvoice/conf/ds_stage2.json deleted file mode 100644 index 2b2de3d..0000000 --- a/examples/magicdata-read/cosyvoice/conf/ds_stage2.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": 1, - "steps_per_print": 100, - "gradient_clipping": 5, - "fp16": { - "enabled": false, - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 16, - "loss_scale_window": 256, - "hysteresis": 2, - "consecutive_hysteresis": false, - "min_loss_scale": 1 - }, - "bf16": { - "enabled": false - }, - "zero_force_ds_cpu_optimizer": false, - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "none", - "pin_memory": true - }, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": false, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients" : true - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 0.001, - "weight_decay": 0.0001, - "torch_adam": true, - "adam_w_mode": true - } - } -} \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/path.sh b/examples/magicdata-read/cosyvoice/path.sh deleted file mode 100644 index e0fa06c..0000000 --- a/examples/magicdata-read/cosyvoice/path.sh +++ /dev/null @@ -1,3 +0,0 @@ -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH diff --git a/examples/magicdata-read/cosyvoice/path.sh b/examples/magicdata-read/cosyvoice/path.sh new file mode 120000 index 0000000..4541305 --- /dev/null +++ b/examples/magicdata-read/cosyvoice/path.sh @@ -0,0 +1 @@ +../../libritts/cosyvoice/path.sh \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/run.sh b/examples/magicdata-read/cosyvoice/run.sh index 1af1a28..888ac7b 100644 --- a/examples/magicdata-read/cosyvoice/run.sh +++ b/examples/magicdata-read/cosyvoice/run.sh @@ -83,7 +83,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi cp data/train/parquet/data.list data/train.data.list cp data/dev/parquet/data.list data/dev.data.list - for model in llm flow; do + for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ cosyvoice/bin/train.py \ @@ -99,11 +99,26 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --num_workers ${num_workers} \ --prefetch ${prefetch} \ --pin_memory \ + --use_amp \ --deepspeed_config ./conf/ds_stage2.json \ --deepspeed.save_states model+optimizer done fi +# average model +average_num=5 +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + for model in llm flow hifigan; do + decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python cosyvoice/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path `pwd`/exp/cosyvoice/$model/$train_engine \ + --num ${average_num} \ + --val_best + done +fi + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir" python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir diff --git a/runtime/python/grpc/server.py b/runtime/python/grpc/server.py index 1cb48ae..76827e6 100644 --- a/runtime/python/grpc/server.py +++ b/runtime/python/grpc/server.py @@ -34,10 +34,10 @@ logging.basicConfig(level=logging.DEBUG, class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer): def __init__(self, args): try: - self.cosyvoice = CosyVoice(args.model_dir) + self.cosyvoice = CosyVoice(args.model_dir, trt_concurrent=args.max_conc) except Exception: try: - self.cosyvoice = CosyVoice2(args.model_dir) + self.cosyvoice = CosyVoice2(args.model_dir, trt_concurrent=args.max_conc) except Exception: raise TypeError('no valid model_type!') logging.info('grpc service initialized')