diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index eacde5b..3d9ae01 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -698,7 +698,7 @@ class CosyVoice3LM(Qwen2LM): lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device)) logits = self.llm_decoder(lm_output) loss = self.criterion_ce(logits, lm_target.to(device)) - acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID) + acc = th_accuracy(logits.view(-1, self.speech_token_size + 200), lm_target, ignore_label=IGNORE_ID) return {'loss': loss, 'acc': acc} @torch.inference_mode() diff --git a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml index df36109..2ec5fcb 100644 --- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml +++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml @@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, # model params # for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. # for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.Qwen2LM +llm: !new:cosyvoice.llm.llm.CosyVoice3LM llm_input_size: !ref llm_output_size: !ref speech_token_size: 6561 @@ -231,4 +231,4 @@ train_conf_gan: grad_clip: 5 accum_grad: 1 # in gan training, accum_grad must be 1 log_interval: 100 - save_per_step: -1 \ No newline at end of file + save_per_step: -1