diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py index 045cb4e..bbc2a21 100644 --- a/cosyvoice/hifigan/generator.py +++ b/cosyvoice/hifigan/generator.py @@ -713,8 +713,8 @@ class CausalHiFTGenerator(HiFTGenerator): @torch.inference_mode() def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor: # mel->f0 NOTE f0_predictor precision is crucial for causal inference, move self.f0_predictor to cpu if necessary - self.f0_predictor.to('cpu') - f0 = self.f0_predictor(speech_feat.cpu(), finalize=finalize).to(speech_feat) + self.f0_predictor.to(torch.float64) + f0 = self.f0_predictor(speech_feat.to(torch.float64), finalize=finalize).to(speech_feat) # f0->source s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t s, _, _ = self.m_source(s)