add vc code

2026-02-04 17:39:25 +08:00 · 2024-09-26 10:49:22 +08:00
parent ed87445540
commit 49015f63e6
7 changed files with 43 additions and 216 deletions
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -42,6 +42,7 @@ class CosyVoiceFrontEnd:
                 speech_tokenizer_model: str,
                 spk2info: str = '',
                 instruct: bool = False,
+                 vc: bool = False,
                 allowed_special: str = 'all'):
        self.tokenizer = get_tokenizer()
        self.feat_extractor = feat_extractor
@@ -55,7 +56,10 @@ class CosyVoiceFrontEnd:
                                                                                "CPUExecutionProvider"])
        if os.path.exists(spk2info):
            self.spk2info = torch.load(spk2info, map_location=self.device)
+        else:
+            self.spk2info = {}
        self.instruct = instruct
+        self.vc = vc
        self.allowed_special = allowed_special
        self.inflect_parser = inflect.engine()
        self.use_ttsfrd = use_ttsfrd
@@ -172,3 +176,15 @@ class CosyVoiceFrontEnd:
        model_input['prompt_text'] = instruct_text_token
        model_input['prompt_text_len'] = instruct_text_token_len
        return model_input
+
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
+        model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
+                       'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
+                       'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
+                       'flow_embedding': embedding}
+        return model_input