mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
update readme
This commit is contained in:
16
README.md
16
README.md
@@ -131,19 +131,29 @@ export PYTHONPATH=third_party/Matcha-TTS
|
|||||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
|
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
|
||||||
from cosyvoice.utils.file_utils import load_wav
|
from cosyvoice.utils.file_utils import load_wav
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
```
|
||||||
|
|
||||||
# cosyvoice2
|
**CosyVoice2 Usage**
|
||||||
|
```python
|
||||||
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
|
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
|
||||||
|
|
||||||
# zero_shot usage
|
# zero_shot usage
|
||||||
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
|
# fine grained control
|
||||||
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
||||||
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_speech_16k, stream=False)):
|
||||||
|
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# instruct usage
|
# instruct usage
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
```
|
||||||
|
|
||||||
# cosyvoice
|
**CosyVoice Usage**
|
||||||
|
```python
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
|
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
|
||||||
# sft usage
|
# sft usage
|
||||||
print(cosyvoice.list_avaliable_spks())
|
print(cosyvoice.list_avaliable_spks())
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ class CosyVoice:
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
||||||
if self.frontend.instruct is True:
|
if self.frontend.instruct is True and isinstance(self.model, CosyVoiceModel):
|
||||||
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
||||||
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
||||||
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
|
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
|
||||||
|
|||||||
@@ -109,6 +109,10 @@ class CosyVoiceFrontEnd:
|
|||||||
|
|
||||||
def text_normalize(self, text, split=True):
|
def text_normalize(self, text, split=True):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
|
# NOTE(lyuxiang.lx) move this judgement into ttsfrd in the future
|
||||||
|
for token in self.tokenizer.special_tokens['additional_special_tokens']:
|
||||||
|
if token in text:
|
||||||
|
return text if split is False else [text]
|
||||||
if contains_chinese(text):
|
if contains_chinese(text):
|
||||||
if self.use_ttsfrd:
|
if self.use_ttsfrd:
|
||||||
texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
|
texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
|
||||||
|
|||||||
@@ -255,6 +255,7 @@ class QwenTokenizer():
|
|||||||
"[lipsmack]", "[mn]"
|
"[lipsmack]", "[mn]"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
self.special_tokens = special_tokens
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
||||||
self.tokenizer.add_special_tokens(special_tokens)
|
self.tokenizer.add_special_tokens(special_tokens)
|
||||||
self.skip_special_tokens = skip_special_tokens
|
self.skip_special_tokens = skip_special_tokens
|
||||||
|
|||||||
Reference in New Issue
Block a user