This commit is contained in:
lyuxiang.lx
2025-12-09 07:57:10 +00:00
parent b35ece675b
commit 56d9876037
8 changed files with 49 additions and 49 deletions

View File

@@ -193,13 +193,13 @@ class CosyVoiceFrontEnd:
model_input = self.frontend_sft(tts_text, spk_id)
# in instruct mode, we remove spk_embedding in llm due to information leakage
del model_input['llm_embedding']
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text)
model_input['prompt_text'] = instruct_text_token
model_input['prompt_text_len'] = instruct_text_token_len
return model_input
def frontend_instruct2(self, tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id):
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_wav, resample_rate, zero_shot_spk_id)
model_input = self.frontend_zero_shot(tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id)
del model_input['llm_prompt_speech_token']
del model_input['llm_prompt_speech_token_len']
return model_input

View File

@@ -129,7 +129,7 @@ class CosyVoiceModel:
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
@@ -284,7 +284,7 @@ class CosyVoice2Model(CosyVoiceModel):
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, _ = self.flow.inference(token=token.to(self.device),
tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
@@ -413,7 +413,7 @@ class CosyVoice3Model(CosyVoice2Model):
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, _ = self.flow.inference(token=token.to(self.device),
tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),

View File

@@ -155,7 +155,7 @@ class TransformerLM(torch.nn.Module):
num_trials, max_trials = 0, 100
while True:
top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
if (not ignore_eos) or (self.speech_token_size not in top_ids):
if (not ignore_eos) or (top_ids < self.speech_token_size):
break
num_trials += 1
if num_trials > max_trials:
@@ -506,7 +506,7 @@ class Qwen2LM(TransformerLM):
masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
cache=cache)
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
if top_ids in self.stop_token_ids:
break
# in stream mode, yield token one by one
@@ -654,7 +654,7 @@ class CosyVoice3LM(Qwen2LM):
self.mix_ratio = mix_ratio
# 5. vllm related
self.stop_token_ids = [speech_token_size + i for i in range(4)]
self.stop_token_ids = [speech_token_size + i for i in range(200)]
self.vllm_output_queue = {}
def forward(

View File

@@ -25,32 +25,32 @@ import torch
IGNORE_ID = -1
instruct_list = ["You are a helpful assistant. 请用广东话表达。",
"You are a helpful assistant. 请用东北话表达。",
"You are a helpful assistant. 请用甘肃话表达。",
"You are a helpful assistant. 请用贵州话表达。",
"You are a helpful assistant. 请用河南话表达。",
"You are a helpful assistant. 请用湖北话表达。",
"You are a helpful assistant. 请用湖南话表达。",
"You are a helpful assistant. 请用江西话表达。",
"You are a helpful assistant. 请用闽南话表达。",
"You are a helpful assistant. 请用宁夏话表达。",
"You are a helpful assistant. 请用山西话表达。",
"You are a helpful assistant. 请用陕西话表达。",
"You are a helpful assistant. 请用山东话表达。",
"You are a helpful assistant. 请用上海话表达。",
"You are a helpful assistant. 请用四川话表达。",
"You are a helpful assistant. 请用天津话表达。",
"You are a helpful assistant. 请用云南话表达。",
"You are a helpful assistant. Please say a sentence as loudly as possible.",
"You are a helpful assistant. Please say a sentence in a very soft voice.",
"You are a helpful assistant. 请用尽可能慢地语速说一句话。",
"You are a helpful assistant. 请用尽可能快地语速说一句话。",
"You are a helpful assistant. 请非常开心地说一句话。",
"You are a helpful assistant. 请非常伤心地说一句话。",
"You are a helpful assistant. 请非常生气地说一句话。",
"You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?",
"You are a helpful assistant. 你可以尝试用机器人的方式解答吗?"]
instruct_list = ["You are a helpful assistant. 请用广东话表达。<endofprompt>",
"You are a helpful assistant. 请用东北话表达。<endofprompt>",
"You are a helpful assistant. 请用甘肃话表达。<endofprompt>",
"You are a helpful assistant. 请用贵州话表达。<endofprompt>",
"You are a helpful assistant. 请用河南话表达。<endofprompt>",
"You are a helpful assistant. 请用湖北话表达。<endofprompt>",
"You are a helpful assistant. 请用湖南话表达。<endofprompt>",
"You are a helpful assistant. 请用江西话表达。<endofprompt>",
"You are a helpful assistant. 请用闽南话表达。<endofprompt>",
"You are a helpful assistant. 请用宁夏话表达。<endofprompt>",
"You are a helpful assistant. 请用山西话表达。<endofprompt>",
"You are a helpful assistant. 请用陕西话表达。<endofprompt>",
"You are a helpful assistant. 请用山东话表达。<endofprompt>",
"You are a helpful assistant. 请用上海话表达。<endofprompt>",
"You are a helpful assistant. 请用四川话表达。<endofprompt>",
"You are a helpful assistant. 请用天津话表达。<endofprompt>",
"You are a helpful assistant. 请用云南话表达。<endofprompt>",
"You are a helpful assistant. Please say a sentence as loudly as possible.<endofprompt>",
"You are a helpful assistant. Please say a sentence in a very soft voice.<endofprompt>",
"You are a helpful assistant. 请用尽可能慢地语速说一句话。<endofprompt>",
"You are a helpful assistant. 请用尽可能快地语速说一句话。<endofprompt>",
"You are a helpful assistant. 请非常开心地说一句话。<endofprompt>",
"You are a helpful assistant. 请非常伤心地说一句话。<endofprompt>",
"You are a helpful assistant. 请非常生气地说一句话。<endofprompt>",
"You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<endofprompt>",
"You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<endofprompt>"]
def pad_list(xs: List[torch.Tensor], pad_value: int):
"""Perform padding for the list of tensors.
@@ -156,12 +156,12 @@ def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
break
prob = torch.tensor(prob).to(weighted_scores)
indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
top_ids = indices[prob.multinomial(1, replacement=True)]
top_ids = indices[prob.multinomial(1, replacement=True)].item()
return top_ids
def random_sampling(weighted_scores, decoded_tokens, sampling):
top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True).item()
return top_ids