mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
add llm bistream
This commit is contained in:
@@ -293,14 +293,46 @@ class Qwen2LM(TransformerLM):
|
|||||||
self.sampling = sampling
|
self.sampling = sampling
|
||||||
self.mix_ratio = mix_ratio
|
self.mix_ratio = mix_ratio
|
||||||
|
|
||||||
def pad_unpad_sequence(self, sos_eos_emb, text_token, text_token_len, task_id_emb, speech_token, speech_token_len, bistream):
|
def prepare_lm_input_target(self, text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len):
|
||||||
|
lm_target, lm_input = [], []
|
||||||
text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
|
text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
|
||||||
speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
|
speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
|
||||||
lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0)
|
text_token_emb = unpad_sequence(text_token_emb, text_token_len.cpu(), batch_first=True)
|
||||||
for i in range(len(text_token))]
|
speech_token_emb = unpad_sequence(speech_token_emb, speech_token_len.cpu(), batch_first=True)
|
||||||
|
for i in range(len(text_token)):
|
||||||
|
# bistream sequence
|
||||||
|
if random.random() < 0.5 and speech_token_len[i] / text_token_len[i] > self.mix_ratio[1] / self.mix_ratio[0]:
|
||||||
|
this_lm_target, this_lm_input = [], []
|
||||||
|
this_lm_target.append(IGNORE_ID)
|
||||||
|
this_lm_input.append(self.llm_embedding.weight[self.sos_eos].reshape(1, -1))
|
||||||
|
for j in range(((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()):
|
||||||
|
this_text_token = text_token[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]].tolist()
|
||||||
|
this_speech_token = speech_token[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]].tolist()
|
||||||
|
if len(this_text_token) == self.mix_ratio[0]:
|
||||||
|
assert len(this_speech_token) == self.mix_ratio[1]
|
||||||
|
this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
|
||||||
|
this_lm_target += this_speech_token
|
||||||
|
this_lm_target.append(self.speech_token_size + 2)
|
||||||
|
this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]])
|
||||||
|
this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]])
|
||||||
|
else:
|
||||||
|
this_lm_target += [-1] * len(this_text_token)
|
||||||
|
this_lm_target += speech_token[i][j * self.mix_ratio[1]:].tolist()
|
||||||
|
this_lm_target.append(self.speech_token_size)
|
||||||
|
this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]:])
|
||||||
|
this_lm_input.append(self.llm_embedding.weight[self.task_id].reshape(1, -1))
|
||||||
|
this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]:])
|
||||||
|
this_lm_target, this_lm_input = torch.tensor(this_lm_target), torch.concat(this_lm_input, dim=0)
|
||||||
|
# unistream sequence
|
||||||
|
else:
|
||||||
|
this_lm_target = torch.tensor([IGNORE_ID] * (1 + text_token_len[i]) + speech_token[i].tolist() + [self.speech_token_size])
|
||||||
|
this_lm_input = torch.concat([self.llm_embedding.weight[self.sos_eos].reshape(1, -1), text_token_emb[i], self.llm_embedding.weight[self.task_id].reshape(1, -1), speech_token_emb[i]], dim=0)
|
||||||
|
lm_target.append(this_lm_target)
|
||||||
|
lm_input.append(this_lm_input)
|
||||||
lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
|
lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
|
||||||
lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
|
lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
|
||||||
return lm_input, lm_input_len
|
lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID)
|
||||||
|
return lm_target, lm_input, lm_input_len
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -319,29 +351,20 @@ class Qwen2LM(TransformerLM):
|
|||||||
speech_token = batch['speech_token'].to(device)
|
speech_token = batch['speech_token'].to(device)
|
||||||
speech_token_len = batch['speech_token_len'].to(device)
|
speech_token_len = batch['speech_token_len'].to(device)
|
||||||
|
|
||||||
# 1. prepare llm_target
|
|
||||||
bistream = True if random.random() < 0.5 else False
|
|
||||||
lm_target = [torch.tensor([IGNORE_ID] * (1 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() +
|
|
||||||
[self.speech_token_size]) for i in range(text_token.size(0))]
|
|
||||||
lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
|
|
||||||
|
|
||||||
# 1. encode text_token
|
# 1. encode text_token
|
||||||
text_token = self.llm.model.model.embed_tokens(text_token)
|
text_token_emb = self.llm.model.model.embed_tokens(text_token)
|
||||||
|
|
||||||
# 3. eos and task_id
|
# 2. encode speech_token
|
||||||
sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
|
speech_token_emb = self.speech_embedding(speech_token)
|
||||||
task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
|
|
||||||
|
|
||||||
# 4. encode speech_token
|
# 3. prepare llm_input/target
|
||||||
speech_token = self.speech_embedding(speech_token)
|
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len)
|
||||||
|
lm_target = lm_target.to(device)
|
||||||
|
|
||||||
# 5. unpad and pad
|
# 4. run lm forward
|
||||||
lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, text_token, text_token_len, task_id_emb, speech_token, speech_token_len, bistream)
|
|
||||||
|
|
||||||
# 6. run lm forward
|
|
||||||
lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
|
lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
|
||||||
logits = self.llm_decoder(lm_output)
|
logits = self.llm_decoder(lm_output)
|
||||||
loss = self.criterion_ce(logits, lm_target)
|
loss = self.criterion_ce(logits, lm_target.to(device))
|
||||||
acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
|
acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
|
||||||
return {'loss': loss, 'acc': acc}
|
return {'loss': loss, 'acc': acc}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user