mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
update
This commit is contained in:
@@ -256,6 +256,10 @@ class CosyVoice2Model(CosyVoiceModel):
|
||||
self.fp16 = fp16
|
||||
# NOTE must matching training static_chunk_size
|
||||
self.token_hop_len = 25
|
||||
# NOTE increase token_hop_len incrementally to avoid duplicate inference
|
||||
self.token_max_hop_len = 4 * self.token_hop_len
|
||||
self.stream_scale_factor = 2
|
||||
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
||||
# hift cache
|
||||
self.mel_cache_len = 8
|
||||
self.source_cache_len = int(self.mel_cache_len * 480)
|
||||
@@ -353,6 +357,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
||||
stream=stream,
|
||||
finalize=False)
|
||||
token_offset += this_token_hop_len
|
||||
self.token_hop_len = min(self.token_max_hop_len, self.token_hop_len * self.stream_scale_factor)
|
||||
yield {'tts_speech': this_tts_speech.cpu()}
|
||||
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
|
||||
break
|
||||
@@ -403,6 +408,10 @@ class CosyVoice3Model(CosyVoice2Model):
|
||||
self.fp16 = fp16
|
||||
# NOTE must matching training static_chunk_size
|
||||
self.token_hop_len = 25
|
||||
# NOTE increase token_hop_len incrementally to avoid duplicate inference
|
||||
self.token_max_hop_len = 4 * self.token_hop_len
|
||||
self.stream_scale_factor = 2
|
||||
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
||||
# rtf and decoding related
|
||||
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
|
||||
self.lock = threading.Lock()
|
||||
|
||||
Reference in New Issue
Block a user