mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
Merge pull request #360 from FunAudioLLM/dev/lyuxiang.lx
set onnx to false as last chunk rtf unstable
This commit is contained in:
1
.github/workflows/lint.yml
vendored
1
.github/workflows/lint.yml
vendored
@@ -2,6 +2,7 @@ name: Lint
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
|
push:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
quick-checks:
|
quick-checks:
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from cosyvoice.utils.file_utils import logging
|
|||||||
|
|
||||||
class CosyVoice:
|
class CosyVoice:
|
||||||
|
|
||||||
def __init__(self, model_dir, load_jit=True, load_onnx=True):
|
def __init__(self, model_dir, load_jit=True, load_onnx=False):
|
||||||
instruct = True if '-Instruct' in model_dir else False
|
instruct = True if '-Instruct' in model_dir else False
|
||||||
self.model_dir = model_dir
|
self.model_dir = model_dir
|
||||||
if not os.path.exists(model_dir):
|
if not os.path.exists(model_dir):
|
||||||
|
|||||||
@@ -43,7 +43,6 @@ class CosyVoiceModel:
|
|||||||
self.stream_scale_factor = 1
|
self.stream_scale_factor = 1
|
||||||
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
||||||
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
|
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
|
||||||
self.flow_hift_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
|
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
# dict used to store session related variable
|
# dict used to store session related variable
|
||||||
self.tts_speech_token_dict = {}
|
self.tts_speech_token_dict = {}
|
||||||
@@ -93,7 +92,6 @@ class CosyVoiceModel:
|
|||||||
self.llm_end_dict[uuid] = True
|
self.llm_end_dict[uuid] = True
|
||||||
|
|
||||||
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False):
|
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False):
|
||||||
with self.flow_hift_context:
|
|
||||||
tts_mel = self.flow.inference(token=token.to(self.device),
|
tts_mel = self.flow.inference(token=token.to(self.device),
|
||||||
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
||||||
prompt_token=prompt_token.to(self.device),
|
prompt_token=prompt_token.to(self.device),
|
||||||
@@ -139,7 +137,6 @@ class CosyVoiceModel:
|
|||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
||||||
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len], dim=1)
|
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len], dim=1)
|
||||||
with self.flow_hift_context:
|
|
||||||
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
||||||
prompt_token=flow_prompt_speech_token,
|
prompt_token=flow_prompt_speech_token,
|
||||||
prompt_feat=prompt_speech_feat,
|
prompt_feat=prompt_speech_feat,
|
||||||
@@ -156,7 +153,6 @@ class CosyVoiceModel:
|
|||||||
p.join()
|
p.join()
|
||||||
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
||||||
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
|
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
|
||||||
with self.flow_hift_context:
|
|
||||||
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
||||||
prompt_token=flow_prompt_speech_token,
|
prompt_token=flow_prompt_speech_token,
|
||||||
prompt_feat=prompt_speech_feat,
|
prompt_feat=prompt_speech_feat,
|
||||||
@@ -168,7 +164,6 @@ class CosyVoiceModel:
|
|||||||
# deal with all tokens
|
# deal with all tokens
|
||||||
p.join()
|
p.join()
|
||||||
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
|
this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
|
||||||
with self.flow_hift_context:
|
|
||||||
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
||||||
prompt_token=flow_prompt_speech_token,
|
prompt_token=flow_prompt_speech_token,
|
||||||
prompt_feat=prompt_speech_feat,
|
prompt_feat=prompt_speech_feat,
|
||||||
@@ -181,5 +176,3 @@ class CosyVoiceModel:
|
|||||||
self.llm_end_dict.pop(this_uuid)
|
self.llm_end_dict.pop(this_uuid)
|
||||||
self.mel_overlap_dict.pop(this_uuid)
|
self.mel_overlap_dict.pop(this_uuid)
|
||||||
self.hift_cache_dict.pop(this_uuid)
|
self.hift_cache_dict.pop(this_uuid)
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|||||||
Reference in New Issue
Block a user