mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
Merge pull request #1331 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
This commit is contained in:
@@ -26,7 +26,7 @@ from cosyvoice.utils.class_utils import get_model_type
|
|||||||
|
|
||||||
class CosyVoice:
|
class CosyVoice:
|
||||||
|
|
||||||
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
|
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
|
||||||
self.instruct = True if '-Instruct' in model_dir else False
|
self.instruct = True if '-Instruct' in model_dir else False
|
||||||
self.model_dir = model_dir
|
self.model_dir = model_dir
|
||||||
self.fp16 = fp16
|
self.fp16 = fp16
|
||||||
@@ -48,7 +48,7 @@ class CosyVoice:
|
|||||||
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
||||||
load_jit, load_trt, fp16 = False, False, False
|
load_jit, load_trt, fp16 = False, False, False
|
||||||
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
||||||
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
|
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, trt_concurrent)
|
||||||
self.model.load('{}/llm.pt'.format(model_dir),
|
self.model.load('{}/llm.pt'.format(model_dir),
|
||||||
'{}/flow.pt'.format(model_dir),
|
'{}/flow.pt'.format(model_dir),
|
||||||
'{}/hift.pt'.format(model_dir))
|
'{}/hift.pt'.format(model_dir))
|
||||||
|
|||||||
@@ -258,9 +258,6 @@ class CosyVoice2Model(CosyVoiceModel):
|
|||||||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
self.llm = llm
|
self.llm = llm
|
||||||
self.flow = flow
|
self.flow = flow
|
||||||
# NOTE default setting for jit/onnx export, you can set to False when using pytorch inference
|
|
||||||
self.flow.encoder.streaming = True
|
|
||||||
self.flow.decoder.estimator.streaming = True
|
|
||||||
self.hift = hift
|
self.hift = hift
|
||||||
self.fp16 = fp16
|
self.fp16 = fp16
|
||||||
self.trt_concurrent = trt_concurrent
|
self.trt_concurrent = trt_concurrent
|
||||||
@@ -290,7 +287,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
|||||||
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
||||||
self.flow.encoder = flow_encoder
|
self.flow.encoder = flow_encoder
|
||||||
|
|
||||||
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, finalize=False, speed=1.0):
|
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
|
||||||
with torch.cuda.amp.autocast(self.fp16), self.trt_context_dict[uuid]:
|
with torch.cuda.amp.autocast(self.fp16), self.trt_context_dict[uuid]:
|
||||||
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
||||||
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
||||||
@@ -299,6 +296,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
|||||||
prompt_feat=prompt_feat.to(self.device),
|
prompt_feat=prompt_feat.to(self.device),
|
||||||
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
||||||
embedding=embedding.to(self.device),
|
embedding=embedding.to(self.device),
|
||||||
|
streaming=stream,
|
||||||
finalize=finalize)
|
finalize=finalize)
|
||||||
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
||||||
# append hift cache
|
# append hift cache
|
||||||
@@ -356,6 +354,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
|||||||
embedding=flow_embedding,
|
embedding=flow_embedding,
|
||||||
token_offset=token_offset,
|
token_offset=token_offset,
|
||||||
uuid=this_uuid,
|
uuid=this_uuid,
|
||||||
|
stream=stream,
|
||||||
finalize=False)
|
finalize=False)
|
||||||
token_offset += this_token_hop_len
|
token_offset += this_token_hop_len
|
||||||
yield {'tts_speech': this_tts_speech.cpu()}
|
yield {'tts_speech': this_tts_speech.cpu()}
|
||||||
|
|||||||
@@ -419,10 +419,6 @@ class CausalConditionalDecoder(ConditionalDecoder):
|
|||||||
Returns:
|
Returns:
|
||||||
_type_: _description_
|
_type_: _description_
|
||||||
"""
|
"""
|
||||||
if hasattr(self, 'streaming'):
|
|
||||||
assert self.training is False, 'you have self.streaming attr, make sure that you are running inference mode'
|
|
||||||
streaming = self.streaming
|
|
||||||
|
|
||||||
t = self.time_embeddings(t).to(t.dtype)
|
t = self.time_embeddings(t).to(t.dtype)
|
||||||
t = self.time_mlp(t)
|
t = self.time_mlp(t)
|
||||||
|
|
||||||
|
|||||||
@@ -241,6 +241,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
prompt_feat,
|
prompt_feat,
|
||||||
prompt_feat_len,
|
prompt_feat_len,
|
||||||
embedding,
|
embedding,
|
||||||
|
streaming,
|
||||||
finalize):
|
finalize):
|
||||||
assert token.shape[0] == 1
|
assert token.shape[0] == 1
|
||||||
# xvec projection
|
# xvec projection
|
||||||
@@ -254,10 +255,10 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
|
|
||||||
# text encode
|
# text encode
|
||||||
if finalize is True:
|
if finalize is True:
|
||||||
h, h_lengths = self.encoder(token, token_len)
|
h, h_lengths = self.encoder(token, token_len, streaming=streaming)
|
||||||
else:
|
else:
|
||||||
token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
|
token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
|
||||||
h, h_lengths = self.encoder(token, token_len, context=context)
|
h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming)
|
||||||
mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
|
mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
|
||||||
h = self.encoder_proj(h)
|
h = self.encoder_proj(h)
|
||||||
|
|
||||||
@@ -273,6 +274,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
spks=embedding,
|
spks=embedding,
|
||||||
cond=conds,
|
cond=conds,
|
||||||
n_timesteps=10,
|
n_timesteps=10,
|
||||||
|
streaming=streaming
|
||||||
)
|
)
|
||||||
feat = feat[:, :, mel_len1:]
|
feat = feat[:, :, mel_len1:]
|
||||||
assert feat.shape[2] == mel_len2
|
assert feat.shape[2] == mel_len2
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ class ConditionalCFM(BASECFM):
|
|||||||
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
|
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
|
||||||
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache
|
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache
|
||||||
|
|
||||||
def solve_euler(self, x, t_span, mu, mask, spks, cond):
|
def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False):
|
||||||
"""
|
"""
|
||||||
Fixed euler solver for ODEs.
|
Fixed euler solver for ODEs.
|
||||||
Args:
|
Args:
|
||||||
@@ -110,7 +110,8 @@ class ConditionalCFM(BASECFM):
|
|||||||
x_in, mask_in,
|
x_in, mask_in,
|
||||||
mu_in, t_in,
|
mu_in, t_in,
|
||||||
spks_in,
|
spks_in,
|
||||||
cond_in
|
cond_in,
|
||||||
|
streaming
|
||||||
)
|
)
|
||||||
dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
|
dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
|
||||||
dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
|
dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
|
||||||
@@ -122,9 +123,9 @@ class ConditionalCFM(BASECFM):
|
|||||||
|
|
||||||
return sol[-1].float()
|
return sol[-1].float()
|
||||||
|
|
||||||
def forward_estimator(self, x, mask, mu, t, spks, cond):
|
def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False):
|
||||||
if isinstance(self.estimator, torch.nn.Module):
|
if isinstance(self.estimator, torch.nn.Module):
|
||||||
return self.estimator(x, mask, mu, t, spks, cond)
|
return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming)
|
||||||
else:
|
else:
|
||||||
estimator, trt_engine = self.estimator.acquire_estimator()
|
estimator, trt_engine = self.estimator.acquire_estimator()
|
||||||
estimator.set_input_shape('x', (2, 80, x.size(2)))
|
estimator.set_input_shape('x', (2, 80, x.size(2)))
|
||||||
@@ -196,7 +197,7 @@ class CausalConditionalCFM(ConditionalCFM):
|
|||||||
self.rand_noise = torch.randn([1, 80, 50 * 300])
|
self.rand_noise = torch.randn([1, 80, 50 * 300])
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
|
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming=False):
|
||||||
"""Forward diffusion
|
"""Forward diffusion
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -220,4 +221,4 @@ class CausalConditionalCFM(ConditionalCFM):
|
|||||||
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
|
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
|
||||||
if self.t_scheduler == 'cosine':
|
if self.t_scheduler == 'cosine':
|
||||||
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
|
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
|
||||||
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
|
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, streaming=streaming), None
|
||||||
|
|||||||
@@ -272,9 +272,6 @@ class UpsampleConformerEncoder(torch.nn.Module):
|
|||||||
checkpointing API because `__call__` attaches all the hooks of the module.
|
checkpointing API because `__call__` attaches all the hooks of the module.
|
||||||
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
|
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
|
||||||
"""
|
"""
|
||||||
if hasattr(self, 'streaming'):
|
|
||||||
assert self.training is False, 'you have self.streaming attr, make sure that you are running inference mode'
|
|
||||||
streaming = self.streaming
|
|
||||||
T = xs.size(1)
|
T = xs.size(1)
|
||||||
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
||||||
if self.global_cmvn is not None:
|
if self.global_cmvn is not None:
|
||||||
|
|||||||
@@ -158,6 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
center: False
|
center: False
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||||
feat_extractor: !ref <feat_extractor>
|
feat_extractor: !ref <feat_extractor>
|
||||||
|
token_mel_ratio: 2
|
||||||
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||||
sample_rate: !ref <sample_rate>
|
sample_rate: !ref <sample_rate>
|
||||||
hop_size: 480
|
hop_size: 480
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH
|
|
||||||
1
examples/libritts/cosyvoice2/path.sh
Symbolic link
1
examples/libritts/cosyvoice2/path.sh
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../cosyvoice/path.sh
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{
|
|
||||||
"1089_134686_000002_000000": [
|
|
||||||
"hello, my name is Jack. What is your name?"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
1
examples/libritts/cosyvoice2/tts_text.json
Symbolic link
1
examples/libritts/cosyvoice2/tts_text.json
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../cosyvoice/tts_text.json
|
||||||
1
examples/magicdata-read/cosyvoice/conf
Symbolic link
1
examples/magicdata-read/cosyvoice/conf
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../libritts/cosyvoice/conf
|
||||||
@@ -1,203 +0,0 @@
|
|||||||
# set random seed, so that you may reproduce your result.
|
|
||||||
__set_seed1: !apply:random.seed [1986]
|
|
||||||
__set_seed2: !apply:numpy.random.seed [1986]
|
|
||||||
__set_seed3: !apply:torch.manual_seed [1986]
|
|
||||||
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
|
|
||||||
|
|
||||||
# fixed params
|
|
||||||
sample_rate: 22050
|
|
||||||
text_encoder_input_size: 512
|
|
||||||
llm_input_size: 1024
|
|
||||||
llm_output_size: 1024
|
|
||||||
spk_embed_dim: 192
|
|
||||||
|
|
||||||
# model params
|
|
||||||
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
|
||||||
# for system/third_party class/function, we do not require this.
|
|
||||||
llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
|
||||||
llm_input_size: !ref <llm_input_size>
|
|
||||||
llm_output_size: !ref <llm_output_size>
|
|
||||||
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
speech_token_size: 4096
|
|
||||||
length_normalized_loss: True
|
|
||||||
lsm_weight: 0
|
|
||||||
spk_embed_dim: !ref <spk_embed_dim>
|
|
||||||
text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
|
||||||
input_size: !ref <text_encoder_input_size>
|
|
||||||
output_size: 1024
|
|
||||||
attention_heads: 8
|
|
||||||
linear_units: 2048
|
|
||||||
num_blocks: 3
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.0
|
|
||||||
normalize_before: True
|
|
||||||
input_layer: 'linear'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
use_cnn_module: False
|
|
||||||
macaron_style: False
|
|
||||||
use_dynamic_chunk: False
|
|
||||||
use_dynamic_left_chunk: False
|
|
||||||
static_chunk_size: 1
|
|
||||||
llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
|
|
||||||
input_size: !ref <llm_input_size>
|
|
||||||
output_size: !ref <llm_output_size>
|
|
||||||
attention_heads: 8
|
|
||||||
linear_units: 2048
|
|
||||||
num_blocks: 7
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.0
|
|
||||||
input_layer: 'linear_legacy'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
static_chunk_size: 1
|
|
||||||
sampling: !name:cosyvoice.utils.common.ras_sampling
|
|
||||||
top_p: 0.8
|
|
||||||
top_k: 25
|
|
||||||
win_size: 10
|
|
||||||
tau_r: 0.1
|
|
||||||
|
|
||||||
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
||||||
input_size: 512
|
|
||||||
output_size: 80
|
|
||||||
spk_embed_dim: !ref <spk_embed_dim>
|
|
||||||
output_type: 'mel'
|
|
||||||
vocab_size: 4096
|
|
||||||
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
only_mask_loss: True
|
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
|
||||||
output_size: 512
|
|
||||||
attention_heads: 4
|
|
||||||
linear_units: 1024
|
|
||||||
num_blocks: 3
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.1
|
|
||||||
normalize_before: True
|
|
||||||
input_layer: 'linear'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
input_size: 512
|
|
||||||
use_cnn_module: False
|
|
||||||
macaron_style: False
|
|
||||||
length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
|
|
||||||
channels: 80
|
|
||||||
sampling_ratios: [1, 1, 1, 1]
|
|
||||||
decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
|
|
||||||
in_channels: 240
|
|
||||||
n_spks: 1
|
|
||||||
spk_emb_dim: 80
|
|
||||||
cfm_params: !new:omegaconf.DictConfig
|
|
||||||
content:
|
|
||||||
sigma_min: 1e-06
|
|
||||||
solver: 'euler'
|
|
||||||
t_scheduler: 'cosine'
|
|
||||||
training_cfg_rate: 0.2
|
|
||||||
inference_cfg_rate: 0.7
|
|
||||||
reg_loss_type: 'l1'
|
|
||||||
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
|
|
||||||
in_channels: 320
|
|
||||||
out_channels: 80
|
|
||||||
channels: [256, 256]
|
|
||||||
dropout: 0.0
|
|
||||||
attention_head_dim: 64
|
|
||||||
n_blocks: 4
|
|
||||||
num_mid_blocks: 8
|
|
||||||
num_heads: 8
|
|
||||||
act_fn: 'gelu'
|
|
||||||
|
|
||||||
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
||||||
in_channels: 80
|
|
||||||
base_channels: 512
|
|
||||||
nb_harmonics: 8
|
|
||||||
sampling_rate: !ref <sample_rate>
|
|
||||||
nsf_alpha: 0.1
|
|
||||||
nsf_sigma: 0.003
|
|
||||||
nsf_voiced_threshold: 10
|
|
||||||
upsample_rates: [8, 8]
|
|
||||||
upsample_kernel_sizes: [16, 16]
|
|
||||||
istft_params:
|
|
||||||
n_fft: 16
|
|
||||||
hop_len: 4
|
|
||||||
resblock_kernel_sizes: [3, 7, 11]
|
|
||||||
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
|
||||||
source_resblock_kernel_sizes: [7, 11]
|
|
||||||
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
|
|
||||||
lrelu_slope: 0.1
|
|
||||||
audio_limit: 0.99
|
|
||||||
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
|
|
||||||
num_class: 1
|
|
||||||
in_channels: 80
|
|
||||||
cond_channels: 512
|
|
||||||
|
|
||||||
# processor functions
|
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
multilingual: True
|
|
||||||
num_languages: 100
|
|
||||||
language: 'en'
|
|
||||||
task: 'transcribe'
|
|
||||||
allowed_special: 'all'
|
|
||||||
tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|
||||||
get_tokenizer: !ref <get_tokenizer>
|
|
||||||
allowed_special: !ref <allowed_special>
|
|
||||||
filter: !name:cosyvoice.dataset.processor.filter
|
|
||||||
max_length: 40960
|
|
||||||
min_length: 0
|
|
||||||
token_max_length: 200
|
|
||||||
token_min_length: 1
|
|
||||||
resample: !name:cosyvoice.dataset.processor.resample
|
|
||||||
resample_rate: !ref <sample_rate>
|
|
||||||
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|
||||||
n_fft: 1024
|
|
||||||
num_mels: 80
|
|
||||||
sampling_rate: !ref <sample_rate>
|
|
||||||
hop_size: 256
|
|
||||||
win_size: 1024
|
|
||||||
fmin: 0
|
|
||||||
fmax: 8000
|
|
||||||
center: False
|
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
|
||||||
feat_extractor: !ref <feat_extractor>
|
|
||||||
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
|
||||||
normalize: True
|
|
||||||
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
|
||||||
shuffle_size: 1000
|
|
||||||
sort: !name:cosyvoice.dataset.processor.sort
|
|
||||||
sort_size: 500 # sort_size should be less than shuffle_size
|
|
||||||
batch: !name:cosyvoice.dataset.processor.batch
|
|
||||||
batch_type: 'dynamic'
|
|
||||||
max_frames_in_batch: 12000
|
|
||||||
padding: !name:cosyvoice.dataset.processor.padding
|
|
||||||
use_spk_embedding: False # change to True during sft
|
|
||||||
|
|
||||||
# dataset processor pipeline
|
|
||||||
data_pipeline: [
|
|
||||||
!ref <parquet_opener>,
|
|
||||||
!ref <tokenize>,
|
|
||||||
!ref <filter>,
|
|
||||||
!ref <resample>,
|
|
||||||
!ref <compute_fbank>,
|
|
||||||
!ref <parse_embedding>,
|
|
||||||
!ref <shuffle>,
|
|
||||||
!ref <sort>,
|
|
||||||
!ref <batch>,
|
|
||||||
!ref <padding>,
|
|
||||||
]
|
|
||||||
|
|
||||||
# train conf
|
|
||||||
train_conf:
|
|
||||||
optim: adam
|
|
||||||
optim_conf:
|
|
||||||
lr: 0.002 # change to 0.001 if you want to train flow from scratch
|
|
||||||
scheduler: warmuplr
|
|
||||||
scheduler_conf:
|
|
||||||
warmup_steps: 25000
|
|
||||||
max_epoch: 200
|
|
||||||
grad_clip: 5
|
|
||||||
accum_grad: 2
|
|
||||||
log_interval: 100
|
|
||||||
save_per_step: -1
|
|
||||||
@@ -1,203 +0,0 @@
|
|||||||
# set random seed, so that you may reproduce your result.
|
|
||||||
__set_seed1: !apply:random.seed [1986]
|
|
||||||
__set_seed2: !apply:numpy.random.seed [1986]
|
|
||||||
__set_seed3: !apply:torch.manual_seed [1986]
|
|
||||||
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
|
|
||||||
|
|
||||||
# fixed params
|
|
||||||
sample_rate: 22050
|
|
||||||
text_encoder_input_size: 512
|
|
||||||
llm_input_size: 1024
|
|
||||||
llm_output_size: 1024
|
|
||||||
spk_embed_dim: 192
|
|
||||||
|
|
||||||
# model params
|
|
||||||
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
|
||||||
# for system/third_party class/function, we do not require this.
|
|
||||||
llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
|
||||||
llm_input_size: !ref <llm_input_size>
|
|
||||||
llm_output_size: !ref <llm_output_size>
|
|
||||||
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
speech_token_size: 4096
|
|
||||||
length_normalized_loss: True
|
|
||||||
lsm_weight: 0
|
|
||||||
spk_embed_dim: !ref <spk_embed_dim>
|
|
||||||
text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
|
||||||
input_size: !ref <text_encoder_input_size>
|
|
||||||
output_size: 1024
|
|
||||||
attention_heads: 16
|
|
||||||
linear_units: 4096
|
|
||||||
num_blocks: 6
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.0
|
|
||||||
normalize_before: True
|
|
||||||
input_layer: 'linear'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
use_cnn_module: False
|
|
||||||
macaron_style: False
|
|
||||||
use_dynamic_chunk: False
|
|
||||||
use_dynamic_left_chunk: False
|
|
||||||
static_chunk_size: 1
|
|
||||||
llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
|
|
||||||
input_size: !ref <llm_input_size>
|
|
||||||
output_size: !ref <llm_output_size>
|
|
||||||
attention_heads: 16
|
|
||||||
linear_units: 4096
|
|
||||||
num_blocks: 14
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.0
|
|
||||||
input_layer: 'linear_legacy'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
static_chunk_size: 1
|
|
||||||
sampling: !name:cosyvoice.utils.common.ras_sampling
|
|
||||||
top_p: 0.8
|
|
||||||
top_k: 25
|
|
||||||
win_size: 10
|
|
||||||
tau_r: 0.1
|
|
||||||
|
|
||||||
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
||||||
input_size: 512
|
|
||||||
output_size: 80
|
|
||||||
spk_embed_dim: !ref <spk_embed_dim>
|
|
||||||
output_type: 'mel'
|
|
||||||
vocab_size: 4096
|
|
||||||
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
only_mask_loss: True
|
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
|
||||||
output_size: 512
|
|
||||||
attention_heads: 8
|
|
||||||
linear_units: 2048
|
|
||||||
num_blocks: 6
|
|
||||||
dropout_rate: 0.1
|
|
||||||
positional_dropout_rate: 0.1
|
|
||||||
attention_dropout_rate: 0.1
|
|
||||||
normalize_before: True
|
|
||||||
input_layer: 'linear'
|
|
||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
|
||||||
input_size: 512
|
|
||||||
use_cnn_module: False
|
|
||||||
macaron_style: False
|
|
||||||
length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
|
|
||||||
channels: 80
|
|
||||||
sampling_ratios: [1, 1, 1, 1]
|
|
||||||
decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
|
|
||||||
in_channels: 240
|
|
||||||
n_spks: 1
|
|
||||||
spk_emb_dim: 80
|
|
||||||
cfm_params: !new:omegaconf.DictConfig
|
|
||||||
content:
|
|
||||||
sigma_min: 1e-06
|
|
||||||
solver: 'euler'
|
|
||||||
t_scheduler: 'cosine'
|
|
||||||
training_cfg_rate: 0.2
|
|
||||||
inference_cfg_rate: 0.7
|
|
||||||
reg_loss_type: 'l1'
|
|
||||||
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
|
|
||||||
in_channels: 320
|
|
||||||
out_channels: 80
|
|
||||||
channels: [256, 256]
|
|
||||||
dropout: 0.0
|
|
||||||
attention_head_dim: 64
|
|
||||||
n_blocks: 4
|
|
||||||
num_mid_blocks: 12
|
|
||||||
num_heads: 8
|
|
||||||
act_fn: 'gelu'
|
|
||||||
|
|
||||||
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
||||||
in_channels: 80
|
|
||||||
base_channels: 512
|
|
||||||
nb_harmonics: 8
|
|
||||||
sampling_rate: !ref <sample_rate>
|
|
||||||
nsf_alpha: 0.1
|
|
||||||
nsf_sigma: 0.003
|
|
||||||
nsf_voiced_threshold: 10
|
|
||||||
upsample_rates: [8, 8]
|
|
||||||
upsample_kernel_sizes: [16, 16]
|
|
||||||
istft_params:
|
|
||||||
n_fft: 16
|
|
||||||
hop_len: 4
|
|
||||||
resblock_kernel_sizes: [3, 7, 11]
|
|
||||||
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
|
||||||
source_resblock_kernel_sizes: [7, 11]
|
|
||||||
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
|
|
||||||
lrelu_slope: 0.1
|
|
||||||
audio_limit: 0.99
|
|
||||||
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
|
|
||||||
num_class: 1
|
|
||||||
in_channels: 80
|
|
||||||
cond_channels: 512
|
|
||||||
|
|
||||||
# processor functions
|
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
|
||||||
multilingual: True
|
|
||||||
num_languages: 100
|
|
||||||
language: 'en'
|
|
||||||
task: 'transcribe'
|
|
||||||
allowed_special: 'all'
|
|
||||||
tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|
||||||
get_tokenizer: !ref <get_tokenizer>
|
|
||||||
allowed_special: !ref <allowed_special>
|
|
||||||
filter: !name:cosyvoice.dataset.processor.filter
|
|
||||||
max_length: 40960
|
|
||||||
min_length: 0
|
|
||||||
token_max_length: 200
|
|
||||||
token_min_length: 1
|
|
||||||
resample: !name:cosyvoice.dataset.processor.resample
|
|
||||||
resample_rate: !ref <sample_rate>
|
|
||||||
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|
||||||
n_fft: 1024
|
|
||||||
num_mels: 80
|
|
||||||
sampling_rate: !ref <sample_rate>
|
|
||||||
hop_size: 256
|
|
||||||
win_size: 1024
|
|
||||||
fmin: 0
|
|
||||||
fmax: 8000
|
|
||||||
center: False
|
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
|
||||||
feat_extractor: !ref <feat_extractor>
|
|
||||||
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
|
||||||
normalize: True
|
|
||||||
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
|
||||||
shuffle_size: 1000
|
|
||||||
sort: !name:cosyvoice.dataset.processor.sort
|
|
||||||
sort_size: 500 # sort_size should be less than shuffle_size
|
|
||||||
batch: !name:cosyvoice.dataset.processor.batch
|
|
||||||
batch_type: 'dynamic'
|
|
||||||
max_frames_in_batch: 2000
|
|
||||||
padding: !name:cosyvoice.dataset.processor.padding
|
|
||||||
use_spk_embedding: False # change to True during sft
|
|
||||||
|
|
||||||
# dataset processor pipeline
|
|
||||||
data_pipeline: [
|
|
||||||
!ref <parquet_opener>,
|
|
||||||
!ref <tokenize>,
|
|
||||||
!ref <filter>,
|
|
||||||
!ref <resample>,
|
|
||||||
!ref <compute_fbank>,
|
|
||||||
!ref <parse_embedding>,
|
|
||||||
!ref <shuffle>,
|
|
||||||
!ref <sort>,
|
|
||||||
!ref <batch>,
|
|
||||||
!ref <padding>,
|
|
||||||
]
|
|
||||||
|
|
||||||
# train conf
|
|
||||||
train_conf:
|
|
||||||
optim: adam
|
|
||||||
optim_conf:
|
|
||||||
lr: 0.001 # change to 1e-5 during sft
|
|
||||||
scheduler: warmuplr # change to constantlr during sft
|
|
||||||
scheduler_conf:
|
|
||||||
warmup_steps: 2500
|
|
||||||
max_epoch: 200
|
|
||||||
grad_clip: 5
|
|
||||||
accum_grad: 2
|
|
||||||
log_interval: 100
|
|
||||||
save_per_step: -1
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
{
|
|
||||||
"train_micro_batch_size_per_gpu": 1,
|
|
||||||
"gradient_accumulation_steps": 1,
|
|
||||||
"steps_per_print": 100,
|
|
||||||
"gradient_clipping": 5,
|
|
||||||
"fp16": {
|
|
||||||
"enabled": false,
|
|
||||||
"auto_cast": false,
|
|
||||||
"loss_scale": 0,
|
|
||||||
"initial_scale_power": 16,
|
|
||||||
"loss_scale_window": 256,
|
|
||||||
"hysteresis": 2,
|
|
||||||
"consecutive_hysteresis": false,
|
|
||||||
"min_loss_scale": 1
|
|
||||||
},
|
|
||||||
"bf16": {
|
|
||||||
"enabled": false
|
|
||||||
},
|
|
||||||
"zero_force_ds_cpu_optimizer": false,
|
|
||||||
"zero_optimization": {
|
|
||||||
"stage": 2,
|
|
||||||
"offload_optimizer": {
|
|
||||||
"device": "none",
|
|
||||||
"pin_memory": true
|
|
||||||
},
|
|
||||||
"allgather_partitions": true,
|
|
||||||
"allgather_bucket_size": 5e8,
|
|
||||||
"overlap_comm": false,
|
|
||||||
"reduce_scatter": true,
|
|
||||||
"reduce_bucket_size": 5e8,
|
|
||||||
"contiguous_gradients" : true
|
|
||||||
},
|
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": 0.001,
|
|
||||||
"weight_decay": 0.0001,
|
|
||||||
"torch_adam": true,
|
|
||||||
"adam_w_mode": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH
|
|
||||||
1
examples/magicdata-read/cosyvoice/path.sh
Symbolic link
1
examples/magicdata-read/cosyvoice/path.sh
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../libritts/cosyvoice/path.sh
|
||||||
@@ -83,7 +83,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
fi
|
fi
|
||||||
cp data/train/parquet/data.list data/train.data.list
|
cp data/train/parquet/data.list data/train.data.list
|
||||||
cp data/dev/parquet/data.list data/dev.data.list
|
cp data/dev/parquet/data.list data/dev.data.list
|
||||||
for model in llm flow; do
|
for model in llm flow hifigan; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
||||||
cosyvoice/bin/train.py \
|
cosyvoice/bin/train.py \
|
||||||
@@ -99,11 +99,26 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
--num_workers ${num_workers} \
|
--num_workers ${num_workers} \
|
||||||
--prefetch ${prefetch} \
|
--prefetch ${prefetch} \
|
||||||
--pin_memory \
|
--pin_memory \
|
||||||
|
--use_amp \
|
||||||
--deepspeed_config ./conf/ds_stage2.json \
|
--deepspeed_config ./conf/ds_stage2.json \
|
||||||
--deepspeed.save_states model+optimizer
|
--deepspeed.save_states model+optimizer
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# average model
|
||||||
|
average_num=5
|
||||||
|
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||||
|
for model in llm flow hifigan; do
|
||||||
|
decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
|
||||||
|
echo "do model average and final checkpoint is $decode_checkpoint"
|
||||||
|
python cosyvoice/bin/average_model.py \
|
||||||
|
--dst_model $decode_checkpoint \
|
||||||
|
--src_path `pwd`/exp/cosyvoice/$model/$train_engine \
|
||||||
|
--num ${average_num} \
|
||||||
|
--val_best
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||||
echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
|
echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
|
||||||
python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir
|
python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir
|
||||||
|
|||||||
@@ -34,10 +34,10 @@ logging.basicConfig(level=logging.DEBUG,
|
|||||||
class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
|
class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
try:
|
try:
|
||||||
self.cosyvoice = CosyVoice(args.model_dir)
|
self.cosyvoice = CosyVoice(args.model_dir, trt_concurrent=args.max_conc)
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
self.cosyvoice = CosyVoice2(args.model_dir)
|
self.cosyvoice = CosyVoice2(args.model_dir, trt_concurrent=args.max_conc)
|
||||||
except Exception:
|
except Exception:
|
||||||
raise TypeError('no valid model_type!')
|
raise TypeError('no valid model_type!')
|
||||||
logging.info('grpc service initialized')
|
logging.info('grpc service initialized')
|
||||||
|
|||||||
Reference in New Issue
Block a user