mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 09:59:23 +08:00
Merge branch 'main' into inference_streaming
This commit is contained in:
@@ -12,6 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import random
|
||||
from typing import Dict, Optional
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -77,6 +78,11 @@ class MaskedDiffWithXvec(torch.nn.Module):
|
||||
|
||||
# get conditions
|
||||
conds = torch.zeros(feat.shape, device=token.device)
|
||||
for i, j in enumerate(feat_len):
|
||||
if random.random() < 0.5:
|
||||
continue
|
||||
index = random.randint(0, int(0.3 * j))
|
||||
conds[i, :index] = feat[i, :index]
|
||||
conds = conds.transpose(1, 2)
|
||||
|
||||
mask = (~make_pad_mask(feat_len)).to(h)
|
||||
|
||||
@@ -299,7 +299,7 @@ class BaseEncoder(torch.nn.Module):
|
||||
rate.
|
||||
3. Currently, nn.Sequential is used to stack all the convolution
|
||||
layers in subsampling, we need to rewrite it to make it work
|
||||
with cache, which is not prefered.
|
||||
with cache, which is not preferred.
|
||||
Args:
|
||||
xs (torch.Tensor): (1, max_len, dim)
|
||||
chunk_size (int): decoding chunk size
|
||||
|
||||
Reference in New Issue
Block a user