Merge branch 'main' into inference_streaming

2026-02-05 09:59:23 +08:00 · 2024-08-29 23:48:02 +08:00
parent 1ab3186799 dcc943db43
commit 662012999a
13 changed files with 750 additions and 1 deletions
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import random
 from typing import Dict, Optional
 import torch
 import torch.nn as nn
@@ -77,6 +78,11 @@ class MaskedDiffWithXvec(torch.nn.Module):

        # get conditions
        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
        conds = conds.transpose(1, 2)

        mask = (~make_pad_mask(feat_len)).to(h)
--- a/cosyvoice/transformer/encoder.py
+++ b/cosyvoice/transformer/encoder.py
@@ -299,7 +299,7 @@ class BaseEncoder(torch.nn.Module):
               rate.
            3. Currently, nn.Sequential is used to stack all the convolution
               layers in subsampling, we need to rewrite it to make it work
-               with cache, which is not prefered.
+               with cache, which is not preferred.
        Args:
            xs (torch.Tensor): (1, max_len, dim)
            chunk_size (int): decoding chunk size