mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
add cosyvoice code
This commit is contained in:
0
cosyvoice/transformer/__init__.py
Normal file
0
cosyvoice/transformer/__init__.py
Normal file
84
cosyvoice/transformer/activation.py
Normal file
84
cosyvoice/transformer/activation.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
|
||||
# 2020 Northwestern Polytechnical University (Pengcheng Guo)
|
||||
# 2020 Mobvoi Inc (Binbin Zhang)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Swish() activation function for Conformer."""
|
||||
|
||||
import torch
|
||||
from torch import nn, sin, pow
|
||||
from torch.nn import Parameter
|
||||
|
||||
|
||||
class Swish(torch.nn.Module):
|
||||
"""Construct an Swish object."""
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""Return Swish activation function."""
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
|
||||
# LICENSE is in incl_licenses directory.
|
||||
class Snake(nn.Module):
|
||||
'''
|
||||
Implementation of a sine-based periodic activation function
|
||||
Shape:
|
||||
- Input: (B, C, T)
|
||||
- Output: (B, C, T), same shape as the input
|
||||
Parameters:
|
||||
- alpha - trainable parameter
|
||||
References:
|
||||
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
|
||||
https://arxiv.org/abs/2006.08195
|
||||
Examples:
|
||||
>>> a1 = snake(256)
|
||||
>>> x = torch.randn(256)
|
||||
>>> x = a1(x)
|
||||
'''
|
||||
def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
|
||||
'''
|
||||
Initialization.
|
||||
INPUT:
|
||||
- in_features: shape of the input
|
||||
- alpha: trainable parameter
|
||||
alpha is initialized to 1 by default, higher values = higher-frequency.
|
||||
alpha will be trained along with the rest of your model.
|
||||
'''
|
||||
super(Snake, self).__init__()
|
||||
self.in_features = in_features
|
||||
|
||||
# initialize alpha
|
||||
self.alpha_logscale = alpha_logscale
|
||||
if self.alpha_logscale: # log scale alphas initialized to zeros
|
||||
self.alpha = Parameter(torch.zeros(in_features) * alpha)
|
||||
else: # linear scale alphas initialized to ones
|
||||
self.alpha = Parameter(torch.ones(in_features) * alpha)
|
||||
|
||||
self.alpha.requires_grad = alpha_trainable
|
||||
|
||||
self.no_div_by_zero = 0.000000001
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
Forward pass of the function.
|
||||
Applies the function to the input elementwise.
|
||||
Snake ∶= x + 1/a * sin^2 (xa)
|
||||
'''
|
||||
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
|
||||
if self.alpha_logscale:
|
||||
alpha = torch.exp(alpha)
|
||||
x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
|
||||
|
||||
return x
|
||||
326
cosyvoice/transformer/attention.py
Normal file
326
cosyvoice/transformer/attention.py
Normal file
@@ -0,0 +1,326 @@
|
||||
# Copyright (c) 2019 Shigeki Karita
|
||||
# 2020 Mobvoi Inc (Binbin Zhang)
|
||||
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Multi-Head Attention layer definition."""
|
||||
|
||||
import math
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class MultiHeadedAttention(nn.Module):
|
||||
"""Multi-Head Attention layer.
|
||||
|
||||
Args:
|
||||
n_head (int): The number of heads.
|
||||
n_feat (int): The number of features.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_head: int,
|
||||
n_feat: int,
|
||||
dropout_rate: float,
|
||||
key_bias: bool = True):
|
||||
"""Construct an MultiHeadedAttention object."""
|
||||
super().__init__()
|
||||
assert n_feat % n_head == 0
|
||||
# We assume d_v always equals d_k
|
||||
self.d_k = n_feat // n_head
|
||||
self.h = n_head
|
||||
self.linear_q = nn.Linear(n_feat, n_feat)
|
||||
self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
|
||||
self.linear_v = nn.Linear(n_feat, n_feat)
|
||||
self.linear_out = nn.Linear(n_feat, n_feat)
|
||||
self.dropout = nn.Dropout(p=dropout_rate)
|
||||
|
||||
def forward_qkv(
|
||||
self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Transform query, key and value.
|
||||
|
||||
Args:
|
||||
query (torch.Tensor): Query tensor (#batch, time1, size).
|
||||
key (torch.Tensor): Key tensor (#batch, time2, size).
|
||||
value (torch.Tensor): Value tensor (#batch, time2, size).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Transformed query tensor, size
|
||||
(#batch, n_head, time1, d_k).
|
||||
torch.Tensor: Transformed key tensor, size
|
||||
(#batch, n_head, time2, d_k).
|
||||
torch.Tensor: Transformed value tensor, size
|
||||
(#batch, n_head, time2, d_k).
|
||||
|
||||
"""
|
||||
n_batch = query.size(0)
|
||||
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
|
||||
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
|
||||
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
|
||||
q = q.transpose(1, 2) # (batch, head, time1, d_k)
|
||||
k = k.transpose(1, 2) # (batch, head, time2, d_k)
|
||||
v = v.transpose(1, 2) # (batch, head, time2, d_k)
|
||||
|
||||
return q, k, v
|
||||
|
||||
def forward_attention(
|
||||
self,
|
||||
value: torch.Tensor,
|
||||
scores: torch.Tensor,
|
||||
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
|
||||
) -> torch.Tensor:
|
||||
"""Compute attention context vector.
|
||||
|
||||
Args:
|
||||
value (torch.Tensor): Transformed value, size
|
||||
(#batch, n_head, time2, d_k).
|
||||
scores (torch.Tensor): Attention score, size
|
||||
(#batch, n_head, time1, time2).
|
||||
mask (torch.Tensor): Mask, size (#batch, 1, time2) or
|
||||
(#batch, time1, time2), (0, 0, 0) means fake mask.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Transformed value (#batch, time1, d_model)
|
||||
weighted by the attention score (#batch, time1, time2).
|
||||
|
||||
"""
|
||||
n_batch = value.size(0)
|
||||
# NOTE(xcsong): When will `if mask.size(2) > 0` be True?
|
||||
# 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
|
||||
# 1st chunk to ease the onnx export.]
|
||||
# 2. pytorch training
|
||||
if mask.size(2) > 0: # time2 > 0
|
||||
mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
|
||||
# For last chunk, time2 might be larger than scores.size(-1)
|
||||
mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2)
|
||||
scores = scores.masked_fill(mask, -float('inf'))
|
||||
attn = torch.softmax(scores, dim=-1).masked_fill(
|
||||
mask, 0.0) # (batch, head, time1, time2)
|
||||
# NOTE(xcsong): When will `if mask.size(2) > 0` be False?
|
||||
# 1. onnx(16/-1, -1/-1, 16/0)
|
||||
# 2. jit (16/-1, -1/-1, 16/0, 16/4)
|
||||
else:
|
||||
attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
|
||||
|
||||
p_attn = self.dropout(attn)
|
||||
x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
|
||||
x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
|
||||
self.h * self.d_k)
|
||||
) # (batch, time1, d_model)
|
||||
|
||||
return self.linear_out(x) # (batch, time1, d_model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
pos_emb: torch.Tensor = torch.empty(0),
|
||||
cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Compute scaled dot product attention.
|
||||
|
||||
Args:
|
||||
query (torch.Tensor): Query tensor (#batch, time1, size).
|
||||
key (torch.Tensor): Key tensor (#batch, time2, size).
|
||||
value (torch.Tensor): Value tensor (#batch, time2, size).
|
||||
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
|
||||
(#batch, time1, time2).
|
||||
1.When applying cross attention between decoder and encoder,
|
||||
the batch padding mask for input is in (#batch, 1, T) shape.
|
||||
2.When applying self attention of encoder,
|
||||
the mask is in (#batch, T, T) shape.
|
||||
3.When applying self attention of decoder,
|
||||
the mask is in (#batch, L, L) shape.
|
||||
4.If the different position in decoder see different block
|
||||
of the encoder, such as Mocha, the passed in mask could be
|
||||
in (#batch, L, T) shape. But there is no such case in current
|
||||
Wenet.
|
||||
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
|
||||
where `cache_t == chunk_size * num_decoding_left_chunks`
|
||||
and `head * d_k == size`
|
||||
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, time1, d_model).
|
||||
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
|
||||
where `cache_t == chunk_size * num_decoding_left_chunks`
|
||||
and `head * d_k == size`
|
||||
|
||||
"""
|
||||
q, k, v = self.forward_qkv(query, key, value)
|
||||
|
||||
# NOTE(xcsong):
|
||||
# when export onnx model, for 1st chunk, we feed
|
||||
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
|
||||
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
|
||||
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
|
||||
# and we will always do splitting and
|
||||
# concatnation(this will simplify onnx export). Note that
|
||||
# it's OK to concat & split zero-shaped tensors(see code below).
|
||||
# when export jit model, for 1st chunk, we always feed
|
||||
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
|
||||
# >>> a = torch.ones((1, 2, 0, 4))
|
||||
# >>> b = torch.ones((1, 2, 3, 4))
|
||||
# >>> c = torch.cat((a, b), dim=2)
|
||||
# >>> torch.equal(b, c) # True
|
||||
# >>> d = torch.split(a, 2, dim=-1)
|
||||
# >>> torch.equal(d[0], d[1]) # True
|
||||
if cache.size(0) > 0:
|
||||
key_cache, value_cache = torch.split(cache,
|
||||
cache.size(-1) // 2,
|
||||
dim=-1)
|
||||
k = torch.cat([key_cache, k], dim=2)
|
||||
v = torch.cat([value_cache, v], dim=2)
|
||||
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
|
||||
# non-trivial to calculate `next_cache_start` here.
|
||||
new_cache = torch.cat((k, v), dim=-1)
|
||||
|
||||
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
|
||||
return self.forward_attention(v, scores, mask), new_cache
|
||||
|
||||
|
||||
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
|
||||
"""Multi-Head Attention layer with relative position encoding.
|
||||
Paper: https://arxiv.org/abs/1901.02860
|
||||
Args:
|
||||
n_head (int): The number of heads.
|
||||
n_feat (int): The number of features.
|
||||
dropout_rate (float): Dropout rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_head: int,
|
||||
n_feat: int,
|
||||
dropout_rate: float,
|
||||
key_bias: bool = True):
|
||||
"""Construct an RelPositionMultiHeadedAttention object."""
|
||||
super().__init__(n_head, n_feat, dropout_rate, key_bias)
|
||||
# linear transformation for positional encoding
|
||||
self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
|
||||
# these two learnable bias are used in matrix c and matrix d
|
||||
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
|
||||
self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
|
||||
self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
|
||||
torch.nn.init.xavier_uniform_(self.pos_bias_u)
|
||||
torch.nn.init.xavier_uniform_(self.pos_bias_v)
|
||||
|
||||
def rel_shift(self, x):
|
||||
"""Compute relative positional encoding.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
|
||||
time1 means the length of query vector.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Output tensor.
|
||||
|
||||
"""
|
||||
zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
|
||||
x_padded = torch.cat([zero_pad, x], dim=-1)
|
||||
|
||||
x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
|
||||
x = x_padded[:, :, 1:].view_as(x)[
|
||||
:, :, :, : x.size(-1) // 2 + 1
|
||||
] # only keep the positions from 0 to time2
|
||||
return x
|
||||
|
||||
def forward(
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
pos_emb: torch.Tensor = torch.empty(0),
|
||||
cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
|
||||
Args:
|
||||
query (torch.Tensor): Query tensor (#batch, time1, size).
|
||||
key (torch.Tensor): Key tensor (#batch, time2, size).
|
||||
value (torch.Tensor): Value tensor (#batch, time2, size).
|
||||
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
|
||||
(#batch, time1, time2), (0, 0, 0) means fake mask.
|
||||
pos_emb (torch.Tensor): Positional embedding tensor
|
||||
(#batch, time2, size).
|
||||
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
|
||||
where `cache_t == chunk_size * num_decoding_left_chunks`
|
||||
and `head * d_k == size`
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, time1, d_model).
|
||||
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
|
||||
where `cache_t == chunk_size * num_decoding_left_chunks`
|
||||
and `head * d_k == size`
|
||||
"""
|
||||
q, k, v = self.forward_qkv(query, key, value)
|
||||
q = q.transpose(1, 2) # (batch, time1, head, d_k)
|
||||
|
||||
# NOTE(xcsong):
|
||||
# when export onnx model, for 1st chunk, we feed
|
||||
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
|
||||
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
|
||||
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
|
||||
# and we will always do splitting and
|
||||
# concatnation(this will simplify onnx export). Note that
|
||||
# it's OK to concat & split zero-shaped tensors(see code below).
|
||||
# when export jit model, for 1st chunk, we always feed
|
||||
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
|
||||
# >>> a = torch.ones((1, 2, 0, 4))
|
||||
# >>> b = torch.ones((1, 2, 3, 4))
|
||||
# >>> c = torch.cat((a, b), dim=2)
|
||||
# >>> torch.equal(b, c) # True
|
||||
# >>> d = torch.split(a, 2, dim=-1)
|
||||
# >>> torch.equal(d[0], d[1]) # True
|
||||
if cache.size(0) > 0:
|
||||
key_cache, value_cache = torch.split(cache,
|
||||
cache.size(-1) // 2,
|
||||
dim=-1)
|
||||
k = torch.cat([key_cache, k], dim=2)
|
||||
v = torch.cat([value_cache, v], dim=2)
|
||||
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
|
||||
# non-trivial to calculate `next_cache_start` here.
|
||||
new_cache = torch.cat((k, v), dim=-1)
|
||||
|
||||
n_batch_pos = pos_emb.size(0)
|
||||
p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
|
||||
p = p.transpose(1, 2) # (batch, head, time1, d_k)
|
||||
|
||||
# (batch, head, time1, d_k)
|
||||
q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
|
||||
# (batch, head, time1, d_k)
|
||||
q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
|
||||
|
||||
# compute attention score
|
||||
# first compute matrix a and matrix c
|
||||
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
|
||||
# (batch, head, time1, time2)
|
||||
matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
|
||||
|
||||
# compute matrix b and matrix d
|
||||
# (batch, head, time1, time2)
|
||||
matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
|
||||
# NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
|
||||
if matrix_ac.shape != matrix_bd.shape:
|
||||
matrix_bd = self.rel_shift(matrix_bd)
|
||||
|
||||
scores = (matrix_ac + matrix_bd) / math.sqrt(
|
||||
self.d_k) # (batch, head, time1, time2)
|
||||
|
||||
return self.forward_attention(v, scores, mask), new_cache
|
||||
145
cosyvoice/transformer/convolution.py
Normal file
145
cosyvoice/transformer/convolution.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""ConvolutionModule definition."""
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class ConvolutionModule(nn.Module):
|
||||
"""ConvolutionModule in Conformer model."""
|
||||
|
||||
def __init__(self,
|
||||
channels: int,
|
||||
kernel_size: int = 15,
|
||||
activation: nn.Module = nn.ReLU(),
|
||||
norm: str = "batch_norm",
|
||||
causal: bool = False,
|
||||
bias: bool = True):
|
||||
"""Construct an ConvolutionModule object.
|
||||
Args:
|
||||
channels (int): The number of channels of conv layers.
|
||||
kernel_size (int): Kernel size of conv layers.
|
||||
causal (int): Whether use causal convolution or not
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.pointwise_conv1 = nn.Conv1d(
|
||||
channels,
|
||||
2 * channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias=bias,
|
||||
)
|
||||
# self.lorder is used to distinguish if it's a causal convolution,
|
||||
# if self.lorder > 0: it's a causal convolution, the input will be
|
||||
# padded with self.lorder frames on the left in forward.
|
||||
# else: it's a symmetrical convolution
|
||||
if causal:
|
||||
padding = 0
|
||||
self.lorder = kernel_size - 1
|
||||
else:
|
||||
# kernel_size should be an odd number for none causal convolution
|
||||
assert (kernel_size - 1) % 2 == 0
|
||||
padding = (kernel_size - 1) // 2
|
||||
self.lorder = 0
|
||||
self.depthwise_conv = nn.Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=padding,
|
||||
groups=channels,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
assert norm in ['batch_norm', 'layer_norm']
|
||||
if norm == "batch_norm":
|
||||
self.use_layer_norm = False
|
||||
self.norm = nn.BatchNorm1d(channels)
|
||||
else:
|
||||
self.use_layer_norm = True
|
||||
self.norm = nn.LayerNorm(channels)
|
||||
|
||||
self.pointwise_conv2 = nn.Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias=bias,
|
||||
)
|
||||
self.activation = activation
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
cache: torch.Tensor = torch.zeros((0, 0, 0)),
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Compute convolution module.
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, channels).
|
||||
mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
|
||||
(0, 0, 0) means fake mask.
|
||||
cache (torch.Tensor): left context cache, it is only
|
||||
used in causal convolution (#batch, channels, cache_t),
|
||||
(0, 0, 0) meas fake cache.
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, time, channels).
|
||||
"""
|
||||
# exchange the temporal dimension and the feature dimension
|
||||
x = x.transpose(1, 2) # (#batch, channels, time)
|
||||
|
||||
# mask batch padding
|
||||
if mask_pad.size(2) > 0: # time > 0
|
||||
x.masked_fill_(~mask_pad, 0.0)
|
||||
|
||||
if self.lorder > 0:
|
||||
if cache.size(2) == 0: # cache_t == 0
|
||||
x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
|
||||
else:
|
||||
assert cache.size(0) == x.size(0) # equal batch
|
||||
assert cache.size(1) == x.size(1) # equal channel
|
||||
x = torch.cat((cache, x), dim=2)
|
||||
assert (x.size(2) > self.lorder)
|
||||
new_cache = x[:, :, -self.lorder:]
|
||||
else:
|
||||
# It's better we just return None if no cache is required,
|
||||
# However, for JIT export, here we just fake one tensor instead of
|
||||
# None.
|
||||
new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
|
||||
|
||||
# GLU mechanism
|
||||
x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
|
||||
x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
|
||||
|
||||
# 1D Depthwise Conv
|
||||
x = self.depthwise_conv(x)
|
||||
if self.use_layer_norm:
|
||||
x = x.transpose(1, 2)
|
||||
x = self.activation(self.norm(x))
|
||||
if self.use_layer_norm:
|
||||
x = x.transpose(1, 2)
|
||||
x = self.pointwise_conv2(x)
|
||||
# mask batch padding
|
||||
if mask_pad.size(2) > 0: # time > 0
|
||||
x.masked_fill_(~mask_pad, 0.0)
|
||||
|
||||
return x.transpose(1, 2), new_cache
|
||||
396
cosyvoice/transformer/decoder.py
Normal file
396
cosyvoice/transformer/decoder.py
Normal file
@@ -0,0 +1,396 @@
|
||||
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""Decoder definition."""
|
||||
from typing import Tuple, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint as ckpt
|
||||
import logging
|
||||
|
||||
from cosyvoice.transformer.decoder_layer import DecoderLayer
|
||||
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||
from cosyvoice.utils.class_utils import (
|
||||
COSYVOICE_EMB_CLASSES,
|
||||
COSYVOICE_ATTENTION_CLASSES,
|
||||
COSYVOICE_ACTIVATION_CLASSES,
|
||||
)
|
||||
from cosyvoice.utils.mask import (subsequent_mask, make_pad_mask)
|
||||
|
||||
|
||||
class TransformerDecoder(torch.nn.Module):
|
||||
"""Base class of Transfomer decoder module.
|
||||
Args:
|
||||
vocab_size: output dim
|
||||
encoder_output_size: dimension of attention
|
||||
attention_heads: the number of heads of multi head attention
|
||||
linear_units: the hidden units number of position-wise feedforward
|
||||
num_blocks: the number of decoder blocks
|
||||
dropout_rate: dropout rate
|
||||
self_attention_dropout_rate: dropout rate for attention
|
||||
input_layer: input layer type
|
||||
use_output_layer: whether to use output layer
|
||||
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
|
||||
normalize_before:
|
||||
True: use layer_norm before each sub-block of a layer.
|
||||
False: use layer_norm after each sub-block of a layer.
|
||||
src_attention: if false, encoder-decoder cross attention is not
|
||||
applied, such as CIF model
|
||||
key_bias: whether use bias in attention.linear_k, False for whisper models.
|
||||
gradient_checkpointing: rerunning a forward-pass segment for each
|
||||
checkpointed segment during backward.
|
||||
tie_word_embedding: Tie or clone module weights depending of whether we are
|
||||
using TorchScript or not
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
encoder_output_size: int,
|
||||
attention_heads: int = 4,
|
||||
linear_units: int = 2048,
|
||||
num_blocks: int = 6,
|
||||
dropout_rate: float = 0.1,
|
||||
positional_dropout_rate: float = 0.1,
|
||||
self_attention_dropout_rate: float = 0.0,
|
||||
src_attention_dropout_rate: float = 0.0,
|
||||
input_layer: str = "embed",
|
||||
use_output_layer: bool = True,
|
||||
normalize_before: bool = True,
|
||||
src_attention: bool = True,
|
||||
key_bias: bool = True,
|
||||
activation_type: str = "relu",
|
||||
gradient_checkpointing: bool = False,
|
||||
tie_word_embedding: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
attention_dim = encoder_output_size
|
||||
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
|
||||
|
||||
self.embed = torch.nn.Sequential(
|
||||
torch.nn.Identity() if input_layer == "no_pos" else
|
||||
torch.nn.Embedding(vocab_size, attention_dim),
|
||||
COSYVOICE_EMB_CLASSES[input_layer](attention_dim,
|
||||
positional_dropout_rate),
|
||||
)
|
||||
|
||||
self.normalize_before = normalize_before
|
||||
self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
|
||||
self.use_output_layer = use_output_layer
|
||||
if use_output_layer:
|
||||
self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
|
||||
else:
|
||||
self.output_layer = torch.nn.Identity()
|
||||
self.num_blocks = num_blocks
|
||||
self.decoders = torch.nn.ModuleList([
|
||||
DecoderLayer(
|
||||
attention_dim,
|
||||
COSYVOICE_ATTENTION_CLASSES["selfattn"](
|
||||
attention_heads, attention_dim,
|
||||
self_attention_dropout_rate, key_bias),
|
||||
COSYVOICE_ATTENTION_CLASSES["selfattn"](
|
||||
attention_heads, attention_dim, src_attention_dropout_rate,
|
||||
key_bias) if src_attention else None,
|
||||
PositionwiseFeedForward(attention_dim, linear_units,
|
||||
dropout_rate, activation),
|
||||
dropout_rate,
|
||||
normalize_before,
|
||||
) for _ in range(self.num_blocks)
|
||||
])
|
||||
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.tie_word_embedding = tie_word_embedding
|
||||
|
||||
def forward(
|
||||
self,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor,
|
||||
ys_in_pad: torch.Tensor,
|
||||
ys_in_lens: torch.Tensor,
|
||||
r_ys_in_pad: torch.Tensor = torch.empty(0),
|
||||
reverse_weight: float = 0.0,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Forward decoder.
|
||||
Args:
|
||||
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
|
||||
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
|
||||
ys_in_lens: input lengths of this batch (batch)
|
||||
r_ys_in_pad: not used in transformer decoder, in order to unify api
|
||||
with bidirectional decoder
|
||||
reverse_weight: not used in transformer decoder, in order to unify
|
||||
api with bidirectional decode
|
||||
Returns:
|
||||
(tuple): tuple containing:
|
||||
x: decoded token score before softmax (batch, maxlen_out,
|
||||
vocab_size) if use_output_layer is True,
|
||||
torch.tensor(0.0), in order to unify api with bidirectional decoder
|
||||
olens: (batch, )
|
||||
NOTE(xcsong):
|
||||
We pass the `__call__` method of the modules instead of `forward` to the
|
||||
checkpointing API because `__call__` attaches all the hooks of the module.
|
||||
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
|
||||
"""
|
||||
tgt = ys_in_pad
|
||||
maxlen = tgt.size(1)
|
||||
# tgt_mask: (B, 1, L)
|
||||
tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
|
||||
tgt_mask = tgt_mask.to(tgt.device)
|
||||
# m: (1, L, L)
|
||||
m = subsequent_mask(tgt_mask.size(-1),
|
||||
device=tgt_mask.device).unsqueeze(0)
|
||||
# tgt_mask: (B, L, L)
|
||||
tgt_mask = tgt_mask & m
|
||||
x, _ = self.embed(tgt)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
x = self.forward_layers_checkpointed(x, tgt_mask, memory,
|
||||
memory_mask)
|
||||
else:
|
||||
x = self.forward_layers(x, tgt_mask, memory, memory_mask)
|
||||
if self.normalize_before:
|
||||
x = self.after_norm(x)
|
||||
if self.use_output_layer:
|
||||
x = self.output_layer(x)
|
||||
olens = tgt_mask.sum(1)
|
||||
return x, torch.tensor(0.0), olens
|
||||
|
||||
def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor) -> torch.Tensor:
|
||||
for layer in self.decoders:
|
||||
x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
|
||||
memory_mask)
|
||||
return x
|
||||
|
||||
@torch.jit.ignore(drop=True)
|
||||
def forward_layers_checkpointed(self, x: torch.Tensor,
|
||||
tgt_mask: torch.Tensor,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor) -> torch.Tensor:
|
||||
for layer in self.decoders:
|
||||
x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
|
||||
layer.__call__, x, tgt_mask, memory, memory_mask)
|
||||
return x
|
||||
|
||||
def forward_one_step(
|
||||
self,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor,
|
||||
tgt: torch.Tensor,
|
||||
tgt_mask: torch.Tensor,
|
||||
cache: Optional[List[torch.Tensor]] = None,
|
||||
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
||||
"""Forward one step.
|
||||
This is only used for decoding.
|
||||
Args:
|
||||
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
|
||||
tgt: input token ids, int64 (batch, maxlen_out)
|
||||
tgt_mask: input token mask, (batch, maxlen_out)
|
||||
dtype=torch.uint8 in PyTorch 1.2-
|
||||
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
|
||||
cache: cached output list of (batch, max_time_out-1, size)
|
||||
Returns:
|
||||
y, cache: NN output value and cache per `self.decoders`.
|
||||
y.shape` is (batch, maxlen_out, token)
|
||||
"""
|
||||
x, _ = self.embed(tgt)
|
||||
new_cache = []
|
||||
for i, decoder in enumerate(self.decoders):
|
||||
if cache is None:
|
||||
c = None
|
||||
else:
|
||||
c = cache[i]
|
||||
x, tgt_mask, memory, memory_mask = decoder(x,
|
||||
tgt_mask,
|
||||
memory,
|
||||
memory_mask,
|
||||
cache=c)
|
||||
new_cache.append(x)
|
||||
if self.normalize_before:
|
||||
y = self.after_norm(x[:, -1])
|
||||
else:
|
||||
y = x[:, -1]
|
||||
if self.use_output_layer:
|
||||
y = torch.log_softmax(self.output_layer(y), dim=-1)
|
||||
return y, new_cache
|
||||
|
||||
def tie_or_clone_weights(self, jit_mode: bool = True):
|
||||
"""Tie or clone module weights (between word_emb and output_layer)
|
||||
depending of whether we are using TorchScript or not"""
|
||||
if not self.use_output_layer:
|
||||
return
|
||||
if jit_mode:
|
||||
logging.info("clone emb.weight to output.weight")
|
||||
self.output_layer.weight = torch.nn.Parameter(
|
||||
self.embed[0].weight.clone())
|
||||
else:
|
||||
logging.info("tie emb.weight with output.weight")
|
||||
self.output_layer.weight = self.embed[0].weight
|
||||
|
||||
if getattr(self.output_layer, "bias", None) is not None:
|
||||
self.output_layer.bias.data = torch.nn.functional.pad(
|
||||
self.output_layer.bias.data,
|
||||
(
|
||||
0,
|
||||
self.output_layer.weight.shape[0] -
|
||||
self.output_layer.bias.shape[0],
|
||||
),
|
||||
"constant",
|
||||
0,
|
||||
)
|
||||
|
||||
|
||||
class BiTransformerDecoder(torch.nn.Module):
|
||||
"""Base class of Transfomer decoder module.
|
||||
Args:
|
||||
vocab_size: output dim
|
||||
encoder_output_size: dimension of attention
|
||||
attention_heads: the number of heads of multi head attention
|
||||
linear_units: the hidden units number of position-wise feedforward
|
||||
num_blocks: the number of decoder blocks
|
||||
r_num_blocks: the number of right to left decoder blocks
|
||||
dropout_rate: dropout rate
|
||||
self_attention_dropout_rate: dropout rate for attention
|
||||
input_layer: input layer type
|
||||
use_output_layer: whether to use output layer
|
||||
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
|
||||
normalize_before:
|
||||
True: use layer_norm before each sub-block of a layer.
|
||||
False: use layer_norm after each sub-block of a layer.
|
||||
key_bias: whether use bias in attention.linear_k, False for whisper models.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
encoder_output_size: int,
|
||||
attention_heads: int = 4,
|
||||
linear_units: int = 2048,
|
||||
num_blocks: int = 6,
|
||||
r_num_blocks: int = 0,
|
||||
dropout_rate: float = 0.1,
|
||||
positional_dropout_rate: float = 0.1,
|
||||
self_attention_dropout_rate: float = 0.0,
|
||||
src_attention_dropout_rate: float = 0.0,
|
||||
input_layer: str = "embed",
|
||||
use_output_layer: bool = True,
|
||||
normalize_before: bool = True,
|
||||
key_bias: bool = True,
|
||||
gradient_checkpointing: bool = False,
|
||||
tie_word_embedding: bool = False,
|
||||
):
|
||||
|
||||
super().__init__()
|
||||
self.tie_word_embedding = tie_word_embedding
|
||||
self.left_decoder = TransformerDecoder(
|
||||
vocab_size,
|
||||
encoder_output_size,
|
||||
attention_heads,
|
||||
linear_units,
|
||||
num_blocks,
|
||||
dropout_rate,
|
||||
positional_dropout_rate,
|
||||
self_attention_dropout_rate,
|
||||
src_attention_dropout_rate,
|
||||
input_layer,
|
||||
use_output_layer,
|
||||
normalize_before,
|
||||
key_bias=key_bias,
|
||||
gradient_checkpointing=gradient_checkpointing,
|
||||
tie_word_embedding=tie_word_embedding)
|
||||
|
||||
self.right_decoder = TransformerDecoder(
|
||||
vocab_size,
|
||||
encoder_output_size,
|
||||
attention_heads,
|
||||
linear_units,
|
||||
r_num_blocks,
|
||||
dropout_rate,
|
||||
positional_dropout_rate,
|
||||
self_attention_dropout_rate,
|
||||
src_attention_dropout_rate,
|
||||
input_layer,
|
||||
use_output_layer,
|
||||
normalize_before,
|
||||
key_bias=key_bias,
|
||||
gradient_checkpointing=gradient_checkpointing,
|
||||
tie_word_embedding=tie_word_embedding)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor,
|
||||
ys_in_pad: torch.Tensor,
|
||||
ys_in_lens: torch.Tensor,
|
||||
r_ys_in_pad: torch.Tensor,
|
||||
reverse_weight: float = 0.0,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Forward decoder.
|
||||
Args:
|
||||
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
|
||||
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
|
||||
ys_in_lens: input lengths of this batch (batch)
|
||||
r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
|
||||
used for right to left decoder
|
||||
reverse_weight: used for right to left decoder
|
||||
Returns:
|
||||
(tuple): tuple containing:
|
||||
x: decoded token score before softmax (batch, maxlen_out,
|
||||
vocab_size) if use_output_layer is True,
|
||||
r_x: x: decoded token score (right to left decoder)
|
||||
before softmax (batch, maxlen_out, vocab_size)
|
||||
if use_output_layer is True,
|
||||
olens: (batch, )
|
||||
"""
|
||||
l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
|
||||
ys_in_lens)
|
||||
r_x = torch.tensor(0.0)
|
||||
if reverse_weight > 0.0:
|
||||
r_x, _, olens = self.right_decoder(memory, memory_mask,
|
||||
r_ys_in_pad, ys_in_lens)
|
||||
return l_x, r_x, olens
|
||||
|
||||
def forward_one_step(
|
||||
self,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor,
|
||||
tgt: torch.Tensor,
|
||||
tgt_mask: torch.Tensor,
|
||||
cache: Optional[List[torch.Tensor]] = None,
|
||||
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
||||
"""Forward one step.
|
||||
This is only used for decoding.
|
||||
Args:
|
||||
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
|
||||
tgt: input token ids, int64 (batch, maxlen_out)
|
||||
tgt_mask: input token mask, (batch, maxlen_out)
|
||||
dtype=torch.uint8 in PyTorch 1.2-
|
||||
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
|
||||
cache: cached output list of (batch, max_time_out-1, size)
|
||||
Returns:
|
||||
y, cache: NN output value and cache per `self.decoders`.
|
||||
y.shape` is (batch, maxlen_out, token)
|
||||
"""
|
||||
return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
|
||||
tgt_mask, cache)
|
||||
|
||||
def tie_or_clone_weights(self, jit_mode: bool = True):
|
||||
"""Tie or clone module weights (between word_emb and output_layer)
|
||||
depending of whether we are using TorchScript or not"""
|
||||
self.left_decoder.tie_or_clone_weights(jit_mode)
|
||||
self.right_decoder.tie_or_clone_weights(jit_mode)
|
||||
132
cosyvoice/transformer/decoder_layer.py
Normal file
132
cosyvoice/transformer/decoder_layer.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# Copyright (c) 2019 Shigeki Karita
|
||||
# 2020 Mobvoi Inc (Binbin Zhang)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Decoder self-attention layer definition."""
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class DecoderLayer(nn.Module):
|
||||
"""Single decoder layer module.
|
||||
|
||||
Args:
|
||||
size (int): Input dimension.
|
||||
self_attn (torch.nn.Module): Self-attention module instance.
|
||||
`MultiHeadedAttention` instance can be used as the argument.
|
||||
src_attn (torch.nn.Module): Inter-attention module instance.
|
||||
`MultiHeadedAttention` instance can be used as the argument.
|
||||
If `None` is passed, Inter-attention is not used, such as
|
||||
CIF, GPT, and other decoder only model.
|
||||
feed_forward (torch.nn.Module): Feed-forward module instance.
|
||||
`PositionwiseFeedForward` instance can be used as the argument.
|
||||
dropout_rate (float): Dropout rate.
|
||||
normalize_before (bool):
|
||||
True: use layer_norm before each sub-block.
|
||||
False: to use layer_norm after each sub-block.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: int,
|
||||
self_attn: nn.Module,
|
||||
src_attn: Optional[nn.Module],
|
||||
feed_forward: nn.Module,
|
||||
dropout_rate: float,
|
||||
normalize_before: bool = True,
|
||||
):
|
||||
"""Construct an DecoderLayer object."""
|
||||
super().__init__()
|
||||
self.size = size
|
||||
self.self_attn = self_attn
|
||||
self.src_attn = src_attn
|
||||
self.feed_forward = feed_forward
|
||||
self.norm1 = nn.LayerNorm(size, eps=1e-5)
|
||||
self.norm2 = nn.LayerNorm(size, eps=1e-5)
|
||||
self.norm3 = nn.LayerNorm(size, eps=1e-5)
|
||||
self.dropout = nn.Dropout(dropout_rate)
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
def forward(
|
||||
self,
|
||||
tgt: torch.Tensor,
|
||||
tgt_mask: torch.Tensor,
|
||||
memory: torch.Tensor,
|
||||
memory_mask: torch.Tensor,
|
||||
cache: Optional[torch.Tensor] = None
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Compute decoded features.
|
||||
|
||||
Args:
|
||||
tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
|
||||
tgt_mask (torch.Tensor): Mask for input tensor
|
||||
(#batch, maxlen_out).
|
||||
memory (torch.Tensor): Encoded memory
|
||||
(#batch, maxlen_in, size).
|
||||
memory_mask (torch.Tensor): Encoded memory mask
|
||||
(#batch, maxlen_in).
|
||||
cache (torch.Tensor): cached tensors.
|
||||
(#batch, maxlen_out - 1, size).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, maxlen_out, size).
|
||||
torch.Tensor: Mask for output tensor (#batch, maxlen_out).
|
||||
torch.Tensor: Encoded memory (#batch, maxlen_in, size).
|
||||
torch.Tensor: Encoded memory mask (#batch, maxlen_in).
|
||||
|
||||
"""
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
if cache is None:
|
||||
tgt_q = tgt
|
||||
tgt_q_mask = tgt_mask
|
||||
else:
|
||||
# compute only the last frame query keeping dim: max_time_out -> 1
|
||||
assert cache.shape == (
|
||||
tgt.shape[0],
|
||||
tgt.shape[1] - 1,
|
||||
self.size,
|
||||
), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
|
||||
tgt_q = tgt[:, -1:, :]
|
||||
residual = residual[:, -1:, :]
|
||||
tgt_q_mask = tgt_mask[:, -1:, :]
|
||||
|
||||
x = residual + self.dropout(
|
||||
self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
|
||||
if not self.normalize_before:
|
||||
x = self.norm1(x)
|
||||
|
||||
if self.src_attn is not None:
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm2(x)
|
||||
x = residual + self.dropout(
|
||||
self.src_attn(x, memory, memory, memory_mask)[0])
|
||||
if not self.normalize_before:
|
||||
x = self.norm2(x)
|
||||
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm3(x)
|
||||
x = residual + self.dropout(self.feed_forward(x))
|
||||
if not self.normalize_before:
|
||||
x = self.norm3(x)
|
||||
|
||||
if cache is not None:
|
||||
x = torch.cat([cache, x], dim=1)
|
||||
|
||||
return x, tgt_mask, memory, memory_mask
|
||||
293
cosyvoice/transformer/embedding.py
Normal file
293
cosyvoice/transformer/embedding.py
Normal file
@@ -0,0 +1,293 @@
|
||||
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""Positonal Encoding Module."""
|
||||
|
||||
import math
|
||||
from typing import Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
|
||||
class PositionalEncoding(torch.nn.Module):
|
||||
"""Positional encoding.
|
||||
|
||||
:param int d_model: embedding dim
|
||||
:param float dropout_rate: dropout rate
|
||||
:param int max_len: maximum input length
|
||||
|
||||
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
|
||||
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model: int,
|
||||
dropout_rate: float,
|
||||
max_len: int = 5000,
|
||||
reverse: bool = False):
|
||||
"""Construct an PositionalEncoding object."""
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.xscale = math.sqrt(self.d_model)
|
||||
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
||||
self.max_len = max_len
|
||||
|
||||
self.pe = torch.zeros(self.max_len, self.d_model)
|
||||
position = torch.arange(0, self.max_len,
|
||||
dtype=torch.float32).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
torch.arange(0, self.d_model, 2, dtype=torch.float32) *
|
||||
-(math.log(10000.0) / self.d_model))
|
||||
self.pe[:, 0::2] = torch.sin(position * div_term)
|
||||
self.pe[:, 1::2] = torch.cos(position * div_term)
|
||||
self.pe = self.pe.unsqueeze(0)
|
||||
|
||||
def forward(self,
|
||||
x: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0) \
|
||||
-> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Add positional encoding.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input. Its shape is (batch, time, ...)
|
||||
offset (int, torch.tensor): position offset
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
|
||||
torch.Tensor: for compatibility to RelPositionalEncoding
|
||||
"""
|
||||
|
||||
self.pe = self.pe.to(x.device)
|
||||
pos_emb = self.position_encoding(offset, x.size(1), False)
|
||||
x = x * self.xscale + pos_emb
|
||||
return self.dropout(x), self.dropout(pos_emb)
|
||||
|
||||
def position_encoding(self,
|
||||
offset: Union[int, torch.Tensor],
|
||||
size: int,
|
||||
apply_dropout: bool = True) -> torch.Tensor:
|
||||
""" For getting encoding in a streaming fashion
|
||||
|
||||
Attention!!!!!
|
||||
we apply dropout only once at the whole utterance level in a none
|
||||
streaming way, but will call this function several times with
|
||||
increasing input size in a streaming scenario, so the dropout will
|
||||
be applied several times.
|
||||
|
||||
Args:
|
||||
offset (int or torch.tensor): start offset
|
||||
size (int): required size of position encoding
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Corresponding encoding
|
||||
"""
|
||||
# How to subscript a Union type:
|
||||
# https://github.com/pytorch/pytorch/issues/69434
|
||||
if isinstance(offset, int):
|
||||
assert offset + size <= self.max_len
|
||||
pos_emb = self.pe[:, offset:offset + size]
|
||||
elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar
|
||||
assert offset + size <= self.max_len
|
||||
pos_emb = self.pe[:, offset:offset + size]
|
||||
else: # for batched streaming decoding on GPU
|
||||
assert torch.max(offset) + size <= self.max_len
|
||||
index = offset.unsqueeze(1) + \
|
||||
torch.arange(0, size).to(offset.device) # B X T
|
||||
flag = index > 0
|
||||
# remove negative offset
|
||||
index = index * flag
|
||||
pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model
|
||||
|
||||
if apply_dropout:
|
||||
pos_emb = self.dropout(pos_emb)
|
||||
return pos_emb
|
||||
|
||||
|
||||
class RelPositionalEncoding(PositionalEncoding):
|
||||
"""Relative positional encoding module.
|
||||
See : Appendix B in https://arxiv.org/abs/1901.02860
|
||||
Args:
|
||||
d_model (int): Embedding dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
max_len (int): Maximum input length.
|
||||
"""
|
||||
|
||||
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
|
||||
"""Initialize class."""
|
||||
super().__init__(d_model, dropout_rate, max_len, reverse=True)
|
||||
|
||||
def forward(self,
|
||||
x: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0) \
|
||||
-> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Compute positional encoding.
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (batch, time, `*`).
|
||||
Returns:
|
||||
torch.Tensor: Encoded tensor (batch, time, `*`).
|
||||
torch.Tensor: Positional embedding tensor (1, time, `*`).
|
||||
"""
|
||||
self.pe = self.pe.to(x.device)
|
||||
x = x * self.xscale
|
||||
pos_emb = self.position_encoding(offset, x.size(1), False)
|
||||
return self.dropout(x), self.dropout(pos_emb)
|
||||
|
||||
|
||||
class WhisperPositionalEncoding(PositionalEncoding):
|
||||
""" Sinusoids position encoding used in openai-whisper.encoder
|
||||
"""
|
||||
|
||||
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
|
||||
super().__init__(d_model, dropout_rate, max_len)
|
||||
self.xscale = 1.0
|
||||
log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
|
||||
inv_timescales = torch.exp(-log_timescale_increment *
|
||||
torch.arange(d_model // 2))
|
||||
scaled_time = torch.arange(max_len)[:, np.newaxis] * \
|
||||
inv_timescales[np.newaxis, :]
|
||||
pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
|
||||
delattr(self, "pe")
|
||||
self.register_buffer("pe", pe.unsqueeze(0))
|
||||
|
||||
|
||||
class LearnablePositionalEncoding(PositionalEncoding):
|
||||
""" Learnable position encoding used in openai-whisper.decoder
|
||||
"""
|
||||
|
||||
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
|
||||
super().__init__(d_model, dropout_rate, max_len)
|
||||
# NOTE(xcsong): overwrite self.pe & self.xscale
|
||||
self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
|
||||
self.xscale = 1.0
|
||||
|
||||
|
||||
class NoPositionalEncoding(torch.nn.Module):
|
||||
""" No position encoding
|
||||
"""
|
||||
|
||||
def __init__(self, d_model: int, dropout_rate: float):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
||||
|
||||
def forward(self,
|
||||
x: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0) \
|
||||
-> Tuple[torch.Tensor, torch.Tensor]:
|
||||
""" Just return zero vector for interface compatibility
|
||||
"""
|
||||
pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
|
||||
return self.dropout(x), pos_emb
|
||||
|
||||
def position_encoding(self, offset: Union[int, torch.Tensor],
|
||||
size: int) -> torch.Tensor:
|
||||
return torch.zeros(1, size, self.d_model)
|
||||
|
||||
|
||||
class EspnetRelPositionalEncoding(torch.nn.Module):
|
||||
"""Relative positional encoding module (new implementation).
|
||||
|
||||
Details can be found in https://github.com/espnet/espnet/pull/2816.
|
||||
|
||||
See : Appendix B in https://arxiv.org/abs/1901.02860
|
||||
|
||||
Args:
|
||||
d_model (int): Embedding dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
max_len (int): Maximum input length.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, dropout_rate, max_len=5000):
|
||||
"""Construct an PositionalEncoding object."""
|
||||
super(EspnetRelPositionalEncoding, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.xscale = math.sqrt(self.d_model)
|
||||
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
||||
self.pe = None
|
||||
self.extend_pe(torch.tensor(0.0).expand(1, max_len))
|
||||
|
||||
def extend_pe(self, x):
|
||||
"""Reset the positional encodings."""
|
||||
if self.pe is not None:
|
||||
# self.pe contains both positive and negative parts
|
||||
# the length of self.pe is 2 * input_len - 1
|
||||
if self.pe.size(1) >= x.size(1) * 2 - 1:
|
||||
if self.pe.dtype != x.dtype or self.pe.device != x.device:
|
||||
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
||||
return
|
||||
# Suppose `i` means to the position of query vecotr and `j` means the
|
||||
# position of key vector. We use position relative positions when keys
|
||||
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
||||
pe_positive = torch.zeros(x.size(1), self.d_model)
|
||||
pe_negative = torch.zeros(x.size(1), self.d_model)
|
||||
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
torch.arange(0, self.d_model, 2, dtype=torch.float32)
|
||||
* -(math.log(10000.0) / self.d_model)
|
||||
)
|
||||
pe_positive[:, 0::2] = torch.sin(position * div_term)
|
||||
pe_positive[:, 1::2] = torch.cos(position * div_term)
|
||||
pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
|
||||
pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
|
||||
|
||||
# Reserve the order of positive indices and concat both positive and
|
||||
# negative indices. This is used to support the shifting trick
|
||||
# as in https://arxiv.org/abs/1901.02860
|
||||
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
|
||||
pe_negative = pe_negative[1:].unsqueeze(0)
|
||||
pe = torch.cat([pe_positive, pe_negative], dim=1)
|
||||
self.pe = pe.to(device=x.device, dtype=x.dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
|
||||
"""Add positional encoding.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (batch, time, `*`).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Encoded tensor (batch, time, `*`).
|
||||
|
||||
"""
|
||||
self.extend_pe(x)
|
||||
x = x * self.xscale
|
||||
pos_emb = self.position_encoding(size=x.size(1), offset=offset)
|
||||
return self.dropout(x), self.dropout(pos_emb)
|
||||
|
||||
def position_encoding(self,
|
||||
offset: Union[int, torch.Tensor],
|
||||
size: int) -> torch.Tensor:
|
||||
""" For getting encoding in a streaming fashion
|
||||
|
||||
Attention!!!!!
|
||||
we apply dropout only once at the whole utterance level in a none
|
||||
streaming way, but will call this function several times with
|
||||
increasing input size in a streaming scenario, so the dropout will
|
||||
be applied several times.
|
||||
|
||||
Args:
|
||||
offset (int or torch.tensor): start offset
|
||||
size (int): required size of position encoding
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Corresponding encoding
|
||||
"""
|
||||
pos_emb = self.pe[
|
||||
:,
|
||||
self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
|
||||
]
|
||||
return pos_emb
|
||||
472
cosyvoice/transformer/encoder.py
Normal file
472
cosyvoice/transformer/encoder.py
Normal file
@@ -0,0 +1,472 @@
|
||||
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
||||
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""Encoder definition."""
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint as ckpt
|
||||
|
||||
from cosyvoice.transformer.convolution import ConvolutionModule
|
||||
from cosyvoice.transformer.encoder_layer import TransformerEncoderLayer
|
||||
from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
|
||||
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||
from cosyvoice.utils.class_utils import (
|
||||
COSYVOICE_EMB_CLASSES,
|
||||
COSYVOICE_SUBSAMPLE_CLASSES,
|
||||
COSYVOICE_ATTENTION_CLASSES,
|
||||
COSYVOICE_ACTIVATION_CLASSES,
|
||||
)
|
||||
from cosyvoice.utils.mask import make_pad_mask
|
||||
from cosyvoice.utils.mask import add_optional_chunk_mask
|
||||
|
||||
|
||||
class BaseEncoder(torch.nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int = 256,
|
||||
attention_heads: int = 4,
|
||||
linear_units: int = 2048,
|
||||
num_blocks: int = 6,
|
||||
dropout_rate: float = 0.1,
|
||||
positional_dropout_rate: float = 0.1,
|
||||
attention_dropout_rate: float = 0.0,
|
||||
input_layer: str = "conv2d",
|
||||
pos_enc_layer_type: str = "abs_pos",
|
||||
normalize_before: bool = True,
|
||||
static_chunk_size: int = 0,
|
||||
use_dynamic_chunk: bool = False,
|
||||
global_cmvn: torch.nn.Module = None,
|
||||
use_dynamic_left_chunk: bool = False,
|
||||
gradient_checkpointing: bool = False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
input_size (int): input dim
|
||||
output_size (int): dimension of attention
|
||||
attention_heads (int): the number of heads of multi head attention
|
||||
linear_units (int): the hidden units number of position-wise feed
|
||||
forward
|
||||
num_blocks (int): the number of decoder blocks
|
||||
dropout_rate (float): dropout rate
|
||||
attention_dropout_rate (float): dropout rate in attention
|
||||
positional_dropout_rate (float): dropout rate after adding
|
||||
positional encoding
|
||||
input_layer (str): input layer type.
|
||||
optional [linear, conv2d, conv2d6, conv2d8]
|
||||
pos_enc_layer_type (str): Encoder positional encoding layer type.
|
||||
opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
|
||||
normalize_before (bool):
|
||||
True: use layer_norm before each sub-block of a layer.
|
||||
False: use layer_norm after each sub-block of a layer.
|
||||
static_chunk_size (int): chunk size for static chunk training and
|
||||
decoding
|
||||
use_dynamic_chunk (bool): whether use dynamic chunk size for
|
||||
training or not, You can only use fixed chunk(chunk_size > 0)
|
||||
or dyanmic chunk size(use_dynamic_chunk = True)
|
||||
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
|
||||
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
|
||||
dynamic chunk training
|
||||
key_bias: whether use bias in attention.linear_k, False for whisper models.
|
||||
gradient_checkpointing: rerunning a forward-pass segment for each
|
||||
checkpointed segment during backward.
|
||||
"""
|
||||
super().__init__()
|
||||
self._output_size = output_size
|
||||
|
||||
self.global_cmvn = global_cmvn
|
||||
self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
|
||||
input_size,
|
||||
output_size,
|
||||
dropout_rate,
|
||||
COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
|
||||
positional_dropout_rate),
|
||||
)
|
||||
|
||||
self.normalize_before = normalize_before
|
||||
self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
|
||||
self.static_chunk_size = static_chunk_size
|
||||
self.use_dynamic_chunk = use_dynamic_chunk
|
||||
self.use_dynamic_left_chunk = use_dynamic_left_chunk
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
def output_size(self) -> int:
|
||||
return self._output_size
|
||||
|
||||
def forward(
|
||||
self,
|
||||
xs: torch.Tensor,
|
||||
xs_lens: torch.Tensor,
|
||||
decoding_chunk_size: int = 0,
|
||||
num_decoding_left_chunks: int = -1,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Embed positions in tensor.
|
||||
|
||||
Args:
|
||||
xs: padded input tensor (B, T, D)
|
||||
xs_lens: input length (B)
|
||||
decoding_chunk_size: decoding chunk size for dynamic chunk
|
||||
0: default for training, use random dynamic chunk.
|
||||
<0: for decoding, use full chunk.
|
||||
>0: for decoding, use fixed chunk size as set.
|
||||
num_decoding_left_chunks: number of left chunks, this is for decoding,
|
||||
the chunk size is decoding_chunk_size.
|
||||
>=0: use num_decoding_left_chunks
|
||||
<0: use all left chunks
|
||||
Returns:
|
||||
encoder output tensor xs, and subsampled masks
|
||||
xs: padded output tensor (B, T' ~= T/subsample_rate, D)
|
||||
masks: torch.Tensor batch padding mask after subsample
|
||||
(B, 1, T' ~= T/subsample_rate)
|
||||
NOTE(xcsong):
|
||||
We pass the `__call__` method of the modules instead of `forward` to the
|
||||
checkpointing API because `__call__` attaches all the hooks of the module.
|
||||
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
|
||||
"""
|
||||
T = xs.size(1)
|
||||
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
||||
if self.global_cmvn is not None:
|
||||
xs = self.global_cmvn(xs)
|
||||
xs, pos_emb, masks = self.embed(xs, masks)
|
||||
mask_pad = masks # (B, 1, T/subsample_rate)
|
||||
chunk_masks = add_optional_chunk_mask(xs, masks,
|
||||
self.use_dynamic_chunk,
|
||||
self.use_dynamic_left_chunk,
|
||||
decoding_chunk_size,
|
||||
self.static_chunk_size,
|
||||
num_decoding_left_chunks)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
|
||||
mask_pad)
|
||||
else:
|
||||
xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
|
||||
if self.normalize_before:
|
||||
xs = self.after_norm(xs)
|
||||
# Here we assume the mask is not changed in encoder layers, so just
|
||||
# return the masks before encoder layers, and the masks will be used
|
||||
# for cross attention with decoder later
|
||||
return xs, masks
|
||||
|
||||
def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
|
||||
pos_emb: torch.Tensor,
|
||||
mask_pad: torch.Tensor) -> torch.Tensor:
|
||||
for layer in self.encoders:
|
||||
xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
|
||||
return xs
|
||||
|
||||
@torch.jit.ignore(drop=True)
|
||||
def forward_layers_checkpointed(self, xs: torch.Tensor,
|
||||
chunk_masks: torch.Tensor,
|
||||
pos_emb: torch.Tensor,
|
||||
mask_pad: torch.Tensor) -> torch.Tensor:
|
||||
for layer in self.encoders:
|
||||
xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs,
|
||||
chunk_masks, pos_emb,
|
||||
mask_pad)
|
||||
return xs
|
||||
|
||||
def forward_chunk(
|
||||
self,
|
||||
xs: torch.Tensor,
|
||||
offset: int,
|
||||
required_cache_size: int,
|
||||
att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
|
||||
cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
|
||||
att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
""" Forward just one chunk
|
||||
|
||||
Args:
|
||||
xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
|
||||
where `time == (chunk_size - 1) * subsample_rate + \
|
||||
subsample.right_context + 1`
|
||||
offset (int): current offset in encoder output time stamp
|
||||
required_cache_size (int): cache size required for next chunk
|
||||
compuation
|
||||
>=0: actual cache size
|
||||
<0: means all history cache is required
|
||||
att_cache (torch.Tensor): cache tensor for KEY & VALUE in
|
||||
transformer/conformer attention, with shape
|
||||
(elayers, head, cache_t1, d_k * 2), where
|
||||
`head * d_k == hidden-dim` and
|
||||
`cache_t1 == chunk_size * num_decoding_left_chunks`.
|
||||
cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
|
||||
(elayers, b=1, hidden-dim, cache_t2), where
|
||||
`cache_t2 == cnn.lorder - 1`
|
||||
|
||||
Returns:
|
||||
torch.Tensor: output of current input xs,
|
||||
with shape (b=1, chunk_size, hidden-dim).
|
||||
torch.Tensor: new attention cache required for next chunk, with
|
||||
dynamic shape (elayers, head, ?, d_k * 2)
|
||||
depending on required_cache_size.
|
||||
torch.Tensor: new conformer cnn cache required for next chunk, with
|
||||
same shape as the original cnn_cache.
|
||||
|
||||
"""
|
||||
assert xs.size(0) == 1
|
||||
# tmp_masks is just for interface compatibility
|
||||
tmp_masks = torch.ones(1,
|
||||
xs.size(1),
|
||||
device=xs.device,
|
||||
dtype=torch.bool)
|
||||
tmp_masks = tmp_masks.unsqueeze(1)
|
||||
if self.global_cmvn is not None:
|
||||
xs = self.global_cmvn(xs)
|
||||
# NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
|
||||
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
|
||||
# NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim)
|
||||
elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
|
||||
chunk_size = xs.size(1)
|
||||
attention_key_size = cache_t1 + chunk_size
|
||||
pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
|
||||
size=attention_key_size)
|
||||
if required_cache_size < 0:
|
||||
next_cache_start = 0
|
||||
elif required_cache_size == 0:
|
||||
next_cache_start = attention_key_size
|
||||
else:
|
||||
next_cache_start = max(attention_key_size - required_cache_size, 0)
|
||||
r_att_cache = []
|
||||
r_cnn_cache = []
|
||||
for i, layer in enumerate(self.encoders):
|
||||
# NOTE(xcsong): Before layer.forward
|
||||
# shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
|
||||
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
|
||||
xs, _, new_att_cache, new_cnn_cache = layer(
|
||||
xs,
|
||||
att_mask,
|
||||
pos_emb,
|
||||
att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
|
||||
cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
|
||||
# NOTE(xcsong): After layer.forward
|
||||
# shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
|
||||
# shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
|
||||
r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
|
||||
r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
|
||||
if self.normalize_before:
|
||||
xs = self.after_norm(xs)
|
||||
|
||||
# NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
|
||||
# ? may be larger than cache_t1, it depends on required_cache_size
|
||||
r_att_cache = torch.cat(r_att_cache, dim=0)
|
||||
# NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
|
||||
r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
|
||||
|
||||
return (xs, r_att_cache, r_cnn_cache)
|
||||
|
||||
def forward_chunk_by_chunk(
|
||||
self,
|
||||
xs: torch.Tensor,
|
||||
decoding_chunk_size: int,
|
||||
num_decoding_left_chunks: int = -1,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
""" Forward input chunk by chunk with chunk_size like a streaming
|
||||
fashion
|
||||
|
||||
Here we should pay special attention to computation cache in the
|
||||
streaming style forward chunk by chunk. Three things should be taken
|
||||
into account for computation in the current network:
|
||||
1. transformer/conformer encoder layers output cache
|
||||
2. convolution in conformer
|
||||
3. convolution in subsampling
|
||||
|
||||
However, we don't implement subsampling cache for:
|
||||
1. We can control subsampling module to output the right result by
|
||||
overlapping input instead of cache left context, even though it
|
||||
wastes some computation, but subsampling only takes a very
|
||||
small fraction of computation in the whole model.
|
||||
2. Typically, there are several covolution layers with subsampling
|
||||
in subsampling module, it is tricky and complicated to do cache
|
||||
with different convolution layers with different subsampling
|
||||
rate.
|
||||
3. Currently, nn.Sequential is used to stack all the convolution
|
||||
layers in subsampling, we need to rewrite it to make it work
|
||||
with cache, which is not prefered.
|
||||
Args:
|
||||
xs (torch.Tensor): (1, max_len, dim)
|
||||
chunk_size (int): decoding chunk size
|
||||
"""
|
||||
assert decoding_chunk_size > 0
|
||||
# The model is trained by static or dynamic chunk
|
||||
assert self.static_chunk_size > 0 or self.use_dynamic_chunk
|
||||
subsampling = self.embed.subsampling_rate
|
||||
context = self.embed.right_context + 1 # Add current frame
|
||||
stride = subsampling * decoding_chunk_size
|
||||
decoding_window = (decoding_chunk_size - 1) * subsampling + context
|
||||
num_frames = xs.size(1)
|
||||
att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
|
||||
cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
|
||||
outputs = []
|
||||
offset = 0
|
||||
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
|
||||
|
||||
# Feed forward overlap input step by step
|
||||
for cur in range(0, num_frames - context + 1, stride):
|
||||
end = min(cur + decoding_window, num_frames)
|
||||
chunk_xs = xs[:, cur:end, :]
|
||||
(y, att_cache,
|
||||
cnn_cache) = self.forward_chunk(chunk_xs, offset,
|
||||
required_cache_size, att_cache,
|
||||
cnn_cache)
|
||||
outputs.append(y)
|
||||
offset += y.size(1)
|
||||
ys = torch.cat(outputs, 1)
|
||||
masks = torch.ones((1, 1, ys.size(1)),
|
||||
device=ys.device,
|
||||
dtype=torch.bool)
|
||||
return ys, masks
|
||||
|
||||
|
||||
class TransformerEncoder(BaseEncoder):
|
||||
"""Transformer encoder module."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int = 256,
|
||||
attention_heads: int = 4,
|
||||
linear_units: int = 2048,
|
||||
num_blocks: int = 6,
|
||||
dropout_rate: float = 0.1,
|
||||
positional_dropout_rate: float = 0.1,
|
||||
attention_dropout_rate: float = 0.0,
|
||||
input_layer: str = "conv2d",
|
||||
pos_enc_layer_type: str = "abs_pos",
|
||||
normalize_before: bool = True,
|
||||
static_chunk_size: int = 0,
|
||||
use_dynamic_chunk: bool = False,
|
||||
global_cmvn: torch.nn.Module = None,
|
||||
use_dynamic_left_chunk: bool = False,
|
||||
key_bias: bool = True,
|
||||
selfattention_layer_type: str = "selfattn",
|
||||
activation_type: str = "relu",
|
||||
gradient_checkpointing: bool = False,
|
||||
):
|
||||
""" Construct TransformerEncoder
|
||||
|
||||
See Encoder for the meaning of each parameter.
|
||||
"""
|
||||
super().__init__(input_size, output_size, attention_heads,
|
||||
linear_units, num_blocks, dropout_rate,
|
||||
positional_dropout_rate, attention_dropout_rate,
|
||||
input_layer, pos_enc_layer_type, normalize_before,
|
||||
static_chunk_size, use_dynamic_chunk, global_cmvn,
|
||||
use_dynamic_left_chunk, gradient_checkpointing)
|
||||
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
|
||||
self.encoders = torch.nn.ModuleList([
|
||||
TransformerEncoderLayer(
|
||||
output_size,
|
||||
COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](attention_heads,
|
||||
output_size,
|
||||
attention_dropout_rate,
|
||||
key_bias),
|
||||
PositionwiseFeedForward(output_size, linear_units,
|
||||
dropout_rate, activation),
|
||||
dropout_rate, normalize_before) for _ in range(num_blocks)
|
||||
])
|
||||
|
||||
|
||||
class ConformerEncoder(BaseEncoder):
|
||||
"""Conformer encoder module."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size: int,
|
||||
output_size: int = 256,
|
||||
attention_heads: int = 4,
|
||||
linear_units: int = 2048,
|
||||
num_blocks: int = 6,
|
||||
dropout_rate: float = 0.1,
|
||||
positional_dropout_rate: float = 0.1,
|
||||
attention_dropout_rate: float = 0.0,
|
||||
input_layer: str = "conv2d",
|
||||
pos_enc_layer_type: str = "rel_pos",
|
||||
normalize_before: bool = True,
|
||||
static_chunk_size: int = 0,
|
||||
use_dynamic_chunk: bool = False,
|
||||
global_cmvn: torch.nn.Module = None,
|
||||
use_dynamic_left_chunk: bool = False,
|
||||
positionwise_conv_kernel_size: int = 1,
|
||||
macaron_style: bool = True,
|
||||
selfattention_layer_type: str = "rel_selfattn",
|
||||
activation_type: str = "swish",
|
||||
use_cnn_module: bool = True,
|
||||
cnn_module_kernel: int = 15,
|
||||
causal: bool = False,
|
||||
cnn_module_norm: str = "batch_norm",
|
||||
key_bias: bool = True,
|
||||
gradient_checkpointing: bool = False,
|
||||
):
|
||||
"""Construct ConformerEncoder
|
||||
|
||||
Args:
|
||||
input_size to use_dynamic_chunk, see in BaseEncoder
|
||||
positionwise_conv_kernel_size (int): Kernel size of positionwise
|
||||
conv1d layer.
|
||||
macaron_style (bool): Whether to use macaron style for
|
||||
positionwise layer.
|
||||
selfattention_layer_type (str): Encoder attention layer type,
|
||||
the parameter has no effect now, it's just for configure
|
||||
compatibility.
|
||||
activation_type (str): Encoder activation function type.
|
||||
use_cnn_module (bool): Whether to use convolution module.
|
||||
cnn_module_kernel (int): Kernel size of convolution module.
|
||||
causal (bool): whether to use causal convolution or not.
|
||||
key_bias: whether use bias in attention.linear_k, False for whisper models.
|
||||
"""
|
||||
super().__init__(input_size, output_size, attention_heads,
|
||||
linear_units, num_blocks, dropout_rate,
|
||||
positional_dropout_rate, attention_dropout_rate,
|
||||
input_layer, pos_enc_layer_type, normalize_before,
|
||||
static_chunk_size, use_dynamic_chunk, global_cmvn,
|
||||
use_dynamic_left_chunk, gradient_checkpointing)
|
||||
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
|
||||
|
||||
# self-attention module definition
|
||||
encoder_selfattn_layer_args = (
|
||||
attention_heads,
|
||||
output_size,
|
||||
attention_dropout_rate,
|
||||
key_bias,
|
||||
)
|
||||
# feed-forward module definition
|
||||
positionwise_layer_args = (
|
||||
output_size,
|
||||
linear_units,
|
||||
dropout_rate,
|
||||
activation,
|
||||
)
|
||||
# convolution module definition
|
||||
convolution_layer_args = (output_size, cnn_module_kernel, activation,
|
||||
cnn_module_norm, causal)
|
||||
|
||||
self.encoders = torch.nn.ModuleList([
|
||||
ConformerEncoderLayer(
|
||||
output_size,
|
||||
COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
|
||||
*encoder_selfattn_layer_args),
|
||||
PositionwiseFeedForward(*positionwise_layer_args),
|
||||
PositionwiseFeedForward(
|
||||
*positionwise_layer_args) if macaron_style else None,
|
||||
ConvolutionModule(
|
||||
*convolution_layer_args) if use_cnn_module else None,
|
||||
dropout_rate,
|
||||
normalize_before,
|
||||
) for _ in range(num_blocks)
|
||||
])
|
||||
236
cosyvoice/transformer/encoder_layer.py
Normal file
236
cosyvoice/transformer/encoder_layer.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
||||
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""Encoder self-attention layer definition."""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class TransformerEncoderLayer(nn.Module):
|
||||
"""Encoder layer module.
|
||||
|
||||
Args:
|
||||
size (int): Input dimension.
|
||||
self_attn (torch.nn.Module): Self-attention module instance.
|
||||
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
|
||||
instance can be used as the argument.
|
||||
feed_forward (torch.nn.Module): Feed-forward module instance.
|
||||
`PositionwiseFeedForward`, instance can be used as the argument.
|
||||
dropout_rate (float): Dropout rate.
|
||||
normalize_before (bool):
|
||||
True: use layer_norm before each sub-block.
|
||||
False: to use layer_norm after each sub-block.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: int,
|
||||
self_attn: torch.nn.Module,
|
||||
feed_forward: torch.nn.Module,
|
||||
dropout_rate: float,
|
||||
normalize_before: bool = True,
|
||||
):
|
||||
"""Construct an EncoderLayer object."""
|
||||
super().__init__()
|
||||
self.self_attn = self_attn
|
||||
self.feed_forward = feed_forward
|
||||
self.norm1 = nn.LayerNorm(size, eps=1e-5)
|
||||
self.norm2 = nn.LayerNorm(size, eps=1e-5)
|
||||
self.dropout = nn.Dropout(dropout_rate)
|
||||
self.size = size
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
pos_emb: torch.Tensor,
|
||||
mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
|
||||
cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Compute encoded features.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): (#batch, time, size)
|
||||
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
|
||||
(0, 0, 0) means fake mask.
|
||||
pos_emb (torch.Tensor): just for interface compatibility
|
||||
to ConformerEncoderLayer
|
||||
mask_pad (torch.Tensor): does not used in transformer layer,
|
||||
just for unified api with conformer.
|
||||
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
|
||||
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
|
||||
cnn_cache (torch.Tensor): Convolution cache in conformer layer
|
||||
(#batch=1, size, cache_t2), not used here, it's for interface
|
||||
compatibility to ConformerEncoderLayer.
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, time, size).
|
||||
torch.Tensor: Mask tensor (#batch, time, time).
|
||||
torch.Tensor: att_cache tensor,
|
||||
(#batch=1, head, cache_t1 + time, d_k * 2).
|
||||
torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
|
||||
|
||||
"""
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm1(x)
|
||||
x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb=pos_emb, cache=att_cache)
|
||||
x = residual + self.dropout(x_att)
|
||||
if not self.normalize_before:
|
||||
x = self.norm1(x)
|
||||
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm2(x)
|
||||
x = residual + self.dropout(self.feed_forward(x))
|
||||
if not self.normalize_before:
|
||||
x = self.norm2(x)
|
||||
|
||||
fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
|
||||
return x, mask, new_att_cache, fake_cnn_cache
|
||||
|
||||
|
||||
class ConformerEncoderLayer(nn.Module):
|
||||
"""Encoder layer module.
|
||||
Args:
|
||||
size (int): Input dimension.
|
||||
self_attn (torch.nn.Module): Self-attention module instance.
|
||||
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
|
||||
instance can be used as the argument.
|
||||
feed_forward (torch.nn.Module): Feed-forward module instance.
|
||||
`PositionwiseFeedForward` instance can be used as the argument.
|
||||
feed_forward_macaron (torch.nn.Module): Additional feed-forward module
|
||||
instance.
|
||||
`PositionwiseFeedForward` instance can be used as the argument.
|
||||
conv_module (torch.nn.Module): Convolution module instance.
|
||||
`ConvlutionModule` instance can be used as the argument.
|
||||
dropout_rate (float): Dropout rate.
|
||||
normalize_before (bool):
|
||||
True: use layer_norm before each sub-block.
|
||||
False: use layer_norm after each sub-block.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: int,
|
||||
self_attn: torch.nn.Module,
|
||||
feed_forward: Optional[nn.Module] = None,
|
||||
feed_forward_macaron: Optional[nn.Module] = None,
|
||||
conv_module: Optional[nn.Module] = None,
|
||||
dropout_rate: float = 0.1,
|
||||
normalize_before: bool = True,
|
||||
):
|
||||
"""Construct an EncoderLayer object."""
|
||||
super().__init__()
|
||||
self.self_attn = self_attn
|
||||
self.feed_forward = feed_forward
|
||||
self.feed_forward_macaron = feed_forward_macaron
|
||||
self.conv_module = conv_module
|
||||
self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module
|
||||
self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module
|
||||
if feed_forward_macaron is not None:
|
||||
self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
|
||||
self.ff_scale = 0.5
|
||||
else:
|
||||
self.ff_scale = 1.0
|
||||
if self.conv_module is not None:
|
||||
self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module
|
||||
self.norm_final = nn.LayerNorm(
|
||||
size, eps=1e-5) # for the final output of the block
|
||||
self.dropout = nn.Dropout(dropout_rate)
|
||||
self.size = size
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
pos_emb: torch.Tensor,
|
||||
mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
|
||||
att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
|
||||
cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Compute encoded features.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): (#batch, time, size)
|
||||
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
|
||||
(0, 0, 0) means fake mask.
|
||||
pos_emb (torch.Tensor): positional encoding, must not be None
|
||||
for ConformerEncoderLayer.
|
||||
mask_pad (torch.Tensor): batch padding mask used for conv module.
|
||||
(#batch, 1,time), (0, 0, 0) means fake mask.
|
||||
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
|
||||
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
|
||||
cnn_cache (torch.Tensor): Convolution cache in conformer layer
|
||||
(#batch=1, size, cache_t2)
|
||||
Returns:
|
||||
torch.Tensor: Output tensor (#batch, time, size).
|
||||
torch.Tensor: Mask tensor (#batch, time, time).
|
||||
torch.Tensor: att_cache tensor,
|
||||
(#batch=1, head, cache_t1 + time, d_k * 2).
|
||||
torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
|
||||
"""
|
||||
|
||||
# whether to use macaron style
|
||||
if self.feed_forward_macaron is not None:
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm_ff_macaron(x)
|
||||
x = residual + self.ff_scale * self.dropout(
|
||||
self.feed_forward_macaron(x))
|
||||
if not self.normalize_before:
|
||||
x = self.norm_ff_macaron(x)
|
||||
|
||||
# multi-headed self-attention module
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm_mha(x)
|
||||
x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
|
||||
att_cache)
|
||||
x = residual + self.dropout(x_att)
|
||||
if not self.normalize_before:
|
||||
x = self.norm_mha(x)
|
||||
|
||||
# convolution module
|
||||
# Fake new cnn cache here, and then change it in conv_module
|
||||
new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
|
||||
if self.conv_module is not None:
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm_conv(x)
|
||||
x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
|
||||
x = residual + self.dropout(x)
|
||||
|
||||
if not self.normalize_before:
|
||||
x = self.norm_conv(x)
|
||||
|
||||
# feed forward module
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
x = self.norm_ff(x)
|
||||
|
||||
x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
|
||||
if not self.normalize_before:
|
||||
x = self.norm_ff(x)
|
||||
|
||||
if self.conv_module is not None:
|
||||
x = self.norm_final(x)
|
||||
|
||||
return x, mask, new_att_cache, new_cnn_cache
|
||||
96
cosyvoice/transformer/label_smoothing_loss.py
Normal file
96
cosyvoice/transformer/label_smoothing_loss.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# Copyright (c) 2019 Shigeki Karita
|
||||
# 2020 Mobvoi Inc (Binbin Zhang)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Label smoothing module."""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class LabelSmoothingLoss(nn.Module):
|
||||
"""Label-smoothing loss.
|
||||
|
||||
In a standard CE loss, the label's data distribution is:
|
||||
[0,1,2] ->
|
||||
[
|
||||
[1.0, 0.0, 0.0],
|
||||
[0.0, 1.0, 0.0],
|
||||
[0.0, 0.0, 1.0],
|
||||
]
|
||||
|
||||
In the smoothing version CE Loss,some probabilities
|
||||
are taken from the true label prob (1.0) and are divided
|
||||
among other labels.
|
||||
|
||||
e.g.
|
||||
smoothing=0.1
|
||||
[0,1,2] ->
|
||||
[
|
||||
[0.9, 0.05, 0.05],
|
||||
[0.05, 0.9, 0.05],
|
||||
[0.05, 0.05, 0.9],
|
||||
]
|
||||
|
||||
Args:
|
||||
size (int): the number of class
|
||||
padding_idx (int): padding class id which will be ignored for loss
|
||||
smoothing (float): smoothing rate (0.0 means the conventional CE)
|
||||
normalize_length (bool):
|
||||
normalize loss by sequence length if True
|
||||
normalize loss by batch size if False
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
size: int,
|
||||
padding_idx: int,
|
||||
smoothing: float,
|
||||
normalize_length: bool = False):
|
||||
"""Construct an LabelSmoothingLoss object."""
|
||||
super(LabelSmoothingLoss, self).__init__()
|
||||
self.criterion = nn.KLDivLoss(reduction="none")
|
||||
self.padding_idx = padding_idx
|
||||
self.confidence = 1.0 - smoothing
|
||||
self.smoothing = smoothing
|
||||
self.size = size
|
||||
self.normalize_length = normalize_length
|
||||
|
||||
def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
|
||||
"""Compute loss between x and target.
|
||||
|
||||
The model outputs and data labels tensors are flatten to
|
||||
(batch*seqlen, class) shape and a mask is applied to the
|
||||
padding part which should not be calculated for loss.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): prediction (batch, seqlen, class)
|
||||
target (torch.Tensor):
|
||||
target signal masked with self.padding_id (batch, seqlen)
|
||||
Returns:
|
||||
loss (torch.Tensor) : The KL loss, scalar float value
|
||||
"""
|
||||
assert x.size(2) == self.size
|
||||
batch_size = x.size(0)
|
||||
x = x.view(-1, self.size)
|
||||
target = target.view(-1)
|
||||
# use zeros_like instead of torch.no_grad() for true_dist,
|
||||
# since no_grad() can not be exported by JIT
|
||||
true_dist = torch.zeros_like(x)
|
||||
true_dist.fill_(self.smoothing / (self.size - 1))
|
||||
ignore = target == self.padding_idx # (B,)
|
||||
total = len(target) - ignore.sum().item()
|
||||
target = target.masked_fill(ignore, 0) # avoid -1 index
|
||||
true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
|
||||
kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
|
||||
denom = total if self.normalize_length else batch_size
|
||||
return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
|
||||
115
cosyvoice/transformer/positionwise_feed_forward.py
Normal file
115
cosyvoice/transformer/positionwise_feed_forward.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Copyright (c) 2019 Shigeki Karita
|
||||
# 2020 Mobvoi Inc (Binbin Zhang)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Positionwise feed forward layer definition."""
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class PositionwiseFeedForward(torch.nn.Module):
|
||||
"""Positionwise feed forward layer.
|
||||
|
||||
FeedForward are appied on each position of the sequence.
|
||||
The output dim is same with the input dim.
|
||||
|
||||
Args:
|
||||
idim (int): Input dimenstion.
|
||||
hidden_units (int): The number of hidden units.
|
||||
dropout_rate (float): Dropout rate.
|
||||
activation (torch.nn.Module): Activation function
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
idim: int,
|
||||
hidden_units: int,
|
||||
dropout_rate: float,
|
||||
activation: torch.nn.Module = torch.nn.ReLU(),
|
||||
):
|
||||
"""Construct a PositionwiseFeedForward object."""
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
self.w_1 = torch.nn.Linear(idim, hidden_units)
|
||||
self.activation = activation
|
||||
self.dropout = torch.nn.Dropout(dropout_rate)
|
||||
self.w_2 = torch.nn.Linear(hidden_units, idim)
|
||||
|
||||
def forward(self, xs: torch.Tensor) -> torch.Tensor:
|
||||
"""Forward function.
|
||||
|
||||
Args:
|
||||
xs: input tensor (B, L, D)
|
||||
Returns:
|
||||
output tensor, (B, L, D)
|
||||
"""
|
||||
return self.w_2(self.dropout(self.activation(self.w_1(xs))))
|
||||
|
||||
|
||||
class MoEFFNLayer(torch.nn.Module):
|
||||
"""
|
||||
Mixture of expert with Positionwise feed forward layer
|
||||
See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
|
||||
The output dim is same with the input dim.
|
||||
|
||||
Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
|
||||
https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
|
||||
Args:
|
||||
n_expert: number of expert.
|
||||
n_expert_per_token: The actual number of experts used for each frame
|
||||
idim (int): Input dimenstion.
|
||||
hidden_units (int): The number of hidden units.
|
||||
dropout_rate (float): Dropout rate.
|
||||
activation (torch.nn.Module): Activation function
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_expert: int,
|
||||
n_expert_per_token: int,
|
||||
idim: int,
|
||||
hidden_units: int,
|
||||
dropout_rate: float,
|
||||
activation: torch.nn.Module = torch.nn.ReLU(),
|
||||
):
|
||||
super(MoEFFNLayer, self).__init__()
|
||||
self.gate = torch.nn.Linear(idim, n_expert, bias=False)
|
||||
self.experts = torch.nn.ModuleList(
|
||||
PositionwiseFeedForward(idim, hidden_units, dropout_rate,
|
||||
activation) for _ in range(n_expert))
|
||||
self.n_expert_per_token = n_expert_per_token
|
||||
|
||||
def forward(self, xs: torch.Tensor) -> torch.Tensor:
|
||||
"""Foward function.
|
||||
Args:
|
||||
xs: input tensor (B, L, D)
|
||||
Returns:
|
||||
output tensor, (B, L, D)
|
||||
|
||||
"""
|
||||
B, L, D = xs.size(
|
||||
) # batch size, sequence length, embedding dimension (idim)
|
||||
xs = xs.view(-1, D) # (B*L, D)
|
||||
router = self.gate(xs) # (B*L, n_expert)
|
||||
logits, indices = torch.topk(
|
||||
router, self.n_expert_per_token
|
||||
) # probs:(B*L, n_expert), indices: (B*L, n_expert)
|
||||
weights = torch.nn.functional.softmax(
|
||||
logits, dim=1,
|
||||
dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token)
|
||||
output = torch.zeros_like(xs) # (B*L, D)
|
||||
for i, expert in enumerate(self.experts):
|
||||
mask = indices == i
|
||||
batch_idx, ith_expert = torch.where(mask)
|
||||
output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
|
||||
xs[batch_idx])
|
||||
return output.view(B, L, D)
|
||||
383
cosyvoice/transformer/subsampling.py
Normal file
383
cosyvoice/transformer/subsampling.py
Normal file
@@ -0,0 +1,383 @@
|
||||
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
||||
# 2024 Alibaba Inc (Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from ESPnet(https://github.com/espnet/espnet)
|
||||
"""Subsampling layer definition."""
|
||||
|
||||
from typing import Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class BaseSubsampling(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.right_context = 0
|
||||
self.subsampling_rate = 1
|
||||
|
||||
def position_encoding(self, offset: Union[int, torch.Tensor],
|
||||
size: int) -> torch.Tensor:
|
||||
return self.pos_enc.position_encoding(offset, size)
|
||||
|
||||
|
||||
class EmbedinigNoSubsampling(BaseSubsampling):
|
||||
"""Embedding input without subsampling
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
super().__init__()
|
||||
self.embed = torch.nn.Embedding(idim, odim)
|
||||
self.pos_enc = pos_enc_class
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Input x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: linear input tensor (#batch, time', odim),
|
||||
where time' = time .
|
||||
torch.Tensor: linear input mask (#batch, 1, time'),
|
||||
where time' = time .
|
||||
|
||||
"""
|
||||
x = self.embed(x)
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask
|
||||
|
||||
|
||||
class LinearNoSubsampling(BaseSubsampling):
|
||||
"""Linear transform the input without subsampling
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an linear object."""
|
||||
super().__init__()
|
||||
self.out = torch.nn.Sequential(
|
||||
torch.nn.Linear(idim, odim),
|
||||
torch.nn.LayerNorm(odim, eps=1e-5),
|
||||
torch.nn.Dropout(dropout_rate),
|
||||
)
|
||||
self.pos_enc = pos_enc_class
|
||||
self.right_context = 0
|
||||
self.subsampling_rate = 1
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Input x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: linear input tensor (#batch, time', odim),
|
||||
where time' = time .
|
||||
torch.Tensor: linear input mask (#batch, 1, time'),
|
||||
where time' = time .
|
||||
|
||||
"""
|
||||
x = self.out(x)
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask
|
||||
|
||||
|
||||
class Conv1dSubsampling2(BaseSubsampling):
|
||||
"""Convolutional 1D subsampling (to 1/2 length).
|
||||
It is designed for Whisper, ref:
|
||||
https://github.com/openai/whisper/blob/main/whisper/model.py
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an Conv1dSubsampling2 object."""
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
|
||||
torch.nn.GELU(),
|
||||
torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
|
||||
torch.nn.GELU(),
|
||||
)
|
||||
self.pos_enc = pos_enc_class
|
||||
# The right context for every conv layer is computed by:
|
||||
# (kernel_size - 1) * frame_rate_of_this_layer
|
||||
self.subsampling_rate = 2
|
||||
# 4 = (3 - 1) * 1 + (3 - 1) * 1
|
||||
self.right_context = 4
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Subsample x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
||||
where time' = time // 2.
|
||||
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
||||
where time' = time // 2.
|
||||
torch.Tensor: positional encoding
|
||||
|
||||
"""
|
||||
time = x.size(1)
|
||||
x = x.transpose(1, 2) # (b, f, t)
|
||||
x = self.conv(x)
|
||||
x = x.transpose(1, 2) # (b, t, f)
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask[:, :, (time + 1) % 2::2]
|
||||
|
||||
|
||||
class Conv2dSubsampling4(BaseSubsampling):
|
||||
"""Convolutional 2D subsampling (to 1/4 length).
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an Conv2dSubsampling4 object."""
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.Conv2d(1, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv2d(odim, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
)
|
||||
self.out = torch.nn.Sequential(
|
||||
torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
|
||||
self.pos_enc = pos_enc_class
|
||||
# The right context for every conv layer is computed by:
|
||||
# (kernel_size - 1) * frame_rate_of_this_layer
|
||||
self.subsampling_rate = 4
|
||||
# 6 = (3 - 1) * 1 + (3 - 1) * 2
|
||||
self.right_context = 6
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Subsample x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
||||
where time' = time // 4.
|
||||
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
||||
where time' = time // 4.
|
||||
torch.Tensor: positional encoding
|
||||
|
||||
"""
|
||||
x = x.unsqueeze(1) # (b, c=1, t, f)
|
||||
x = self.conv(x)
|
||||
b, c, t, f = x.size()
|
||||
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
|
||||
|
||||
|
||||
class Conv2dSubsampling6(BaseSubsampling):
|
||||
"""Convolutional 2D subsampling (to 1/6 length).
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
pos_enc (torch.nn.Module): Custom position encoding layer.
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an Conv2dSubsampling6 object."""
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.Conv2d(1, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv2d(odim, odim, 5, 3),
|
||||
torch.nn.ReLU(),
|
||||
)
|
||||
self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
|
||||
odim)
|
||||
self.pos_enc = pos_enc_class
|
||||
# 10 = (3 - 1) * 1 + (5 - 1) * 2
|
||||
self.subsampling_rate = 6
|
||||
self.right_context = 10
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Subsample x.
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
||||
where time' = time // 6.
|
||||
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
||||
where time' = time // 6.
|
||||
torch.Tensor: positional encoding
|
||||
"""
|
||||
x = x.unsqueeze(1) # (b, c, t, f)
|
||||
x = self.conv(x)
|
||||
b, c, t, f = x.size()
|
||||
x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
|
||||
|
||||
|
||||
class Conv2dSubsampling8(BaseSubsampling):
|
||||
"""Convolutional 2D subsampling (to 1/8 length).
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an Conv2dSubsampling8 object."""
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.Conv2d(1, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv2d(odim, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Conv2d(odim, odim, 3, 2),
|
||||
torch.nn.ReLU(),
|
||||
)
|
||||
self.linear = torch.nn.Linear(
|
||||
odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
|
||||
self.pos_enc = pos_enc_class
|
||||
self.subsampling_rate = 8
|
||||
# 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
|
||||
self.right_context = 14
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Subsample x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
||||
where time' = time // 8.
|
||||
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
||||
where time' = time // 8.
|
||||
torch.Tensor: positional encoding
|
||||
"""
|
||||
x = x.unsqueeze(1) # (b, c, t, f)
|
||||
x = self.conv(x)
|
||||
b, c, t, f = x.size()
|
||||
x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
|
||||
|
||||
|
||||
class LegacyLinearNoSubsampling(BaseSubsampling):
|
||||
"""Linear transform the input without subsampling
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
odim (int): Output dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
||||
pos_enc_class: torch.nn.Module):
|
||||
"""Construct an linear object."""
|
||||
super().__init__()
|
||||
self.out = torch.nn.Sequential(
|
||||
torch.nn.Linear(idim, odim),
|
||||
torch.nn.LayerNorm(odim, eps=1e-5),
|
||||
torch.nn.Dropout(dropout_rate),
|
||||
torch.nn.ReLU(),
|
||||
)
|
||||
self.pos_enc = pos_enc_class
|
||||
self.right_context = 0
|
||||
self.subsampling_rate = 1
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
offset: Union[int, torch.Tensor] = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Input x.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor (#batch, time, idim).
|
||||
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: linear input tensor (#batch, time', odim),
|
||||
where time' = time .
|
||||
torch.Tensor: linear input mask (#batch, 1, time'),
|
||||
where time' = time .
|
||||
|
||||
"""
|
||||
x = self.out(x)
|
||||
x, pos_emb = self.pos_enc(x, offset)
|
||||
return x, pos_emb, x_mask
|
||||
Reference in New Issue
Block a user