mirror of
https://github.com/HumanAIGC/lite-avatar.git
synced 2026-02-05 18:09:20 +08:00
add files
This commit is contained in:
1
funasr_local/modules/rnn/__init__.py
Normal file
1
funasr_local/modules/rnn/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Initialize sub package."""
|
||||
156
funasr_local/modules/rnn/argument.py
Normal file
156
funasr_local/modules/rnn/argument.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# Copyright 2020 Hirofumi Inaguma
|
||||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
"""Conformer common arguments."""
|
||||
|
||||
|
||||
def add_arguments_rnn_encoder_common(group):
|
||||
"""Define common arguments for RNN encoder."""
|
||||
group.add_argument(
|
||||
"--etype",
|
||||
default="blstmp",
|
||||
type=str,
|
||||
choices=[
|
||||
"lstm",
|
||||
"blstm",
|
||||
"lstmp",
|
||||
"blstmp",
|
||||
"vgglstmp",
|
||||
"vggblstmp",
|
||||
"vgglstm",
|
||||
"vggblstm",
|
||||
"gru",
|
||||
"bgru",
|
||||
"grup",
|
||||
"bgrup",
|
||||
"vgggrup",
|
||||
"vggbgrup",
|
||||
"vgggru",
|
||||
"vggbgru",
|
||||
],
|
||||
help="Type of encoder network architecture",
|
||||
)
|
||||
group.add_argument(
|
||||
"--elayers",
|
||||
default=4,
|
||||
type=int,
|
||||
help="Number of encoder layers",
|
||||
)
|
||||
group.add_argument(
|
||||
"--eunits",
|
||||
"-u",
|
||||
default=300,
|
||||
type=int,
|
||||
help="Number of encoder hidden units",
|
||||
)
|
||||
group.add_argument(
|
||||
"--eprojs", default=320, type=int, help="Number of encoder projection units"
|
||||
)
|
||||
group.add_argument(
|
||||
"--subsample",
|
||||
default="1",
|
||||
type=str,
|
||||
help="Subsample input frames x_y_z means "
|
||||
"subsample every x frame at 1st layer, "
|
||||
"every y frame at 2nd layer etc.",
|
||||
)
|
||||
return group
|
||||
|
||||
|
||||
def add_arguments_rnn_decoder_common(group):
|
||||
"""Define common arguments for RNN decoder."""
|
||||
group.add_argument(
|
||||
"--dtype",
|
||||
default="lstm",
|
||||
type=str,
|
||||
choices=["lstm", "gru"],
|
||||
help="Type of decoder network architecture",
|
||||
)
|
||||
group.add_argument(
|
||||
"--dlayers", default=1, type=int, help="Number of decoder layers"
|
||||
)
|
||||
group.add_argument(
|
||||
"--dunits", default=320, type=int, help="Number of decoder hidden units"
|
||||
)
|
||||
group.add_argument(
|
||||
"--dropout-rate-decoder",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Dropout rate for the decoder",
|
||||
)
|
||||
group.add_argument(
|
||||
"--sampling-probability",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Ratio of predicted labels fed back to decoder",
|
||||
)
|
||||
group.add_argument(
|
||||
"--lsm-type",
|
||||
const="",
|
||||
default="",
|
||||
type=str,
|
||||
nargs="?",
|
||||
choices=["", "unigram"],
|
||||
help="Apply label smoothing with a specified distribution type",
|
||||
)
|
||||
return group
|
||||
|
||||
|
||||
def add_arguments_rnn_attention_common(group):
|
||||
"""Define common arguments for RNN attention."""
|
||||
group.add_argument(
|
||||
"--atype",
|
||||
default="dot",
|
||||
type=str,
|
||||
choices=[
|
||||
"noatt",
|
||||
"dot",
|
||||
"add",
|
||||
"location",
|
||||
"coverage",
|
||||
"coverage_location",
|
||||
"location2d",
|
||||
"location_recurrent",
|
||||
"multi_head_dot",
|
||||
"multi_head_add",
|
||||
"multi_head_loc",
|
||||
"multi_head_multi_res_loc",
|
||||
],
|
||||
help="Type of attention architecture",
|
||||
)
|
||||
group.add_argument(
|
||||
"--adim",
|
||||
default=320,
|
||||
type=int,
|
||||
help="Number of attention transformation dimensions",
|
||||
)
|
||||
group.add_argument(
|
||||
"--awin", default=5, type=int, help="Window size for location2d attention"
|
||||
)
|
||||
group.add_argument(
|
||||
"--aheads",
|
||||
default=4,
|
||||
type=int,
|
||||
help="Number of heads for multi head attention",
|
||||
)
|
||||
group.add_argument(
|
||||
"--aconv-chans",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="Number of attention convolution channels \
|
||||
(negative value indicates no location-aware attention)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--aconv-filts",
|
||||
default=100,
|
||||
type=int,
|
||||
help="Number of attention convolution filters \
|
||||
(negative value indicates no location-aware attention)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--dropout-rate",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Dropout rate for the encoder",
|
||||
)
|
||||
return group
|
||||
1808
funasr_local/modules/rnn/attentions.py
Normal file
1808
funasr_local/modules/rnn/attentions.py
Normal file
File diff suppressed because it is too large
Load Diff
1211
funasr_local/modules/rnn/decoders.py
Normal file
1211
funasr_local/modules/rnn/decoders.py
Normal file
File diff suppressed because it is too large
Load Diff
372
funasr_local/modules/rnn/encoders.py
Normal file
372
funasr_local/modules/rnn/encoders.py
Normal file
@@ -0,0 +1,372 @@
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import six
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils.rnn import pack_padded_sequence
|
||||
from torch.nn.utils.rnn import pad_packed_sequence
|
||||
|
||||
from funasr_local.modules.e2e_asr_common import get_vgg2l_odim
|
||||
from funasr_local.modules.nets_utils import make_pad_mask
|
||||
from funasr_local.modules.nets_utils import to_device
|
||||
|
||||
|
||||
class RNNP(torch.nn.Module):
|
||||
"""RNN with projection layer module
|
||||
|
||||
:param int idim: dimension of inputs
|
||||
:param int elayers: number of encoder layers
|
||||
:param int cdim: number of rnn units (resulted in cdim * 2 if bidirectional)
|
||||
:param int hdim: number of projection units
|
||||
:param np.ndarray subsample: list of subsampling numbers
|
||||
:param float dropout: dropout rate
|
||||
:param str typ: The RNN type
|
||||
"""
|
||||
|
||||
def __init__(self, idim, elayers, cdim, hdim, subsample, dropout, typ="blstm"):
|
||||
super(RNNP, self).__init__()
|
||||
bidir = typ[0] == "b"
|
||||
for i in six.moves.range(elayers):
|
||||
if i == 0:
|
||||
inputdim = idim
|
||||
else:
|
||||
inputdim = hdim
|
||||
|
||||
RNN = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
|
||||
rnn = RNN(
|
||||
inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
|
||||
)
|
||||
|
||||
setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)
|
||||
|
||||
# bottleneck layer to merge
|
||||
if bidir:
|
||||
setattr(self, "bt%d" % i, torch.nn.Linear(2 * cdim, hdim))
|
||||
else:
|
||||
setattr(self, "bt%d" % i, torch.nn.Linear(cdim, hdim))
|
||||
|
||||
self.elayers = elayers
|
||||
self.cdim = cdim
|
||||
self.subsample = subsample
|
||||
self.typ = typ
|
||||
self.bidir = bidir
|
||||
self.dropout = dropout
|
||||
|
||||
def forward(self, xs_pad, ilens, prev_state=None):
|
||||
"""RNNP forward
|
||||
|
||||
:param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
|
||||
:param torch.Tensor ilens: batch of lengths of input sequences (B)
|
||||
:param torch.Tensor prev_state: batch of previous RNN states
|
||||
:return: batch of hidden state sequences (B, Tmax, hdim)
|
||||
:rtype: torch.Tensor
|
||||
"""
|
||||
logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
|
||||
elayer_states = []
|
||||
for layer in six.moves.range(self.elayers):
|
||||
if not isinstance(ilens, torch.Tensor):
|
||||
ilens = torch.tensor(ilens)
|
||||
xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
|
||||
rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
|
||||
rnn.flatten_parameters()
|
||||
if prev_state is not None and rnn.bidirectional:
|
||||
prev_state = reset_backward_rnn_state(prev_state)
|
||||
ys, states = rnn(
|
||||
xs_pack, hx=None if prev_state is None else prev_state[layer]
|
||||
)
|
||||
elayer_states.append(states)
|
||||
# ys: utt list of frame x cdim x 2 (2: means bidirectional)
|
||||
ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)
|
||||
sub = self.subsample[layer + 1]
|
||||
if sub > 1:
|
||||
ys_pad = ys_pad[:, ::sub]
|
||||
ilens = torch.tensor([int(i + 1) // sub for i in ilens])
|
||||
# (sum _utt frame_utt) x dim
|
||||
projection_layer = getattr(self, "bt%d" % layer)
|
||||
projected = projection_layer(ys_pad.contiguous().view(-1, ys_pad.size(2)))
|
||||
xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)
|
||||
if layer < self.elayers - 1:
|
||||
xs_pad = torch.tanh(F.dropout(xs_pad, p=self.dropout))
|
||||
|
||||
return xs_pad, ilens, elayer_states # x: utt list of frame x dim
|
||||
|
||||
|
||||
class RNN(torch.nn.Module):
|
||||
"""RNN module
|
||||
|
||||
:param int idim: dimension of inputs
|
||||
:param int elayers: number of encoder layers
|
||||
:param int cdim: number of rnn units (resulted in cdim * 2 if bidirectional)
|
||||
:param int hdim: number of final projection units
|
||||
:param float dropout: dropout rate
|
||||
:param str typ: The RNN type
|
||||
"""
|
||||
|
||||
def __init__(self, idim, elayers, cdim, hdim, dropout, typ="blstm"):
|
||||
super(RNN, self).__init__()
|
||||
bidir = typ[0] == "b"
|
||||
self.nbrnn = (
|
||||
torch.nn.LSTM(
|
||||
idim,
|
||||
cdim,
|
||||
elayers,
|
||||
batch_first=True,
|
||||
dropout=dropout,
|
||||
bidirectional=bidir,
|
||||
)
|
||||
if "lstm" in typ
|
||||
else torch.nn.GRU(
|
||||
idim,
|
||||
cdim,
|
||||
elayers,
|
||||
batch_first=True,
|
||||
dropout=dropout,
|
||||
bidirectional=bidir,
|
||||
)
|
||||
)
|
||||
if bidir:
|
||||
self.l_last = torch.nn.Linear(cdim * 2, hdim)
|
||||
else:
|
||||
self.l_last = torch.nn.Linear(cdim, hdim)
|
||||
self.typ = typ
|
||||
|
||||
def forward(self, xs_pad, ilens, prev_state=None):
|
||||
"""RNN forward
|
||||
|
||||
:param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
|
||||
:param torch.Tensor ilens: batch of lengths of input sequences (B)
|
||||
:param torch.Tensor prev_state: batch of previous RNN states
|
||||
:return: batch of hidden state sequences (B, Tmax, eprojs)
|
||||
:rtype: torch.Tensor
|
||||
"""
|
||||
logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
|
||||
if not isinstance(ilens, torch.Tensor):
|
||||
ilens = torch.tensor(ilens)
|
||||
xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
|
||||
self.nbrnn.flatten_parameters()
|
||||
if prev_state is not None and self.nbrnn.bidirectional:
|
||||
# We assume that when previous state is passed,
|
||||
# it means that we're streaming the input
|
||||
# and therefore cannot propagate backward BRNN state
|
||||
# (otherwise it goes in the wrong direction)
|
||||
prev_state = reset_backward_rnn_state(prev_state)
|
||||
ys, states = self.nbrnn(xs_pack, hx=prev_state)
|
||||
# ys: utt list of frame x cdim x 2 (2: means bidirectional)
|
||||
ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)
|
||||
# (sum _utt frame_utt) x dim
|
||||
projected = torch.tanh(
|
||||
self.l_last(ys_pad.contiguous().view(-1, ys_pad.size(2)))
|
||||
)
|
||||
xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)
|
||||
return xs_pad, ilens, states # x: utt list of frame x dim
|
||||
|
||||
|
||||
def reset_backward_rnn_state(states):
|
||||
"""Sets backward BRNN states to zeroes
|
||||
|
||||
Useful in processing of sliding windows over the inputs
|
||||
"""
|
||||
if isinstance(states, (list, tuple)):
|
||||
for state in states:
|
||||
state[1::2] = 0.0
|
||||
else:
|
||||
states[1::2] = 0.0
|
||||
return states
|
||||
|
||||
|
||||
class VGG2L(torch.nn.Module):
|
||||
"""VGG-like module
|
||||
|
||||
:param int in_channel: number of input channels
|
||||
"""
|
||||
|
||||
def __init__(self, in_channel=1):
|
||||
super(VGG2L, self).__init__()
|
||||
# CNN layer (VGG motivated)
|
||||
self.conv1_1 = torch.nn.Conv2d(in_channel, 64, 3, stride=1, padding=1)
|
||||
self.conv1_2 = torch.nn.Conv2d(64, 64, 3, stride=1, padding=1)
|
||||
self.conv2_1 = torch.nn.Conv2d(64, 128, 3, stride=1, padding=1)
|
||||
self.conv2_2 = torch.nn.Conv2d(128, 128, 3, stride=1, padding=1)
|
||||
|
||||
self.in_channel = in_channel
|
||||
|
||||
def forward(self, xs_pad, ilens, **kwargs):
|
||||
"""VGG2L forward
|
||||
|
||||
:param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
|
||||
:param torch.Tensor ilens: batch of lengths of input sequences (B)
|
||||
:return: batch of padded hidden state sequences (B, Tmax // 4, 128 * D // 4)
|
||||
:rtype: torch.Tensor
|
||||
"""
|
||||
logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
|
||||
|
||||
# x: utt x frame x dim
|
||||
# xs_pad = F.pad_sequence(xs_pad)
|
||||
|
||||
# x: utt x 1 (input channel num) x frame x dim
|
||||
xs_pad = xs_pad.view(
|
||||
xs_pad.size(0),
|
||||
xs_pad.size(1),
|
||||
self.in_channel,
|
||||
xs_pad.size(2) // self.in_channel,
|
||||
).transpose(1, 2)
|
||||
|
||||
# NOTE: max_pool1d ?
|
||||
xs_pad = F.relu(self.conv1_1(xs_pad))
|
||||
xs_pad = F.relu(self.conv1_2(xs_pad))
|
||||
xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)
|
||||
|
||||
xs_pad = F.relu(self.conv2_1(xs_pad))
|
||||
xs_pad = F.relu(self.conv2_2(xs_pad))
|
||||
xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)
|
||||
if torch.is_tensor(ilens):
|
||||
ilens = ilens.cpu().numpy()
|
||||
else:
|
||||
ilens = np.array(ilens, dtype=np.float32)
|
||||
ilens = np.array(np.ceil(ilens / 2), dtype=np.int64)
|
||||
ilens = np.array(
|
||||
np.ceil(np.array(ilens, dtype=np.float32) / 2), dtype=np.int64
|
||||
).tolist()
|
||||
|
||||
# x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
|
||||
xs_pad = xs_pad.transpose(1, 2)
|
||||
xs_pad = xs_pad.contiguous().view(
|
||||
xs_pad.size(0), xs_pad.size(1), xs_pad.size(2) * xs_pad.size(3)
|
||||
)
|
||||
return xs_pad, ilens, None # no state in this layer
|
||||
|
||||
|
||||
class Encoder(torch.nn.Module):
|
||||
"""Encoder module
|
||||
|
||||
:param str etype: type of encoder network
|
||||
:param int idim: number of dimensions of encoder network
|
||||
:param int elayers: number of layers of encoder network
|
||||
:param int eunits: number of lstm units of encoder network
|
||||
:param int eprojs: number of projection units of encoder network
|
||||
:param np.ndarray subsample: list of subsampling numbers
|
||||
:param float dropout: dropout rate
|
||||
:param int in_channel: number of input channels
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1
|
||||
):
|
||||
super(Encoder, self).__init__()
|
||||
typ = etype.lstrip("vgg").rstrip("p")
|
||||
if typ not in ["lstm", "gru", "blstm", "bgru"]:
|
||||
logging.error("Error: need to specify an appropriate encoder architecture")
|
||||
|
||||
if etype.startswith("vgg"):
|
||||
if etype[-1] == "p":
|
||||
self.enc = torch.nn.ModuleList(
|
||||
[
|
||||
VGG2L(in_channel),
|
||||
RNNP(
|
||||
get_vgg2l_odim(idim, in_channel=in_channel),
|
||||
elayers,
|
||||
eunits,
|
||||
eprojs,
|
||||
subsample,
|
||||
dropout,
|
||||
typ=typ,
|
||||
),
|
||||
]
|
||||
)
|
||||
logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
|
||||
else:
|
||||
self.enc = torch.nn.ModuleList(
|
||||
[
|
||||
VGG2L(in_channel),
|
||||
RNN(
|
||||
get_vgg2l_odim(idim, in_channel=in_channel),
|
||||
elayers,
|
||||
eunits,
|
||||
eprojs,
|
||||
dropout,
|
||||
typ=typ,
|
||||
),
|
||||
]
|
||||
)
|
||||
logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
|
||||
self.conv_subsampling_factor = 4
|
||||
else:
|
||||
if etype[-1] == "p":
|
||||
self.enc = torch.nn.ModuleList(
|
||||
[RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)]
|
||||
)
|
||||
logging.info(typ.upper() + " with every-layer projection for encoder")
|
||||
else:
|
||||
self.enc = torch.nn.ModuleList(
|
||||
[RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)]
|
||||
)
|
||||
logging.info(typ.upper() + " without projection for encoder")
|
||||
self.conv_subsampling_factor = 1
|
||||
|
||||
def forward(self, xs_pad, ilens, prev_states=None):
|
||||
"""Encoder forward
|
||||
|
||||
:param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
|
||||
:param torch.Tensor ilens: batch of lengths of input sequences (B)
|
||||
:param torch.Tensor prev_state: batch of previous encoder hidden states (?, ...)
|
||||
:return: batch of hidden state sequences (B, Tmax, eprojs)
|
||||
:rtype: torch.Tensor
|
||||
"""
|
||||
if prev_states is None:
|
||||
prev_states = [None] * len(self.enc)
|
||||
assert len(prev_states) == len(self.enc)
|
||||
|
||||
current_states = []
|
||||
for module, prev_state in zip(self.enc, prev_states):
|
||||
xs_pad, ilens, states = module(xs_pad, ilens, prev_state=prev_state)
|
||||
current_states.append(states)
|
||||
|
||||
# make mask to remove bias value in padded part
|
||||
mask = to_device(xs_pad, make_pad_mask(ilens).unsqueeze(-1))
|
||||
|
||||
return xs_pad.masked_fill(mask, 0.0), ilens, current_states
|
||||
|
||||
|
||||
def encoder_for(args, idim, subsample):
|
||||
"""Instantiates an encoder module given the program arguments
|
||||
|
||||
:param Namespace args: The arguments
|
||||
:param int or List of integer idim: dimension of input, e.g. 83, or
|
||||
List of dimensions of inputs, e.g. [83,83]
|
||||
:param List or List of List subsample: subsample factors, e.g. [1,2,2,1,1], or
|
||||
List of subsample factors of each encoder.
|
||||
e.g. [[1,2,2,1,1], [1,2,2,1,1]]
|
||||
:rtype torch.nn.Module
|
||||
:return: The encoder module
|
||||
"""
|
||||
num_encs = getattr(args, "num_encs", 1) # use getattr to keep compatibility
|
||||
if num_encs == 1:
|
||||
# compatible with single encoder asr mode
|
||||
return Encoder(
|
||||
args.etype,
|
||||
idim,
|
||||
args.elayers,
|
||||
args.eunits,
|
||||
args.eprojs,
|
||||
subsample,
|
||||
args.dropout_rate,
|
||||
)
|
||||
elif num_encs >= 1:
|
||||
enc_list = torch.nn.ModuleList()
|
||||
for idx in range(num_encs):
|
||||
enc = Encoder(
|
||||
args.etype[idx],
|
||||
idim[idx],
|
||||
args.elayers[idx],
|
||||
args.eunits[idx],
|
||||
args.eprojs,
|
||||
subsample[idx],
|
||||
args.dropout_rate[idx],
|
||||
)
|
||||
enc_list.append(enc)
|
||||
return enc_list
|
||||
else:
|
||||
raise ValueError(
|
||||
"Number of encoders needs to be more than one. {}".format(num_encs)
|
||||
)
|
||||
Reference in New Issue
Block a user