feat: data preprocessing and training (#294)

* docs: update readme * docs: update readme * feat: training codes * feat: data preprocess * docs: release training
2026-02-05 18:09:19 +08:00 · 2025-04-04 22:10:03 +08:00
parent e636166b85
commit 1ab53a626b
23 changed files with 3854 additions and 6 deletions
--- a/musetalk/loss/basic_loss.py
+++ b/musetalk/loss/basic_loss.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+import torch
+import torch.nn.functional as F
+from torch import nn, optim
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from musetalk.loss.discriminator import MultiScaleDiscriminator,DiscriminatorFullModel
+import musetalk.loss.vgg_face as vgg_face
+
+class Interpolate(nn.Module):
+    def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None):
+        super(Interpolate, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, input):
+        return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners)
+
+def set_requires_grad(net, requires_grad=False):
+    if net is not None:
+        for param in net.parameters():
+            param.requires_grad = requires_grad
+
+if __name__ == "__main__":
+    cfg = OmegaConf.load("config/audio_adapter/E7.yaml")
+
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    pyramid_scale = [1, 0.5, 0.25, 0.125]
+    vgg_IN = vgg_face.Vgg19().to(device)
+    pyramid = vgg_face.ImagePyramide(cfg.loss_params.pyramid_scale, 3).to(device)
+    vgg_IN.eval()
+    downsampler = Interpolate(size=(224, 224), mode='bilinear', align_corners=False)
+    
+    image = torch.rand(8, 3, 256, 256).to(device)
+    image_pred = torch.rand(8, 3, 256, 256).to(device)
+    pyramide_real = pyramid(downsampler(image))
+    pyramide_generated = pyramid(downsampler(image_pred))
+    
+
+    loss_IN = 0
+    for scale in cfg.loss_params.pyramid_scale:
+        x_vgg = vgg_IN(pyramide_generated['prediction_' + str(scale)])
+        y_vgg = vgg_IN(pyramide_real['prediction_' + str(scale)])
+        for i, weight in enumerate(cfg.loss_params.vgg_layer_weight):
+            value = torch.abs(x_vgg[i] - y_vgg[i].detach()).mean() 
+            loss_IN += weight * value
+    loss_IN /= sum(cfg.loss_params.vgg_layer_weight)  # 对vgg不同层取均值，金字塔loss是每层叠
+    print(loss_IN)
+
+    #print(cfg.model_params.discriminator_params)
+
+    discriminator = MultiScaleDiscriminator(**cfg.model_params.discriminator_params).to(device)
+    discriminator_full = DiscriminatorFullModel(discriminator)
+    disc_scales = cfg.model_params.discriminator_params.scales
+    # Prepare optimizer and loss function
+    optimizer_D = optim.AdamW(discriminator.parameters(), 
+                                lr=cfg.discriminator_train_params.lr, 
+                                weight_decay=cfg.discriminator_train_params.weight_decay,
+                                betas=cfg.discriminator_train_params.betas,
+                                eps=cfg.discriminator_train_params.eps)
+    scheduler_D = CosineAnnealingLR(optimizer_D, 
+                                    T_max=cfg.discriminator_train_params.epochs, 
+                                    eta_min=1e-6)
+
+    discriminator.train()
+
+    set_requires_grad(discriminator, False)
+
+    loss_G = 0.
+    discriminator_maps_generated = discriminator(pyramide_generated)
+    discriminator_maps_real = discriminator(pyramide_real)
+
+    for scale in disc_scales:
+        key = 'prediction_map_%s' % scale
+        value = ((1 - discriminator_maps_generated[key]) ** 2).mean()
+        loss_G += value
+
+    print(loss_G)
--- a/musetalk/loss/conv.py
+++ b/musetalk/loss/conv.py
@@ -0,0 +1,44 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+class Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
+                            nn.BatchNorm2d(cout)
+                            )
+        self.act = nn.ReLU()
+        self.residual = residual
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        if self.residual:
+            out += x
+        return self.act(out)
+
+class nonorm_Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
+                            )
+        self.act = nn.LeakyReLU(0.01, inplace=True)
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        return self.act(out)
+
+class Conv2dTranspose(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, output_padding=0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.ConvTranspose2d(cin, cout, kernel_size, stride, padding, output_padding),
+                            nn.BatchNorm2d(cout)
+                            )
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        return self.act(out)
--- a/musetalk/loss/discriminator.py
+++ b/musetalk/loss/discriminator.py
@@ -0,0 +1,145 @@
+from torch import nn
+import torch.nn.functional as F
+import torch
+from musetalk.loss.vgg_face import ImagePyramide
+
+class DownBlock2d(nn.Module):
+    """
+    Simple block for processing video (encoder).
+    """
+
+    def __init__(self, in_features, out_features, norm=False, kernel_size=4, pool=False, sn=False):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size)
+
+        if sn:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        if norm:
+            self.norm = nn.InstanceNorm2d(out_features, affine=True)
+        else:
+            self.norm = None
+        self.pool = pool
+
+    def forward(self, x):
+        out = x
+        out = self.conv(out)
+        if self.norm:
+            out = self.norm(out)
+        out = F.leaky_relu(out, 0.2)
+        if self.pool:
+            out = F.avg_pool2d(out, (2, 2))
+        return out
+
+
+class Discriminator(nn.Module):
+    """
+    Discriminator similar to Pix2Pix
+    """
+
+    def __init__(self, num_channels=3, block_expansion=64, num_blocks=4, max_features=512,
+                 sn=False, **kwargs):
+        super(Discriminator, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(
+                DownBlock2d(num_channels if i == 0 else min(max_features, block_expansion * (2 ** i)),
+                            min(max_features, block_expansion * (2 ** (i + 1))),
+                            norm=(i != 0), kernel_size=4, pool=(i != num_blocks - 1), sn=sn))
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.conv = nn.Conv2d(self.down_blocks[-1].conv.out_channels, out_channels=1, kernel_size=1)
+        if sn:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+    def forward(self, x):
+        feature_maps = []
+        out = x
+
+        for down_block in self.down_blocks:
+            feature_maps.append(down_block(out))
+            out = feature_maps[-1]
+        prediction_map = self.conv(out)
+
+        return feature_maps, prediction_map
+
+
+class MultiScaleDiscriminator(nn.Module):
+    """ 
+    Multi-scale (scale) discriminator
+    """
+
+    def __init__(self, scales=(), **kwargs):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.scales = scales
+        discs = {}
+        for scale in scales:
+            discs[str(scale).replace('.', '-')] = Discriminator(**kwargs)
+        self.discs = nn.ModuleDict(discs)
+
+    def forward(self, x):
+        out_dict = {}
+        for scale, disc in self.discs.items():
+            scale = str(scale).replace('-', '.')
+            key = 'prediction_' + scale
+            #print(key)
+            #print(x)
+            feature_maps, prediction_map = disc(x[key])
+            out_dict['feature_maps_' + scale] = feature_maps
+            out_dict['prediction_map_' + scale] = prediction_map
+        return out_dict
+
+
+
+class DiscriminatorFullModel(torch.nn.Module):
+    """
+    Merge all discriminator related updates into single model for better multi-gpu usage
+    """
+
+    def __init__(self, discriminator):
+        super(DiscriminatorFullModel, self).__init__()
+        self.discriminator = discriminator
+        self.scales = self.discriminator.scales
+        print("scales",self.scales)
+        self.pyramid = ImagePyramide(self.scales, 3)
+        if torch.cuda.is_available():
+            self.pyramid = self.pyramid.cuda()
+
+        self.zero_tensor = None
+
+    def get_zero_tensor(self, input):
+        if self.zero_tensor is None:
+            self.zero_tensor = torch.FloatTensor(1).fill_(0).cuda()
+            self.zero_tensor.requires_grad_(False)
+        return self.zero_tensor.expand_as(input)
+
+    def forward(self, x, generated, gan_mode='ls'):
+        pyramide_real = self.pyramid(x)
+        pyramide_generated = self.pyramid(generated.detach())
+
+        discriminator_maps_generated = self.discriminator(pyramide_generated)
+        discriminator_maps_real = self.discriminator(pyramide_real)
+
+        value_total = 0
+        for scale in self.scales:
+            key = 'prediction_map_%s' % scale
+            if gan_mode == 'hinge':
+                value = -torch.mean(torch.min(discriminator_maps_real[key]-1, self.get_zero_tensor(discriminator_maps_real[key]))) - torch.mean(torch.min(-discriminator_maps_generated[key]-1, self.get_zero_tensor(discriminator_maps_generated[key])))
+            elif gan_mode == 'ls':
+                value = ((1 - discriminator_maps_real[key]) ** 2 + discriminator_maps_generated[key] ** 2).mean()
+            else:
+                raise ValueError('Unexpected gan_mode {}'.format(self.train_params['gan_mode']))
+
+            value_total += value
+
+        return value_total
+    
+def main():
+    discriminator = MultiScaleDiscriminator(scales=[1],
+                                        block_expansion=32,
+                                        max_features=512,
+                                        num_blocks=4,
+                                        sn=True,
+                                        image_channel=3,
+                                        estimate_jacobian=False)
--- a/musetalk/loss/resnet.py
+++ b/musetalk/loss/resnet.py
@@ -0,0 +1,152 @@
+import torch.nn as nn
+import math
+
+__all__ = ['ResNet', 'resnet50']
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000, include_top=True):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.include_top = include_top
+        
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x * 255.
+        x = x.flip(1)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        
+        if not self.include_top:
+            return x
+        
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+def resnet50(**kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
--- a/musetalk/loss/syncnet.py
+++ b/musetalk/loss/syncnet.py
@@ -0,0 +1,95 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .conv import Conv2d
+
+logloss = nn.BCELoss(reduction="none")
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    d = d.clamp(0,1) # cosine_similarity的取值范围是【-1，1】，BCE如果输入负数会报错RuntimeError: CUDA error: device-side assert triggered
+    loss = logloss(d.unsqueeze(1), y).squeeze()
+    loss = loss.mean()
+    return loss, d
+
+def get_sync_loss(
+    audio_embed, 
+    gt_frames, 
+    pred_frames, 
+    syncnet, 
+    adapted_weight,
+    frames_left_index=0,
+    frames_right_index=16,
+):
+    # 跟gt_frames做随机的插入交换，节省显存开销
+    assert pred_frames.shape[1] == (frames_right_index - frames_left_index) * 3
+    # 3通道图像
+    frames_sync_loss = torch.cat(
+        [gt_frames[:, :3 * frames_left_index, ...], pred_frames, gt_frames[:, 3 * frames_right_index:, ...]], 
+        axis=1
+    )
+    vision_embed = syncnet.get_image_embed(frames_sync_loss)
+    y = torch.ones(frames_sync_loss.size(0), 1).float().to(audio_embed.device)
+    loss, score = cosine_loss(audio_embed, vision_embed, y)
+    return loss, score
+
+class SyncNet_color(nn.Module):
+    def __init__(self):
+        super(SyncNet_color, self).__init__()
+
+        self.face_encoder = nn.Sequential(
+            Conv2d(15, 32, kernel_size=(7, 7), stride=1, padding=3),
+
+            Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(256, 512, kernel_size=3, stride=2, padding=1),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+
+        self.audio_encoder = nn.Sequential(
+            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+
+    def forward(self, audio_sequences, face_sequences): # audio_sequences := (B, dim, T)
+        face_embedding = self.face_encoder(face_sequences)
+        audio_embedding = self.audio_encoder(audio_sequences)
+
+        audio_embedding = audio_embedding.view(audio_embedding.size(0), -1)
+        face_embedding = face_embedding.view(face_embedding.size(0), -1)
+
+        audio_embedding = F.normalize(audio_embedding, p=2, dim=1)
+        face_embedding = F.normalize(face_embedding, p=2, dim=1)
+
+
+        return audio_embedding, face_embedding
--- a/musetalk/loss/vgg_face.py
+++ b/musetalk/loss/vgg_face.py
@@ -0,0 +1,237 @@
+'''
+    This part of code contains a pretrained vgg_face model.
+    ref link: https://github.com/prlz77/vgg-face.pytorch
+'''
+import torch
+import torch.nn.functional as F
+import torch.utils.model_zoo
+import pickle
+from musetalk.loss import resnet as ResNet
+
+
+MODEL_URL = "https://github.com/claudio-unipv/vggface-pytorch/releases/download/v0.1/vggface-9d491dd7c30312.pth"
+VGG_FACE_PATH = '/apdcephfs_cq8/share_1367250/zhentaoyu/Driving/00_VASA/00_data/models/pretrain_models/resnet50_ft_weight.pkl'
+
+# It was 93.5940, 104.7624, 129.1863 before dividing by 255
+MEAN_RGB = [
+    0.367035294117647,
+    0.41083294117647057,
+    0.5066129411764705
+]
+def load_state_dict(model, fname):
+    """
+    Set parameters converted from Caffe models authors of VGGFace2 provide.
+    See https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/.
+
+    Arguments:
+        model: model
+        fname: file name of parameters converted from a Caffe model, assuming the file format is Pickle.
+    """
+    with open(fname, 'rb') as f:
+        weights = pickle.load(f, encoding='latin1')
+
+    own_state = model.state_dict()
+    for name, param in weights.items():
+        if name in own_state:
+            try:
+                own_state[name].copy_(torch.from_numpy(param))
+            except Exception:
+                raise RuntimeError('While copying the parameter named {}, whose dimensions in the model are {} and whose '\
+                                   'dimensions in the checkpoint are {}.'.format(name, own_state[name].size(), param.size()))
+        else:
+            raise KeyError('unexpected key "{}" in state_dict'.format(name))
+        
+
+def vggface2(pretrained=True):
+    vggface = ResNet.resnet50(num_classes=8631, include_top=True)
+    load_state_dict(vggface, VGG_FACE_PATH)
+    return vggface
+
+def vggface(pretrained=False, **kwargs):
+    """VGGFace model.
+
+    Args:
+        pretrained (bool): If True, returns pre-trained model
+    """
+    model = VggFace(**kwargs)
+    if pretrained:
+        state = torch.utils.model_zoo.load_url(MODEL_URL)
+        model.load_state_dict(state)
+    return model
+
+
+class VggFace(torch.nn.Module):
+    def __init__(self, classes=2622):
+        """VGGFace model.
+
+        Face recognition network.  It takes as input a Bx3x224x224
+        batch of face images and gives as output a BxC score vector
+        (C is the number of identities).
+        Input images need to be scaled in the 0-1 range and then
+        normalized with respect to the mean RGB used during training.
+
+        Args:
+            classes (int): number of identities recognized by the
+            network
+
+        """
+        super().__init__()
+        self.conv1 = _ConvBlock(3, 64, 64)
+        self.conv2 = _ConvBlock(64, 128, 128)
+        self.conv3 = _ConvBlock(128, 256, 256, 256)
+        self.conv4 = _ConvBlock(256, 512, 512, 512)
+        self.conv5 = _ConvBlock(512, 512, 512, 512)
+        self.dropout = torch.nn.Dropout(0.5)
+        self.fc1 = torch.nn.Linear(7 * 7 * 512, 4096)
+        self.fc2 = torch.nn.Linear(4096, 4096)
+        self.fc3 = torch.nn.Linear(4096, classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        x = x.view(x.size(0), -1)
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = self.dropout(F.relu(self.fc2(x)))
+        x = self.fc3(x)
+        return x
+
+
+class _ConvBlock(torch.nn.Module):
+    """A Convolutional block."""
+
+    def __init__(self, *units):
+        """Create a block with len(units) - 1 convolutions.
+
+        convolution number i transforms the number of channels from
+        units[i - 1] to units[i] channels.
+
+        """
+        super().__init__()
+        self.convs = torch.nn.ModuleList([
+            torch.nn.Conv2d(in_, out, 3, 1, 1)
+            for in_, out in zip(units[:-1], units[1:])
+        ])
+
+    def forward(self, x):
+        # Each convolution is followed by a ReLU, then the block is
+        # concluded by a max pooling.
+        for c in self.convs:
+            x = F.relu(c(x))
+        return F.max_pool2d(x, 2, 2, 0, ceil_mode=True)
+
+
+
+import numpy as np
+from torchvision import models
+class Vgg19(torch.nn.Module):
+    """
+    Vgg19 network for perceptual loss.
+    """
+    def __init__(self, requires_grad=False):
+        super(Vgg19, self).__init__()
+        vgg_pretrained_features = models.vgg19(pretrained=True).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        for x in range(2):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(2, 7):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(7, 12):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(12, 21):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(21, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+
+        self.mean = torch.nn.Parameter(data=torch.Tensor(np.array([0.485, 0.456, 0.406]).reshape((1, 3, 1, 1))),
+                                       requires_grad=False)
+        self.std = torch.nn.Parameter(data=torch.Tensor(np.array([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1))),
+                                      requires_grad=False)
+
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        X = (X - self.mean) / self.std
+        h_relu1 = self.slice1(X)
+        h_relu2 = self.slice2(h_relu1)
+        h_relu3 = self.slice3(h_relu2)
+        h_relu4 = self.slice4(h_relu3)
+        h_relu5 = self.slice5(h_relu4)
+        out = [h_relu1, h_relu2, h_relu3, h_relu4, h_relu5]
+        return out
+
+
+from torch import nn
+class AntiAliasInterpolation2d(nn.Module):
+    """
+    Band-limited downsampling, for better preservation of the input signal.
+    """
+    def __init__(self, channels, scale):
+        super(AntiAliasInterpolation2d, self).__init__()
+        sigma = (1 / scale - 1) / 2
+        kernel_size = 2 * round(sigma * 4) + 1
+        self.ka = kernel_size // 2
+        self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka
+
+        kernel_size = [kernel_size, kernel_size]
+        sigma = [sigma, sigma]
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid(
+            [
+                torch.arange(size, dtype=torch.float32)
+                for size in kernel_size
+                ]
+        )
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= torch.exp(-(mgrid - mean) ** 2 / (2 * std ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer('weight', kernel)
+        self.groups = channels
+        self.scale = scale
+        inv_scale = 1 / scale
+        self.int_inv_scale = int(inv_scale)
+
+    def forward(self, input):
+        if self.scale == 1.0:
+            return input
+
+        out = F.pad(input, (self.ka, self.kb, self.ka, self.kb))
+        out = F.conv2d(out, weight=self.weight, groups=self.groups)
+        out = out[:, :, ::self.int_inv_scale, ::self.int_inv_scale]
+
+        return out
+
+
+class ImagePyramide(torch.nn.Module):
+    """
+    Create image pyramide for computing pyramide perceptual loss.
+    """
+    def __init__(self, scales, num_channels):
+        super(ImagePyramide, self).__init__()
+        downs = {}
+        for scale in scales:
+            downs[str(scale).replace('.', '-')] = AntiAliasInterpolation2d(num_channels, scale)
+        self.downs = nn.ModuleDict(downs)
+
+    def forward(self, x):
+        out_dict = {}
+        for scale, down_module in self.downs.items():
+            out_dict['prediction_' + str(scale).replace('-', '.')] = down_module(x)
+        return out_dict