Merge pull request #75 from shivammehta25/dev

Adding alginment information to readme
2026-02-04 17:59:19 +08:00 · 2024-05-27 13:57:49 +02:00 · 2024-05-27 13:57:10 +02:00 · 2024-05-27 13:54:27 +02:00 · 2024-05-27 13:50:21 +02:00 · 2024-05-27 13:40:02 +02:00
23 changed files with 467 additions and 143 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,9 +1,9 @@
 default_language_version:
-  python: python3.10
+  python: python3.11
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
    hooks:
      # list of supported hooks: https://pre-commit.com/hooks.html
      - id: trailing-whitespace
@@ -18,28 +18,28 @@ repos:
  # python code formatting
  - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 23.12.1
    hooks:
      - id: black
        args: [--line-length, "120"]
  # python import sorting
  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
    hooks:
      - id: isort
        args: ["--profile", "black", "--filter-files"]
  # python upgrading syntax to newer version
  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.3.1
+    rev: v3.15.0
    hooks:
      - id: pyupgrade
        args: [--py38-plus]
  # python check (PEP8), programming errors and code complexity
  - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.0.0
    hooks:
      - id: flake8
        args:
@@ -54,6 +54,6 @@ repos:
  # pylint
  - repo: https://github.com/pycqa/pylint
-    rev: v2.8.2
+    rev: v3.0.3
    hooks:
    -   id: pylint
--- a/.pylintrc
+++ b/.pylintrc
@@ -82,16 +82,6 @@ disable=missing-docstring,
        no-name-in-module,
        no-member,
        unsubscriptable-object,
        print-statement,
        parameter-unpacking,
        unpacking-in-except,
        old-raise-syntax,
        backtick,
        long-suffix,
        old-ne-operator,
        old-octal-literal,
        import-star-module-level,
        non-ascii-bytes-literal,
        raw-checker-failed,
        bad-inline-option,
        locally-disabled,
@@ -106,67 +96,6 @@ disable=missing-docstring,
        too-many-arguments,
        too-many-locals,
        too-many-statements,
        apply-builtin,
        basestring-builtin,
        buffer-builtin,
        cmp-builtin,
        coerce-builtin,
        execfile-builtin,
        file-builtin,
        long-builtin,
        raw_input-builtin,
        reduce-builtin,
        standarderror-builtin,
        unicode-builtin,
        xrange-builtin,
        coerce-method,
        delslice-method,
        getslice-method,
        setslice-method,
        no-absolute-import,
        old-division,
        dict-iter-method,
        dict-view-method,
        next-method-called,
        metaclass-assignment,
        indexing-exception,
        raising-string,
        reload-builtin,
        oct-method,
        hex-method,
        nonzero-method,
        cmp-method,
        input-builtin,
        round-builtin,
        intern-builtin,
        unichr-builtin,
        map-builtin-not-iterating,
        zip-builtin-not-iterating,
        range-builtin-not-iterating,
        filter-builtin-not-iterating,
        using-cmp-argument,
        eq-without-hash,
        div-method,
        idiv-method,
        rdiv-method,
        exception-message-attribute,
        invalid-str-codec,
        sys-max-int,
        bad-python3-import,
        deprecated-string-function,
        deprecated-str-translate-call,
        deprecated-itertools-function,
        deprecated-types-field,
        next-method-defined,
        dict-items-not-iterating,
        dict-keys-not-iterating,
        dict-values-not-iterating,
        deprecated-operator-function,
        deprecated-urllib-function,
        xreadlines-attribute,
        deprecated-sys-function,
        exception-escape,
        comprehension-escape,
        duplicate-code,
        not-callable,
        import-outside-toplevel,
@@ -363,13 +292,6 @@ max-line-length=120
 # Maximum number of lines in a module.
 max-module-lines=1000
 # List of optional constructs for which whitespace checking is disabled. `dict-
 # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
 # `trailing-comma` allows a space between comma and closing bracket: (a, ).
 # `empty-line` allows space-only lines.
 no-space-check=trailing-comma,
               dict-separator
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.
 single-line-class-stmt=no
@@ -599,5 +521,5 @@ min-public-methods=2
 # Exceptions that will emit a warning when being caught. Defaults to
 # "BaseException, Exception".
-overgeneral-exceptions=BaseException,
+overgeneral-exceptions=builtins.BaseException,
-                       Exception
+                       builtins.Exception
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 </div>
-> This is the official code implementation of 🍵 Matcha-TTS.
+> This is the official code implementation of 🍵 Matcha-TTS [ICASSP 2024].
 We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses [conditional flow matching](https://arxiv.org/abs/2210.02747) (similar to [rectified flows](https://arxiv.org/abs/2209.03003)) to speed up ODE-based speech synthesis. Our method:
@@ -26,13 +26,13 @@ We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, tha
 - Sounds highly natural
 - Is very fast to synthesise from
-Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our arXiv preprint](https://arxiv.org/abs/2309.03199) for more details.
+Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our ICASSP 2024 paper](https://arxiv.org/abs/2309.03199) for more details.
 [Pre-trained models](https://drive.google.com/drive/folders/17C_gYgEHOxI5ZypcfE_k1piKCtyR0isJ?usp=sharing) will be automatically downloaded with the CLI or gradio interface.
-[Try 🍵 Matcha-TTS on HuggingFace 🤗 spaces!](https://huggingface.co/spaces/shivammehta25/Matcha-TTS)
+You can also [try 🍵 Matcha-TTS in your browser on HuggingFace 🤗 spaces](https://huggingface.co/spaces/shivammehta25/Matcha-TTS).
-## Watch the teaser
+## Teaser video
 [![Watch the video](https://img.youtube.com/vi/xmvJkz3bqw0/hqdefault.jpg)](https://youtu.be/xmvJkz3bqw0)
@@ -252,16 +252,53 @@ python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --vo
 This will write `.wav` audio files to the output directory.
 ## Extract phoneme alignments from Matcha-TTS
 If the dataset is structured as
 ```bash
 data/
 └── LJSpeech-1.1
    ├── metadata.csv
    ├── README
    ├── test.txt
    ├── train.txt
    ├── val.txt
    └── wavs
 ```
 Then you can extract the phoneme level alignments from a Trained Matcha-TTS model using:
 ```bash
 python  matcha/utils/get_durations_from_trained_model.py -i dataset_yaml -c <checkpoint>
 ```
 Example:
 ```bash
 python  matcha/utils/get_durations_from_trained_model.py -i ljspeech.yaml -c matcha_ljspeech.ckpt
 ```
 or simply:
 ```bash
 matcha-tts-get-durations -i ljspeech.yaml -c matcha_ljspeech.ckpt
 ```
 ---
 ## Train using extracted alignments
 In the datasetconfig turn on load duration.
 Example: `ljspeech.yaml`
 ```
 load_durations: True
 ```
 or see an examples in configs/experiment/ljspeech_from_durations.yaml
 ## Citation information
 If you use our code or otherwise find this work useful, please cite our paper:
 ```text
-@article{mehta2023matcha,
+@inproceedings{mehta2024matcha,
-  title={Matcha-TTS: A fast TTS architecture with conditional flow matching},
+  title={Matcha-{TTS}: A fast {TTS} architecture with conditional flow matching},
  author={Mehta, Shivam and Tu, Ruibo and Beskow, Jonas and Sz{\'e}kely, {\'E}va and Henter, Gustav Eje},
-  journal={arXiv preprint arXiv:2309.03199},
+  booktitle={Proc. ICASSP},
-  year={2023}
+  year={2024}
 }
 ```
@@ -269,7 +306,7 @@ If you use our code or otherwise find this work useful, please cite our paper:
 Since this code uses [Lightning-Hydra-Template](https://github.com/ashleve/lightning-hydra-template), you have all the powers that come with it.
-Other source code I would like to acknowledge:
+Other source code we would like to acknowledge:
 - [Coqui-TTS](https://github.com/coqui-ai/TTS/tree/dev): For helping me figure out how to make cython binaries pip installable and encouragement
 - [Hugging Face Diffusers](https://huggingface.co/): For their awesome diffusers library and its components
--- a/configs/data/hi-fi_en-US_female.yaml
+++ b/configs/data/hi-fi_en-US_female.yaml
@@ -0,0 +1,14 @@
 defaults:
  - ljspeech
  - _self_
 # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/
 _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 name: hi-fi_en-US_female
 train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt
 valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt
 batch_size: 32
 cleaners: [english_cleaners_piper]
 data_statistics:  # Computed for this dataset
  mel_mean: -6.38385
  mel_std: 2.541796
--- a/configs/data/ljspeech.yaml
+++ b/configs/data/ljspeech.yaml
@@ -1,7 +1,7 @@
 _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 name: ljspeech
-train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt
+train_filelist_path: data/LJSpeech-1.1/train.txt
-valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt
+valid_filelist_path: data/LJSpeech-1.1/val.txt
 batch_size: 32
 num_workers: 20
 pin_memory: True
@@ -19,3 +19,4 @@ data_statistics:  # Computed for ljspeech dataset
  mel_mean: -5.536622
  mel_std: 2.116101
 seed: ${seed}
 load_durations: false
--- a/configs/experiment/hifi_dataset_piper_phonemizer.yaml
+++ b/configs/experiment/hifi_dataset_piper_phonemizer.yaml
@@ -0,0 +1,14 @@
 # @package _global_
 # to execute this experiment run:
 # python train.py experiment=multispeaker
 defaults:
  - override /data: hi-fi_en-US_female.yaml
 # all parameters below will be merged with parameters from default configurations set above
 # this allows you to overwrite only specified parameters
 tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"]
 run_name: hi-fi_en-US_female_piper_phonemizer
--- a/configs/experiment/ljspeech_from_durations.yaml
+++ b/configs/experiment/ljspeech_from_durations.yaml
@@ -0,0 +1,19 @@
 # @package _global_
 # to execute this experiment run:
 # python train.py experiment=multispeaker
 defaults:
  - override /data: ljspeech.yaml
 # all parameters below will be merged with parameters from default configurations set above
 # this allows you to overwrite only specified parameters
 tags: ["ljspeech"]
 run_name: ljspeech
 data:
  load_durations: True
  batch_size: 64
--- a/configs/model/matcha.yaml
+++ b/configs/model/matcha.yaml
@@ -12,3 +12,5 @@ spk_emb_dim: 64
 n_feats: 80
 data_statistics: ${data.data_statistics}
 out_size: null # Must be divisible by 4
 prior_loss: true
 use_precomputed_durations: ${data.load_durations}
--- a/matcha/VERSION
+++ b/matcha/VERSION
@@ -1 +1 @@
-0.0.4
+0.0.6.0
--- a/matcha/app.py
+++ b/matcha/app.py
@@ -29,8 +29,15 @@ args = Namespace(
 CURRENTLY_LOADED_MODEL = args.model
-MATCHA_TTS_LOC = lambda x: LOCATION / f"{x}.ckpt"  # noqa: E731
+
-VOCODER_LOC = lambda x: LOCATION / f"{x}"  # noqa: E731
+def MATCHA_TTS_LOC(x):
    return LOCATION / f"{x}.ckpt"
 def VOCODER_LOC(x):
    return LOCATION / f"{x}"
 LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png"
 RADIO_OPTIONS = {
    "Multi Speaker (VCTK)": {
--- a/matcha/cli.py
+++ b/matcha/cli.py
@@ -18,13 +18,13 @@ from matcha.text import sequence_to_text, text_to_sequence
 from matcha.utils.utils import assert_model_downloaded, get_user_data_dir, intersperse
 MATCHA_URLS = {
-    "matcha_ljspeech": "https://drive.google.com/file/d/1BBzmMU7k3a_WetDfaFblMoN18GqQeHCg/view?usp=drive_link",
+    "matcha_ljspeech": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_ljspeech.ckpt",
-    "matcha_vctk": "https://drive.google.com/file/d/1enuxmfslZciWGAl63WGh2ekVo00FYuQ9/view?usp=drive_link",
+    "matcha_vctk": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_vctk.ckpt",
 }
 VOCODER_URLS = {
-    "hifigan_T2_v1": "https://drive.google.com/file/d/14NENd4equCBLyyCSke114Mv6YR_j_uFs/view?usp=drive_link",
+    "hifigan_T2_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/generator_v1",  # Old url: https://drive.google.com/file/d/14NENd4equCBLyyCSke114Mv6YR_j_uFs/view?usp=drive_link
-    "hifigan_univ_v1": "https://drive.google.com/file/d/1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW/view?usp=drive_link",
+    "hifigan_univ_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/g_02500000",  # Old url: https://drive.google.com/file/d/1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW/view?usp=drive_link
 }
 MULTISPEAKER_MODEL = {
@@ -48,7 +48,7 @@ def plot_spectrogram_to_numpy(spectrogram, filename):
 def process_text(i: int, text: str, device: torch.device):
    print(f"[{i}] - Input text: {text}")
    x = torch.tensor(
-        intersperse(text_to_sequence(text, ["english_cleaners2"]), 0),
+        intersperse(text_to_sequence(text, ["english_cleaners2"])[0], 0),
        dtype=torch.long,
        device=device,
    )[None]
@@ -63,7 +63,7 @@ def get_texts(args):
    if args.text:
        texts = [args.text]
    else:
-        with open(args.file) as f:
+        with open(args.file, encoding="utf-8") as f:
            texts = f.readlines()
    return texts
@@ -140,7 +140,7 @@ def validate_args(args):
    if args.checkpoint_path is None:
        # When using pretrained models
-        if args.model in SINGLESPEAKER_MODEL.keys():
+        if args.model in SINGLESPEAKER_MODEL:
            args = validate_args_for_single_speaker_model(args)
        if args.model in MULTISPEAKER_MODEL:
@@ -326,12 +326,13 @@ def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
    for i, batch in enumerate(dataloader):
        i = i + 1
        start_t = dt.datetime.now()
        b = batch["x"].shape[0]
        output = model.synthesise(
            batch["x"].to(device),
            batch["x_lengths"].to(device),
            n_timesteps=args.steps,
            temperature=args.temperature,
-            spks=spk,
+            spks=spk.expand(b) if spk is not None else spk,
            length_scale=args.speaking_rate,
        )
--- a/matcha/data/text_mel_datamodule.py
+++ b/matcha/data/text_mel_datamodule.py
@@ -1,6 +1,8 @@
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
 import numpy as np
 import torch
 import torchaudio as ta
 from lightning import LightningDataModule
@@ -39,6 +41,7 @@ class TextMelDataModule(LightningDataModule):
        f_max,
        data_statistics,
        seed,
        load_durations,
    ):
        super().__init__()
@@ -68,6 +71,7 @@ class TextMelDataModule(LightningDataModule):
            self.hparams.f_max,
            self.hparams.data_statistics,
            self.hparams.seed,
            self.hparams.load_durations,
        )
        self.validset = TextMelDataset(  # pylint: disable=attribute-defined-outside-init
            self.hparams.valid_filelist_path,
@@ -83,6 +87,7 @@ class TextMelDataModule(LightningDataModule):
            self.hparams.f_max,
            self.hparams.data_statistics,
            self.hparams.seed,
            self.hparams.load_durations,
        )
    def train_dataloader(self):
@@ -109,7 +114,7 @@ class TextMelDataModule(LightningDataModule):
        """Clean up after fit or test."""
        pass  # pylint: disable=unnecessary-pass
-    def state_dict(self):  # pylint: disable=no-self-use
+    def state_dict(self):
        """Extra things to save to checkpoint."""
        return {}
@@ -134,6 +139,7 @@ class TextMelDataset(torch.utils.data.Dataset):
        f_max=8000,
        data_parameters=None,
        seed=None,
        load_durations=False,
    ):
        self.filepaths_and_text = parse_filelist(filelist_path)
        self.n_spks = n_spks
@@ -146,6 +152,8 @@ class TextMelDataset(torch.utils.data.Dataset):
        self.win_length = win_length
        self.f_min = f_min
        self.f_max = f_max
        self.load_durations = load_durations
        if data_parameters is not None:
            self.data_parameters = data_parameters
        else:
@@ -164,10 +172,29 @@ class TextMelDataset(torch.utils.data.Dataset):
            filepath, text = filepath_and_text[0], filepath_and_text[1]
            spk = None
-        text = self.get_text(text, add_blank=self.add_blank)
+        text, cleaned_text = self.get_text(text, add_blank=self.add_blank)
        mel = self.get_mel(filepath)
-        return {"x": text, "y": mel, "spk": spk}
+        durations = self.get_durations(filepath, text) if self.load_durations else None
        return {"x": text, "y": mel, "spk": spk, "filepath": filepath, "x_text": cleaned_text, "durations": durations}
    def get_durations(self, filepath, text):
        filepath = Path(filepath)
        data_dir, name = filepath.parent.parent, filepath.stem
        try:
            dur_loc = data_dir / "durations" / f"{name}.npy"
            durs = torch.from_numpy(np.load(dur_loc).astype(int))
        except FileNotFoundError as e:
            raise FileNotFoundError(
                f"Tried loading the durations but durations didn't exist at {dur_loc}, make sure you've generate the durations first using: python matcha/utils/get_durations_from_trained_model.py \n"
            ) from e
        assert len(durs) == len(text), f"Length of durations {len(durs)} and text {len(text)} do not match"
        return durs
    def get_mel(self, filepath):
        audio, sr = ta.load(filepath)
@@ -187,11 +214,11 @@ class TextMelDataset(torch.utils.data.Dataset):
        return mel
    def get_text(self, text, add_blank=True):
-        text_norm = text_to_sequence(text, self.cleaners)
+        text_norm, cleaned_text = text_to_sequence(text, self.cleaners)
        if self.add_blank:
            text_norm = intersperse(text_norm, 0)
        text_norm = torch.IntTensor(text_norm)
-        return text_norm
+        return text_norm, cleaned_text
    def __getitem__(self, index):
        datapoint = self.get_datapoint(self.filepaths_and_text[index])
@@ -214,8 +241,11 @@ class TextMelBatchCollate:
        y = torch.zeros((B, n_feats, y_max_length), dtype=torch.float32)
        x = torch.zeros((B, x_max_length), dtype=torch.long)
        durations = torch.zeros((B, x_max_length), dtype=torch.long)
        y_lengths, x_lengths = [], []
        spks = []
        filepaths, x_texts = [], []
        for i, item in enumerate(batch):
            y_, x_ = item["y"], item["x"]
            y_lengths.append(y_.shape[-1])
@@ -223,9 +253,22 @@ class TextMelBatchCollate:
            y[i, :, : y_.shape[-1]] = y_
            x[i, : x_.shape[-1]] = x_
            spks.append(item["spk"])
            filepaths.append(item["filepath"])
            x_texts.append(item["x_text"])
            if item["durations"] is not None:
                durations[i, : item["durations"].shape[-1]] = item["durations"]
        y_lengths = torch.tensor(y_lengths, dtype=torch.long)
        x_lengths = torch.tensor(x_lengths, dtype=torch.long)
        spks = torch.tensor(spks, dtype=torch.long) if self.n_spks > 1 else None
-        return {"x": x, "x_lengths": x_lengths, "y": y, "y_lengths": y_lengths, "spks": spks}
+        return {
            "x": x,
            "x_lengths": x_lengths,
            "y": y,
            "y_lengths": y_lengths,
            "spks": spks,
            "filepaths": filepaths,
            "x_texts": x_texts,
            "durations": durations if not torch.eq(durations, 0).all() else None,
        }
--- a/matcha/models/baselightningmodule.py
+++ b/matcha/models/baselightningmodule.py
@@ -58,13 +58,14 @@ class BaseLightningClass(LightningModule, ABC):
        y, y_lengths = batch["y"], batch["y_lengths"]
        spks = batch["spks"]
-        dur_loss, prior_loss, diff_loss = self(
+        dur_loss, prior_loss, diff_loss, *_ = self(
            x=x,
            x_lengths=x_lengths,
            y=y,
            y_lengths=y_lengths,
            spks=spks,
            out_size=self.out_size,
            durations=batch["durations"],
        )
        return {
            "dur_loss": dur_loss,
@@ -81,7 +82,7 @@ class BaseLightningClass(LightningModule, ABC):
            "step",
            float(self.global_step),
            on_step=True,
-            on_epoch=True,
+            prog_bar=True,
            logger=True,
            sync_dist=True,
        )
--- a/matcha/models/components/flow_matching.py
+++ b/matcha/models/components/flow_matching.py
@@ -73,16 +73,14 @@ class BASECFM(torch.nn.Module, ABC):
        # Or in future might add like a return_all_steps flag
        sol = []
-        steps = 1
+        for step in range(1, len(t_span)):
        while steps <= len(t_span) - 1:
            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
            x = x + dt * dphi_dt
            t = t + dt
            sol.append(x)
-            if steps < len(t_span) - 1:
+            if step < len(t_span) - 1:
-                dt = t_span[steps + 1] - t
+                dt = t_span[step + 1] - t
            steps += 1
        return sol[-1]
--- a/matcha/models/matcha_tts.py
+++ b/matcha/models/matcha_tts.py
@@ -34,6 +34,8 @@ class MatchaTTS(BaseLightningClass):  # 🍵
        out_size,
        optimizer=None,
        scheduler=None,
        prior_loss=True,
        use_precomputed_durations=False,
    ):
        super().__init__()
@@ -44,6 +46,8 @@ class MatchaTTS(BaseLightningClass):  # 🍵
        self.spk_emb_dim = spk_emb_dim
        self.n_feats = n_feats
        self.out_size = out_size
        self.prior_loss = prior_loss
        self.use_precomputed_durations = use_precomputed_durations
        if n_spks > 1:
            self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim)
@@ -145,7 +149,7 @@ class MatchaTTS(BaseLightningClass):  # 🍵
            "rtf": rtf,
        }
-    def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=None):
+    def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=None, durations=None):
        """
        Computes 3 losses:
            1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS).
@@ -177,17 +181,20 @@ class MatchaTTS(BaseLightningClass):  # 🍵
        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask)
        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
-        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        if self.use_precomputed_durations:
-        with torch.no_grad():
+            attn = generate_path(durations.squeeze(1), attn_mask.squeeze(1))
-            const = -0.5 * math.log(2 * math.pi) * self.n_feats
+        else:
-            factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
+            # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
-            y_square = torch.matmul(factor.transpose(1, 2), y**2)
+            with torch.no_grad():
-            y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
+                const = -0.5 * math.log(2 * math.pi) * self.n_feats
-            mu_square = torch.sum(factor * (mu_x**2), 1).unsqueeze(-1)
+                factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
-            log_prior = y_square - y_mu_double + mu_square + const
+                y_square = torch.matmul(factor.transpose(1, 2), y**2)
                y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
                mu_square = torch.sum(factor * (mu_x**2), 1).unsqueeze(-1)
                log_prior = y_square - y_mu_double + mu_square + const
-            attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
+                attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
-            attn = attn.detach()
+                attn = attn.detach()  # b, t_text, T_mel
        # Compute loss between predicted log-scaled durations and those obtained from MAS
        # refered to as prior loss in the paper
@@ -228,7 +235,10 @@ class MatchaTTS(BaseLightningClass):  # 🍵
        # Compute loss of the decoder
        diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond)
-        prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+        if self.prior_loss:
-        prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
+            prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
            prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
        else:
            prior_loss = 0
-        return dur_loss, prior_loss, diff_loss
+        return dur_loss, prior_loss, diff_loss, attn
--- a/matcha/text/init.py
+++ b/matcha/text/init.py
@@ -21,7 +21,7 @@ def text_to_sequence(text, cleaner_names):
    for symbol in clean_text:
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
-    return sequence
+    return sequence, clean_text
 def cleaned_text_to_sequence(cleaned_text):
--- a/matcha/text/cleaners.py
+++ b/matcha/text/cleaners.py
@@ -103,3 +103,19 @@ def english_cleaners2(text):
    phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0]
    phonemes = collapse_whitespace(phonemes)
    return phonemes
 # I am removing this due to incompatibility with several version of python
 # However, if you want to use it, you can uncomment it
 # and install piper-phonemize with the following command:
 # pip install piper-phonemize
 # import piper_phonemize
 # def english_cleaners_piper(text):
 #     """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
 #     text = convert_to_ascii(text)
 #     text = lowercase(text)
 #     text = expand_abbreviations(text)
 #     phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0])
 #     phonemes = collapse_whitespace(phonemes)
 #     return phonemes
--- a/matcha/utils/generate_data_statistics.py
+++ b/matcha/utils/generate_data_statistics.py
@@ -94,6 +94,7 @@ def main():
        cfg["batch_size"] = args.batch_size
        cfg["train_filelist_path"] = str(os.path.join(root_path, cfg["train_filelist_path"]))
        cfg["valid_filelist_path"] = str(os.path.join(root_path, cfg["valid_filelist_path"]))
        cfg["load_durations"] = False
    text_mel_datamodule = TextMelDataModule(**cfg)
    text_mel_datamodule.setup()
--- a/matcha/utils/get_durations_from_trained_model.py
+++ b/matcha/utils/get_durations_from_trained_model.py
@@ -0,0 +1,195 @@
 r"""
 The file creates a pickle file where the values needed for loading of dataset is stored and the model can load it
 when needed.
 Parameters from hparam.py will be used
 """
 import argparse
 import json
 import os
 import sys
 from pathlib import Path
 import lightning
 import numpy as np
 import rootutils
 import torch
 from hydra import compose, initialize
 from omegaconf import open_dict
 from torch import nn
 from tqdm.auto import tqdm
 from matcha.cli import get_device
 from matcha.data.text_mel_datamodule import TextMelDataModule
 from matcha.models.matcha_tts import MatchaTTS
 from matcha.utils.logging_utils import pylogger
 from matcha.utils.utils import get_phoneme_durations
 log = pylogger.get_pylogger(__name__)
 def save_durations_to_folder(
    attn: torch.Tensor, x_length: int, y_length: int, filepath: str, output_folder: Path, text: str
 ):
    durations = attn.squeeze().sum(1)[:x_length].numpy()
    durations_json = get_phoneme_durations(durations, text)
    output = output_folder / Path(filepath).name.replace(".wav", ".npy")
    with open(output.with_suffix(".json"), "w", encoding="utf-8") as f:
        json.dump(durations_json, f, indent=4, ensure_ascii=False)
    np.save(output, durations)
@torch.inference_mode()
 def compute_durations(data_loader: torch.utils.data.DataLoader, model: nn.Module, device: torch.device, output_folder):
    """Generate durations from the model for each datapoint and save it in a folder
    Args:
        data_loader (torch.utils.data.DataLoader): Dataloader
        model (nn.Module): MatchaTTS model
        device (torch.device): GPU or CPU
    """
    for batch in tqdm(data_loader, desc="🍵 Computing durations 🍵:"):
        x, x_lengths = batch["x"], batch["x_lengths"]
        y, y_lengths = batch["y"], batch["y_lengths"]
        spks = batch["spks"]
        x = x.to(device)
        y = y.to(device)
        x_lengths = x_lengths.to(device)
        y_lengths = y_lengths.to(device)
        spks = spks.to(device) if spks is not None else None
        _, _, _, attn = model(
            x=x,
            x_lengths=x_lengths,
            y=y,
            y_lengths=y_lengths,
            spks=spks,
        )
        attn = attn.cpu()
        for i in range(attn.shape[0]):
            save_durations_to_folder(
                attn[i],
                x_lengths[i].item(),
                y_lengths[i].item(),
                batch["filepaths"][i],
                output_folder,
                batch["x_texts"][i],
            )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i",
        "--input-config",
        type=str,
        default="ljspeech.yaml",
        help="The name of the yaml config file under configs/data",
    )
    parser.add_argument(
        "-b",
        "--batch-size",
        type=int,
        default="32",
        help="Can have increased batch size for faster computation",
    )
    parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        default=False,
        required=False,
        help="force overwrite the file",
    )
    parser.add_argument(
        "-c",
        "--checkpoint_path",
        type=str,
        required=True,
        help="Path to the checkpoint file to load the model from",
    )
    parser.add_argument(
        "-o",
        "--output-folder",
        type=str,
        default=None,
        help="Output folder to save the data statistics",
    )
    parser.add_argument(
        "--cpu", action="store_true", help="Use CPU for inference, not recommended (default: use GPU if available)"
    )
    args = parser.parse_args()
    with initialize(version_base="1.3", config_path="../../configs/data"):
        cfg = compose(config_name=args.input_config, return_hydra_config=True, overrides=[])
    root_path = rootutils.find_root(search_from=__file__, indicator=".project-root")
    with open_dict(cfg):
        del cfg["hydra"]
        del cfg["_target_"]
        cfg["seed"] = 1234
        cfg["batch_size"] = args.batch_size
        cfg["train_filelist_path"] = str(os.path.join(root_path, cfg["train_filelist_path"]))
        cfg["valid_filelist_path"] = str(os.path.join(root_path, cfg["valid_filelist_path"]))
        cfg["load_durations"] = False
    if args.output_folder is not None:
        output_folder = Path(args.output_folder)
    else:
        output_folder = Path(cfg["train_filelist_path"]).parent / "durations"
    print(f"Output folder set to: {output_folder}")
    if os.path.exists(output_folder) and not args.force:
        print("Folder already exists. Use -f to force overwrite")
        sys.exit(1)
    output_folder.mkdir(parents=True, exist_ok=True)
    print(f"Preprocessing: {cfg['name']} from training filelist: {cfg['train_filelist_path']}")
    print("Loading model...")
    device = get_device(args)
    model = MatchaTTS.load_from_checkpoint(args.checkpoint_path, map_location=device)
    text_mel_datamodule = TextMelDataModule(**cfg)
    text_mel_datamodule.setup()
    try:
        print("Computing stats for training set if exists...")
        train_dataloader = text_mel_datamodule.train_dataloader()
        compute_durations(train_dataloader, model, device, output_folder)
    except lightning.fabric.utilities.exceptions.MisconfigurationException:
        print("No training set found")
    try:
        print("Computing stats for validation set if exists...")
        val_dataloader = text_mel_datamodule.val_dataloader()
        compute_durations(val_dataloader, model, device, output_folder)
    except lightning.fabric.utilities.exceptions.MisconfigurationException:
        print("No validation set found")
    try:
        print("Computing stats for test set if exists...")
        test_dataloader = text_mel_datamodule.test_dataloader()
        compute_durations(test_dataloader, model, device, output_folder)
    except lightning.fabric.utilities.exceptions.MisconfigurationException:
        print("No test set found")
    print(f"[+] Done! Data statistics saved to: {output_folder}")
 if __name__ == "__main__":
    # Helps with generating durations for the dataset to train other architectures
    # that cannot learn to align due to limited size of dataset
    # Example usage:
    # python python matcha/utils/get_durations_from_trained_model.py -i ljspeech.yaml -c pretrained_model
    # This will create a folder in data/processed_data/durations/ljspeech with the durations
    main()
--- a/matcha/utils/utils.py
+++ b/matcha/utils/utils.py
@@ -2,6 +2,7 @@ import os
 import sys
 import warnings
 from importlib.util import find_spec
 from math import ceil
 from pathlib import Path
 from typing import Any, Callable, Dict, Tuple
@@ -115,7 +116,7 @@ def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> float:
        return None
    if metric_name not in metric_dict:
-        raise Exception(
+        raise ValueError(
            f"Metric value not found! <metric_name={metric_name}>\n"
            "Make sure metric name logged in LightningModule is correct!\n"
            "Make sure `optimized_metric` name in `hparams_search` config is correct!"
@@ -205,13 +206,54 @@ def get_user_data_dir(appname="matcha_tts"):
    return final_path
-def assert_model_downloaded(checkpoint_path, url, use_wget=False):
+def assert_model_downloaded(checkpoint_path, url, use_wget=True):
    if Path(checkpoint_path).exists():
        log.debug(f"[+] Model already present at {checkpoint_path}!")
        print(f"[+] Model already present at {checkpoint_path}!")
        return
    log.info(f"[-] Model not found at {checkpoint_path}! Will download it")
    print(f"[-] Model not found at {checkpoint_path}! Will download it")
    checkpoint_path = str(checkpoint_path)
    if not use_wget:
        gdown.download(url=url, output=checkpoint_path, quiet=False, fuzzy=True)
    else:
        wget.download(url=url, out=checkpoint_path)
 def get_phoneme_durations(durations, phones):
    prev = durations[0]
    merged_durations = []
    # Convolve with stride 2
    for i in range(1, len(durations), 2):
        if i == len(durations) - 2:
            # if it is last take full value
            next_half = durations[i + 1]
        else:
            next_half = ceil(durations[i + 1] / 2)
        curr = prev + durations[i] + next_half
        prev = durations[i + 1] - next_half
        merged_durations.append(curr)
    assert len(phones) == len(merged_durations)
    assert len(merged_durations) == (len(durations) - 1) // 2
    merged_durations = torch.cumsum(torch.tensor(merged_durations), 0, dtype=torch.long)
    start = torch.tensor(0)
    duration_json = []
    for i, duration in enumerate(merged_durations):
        duration_json.append(
            {
                phones[i]: {
                    "starttime": start.item(),
                    "endtime": duration.item(),
                    "duration": duration.item() - start.item(),
                }
            }
        )
        start = duration
    assert list(duration_json[-1].values())[0]["endtime"] == sum(
        durations
    ), f"{list(duration_json[-1].values())[0]['endtime'],  sum(durations)}"
    return duration_json
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,10 +35,10 @@ torchaudio
 matplotlib
 pandas
 conformer==0.3.2
-diffusers==0.21.3
+diffusers==0.25.0
 notebook
 ipywidgets
-gradio
+gradio==3.43.2
 gdown
 wget
 seaborn
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ setup(
            "matcha-data-stats=matcha.utils.generate_data_statistics:main",
            "matcha-tts=matcha.cli:cli",
            "matcha-tts-app=matcha.app:main",
            "matcha-tts-get-durations=matcha.utils.get_durations_from_trained_model:main",
        ]
    },
    ext_modules=cythonize(exts, language_level=3),
--- a/synthesis.ipynb
+++ b/synthesis.ipynb
@@ -19,7 +19,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "148f4bc0-c28e-4670-9a5e-4c7928ab8992",
   "metadata": {},
   "outputs": [
@@ -192,7 +192,7 @@
   "source": [
    "@torch.inference_mode()\n",
    "def process_text(text: str):\n",
-    "    x = torch.tensor(intersperse(text_to_sequence(text, ['english_cleaners2']), 0),dtype=torch.long, device=device)[None]\n",
+    "    x = torch.tensor(intersperse(text_to_sequence(text, ['english_cleaners2'])[0], 0),dtype=torch.long, device=device)[None]\n",
    "    x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)\n",
    "    x_phones = sequence_to_text(x.squeeze(0).tolist())\n",
    "    return {\n",
Author	SHA1	Message	Date
Shivam Mehta	d31cd92a61	Merge pull request #75 from shivammehta25/dev Adding alginment information to readme	2024-05-27 13:57:49 +02:00
Shivam Mehta	068d135e20	Adding alginment information to readme	2024-05-27 13:57:10 +02:00
Shivam Mehta	bd37d03b62	Merge pull request #74 from shivammehta25/dev Adding the possibility to use Matcha-TTS as an aligner and train from pretrained extracted alignments.	2024-05-27 13:54:27 +02:00
Shivam Mehta	ac0b258f80	Adding configuration for training from durations	2024-05-27 13:50:21 +02:00
Shivam Mehta	de910380bc	Fixing batched synthesis for multispeaker model	2024-05-27 13:40:02 +02:00
Shivam Mehta	aa496aa13f	Adding the possibility to train with durations	2024-05-27 13:24:21 +02:00
Shivam Mehta	e658aee6a5	Pinning gradio	2024-05-25 20:15:17 +02:00
Shivam Mehta	d816c40e3d	Updating the notebook to adjust to the change	2024-05-24 11:46:03 +02:00
Shivam Mehta	4b39f6cad0	Adding the possibility of get durations out of pretrained model	2024-05-24 11:34:51 +02:00
Shivam Mehta	dd9105b34b	Merge pull request #60 from jimregan/patch-1 Pin gradio to 3.43.2	2024-02-27 13:29:42 +01:00
Jim O’Regan	7d9d4cfd40	Pin gradio to 3.43.2 Fixes #59	2024-02-27 13:25:08 +01:00
Shivam Mehta	256adc55d3	Adding ICASSP 2024	2024-01-12 11:31:01 +00:00
Shivam Mehta	bfcbdbc82e	Merge pull request #43 from shivammehta25/dev Removing gdown for HifiGAN checkpoints too	2024-01-12 12:29:03 +01:00
Shivam Mehta	fb7b954de5	Updating different url for hifigan as well	2024-01-12 11:21:51 +00:00
Shivam Mehta	5a52a67cf7	Version bump	2024-01-12 11:11:41 +00:00
Shivam Mehta	39cbd85236	Using Wget for new ckpt downloadsA	2024-01-12 11:09:25 +00:00
Shivam Mehta	47a629f128	Merge pull request #42 from shivammehta25/dev Merging dev adding another dataset, piper phonemizer and refractoring	2024-01-12 11:49:53 +01:00
Shivam Mehta	95ec24b599	Version bump	2024-01-12 10:48:52 +00:00
Shivam Mehta	5a2a893750	Merge pull request #19 from shivammehta25/pre-commit-ci-update-config [pre-commit.ci] pre-commit autoupdate	2024-01-12 11:47:10 +01:00
Shivam Mehta	13ca33fbe5	Merge pull request #37 from shivammehta25/dependabot/pip/dev/diffusers-0.25.0 Bump diffusers from 0.21.3 to 0.25.0	2024-01-12 11:46:40 +01:00
Shivam Mehta	19bea20928	Merge branch 'main' into dev	2024-01-12 10:37:17 +00:00
Shivam Mehta	8268360674	Update download urls	2024-01-12 10:32:59 +00:00
Shivam Mehta	a0bf4e9e9a	Merge pull request #40 from shivammehta25/ghenter-readme-update-1 Update README.md with ICASSP acceptance	2024-01-12 10:13:23 +01:00
Gustav Eje Henter	f1e8efdec2	Update README.md Add back full stop that erroneously went missing in the shuffle.	2024-01-09 22:53:09 +01:00
Gustav Eje Henter	4ec245e61e	Update README.md with ICASSP acceptance Added ICASSP acceptance to the README and made some tiny tweaks to the text	2024-01-09 22:48:16 +01:00
pre-commit-ci[bot]	dc035a09f2	[pre-commit.ci] pre-commit autoupdate updates: - [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.5.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.5.0) - [github.com/psf/black: 23.9.1 → 23.12.1](https://github.com/psf/black/compare/23.9.1...23.12.1) - [github.com/PyCQA/isort: 5.12.0 → 5.13.2](https://github.com/PyCQA/isort/compare/5.12.0...5.13.2) - [github.com/asottile/pyupgrade: v3.14.0 → v3.15.0](https://github.com/asottile/pyupgrade/compare/v3.14.0...v3.15.0) - [github.com/PyCQA/flake8: 6.1.0 → 7.0.0](https://github.com/PyCQA/flake8/compare/6.1.0...7.0.0) - [github.com/pycqa/pylint: v3.0.0 → v3.0.3](https://github.com/pycqa/pylint/compare/v3.0.0...v3.0.3)	2024-01-08 21:15:26 +00:00
dependabot[bot]	254a8e05ce	Bump diffusers from 0.21.3 to 0.25.0 Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.21.3 to 0.25.0. - [Release notes](https://github.com/huggingface/diffusers/releases) - [Commits](https://github.com/huggingface/diffusers/compare/v0.21.3...v0.25.0) --- updated-dependencies: - dependency-name: diffusers dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2023-12-28 13:20:11 +00:00
Shivam Mehta	0ed9290c31	Logging global step while training	2023-12-06 10:39:54 +00:00
Shivam Mehta	f39ee6cf3b	Changing while to for for more readibility	2023-12-05 12:10:52 +00:00
Shivam Mehta	6e71dc8b8f	adding prior loss as a configuration	2023-12-05 09:57:37 +00:00
Shivam Mehta	ae2417c175	Merge pull request #34 from shivammehta25/piper_phonemize Piper phonemize	2023-12-04 11:16:24 +01:00
Shivam Mehta	6c7a82a516	Adding dataset information	2023-12-04 10:15:13 +00:00
Shivam Mehta	009b09a8b2	Removing unwanted configs	2023-12-04 10:13:44 +00:00
Shivam Mehta	a18db17330	Removing the option for configuring prior loss, the durations predicted are not so good then	2023-12-04 10:12:39 +00:00
Shivam Mehta	263d5c4d4e	Adding piper phonemizer with different dataset	2023-12-01 12:06:26 +00:00
Shivam Mehta	df896301ca	Minor changes moving option to disable prior loss in config	2023-12-01 10:44:49 +00:00
Shivam Mehta	c8d0d60f87	Merge pull request #16 from shivammehta25/pre-commit-ci-update-config [pre-commit.ci] pre-commit autoupdate	2023-10-06 05:44:02 +02:00
pre-commit-ci[bot]	e540794e7e	[pre-commit.ci] pre-commit autoupdate updates: - [github.com/psf/black: 23.1.0 → 23.9.1](https://github.com/psf/black/compare/23.1.0...23.9.1) - [github.com/asottile/pyupgrade: v3.3.1 → v3.14.0](https://github.com/asottile/pyupgrade/compare/v3.3.1...v3.14.0) - [github.com/PyCQA/flake8: 6.0.0 → 6.1.0](https://github.com/PyCQA/flake8/compare/6.0.0...6.1.0) - [github.com/pycqa/pylint: v2.8.2 → v3.0.0](https://github.com/pycqa/pylint/compare/v2.8.2...v3.0.0)	2023-10-03 13:14:20 +00:00