From df896301ca88c31b2e5765c64c59225bd16273e1 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Fri, 1 Dec 2023 10:44:49 +0000 Subject: [PATCH 1/5] Minor changes moving option to disable prior loss in config --- configs/experiment/ljspeech_no_prior_loss.yaml | 17 +++++++++++++++++ configs/model/matcha.yaml | 1 + matcha/models/matcha_tts.py | 9 +++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 configs/experiment/ljspeech_no_prior_loss.yaml diff --git a/configs/experiment/ljspeech_no_prior_loss.yaml b/configs/experiment/ljspeech_no_prior_loss.yaml new file mode 100644 index 0000000..6181950 --- /dev/null +++ b/configs/experiment/ljspeech_no_prior_loss.yaml @@ -0,0 +1,17 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: ljspeech.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["ljspeech"] + +run_name: ljspeech + +model: + prior_loss: false diff --git a/configs/model/matcha.yaml b/configs/model/matcha.yaml index 4700855..36f6eaf 100644 --- a/configs/model/matcha.yaml +++ b/configs/model/matcha.yaml @@ -12,3 +12,4 @@ spk_emb_dim: 64 n_feats: 80 data_statistics: ${data.data_statistics} out_size: null # Must be divisible by 4 +prior_loss: true diff --git a/matcha/models/matcha_tts.py b/matcha/models/matcha_tts.py index 6feb9e7..64b2c07 100644 --- a/matcha/models/matcha_tts.py +++ b/matcha/models/matcha_tts.py @@ -34,6 +34,7 @@ class MatchaTTS(BaseLightningClass): # 🍵 out_size, optimizer=None, scheduler=None, + prior_loss=True, ): super().__init__() @@ -44,6 +45,7 @@ class MatchaTTS(BaseLightningClass): # 🍵 self.spk_emb_dim = spk_emb_dim self.n_feats = n_feats self.out_size = out_size + self.prior_loss = prior_loss if n_spks > 1: self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) @@ -228,7 +230,10 @@ class MatchaTTS(BaseLightningClass): # 🍵 # Compute loss of the decoder diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) - prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) - prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + if self.prior_loss: + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + else: + prior_loss = 0 return dur_loss, prior_loss, diff_loss From 263d5c4d4ea23da432e6a6b5e5e19f1da8e00a45 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Fri, 1 Dec 2023 12:06:26 +0000 Subject: [PATCH 2/5] Adding piper phonemizer with different dataset --- configs/data/hi-fi_en-US_female.yaml | 13 +++++++++++++ .../experiment/hifi_dataset_piper_phonemizer.yaml | 14 ++++++++++++++ matcha/text/cleaners.py | 11 +++++++++++ requirements.txt | 1 + 4 files changed, 39 insertions(+) create mode 100644 configs/data/hi-fi_en-US_female.yaml create mode 100644 configs/experiment/hifi_dataset_piper_phonemizer.yaml diff --git a/configs/data/hi-fi_en-US_female.yaml b/configs/data/hi-fi_en-US_female.yaml new file mode 100644 index 0000000..2a95cda --- /dev/null +++ b/configs/data/hi-fi_en-US_female.yaml @@ -0,0 +1,13 @@ +defaults: + - ljspeech + - _self_ + +_target_: matcha.data.text_mel_datamodule.TextMelDataModule +name: hi-fi_en-US_female +train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt +valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt +batch_size: 32 +cleaners: [english_cleaners_piper] +data_statistics: # Computed for vctk dataset + mel_mean: -6.38385 + mel_std: 2.541796 diff --git a/configs/experiment/hifi_dataset_piper_phonemizer.yaml b/configs/experiment/hifi_dataset_piper_phonemizer.yaml new file mode 100644 index 0000000..7e6c57a --- /dev/null +++ b/configs/experiment/hifi_dataset_piper_phonemizer.yaml @@ -0,0 +1,14 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: hi-fi_en-US_female.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] + +run_name: hi-fi_en-US_female_piper_phonemizer diff --git a/matcha/text/cleaners.py b/matcha/text/cleaners.py index 26b91d7..5e8d96b 100644 --- a/matcha/text/cleaners.py +++ b/matcha/text/cleaners.py @@ -15,6 +15,7 @@ import logging import re import phonemizer +import piper_phonemize from unidecode import unidecode # To avoid excessive logging we set the log level of the phonemizer package to Critical @@ -103,3 +104,13 @@ def english_cleaners2(text): phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0] phonemes = collapse_whitespace(phonemes) return phonemes + + +def english_cleaners_piper(text): + """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0]) + phonemes = collapse_whitespace(phonemes) + return phonemes diff --git a/requirements.txt b/requirements.txt index c1be781..f657dc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,3 +42,4 @@ gradio gdown wget seaborn +piper_phonemize From a18db173302052cd0bfff1fe8f70ef7b58ae87be Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:12:39 +0000 Subject: [PATCH 3/5] Removing the option for configuring prior loss, the durations predicted are not so good then --- configs/model/matcha.yaml | 1 - matcha/models/matcha_tts.py | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/configs/model/matcha.yaml b/configs/model/matcha.yaml index 36f6eaf..4700855 100644 --- a/configs/model/matcha.yaml +++ b/configs/model/matcha.yaml @@ -12,4 +12,3 @@ spk_emb_dim: 64 n_feats: 80 data_statistics: ${data.data_statistics} out_size: null # Must be divisible by 4 -prior_loss: true diff --git a/matcha/models/matcha_tts.py b/matcha/models/matcha_tts.py index 64b2c07..6feb9e7 100644 --- a/matcha/models/matcha_tts.py +++ b/matcha/models/matcha_tts.py @@ -34,7 +34,6 @@ class MatchaTTS(BaseLightningClass): # 🍵 out_size, optimizer=None, scheduler=None, - prior_loss=True, ): super().__init__() @@ -45,7 +44,6 @@ class MatchaTTS(BaseLightningClass): # 🍵 self.spk_emb_dim = spk_emb_dim self.n_feats = n_feats self.out_size = out_size - self.prior_loss = prior_loss if n_spks > 1: self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) @@ -230,10 +228,7 @@ class MatchaTTS(BaseLightningClass): # 🍵 # Compute loss of the decoder diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) - if self.prior_loss: - prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) - prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) - else: - prior_loss = 0 + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) return dur_loss, prior_loss, diff_loss From 009b09a8b2ff5922e076dd4892be8a3ce5b95e3e Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:13:44 +0000 Subject: [PATCH 4/5] Removing unwanted configs --- configs/experiment/ljspeech_no_prior_loss.yaml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 configs/experiment/ljspeech_no_prior_loss.yaml diff --git a/configs/experiment/ljspeech_no_prior_loss.yaml b/configs/experiment/ljspeech_no_prior_loss.yaml deleted file mode 100644 index 6181950..0000000 --- a/configs/experiment/ljspeech_no_prior_loss.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# @package _global_ - -# to execute this experiment run: -# python train.py experiment=multispeaker - -defaults: - - override /data: ljspeech.yaml - -# all parameters below will be merged with parameters from default configurations set above -# this allows you to overwrite only specified parameters - -tags: ["ljspeech"] - -run_name: ljspeech - -model: - prior_loss: false From 6c7a82a51651370b562eb9f750b7f9a087cac293 Mon Sep 17 00:00:00 2001 From: Shivam Mehta Date: Mon, 4 Dec 2023 10:15:13 +0000 Subject: [PATCH 5/5] Adding dataset information --- configs/data/hi-fi_en-US_female.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/data/hi-fi_en-US_female.yaml b/configs/data/hi-fi_en-US_female.yaml index 2a95cda..1269f9b 100644 --- a/configs/data/hi-fi_en-US_female.yaml +++ b/configs/data/hi-fi_en-US_female.yaml @@ -2,12 +2,13 @@ defaults: - ljspeech - _self_ +# Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ _target_: matcha.data.text_mel_datamodule.TextMelDataModule name: hi-fi_en-US_female train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt batch_size: 32 cleaners: [english_cleaners_piper] -data_statistics: # Computed for vctk dataset +data_statistics: # Computed for this dataset mel_mean: -6.38385 mel_std: 2.541796