From 7e499df0b25ea86b7b916df9480003b8aff72a4b Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 2 Dec 2024 11:01:04 +0000 Subject: [PATCH] ljspeech/hificaptain from #99 --- .gitignore | 1 + configs/data/hi-fi_en-US_female.yaml | 4 +- data | 1 - matcha/utils/data/__init__.py | 0 matcha/utils/data/hificaptain.py | 148 +++++++++++++++++++++++++++ matcha/utils/data/ljspeech.py | 97 ++++++++++++++++++ matcha/utils/data/utils.py | 53 ++++++++++ 7 files changed, 301 insertions(+), 3 deletions(-) delete mode 120000 data create mode 100644 matcha/utils/data/__init__.py create mode 100644 matcha/utils/data/hificaptain.py create mode 100644 matcha/utils/data/ljspeech.py create mode 100644 matcha/utils/data/utils.py diff --git a/.gitignore b/.gitignore index cbec8b4..f353a3c 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ generator_v1 g_02500000 gradio_cached_examples/ synth_output/ +/data diff --git a/configs/data/hi-fi_en-US_female.yaml b/configs/data/hi-fi_en-US_female.yaml index 1269f9b..e9f888e 100644 --- a/configs/data/hi-fi_en-US_female.yaml +++ b/configs/data/hi-fi_en-US_female.yaml @@ -5,8 +5,8 @@ defaults: # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ _target_: matcha.data.text_mel_datamodule.TextMelDataModule name: hi-fi_en-US_female -train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt -valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt +train_filelist_path: data/hi-fi_en-US_female/train.txt +valid_filelist_path: data/hi-fi_en-US_female/val.txt batch_size: 32 cleaners: [english_cleaners_piper] data_statistics: # Computed for this dataset diff --git a/data b/data deleted file mode 120000 index 18e4b1a..0000000 --- a/data +++ /dev/null @@ -1 +0,0 @@ -/home/smehta/Projects/Speech-Backbones/Grad-TTS/data \ No newline at end of file diff --git a/matcha/utils/data/__init__.py b/matcha/utils/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/matcha/utils/data/hificaptain.py b/matcha/utils/data/hificaptain.py new file mode 100644 index 0000000..a219a57 --- /dev/null +++ b/matcha/utils/data/hificaptain.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +import argparse +import os +import sys +import tempfile +from pathlib import Path + +import torchaudio +from torch.hub import download_url_to_file +from tqdm import tqdm + +from matcha.utils.data.utils import _extract_zip + +URLS = { + "en-US": { + "female": "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_en-US_F.zip", + "male": "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_en-US_M.zip", + }, + "ja-JP": { + "female": "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_ja-JP_F.zip", + "male": "https://ast-astrec.nict.go.jp/release/hi-fi-captain/hfc_ja-JP_M.zip", + }, +} + +INFO_PAGE = "https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/" + +# On their website they say "We NICT open-sourced Hi-Fi-CAPTAIN", +# but they use this very-much-not-open-source licence. +# Dunno if this is open washing or stupidity. +LICENCE = "CC BY-NC-SA 4.0" + +# I'd normally put the citation here. It's on their website. +# Boo to non-open-source stuff. + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("-s", "--save-dir", type=str, default=None, help="Place to store the downloaded zip files") + parser.add_argument( + "-r", + "--skip-resampling", + action="store_true", + default=False, + help="Skip resampling the data (from 48 to 22.05)", + ) + parser.add_argument( + "-l", "--language", type=str, choices=["en-US", "ja-JP"], default="en-US", help="The language to download" + ) + parser.add_argument( + "-g", + "--gender", + type=str, + choices=["male", "female"], + default="female", + help="The gender of the speaker to download", + ) + parser.add_argument( + "-o", + "--output_dir", + type=str, + default="data", + help="Place to store the converted data. Top-level only, the subdirectory will be created", + ) + + return parser.parse_args() + + +def process_text(infile, outpath: Path): + outmode = "w" + if infile.endswith("dev.txt"): + outfile = outpath / "valid.txt" + elif infile.endswith("eval.txt"): + outfile = outpath / "test.txt" + else: + outfile = outpath / "train.txt" + if outfile.exists(): + outmode = "a" + with ( + open(infile, encoding="utf-8") as inf, + open(outfile, outmode, encoding="utf-8") as of, + ): + for line in inf.readlines(): + line = line.strip() + fileid, rest = line.split(" ", maxsplit=1) + outfile = str(outpath / f"{fileid}.wav") + of.write(f"{outfile}|{rest}\n") + + +def process_files(zipfile, outpath, resample=True): + with tempfile.TemporaryDirectory() as tmpdirname: + for filename in tqdm(_extract_zip(zipfile, tmpdirname)): + if not filename.startswith(tmpdirname): + filename = os.path.join(tmpdirname, filename) + if filename.endswith(".txt"): + process_text(filename, outpath) + elif filename.endswith(".wav"): + filepart = filename.rsplit("/", maxsplit=1)[-1] + outfile = str(outpath / filepart) + arr, sr = torchaudio.load(filename) + if resample: + arr = torchaudio.functional.resample(arr, orig_freq=sr, new_freq=22050) + torchaudio.save(outfile, arr, 22050) + else: + continue + + +def main(): + args = get_args() + + save_dir = None + if args.save_dir: + save_dir = Path(args.save_dir) + if not save_dir.is_dir(): + save_dir.mkdir() + + if not args.output_dir: + print("output directory not specified, exiting") + sys.exit(1) + + URL = URLS[args.language][args.gender] + dirname = f"hi-fi_{args.language}_{args.gender}" + + outbasepath = Path(args.output_dir) + if not outbasepath.is_dir(): + outbasepath.mkdir() + outpath = outbasepath / dirname + if not outpath.is_dir(): + outpath.mkdir() + + resample = True + if args.skip_resampling: + resample = False + + if save_dir: + zipname = URL.rsplit("/", maxsplit=1)[-1] + zipfile = save_dir / zipname + if not zipfile.exists(): + download_url_to_file(URL, zipfile, progress=True) + process_files(zipfile, outpath, resample) + else: + with tempfile.NamedTemporaryFile(suffix=".zip", delete=True) as zf: + download_url_to_file(URL, zf.name, progress=True) + process_files(zf.name, outpath, resample) + + +if __name__ == "__main__": + main() diff --git a/matcha/utils/data/ljspeech.py b/matcha/utils/data/ljspeech.py new file mode 100644 index 0000000..eb1eeee --- /dev/null +++ b/matcha/utils/data/ljspeech.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +import argparse +import random +import tempfile +from pathlib import Path + +from torch.hub import download_url_to_file + +from matcha.utils.data.utils import _extract_tar + +URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2" + +INFO_PAGE = "https://keithito.com/LJ-Speech-Dataset/" + +LICENCE = "Public domain (LibriVox copyright disclaimer)" + +CITATION = """ +@misc{ljspeech17, + author = {Keith Ito and Linda Johnson}, + title = {The LJ Speech Dataset}, + howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}}, + year = 2017 +} +""" + + +def decision(): + return random.random() < 0.98 + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("-s", "--save-dir", type=str, default=None, help="Place to store the downloaded zip files") + parser.add_argument( + "output_dir", + type=str, + nargs="?", + default="data", + help="Place to store the converted data (subdirectory LJSpeech-1.1 will be created)", + ) + + return parser.parse_args() + + +def process_csv(ljpath: Path): + if (ljpath / "metadata.csv").exists(): + basepath = ljpath + elif (ljpath / "LJSpeech-1.1" / "metadata.csv").exists(): + basepath = ljpath / "LJSpeech-1.1" + csvpath = basepath / "metadata.csv" + wavpath = basepath / "wavs" + + with ( + open(csvpath, encoding="utf-8") as csvf, + open(basepath / "train.txt", "w", encoding="utf-8") as tf, + open(basepath / "val.txt", "w", encoding="utf-8") as vf, + ): + for line in csvf.readlines(): + line = line.strip() + parts = line.split("|") + wavfile = str(wavpath / f"{parts[0]}.wav") + if decision(): + tf.write(f"{wavfile}|{parts[1]}\n") + else: + vf.write(f"{wavfile}|{parts[1]}\n") + + +def main(): + args = get_args() + + save_dir = None + if args.save_dir: + save_dir = Path(args.save_dir) + if not save_dir.is_dir(): + save_dir.mkdir() + + outpath = Path(args.output_dir) + if not outpath.is_dir(): + outpath.mkdir() + + if save_dir: + tarname = URL.rsplit("/", maxsplit=1)[-1] + tarfile = save_dir / tarname + if not tarfile.exists(): + download_url_to_file(URL, str(tarfile), progress=True) + _extract_tar(tarfile, outpath) + process_csv(outpath) + else: + with tempfile.NamedTemporaryFile(suffix=".tar.bz2", delete=True) as zf: + download_url_to_file(URL, zf.name, progress=True) + _extract_tar(zf.name, outpath) + process_csv(outpath) + + +if __name__ == "__main__": + main() diff --git a/matcha/utils/data/utils.py b/matcha/utils/data/utils.py new file mode 100644 index 0000000..9f89116 --- /dev/null +++ b/matcha/utils/data/utils.py @@ -0,0 +1,53 @@ +# taken from https://github.com/pytorch/audio/blob/main/src/torchaudio/datasets/utils.py +# Copyright (c) 2017 Facebook Inc. (Soumith Chintala) +# Licence: BSD 2-Clause +# pylint: disable=C0123 + +import logging +import os +import tarfile +import zipfile +from pathlib import Path +from typing import Any, List, Optional, Union + +_LG = logging.getLogger(__name__) + + +def _extract_tar(from_path: Union[str, Path], to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: + if type(from_path) is Path: + from_path = str(Path) + + if to_path is None: + to_path = os.path.dirname(from_path) + + with tarfile.open(from_path, "r") as tar: + files = [] + for file_ in tar: # type: Any + file_path = os.path.join(to_path, file_.name) + if file_.isfile(): + files.append(file_path) + if os.path.exists(file_path): + _LG.info("%s already extracted.", file_path) + if not overwrite: + continue + tar.extract(file_, to_path) + return files + + +def _extract_zip(from_path: Union[str, Path], to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: + if type(from_path) is Path: + from_path = str(Path) + + if to_path is None: + to_path = os.path.dirname(from_path) + + with zipfile.ZipFile(from_path, "r") as zfile: + files = zfile.namelist() + for file_ in files: + file_path = os.path.join(to_path, file_) + if os.path.exists(file_path): + _LG.info("%s already extracted.", file_path) + if not overwrite: + continue + zfile.extract(file_, to_path) + return files