From 4bc541705ae182909ce165fc238e3a49a6f8e28d Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Thu, 3 Oct 2024 17:18:58 +0000 Subject: [PATCH] add a cleaner for the mmconv data Different versions of espeak represent things differently, it seems (also, there are some distinctions none of our speakers make, so normalising those away reduces perplexity a tiny amount). --- matcha/text/cleaners.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/matcha/text/cleaners.py b/matcha/text/cleaners.py index 36776e3..386257b 100644 --- a/matcha/text/cleaners.py +++ b/matcha/text/cleaners.py @@ -105,6 +105,20 @@ def english_cleaners2(text): return phonemes +def mmconv_ipa_simplify(text): + replacements = [ + ("ɐ", "ə"), + ("ˈə", "ə"), + ("ʤ", "dʒ"), + ("ʧ", "tʃ"), + ("ᵻ", "ɪ"), + ] + for replacement in replacements: + text = text.replace(replacement[0], replacement[1]) + phonemes = collapse_whitespace(text) + return phonemes + + # I am removing this due to incompatibility with several version of python # However, if you want to use it, you can uncomment it # and install piper-phonemize with the following command: