add a cleaner for the mmconv data

Different versions of espeak represent things differently, it seems (also, there are some distinctions none of our speakers make, so normalising those away reduces perplexity a tiny amount).
2026-02-05 18:29:19 +08:00 · 2024-10-03 17:18:58 +00:00
parent 77804265f8
commit 4bc541705a
1 changed files with 14 additions and 0 deletions
--- a/matcha/text/cleaners.py
+++ b/matcha/text/cleaners.py
@@ -105,6 +105,20 @@ def english_cleaners2(text):
    return phonemes


+def mmconv_ipa_simplify(text):
+    replacements = [
+        ("ɐ", "ə"),
+        ("ˈə", "ə"),
+        ("ʤ", "dʒ"),
+        ("ʧ", "tʃ"),
+        ("ᵻ", "ɪ"),
+    ]
+    for replacement in replacements:
+        text = text.replace(replacement[0], replacement[1])
+    phonemes = collapse_whitespace(text)
+    return phonemes
+
+
 # I am removing this due to incompatibility with several version of python
 # However, if you want to use it, you can uncomment it
 # and install piper-phonemize with the following command: