add a cleaner for the mmconv data

Different versions of espeak represent things differently, it seems
(also, there are some distinctions none of our speakers make, so
normalising those away reduces perplexity a tiny amount).
This commit is contained in:
Jim O'Regan
2024-10-03 17:18:58 +00:00
parent 77804265f8
commit 4bc541705a

View File

@@ -105,6 +105,20 @@ def english_cleaners2(text):
return phonemes
def mmconv_ipa_simplify(text):
replacements = [
("ɐ", "ə"),
("ˈə", "ə"),
("ʤ", ""),
("ʧ", ""),
("", "ɪ"),
]
for replacement in replacements:
text = text.replace(replacement[0], replacement[1])
phonemes = collapse_whitespace(text)
return phonemes
# I am removing this due to incompatibility with several version of python
# However, if you want to use it, you can uncomment it
# and install piper-phonemize with the following command: