add files

2026-02-05 18:09:20 +08:00 · 2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions
--- a/funasr_local/text/korean_cleaner.py
+++ b/funasr_local/text/korean_cleaner.py
@@ -0,0 +1,77 @@
+# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
+
+import re
+
+
+class KoreanCleaner:
+    @classmethod
+    def _normalize_numbers(cls, text):
+        number_to_kor = {
+            "0": "영",
+            "1": "일",
+            "2": "이",
+            "3": "삼",
+            "4": "사",
+            "5": "오",
+            "6": "육",
+            "7": "칠",
+            "8": "팔",
+            "9": "구",
+        }
+        new_text = "".join(
+            number_to_kor[char] if char in number_to_kor.keys() else char
+            for char in text
+        )
+        return new_text
+
+    @classmethod
+    def _normalize_english_text(cls, text):
+        upper_alphabet_to_kor = {
+            "A": "에이",
+            "B": "비",
+            "C": "씨",
+            "D": "디",
+            "E": "이",
+            "F": "에프",
+            "G": "지",
+            "H": "에이치",
+            "I": "아이",
+            "J": "제이",
+            "K": "케이",
+            "L": "엘",
+            "M": "엠",
+            "N": "엔",
+            "O": "오",
+            "P": "피",
+            "Q": "큐",
+            "R": "알",
+            "S": "에스",
+            "T": "티",
+            "U": "유",
+            "V": "브이",
+            "W": "더블유",
+            "X": "엑스",
+            "Y": "와이",
+            "Z": "지",
+        }
+        new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
+        new_text = "".join(
+            upper_alphabet_to_kor[char]
+            if char in upper_alphabet_to_kor.keys()
+            else char
+            for char in new_text
+        )
+
+        return new_text
+
+    @classmethod
+    def normalize_text(cls, text):
+        # stage 0 : text strip
+        text = text.strip()
+
+        # stage 1 : normalize numbers
+        text = cls._normalize_numbers(text)
+
+        # stage 2 : normalize english text
+        text = cls._normalize_english_text(text)
+        return text