diff --git a/README.md b/README.md index 25e8c6f..7eb27d4 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ The models are small enough to be included directly into this repository. Newer | `'silero_vad_mini_8k'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | | `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | -| `'silero_lang_detector_116'` | 1.7M | Language Classifier | No | [116 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_116.json) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | +| ~~`'silero_lang_detector_116'`~~ | ~~1.7M~~ | ~~Language Classifier~~ ||| | || +| `'silero_lang_detector_95'` | 4.7M | Language Classifier | No | [95 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_95.json) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) | (*) Though explicitly trained on these languages, VAD should work on any Germanic, Romance or Slavic Languages out of the box. @@ -103,7 +104,7 @@ What models do: - VAD - detects speech; - Number Detector - detects spoken numbers (i.e. thirty five); - Language Classifier - classifies utterances between language; -- Language Classifier 116 - classifies among 116 languages as well as 77 language groups (mutually intelligible languages -> same group) +- Language Classifier 95 - classifies among 95 languages as well as 58 language groups (mutually intelligible languages -> same group) ### Version History @@ -118,7 +119,8 @@ What models do: | `v2.1` | 2021-02-11 | Add micro (10k params) VAD models | | `v2.2` | 2021-03-22 | Add micro 8000 sample rate VAD models | | `v2.3` | 2021-04-12 | Add mini (100k params) VAD models (8k and 16k sample rate) + **new** adaptive utils for full audio and single audio stream | -| `v2.4` | 2021-07-09 | Add 116 languages classifier and group classifier +| `v2.4` | 2021-07-09 | Add 116 languages classifier and group classifier | +| `v2.4` | 2021-07-09 | Deleted 116 language classifier, added 95 language classifier instead (get rid of lowspoken languages for quality improvement) | ### PyTorch @@ -218,7 +220,7 @@ language = get_language(wav, model) pprint(language) ``` -##### 116 languages +##### 95 languages [![Open on Torch Hub](https://img.shields.io/badge/Torch-Hub-red?logo=pytorch&style=for-the-badge)](https://pytorch.org/hub/snakers4_silero-vad_language/) @@ -229,7 +231,7 @@ from pprint import pprint model, lang_dict, lang_group_dict, utils = torch.hub.load( repo_or_dir='snakers4/silero-vad', - model='silero_lang_detector_116', + model='silero_lang_detector_95', force_reload=True) get_language_and_group, read_audio = utils @@ -362,7 +364,7 @@ language = get_language(wav, model, run_function=validate_onnx) print(language) ``` -##### 116 languages +##### 95 languages ```python import torch @@ -371,7 +373,7 @@ from pprint import pprint model, lang_dict, lang_group_dict, utils = torch.hub.load( repo_or_dir='snakers4/silero-vad', - model='silero_lang_detector_116', + model='silero_lang_detector_95', force_reload=True) get_language_and_group, read_audio = utils @@ -388,7 +390,7 @@ def validate_onnx(model, inputs): outs = [torch.Tensor(x) for x in outs] return outs -model = init_onnx_model(f'{files_dir}/lang_classifier_116.onnx') +model = init_onnx_model(f'{files_dir}/lang_classifier_95.onnx') wav = read_audio(f'{files_dir}/de.wav') languages, language_groups = get_language_and_group(wav, model, lang_dict, lang_group_dict, top_n=2, run_function=validate_onnx) @@ -539,10 +541,10 @@ Please see [Quality Metrics](#quality-metrics) - More languages TBD - Arbitrary audio length can be used, although network was trained using audio shorter than 15 seconds -### How Language Classifier 116 Works +### How Language Classifier 95 Works -- **83%** validation accuracy among 116 languages, **87%** validation accuracy among [77 language groups](https://github.com/snakers4/silero-vad/blob/master/files/lang_group_dict_116.json) -- Language classifier 116 was trained using audio samples in [116 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_116.json) +- **83%** validation accuracy among 95 languages, **87%** validation accuracy among [58 language groups](https://github.com/snakers4/silero-vad/blob/master/files/lang_group_dict_95.json) +- Language classifier 95 was trained using audio samples in [95 languages](https://github.com/snakers4/silero-vad/blob/master/files/lang_dict_95.json) - Arbitrary audio length can be used, although network was trained using audio shorter than 20 seconds ## Contact diff --git a/files/lang_classifier_116.jit b/files/lang_classifier_116.jit deleted file mode 100644 index 8556f32..0000000 Binary files a/files/lang_classifier_116.jit and /dev/null differ diff --git a/files/lang_classifier_116.onnx b/files/lang_classifier_116.onnx deleted file mode 100644 index 78586fb..0000000 Binary files a/files/lang_classifier_116.onnx and /dev/null differ diff --git a/files/lang_classifier_95.jit b/files/lang_classifier_95.jit new file mode 100644 index 0000000..27caa4c Binary files /dev/null and b/files/lang_classifier_95.jit differ diff --git a/files/lang_classifier_95.onnx b/files/lang_classifier_95.onnx new file mode 100644 index 0000000..e99ab5a Binary files /dev/null and b/files/lang_classifier_95.onnx differ diff --git a/files/lang_dict_116.json b/files/lang_dict_116.json deleted file mode 100644 index 41fa734..0000000 --- a/files/lang_dict_116.json +++ /dev/null @@ -1 +0,0 @@ -{"43": "tt, Tatar", "37": "am, Amharic", "70": "uk, Ukrainian", "40": "si, Sinhala, Sinhalese", "77": "mk, Macedonian", "63": "lb, Luxembourgish, Letzeburgesch", "20": "ka, Georgian", "35": "ar, Arabic", "95": "lv, Latvian", "25": "tk, Turkmen", "67": "uz, Uzbek", "50": "mg, Malagasy", "56": "pt, Portuguese", "91": "sl, Slovenian", "115": "pa-IN, Punjabi, Panjabi", "52": "eo, Esperanto", "55": "sa, Sanskrit", "114": "gn, Guarani", "61": "de, German", "69": "id, Indonesian", "97": "sk, Slovak", "47": "kk, Kazakh", "111": "ps, Pashto, Pushto", "98": "et, Estonian", "102": "sv-SE, Swedish", "68": "su, Sundanese", "10": "ba, Bashkir", "76": "ga-IE, Irish", "24": "el, Greek, Modern (1453\u2013)", "65": "km, Central Khmer", "17": "sd, Sindhi", "30": "ne, Nepali", "99": "nn, Norwegian Nynorsk", "9": "da, Danish", "109": "bn, Bengali", "104": "ia, Interlingua (International Auxiliary Language Association)", "113": "ab, Abkhazian", "19": "nl, Dutch, Flemish", "96": "ur, Urdu", "16": "mr, Marathi", "86": "ms, Malay", "26": "br, Breton", "84": "tl, Tagalog", "4": "fy-NL, Western Frisian", "15": "lt, Lithuanian", "13": "mn, Mongolian", "29": "my, Burmese", "27": "cv, Chuvash", "38": "yi, Yiddish", "8": "yo, Yoruba", "112": "or, Oriya", "18": "gu, Gujarati", "101": "as, Assamese", "107": "sn, Shona", "34": "hu, Hungarian", "110": "ca, Catalan, Valencian", "44": "so, Somali", "23": "is, Icelandic", "60": "az, Azerbaijani", "2": "ln, Lingala", "5": "hi, Hindi", "31": "cs, Czech", "74": "bo, Tibetan", "90": "kn, Kannada", "49": "zh-HK, Chinese", "22": "ha, Hausa", "14": "cy, Welsh", "87": "ko, Korean", "0": "fr, French", "78": "la, Latin", "103": "zh-TW, Chinese", "53": "vi, Vietnamese", "81": "lg, Ganda", "83": "pl, Polish", "59": "ro, Romanian, Moldavian, Moldovan", "72": "hr, Croatian", "48": "ht, Haitian, Haitian Creole", "36": "mt, Maltese", "100": "fi, Finnish", "46": "tr, Turkish", "80": "ml, Malayalam", "32": "eu, Basque", "75": "rm-sursilv, Romansh", "92": "sw, Swahili", "51": "ta, Tamil", "3": "dv, Divehi, Dhivehi, Maldivian", "105": "rm-vallader, Romansh", "58": "hy, Armenian", "6": "ru, Russian", "94": "bg, Bulgarian", "7": "en, English", "85": "be, Belarusian", "62": "sq, Albanian", "106": "es, Spanish, Castilian", "64": "af, Afrikaans", "89": "fo, Faroese", "33": "sv, Swedish", "73": "th, Thai", "57": "tg, Tajik", "66": "bs, Bosnian", "39": "zh-CN, Chinese", "71": "gv, Manx", "21": "te, Telugu", "108": "mi, Maori", "93": "oc, Occitan", "88": "ja, Japanese", "82": "ky, Kirghiz, Kyrgyz", "28": "sr, Serbian", "12": "it, Italian", "42": "fa, Persian", "41": "lo, Lao", "1": "zh, Chinese", "54": "gl, Galician", "79": "no, Norwegian", "11": "rw, Kinyarwanda", "45": "pa, Punjabi, Panjabi"} \ No newline at end of file diff --git a/files/lang_dict_95.json b/files/lang_dict_95.json new file mode 100644 index 0000000..623bf86 --- /dev/null +++ b/files/lang_dict_95.json @@ -0,0 +1 @@ +{"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"} \ No newline at end of file diff --git a/files/lang_group_dict_116.json b/files/lang_group_dict_116.json deleted file mode 100644 index 1238b63..0000000 --- a/files/lang_group_dict_116.json +++ /dev/null @@ -1 +0,0 @@ -{"0": ["Dutch, Flemish", "Afrikaans", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Polish", "Slovak", "Ukrainian", "Czech", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Danish", "Norwegian", "Norwegian Nynorsk", "Swedish"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Luxembourgish, Letzeburgesch", "German", "Yiddish"], "8": ["Irish"], "9": ["Portuguese", "Galician", "Spanish", "Catalan, Valencian", "Occitan", "Italian", "Spanish, Castilian"], "10": ["Maltese", "Arabic"], "11": ["Marathi"], "12": ["Welsh", "Breton"], "13": ["Hindi", "Urdu"], "14": ["Lao", "Thai"], "15": ["Malay", "Indonesian"], "16": ["Romanian, Moldavian, Moldovan"], "17": ["Tagalog"], "18": ["Persian", "Tajik"], "19": ["Kazakh", "Kirghiz, Kyrgyz", "Uzbek"], "20": ["Romansh"], "21": ["Kinyarwanda"], "22": ["Bashkir", "Tatar"], "23": ["French"], "24": ["Chinese"], "25": ["Lingala"], "26": ["Divehi, Dhivehi, Maldivian"], "27": ["Yoruba"], "28": ["Sinhala, Sinhalese"], "29": ["Lithuanian"], "30": ["Assamese"], "31": ["Korean"], "32": ["Gujarati"], "33": ["Basque"], "34": ["Hausa"], "35": ["Punjabi, Panjabi"], "36": ["Maori"], "37": ["Pashto, Pushto"], "38": ["Esperanto"], "39": ["Swahili"], "40": ["Abkhazian"], "41": ["Albanian"], "42": ["Armenian"], "43": ["Mongolian"], "44": ["Tamil"], "45": ["Haitian, Haitian Creole"], "46": ["Georgian"], "47": ["Japanese"], "48": ["Vietnamese"], "49": ["Amharic"], "50": ["Hungarian"], "51": ["Sanskrit"], "52": ["Chuvash"], "53": ["Shona"], "54": ["Latin"], "55": ["Central Khmer"], "56": ["Malagasy"], "57": ["Nepali"], "58": ["Ganda"], "59": ["Telugu"], "60": ["Oriya"], "61": ["Burmese"], "62": ["Icelandic"], "63": ["Greek, Modern (1453\u2013)"], "64": ["Guarani"], "65": ["Interlingua (International Auxiliary Language Association)"], "66": ["Malayalam"], "67": ["Tibetan"], "68": ["Faroese"], "69": ["Turkmen"], "70": ["Manx"], "71": ["Latvian"], "72": ["Somali"], "73": ["Bengali"], "74": ["Sundanese"], "75": ["Sindhi"], "76": ["Kannada"]} \ No newline at end of file diff --git a/files/lang_group_dict_95.json b/files/lang_group_dict_95.json new file mode 100644 index 0000000..1e612a0 --- /dev/null +++ b/files/lang_group_dict_95.json @@ -0,0 +1 @@ +{"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]} \ No newline at end of file diff --git a/hubconf.py b/hubconf.py index 138d66b..5b986df 100644 --- a/hubconf.py +++ b/hubconf.py @@ -134,19 +134,19 @@ def silero_lang_detector(**kwargs): return model, utils -def silero_lang_detector_116(**kwargs): - """Silero Language Classifier (116 languages) +def silero_lang_detector_95(**kwargs): + """Silero Language Classifier (95 languages) Returns a model with a set of utils Please see https://github.com/snakers4/silero-vad for usage examples """ hub_dir = torch.hub.get_dir() - model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/lang_classifier_116.jit') + model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/lang_classifier_95.jit') - with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_116.json', 'r') as f: + with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_95.json', 'r') as f: lang_dict = json.load(f) - with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_116.json', 'r') as f: + with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_95.json', 'r') as f: lang_group_dict = json.load(f) utils = (get_language_and_group, read_audio)