Merge pull request #80 from snakers4/adamnsandle

add 116 lang classifier
This commit is contained in:
Alexander Veysov
2021-07-09 15:02:21 +03:00
committed by GitHub
7 changed files with 112 additions and 2 deletions

View File

@@ -94,6 +94,7 @@ The models are small enough to be included directly into this repository. Newer
| `'silero_vad_mini_8k'` | 100K | VAD | Yes | `ru`, `en`, `de`, `es` (*) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
| `'silero_number_detector'` | 1.1M | Number Detector | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
| `'silero_lang_detector'` | 1.1M | Language Classifier | No | `ru`, `en`, `de`, `es` | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
| `'silero_lang_detector_116'` | 1.7M | Language Classifier | No | [116 languages](https://github.com/snakers4/silero-vad/files/lang_dict_116.json) | :heavy_check_mark: | :heavy_check_mark: | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) |
(*) Though explicitly trained on these languages, VAD should work on any Germanic, Romance or Slavic Languages out of the box.
@@ -102,6 +103,7 @@ What models do:
- VAD - detects speech;
- Number Detector - detects spoken numbers (i.e. thirty five);
- Language Classifier - classifies utterances between language;
- Language Classifier 116 - classifies among 116 languages as well as 77 language groups (mutually intelligible languages -> same group)
### Version History
@@ -116,6 +118,8 @@ What models do:
| `v2.1` | 2021-02-11 | Add micro (10k params) VAD models |
| `v2.2` | 2021-03-22 | Add micro 8000 sample rate VAD models |
| `v2.3` | 2021-04-12 | Add mini (100k params) VAD models (8k and 16k sample rate) + **new** adaptive utils for full audio and single audio stream |
| `v2.4` | 2021-07-09 | Add 116 languages classifier and group classifier
|
### PyTorch
@@ -190,7 +194,7 @@ number_timestamps = get_number_ts(wav, model)
pprint(number_timestamps)
```
#### Language Classifier
#### Language Classifier (4 languages)
[![Open on Torch Hub](https://img.shields.io/badge/Torch-Hub-red?logo=pytorch&style=for-the-badge)](https://pytorch.org/hub/snakers4_silero-vad_language/)
@@ -213,6 +217,31 @@ language = get_language(wav, model)
pprint(language)
```
#### Language Classifier (116 languages)
[![Open on Torch Hub](https://img.shields.io/badge/Torch-Hub-red?logo=pytorch&style=for-the-badge)](https://pytorch.org/hub/snakers4_silero-vad_language/)
```python
import torch
torch.set_num_threads(1)
from pprint import pprint
model, lang_dict, lang_group_dict, utils = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_lang_detector_116',
force_reload=True)
get_language_and_group, read_audio = utils
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
wav = read_audio(f'{files_dir}/de.wav')
language, language_group = get_language_and_group(wav, model, lang_dict, lang_group_dict)
pprint(f'Language: {language}')
pprint(f'Language group: {language_group}')
```
### ONNX
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
@@ -296,7 +325,7 @@ number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)
pprint(number_timestamps)
```
#### Language Classifier
#### Language Classifier (4 languages)
```python
import torch
@@ -327,6 +356,44 @@ wav = read_audio(f'{files_dir}/de.wav')
language = get_language(wav, model, run_function=validate_onnx)
print(language)
```
#### Language Classifier (116 languages)
```python
import torch
import onnxruntime
from pprint import pprint
model, lang_dict, lang_group_dict, utils = torch.hub.load(
repo_or_dir='snakers4/silero-vad',
model='silero_lang_detector_116',
force_reload=True)
get_language_and_group, read_audio = utils
files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'
def init_onnx_model(model_path: str):
return onnxruntime.InferenceSession(model_path)
def validate_onnx(model, inputs):
with torch.no_grad():
ort_inputs = {'input': inputs.cpu().numpy()}
outs = model.run(None, ort_inputs)
outs = [torch.Tensor(x) for x in outs]
return outs
model = init_onnx_model(f'{files_dir}/lang_classifier_116.onnx')
wav = read_audio(f'{files_dir}/de.wav')
language, language_group = get_language_and_group(wav, model, lang_dict, lang_group_dict)
pprint(f'Language: {language}')
pprint(f'Language group: {language_group}')
```
[![Open on Torch Hub](https://img.shields.io/badge/Torch-Hub-red?logo=pytorch&style=for-the-badge)](https://pytorch.org/hub/snakers4_silero-vad_language/)
## Metrics
### Performance Metrics
@@ -464,6 +531,12 @@ Please see [Quality Metrics](#quality-metrics)
- More languages TBD
- Arbitrary audio length can be used, although network was trained using audio shorter than 15 seconds
### How Language Classifier 116
- **83%** validation accuracy among 116 languages, **87%** validation accuracy among [77 language groups](https://github.com/snakers4/silero-vad/files/lang_group_dict_116.json)
- Language classifier 116 was trained using audio samples in [116 languages](https://github.com/snakers4/silero-vad/files/lang_dict_116.json)
- Arbitrary audio length can be used, although network was trained using audio shorter than 20 seconds
## Contact
### Get in Touch

Binary file not shown.

Binary file not shown.

1
files/lang_dict_116.json Normal file
View File

@@ -0,0 +1 @@
{"43": "tt, Tatar", "37": "am, Amharic", "70": "uk, Ukrainian", "40": "si, Sinhala, Sinhalese", "77": "mk, Macedonian", "63": "lb, Luxembourgish, Letzeburgesch", "20": "ka, Georgian", "35": "ar, Arabic", "95": "lv, Latvian", "25": "tk, Turkmen", "67": "uz, Uzbek", "50": "mg, Malagasy", "56": "pt, Portuguese", "91": "sl, Slovenian", "115": "pa-IN, Punjabi, Panjabi", "52": "eo, Esperanto", "55": "sa, Sanskrit", "114": "gn, Guarani", "61": "de, German", "69": "id, Indonesian", "97": "sk, Slovak", "47": "kk, Kazakh", "111": "ps, Pashto, Pushto", "98": "et, Estonian", "102": "sv-SE, Swedish", "68": "su, Sundanese", "10": "ba, Bashkir", "76": "ga-IE, Irish", "24": "el, Greek, Modern (1453\u2013)", "65": "km, Central Khmer", "17": "sd, Sindhi", "30": "ne, Nepali", "99": "nn, Norwegian Nynorsk", "9": "da, Danish", "109": "bn, Bengali", "104": "ia, Interlingua (International Auxiliary Language Association)", "113": "ab, Abkhazian", "19": "nl, Dutch, Flemish", "96": "ur, Urdu", "16": "mr, Marathi", "86": "ms, Malay", "26": "br, Breton", "84": "tl, Tagalog", "4": "fy-NL, Western Frisian", "15": "lt, Lithuanian", "13": "mn, Mongolian", "29": "my, Burmese", "27": "cv, Chuvash", "38": "yi, Yiddish", "8": "yo, Yoruba", "112": "or, Oriya", "18": "gu, Gujarati", "101": "as, Assamese", "107": "sn, Shona", "34": "hu, Hungarian", "110": "ca, Catalan, Valencian", "44": "so, Somali", "23": "is, Icelandic", "60": "az, Azerbaijani", "2": "ln, Lingala", "5": "hi, Hindi", "31": "cs, Czech", "74": "bo, Tibetan", "90": "kn, Kannada", "49": "zh-HK, Chinese", "22": "ha, Hausa", "14": "cy, Welsh", "87": "ko, Korean", "0": "fr, French", "78": "la, Latin", "103": "zh-TW, Chinese", "53": "vi, Vietnamese", "81": "lg, Ganda", "83": "pl, Polish", "59": "ro, Romanian, Moldavian, Moldovan", "72": "hr, Croatian", "48": "ht, Haitian, Haitian Creole", "36": "mt, Maltese", "100": "fi, Finnish", "46": "tr, Turkish", "80": "ml, Malayalam", "32": "eu, Basque", "75": "rm-sursilv, Romansh", "92": "sw, Swahili", "51": "ta, Tamil", "3": "dv, Divehi, Dhivehi, Maldivian", "105": "rm-vallader, Romansh", "58": "hy, Armenian", "6": "ru, Russian", "94": "bg, Bulgarian", "7": "en, English", "85": "be, Belarusian", "62": "sq, Albanian", "106": "es, Spanish, Castilian", "64": "af, Afrikaans", "89": "fo, Faroese", "33": "sv, Swedish", "73": "th, Thai", "57": "tg, Tajik", "66": "bs, Bosnian", "39": "zh-CN, Chinese", "71": "gv, Manx", "21": "te, Telugu", "108": "mi, Maori", "93": "oc, Occitan", "88": "ja, Japanese", "82": "ky, Kirghiz, Kyrgyz", "28": "sr, Serbian", "12": "it, Italian", "42": "fa, Persian", "41": "lo, Lao", "1": "zh, Chinese", "54": "gl, Galician", "79": "no, Norwegian", "11": "rw, Kinyarwanda", "45": "pa, Punjabi, Panjabi"}

View File

@@ -0,0 +1 @@
{"0": ["Dutch, Flemish", "Afrikaans", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Polish", "Slovak", "Ukrainian", "Czech", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Danish", "Norwegian", "Norwegian Nynorsk", "Swedish"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Luxembourgish, Letzeburgesch", "German", "Yiddish"], "8": ["Irish"], "9": ["Portuguese", "Galician", "Spanish", "Catalan, Valencian", "Occitan", "Italian", "Spanish, Castilian"], "10": ["Maltese", "Arabic"], "11": ["Marathi"], "12": ["Welsh", "Breton"], "13": ["Hindi", "Urdu"], "14": ["Lao", "Thai"], "15": ["Malay", "Indonesian"], "16": ["Romanian, Moldavian, Moldovan"], "17": ["Tagalog"], "18": ["Persian", "Tajik"], "19": ["Kazakh", "Kirghiz, Kyrgyz", "Uzbek"], "20": ["Romansh"], "21": ["Kinyarwanda"], "22": ["Bashkir", "Tatar"], "23": ["French"], "24": ["Chinese"], "25": ["Lingala"], "26": ["Divehi, Dhivehi, Maldivian"], "27": ["Yoruba"], "28": ["Sinhala, Sinhalese"], "29": ["Lithuanian"], "30": ["Assamese"], "31": ["Korean"], "32": ["Gujarati"], "33": ["Basque"], "34": ["Hausa"], "35": ["Punjabi, Panjabi"], "36": ["Maori"], "37": ["Pashto, Pushto"], "38": ["Esperanto"], "39": ["Swahili"], "40": ["Abkhazian"], "41": ["Albanian"], "42": ["Armenian"], "43": ["Mongolian"], "44": ["Tamil"], "45": ["Haitian, Haitian Creole"], "46": ["Georgian"], "47": ["Japanese"], "48": ["Vietnamese"], "49": ["Amharic"], "50": ["Hungarian"], "51": ["Sanskrit"], "52": ["Chuvash"], "53": ["Shona"], "54": ["Latin"], "55": ["Central Khmer"], "56": ["Malagasy"], "57": ["Nepali"], "58": ["Ganda"], "59": ["Telugu"], "60": ["Oriya"], "61": ["Burmese"], "62": ["Icelandic"], "63": ["Greek, Modern (1453\u2013)"], "64": ["Guarani"], "65": ["Interlingua (International Auxiliary Language Association)"], "66": ["Malayalam"], "67": ["Tibetan"], "68": ["Faroese"], "69": ["Turkmen"], "70": ["Manx"], "71": ["Latvian"], "72": ["Somali"], "73": ["Bengali"], "74": ["Sundanese"], "75": ["Sindhi"], "76": ["Kannada"]}

View File

@@ -1,5 +1,6 @@
dependencies = ['torch', 'torchaudio']
import torch
import json
from utils_vad import (init_jit_model,
get_speech_ts,
get_speech_ts_adaptive,
@@ -130,3 +131,23 @@ def silero_lang_detector(**kwargs):
read_audio)
return model, utils
def silero_lang_detector_116(**kwargs):
"""Silero Language Classifier (116 languages)
Returns a model with a set of utils
Please see https://github.com/snakers4/silero-vad for usage examples
"""
hub_dir = torch.hub.get_dir()
model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/lang_classifier_116.jit')
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_116.json', 'r') as f:
lang_dict = json.load(f)
with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_116.json', 'r') as f:
lang_group_dict = json.load(f)
utils = (get_language_and_group, read_audio)
return model, lang_dict, lang_group_dict, utils

View File

@@ -329,6 +329,20 @@ def get_language(wav: torch.Tensor,
return languages[lang_pred]
def get_language_and_group(wav: torch.Tensor,
model,
lang_dict: dict,
lang_group_dict: dict,
run_function=validate):
wav = torch.unsqueeze(wav, dim=0)
lang_logits, lang_group_logits = run_function(model, wav)
lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item() # from 0 to len(languages) - 1
lang_group_pred = torch.argmax(torch.softmax(lang_group_logits, dim=1), dim=1).item()
return lang_dict[str(lang_pred)], lang_group_dict[str(lang_group_pred)]
class VADiterator:
def __init__(self,
trig_sum: float = 0.26,